未验证 提交 90b31790 编写于 作者: Z zhaoyingli 提交者: GitHub

[Cherry-Pick][AutoParallel] auto_parallel cherry-pick to release2.4 (#47145)

* [Auto Parallel] Make Engine class callable (#46416)

* [Auto Parallel] Imporve the user-defined fetches and logging

* [Auto Parallel] Make Engine class callable

* [Auto Parallel] Update the data loading of tuner

* Print IPS in auto parallel Engine (#46554)

* [AutoParallel] fix dist_split (#46505)

* [AutoParallel] fix dist_split

* add unittest

* update cmakelist

* [AutoParallel] fix sharding (#46572)

* [AutoParallel] fix process_mesh (#46583)

* [AutoParallel] fix reshard when train with eval (#46605)

* [AutoParallel] fix reshard when train with eval

* fix mppp

* [AutoParallel] fix amp when predict (#46637)

* [Auto Parallel]Update comp cost and completion for gpt auto search (#46387)

* update comp cost and completion for gpt auto search

* add unittest

* [Auto Parallel] Fix bugs caused by the inconsistent outputs of Engine API (#46633)

* [Auto Parallel] Unify the logger and outputs of Engine API

* [Auto Parallel] Fix the bugs of to_static

* [Auto Parallel] Adjust the test_to_static.py

* [Auto Parallel] Improve the fine-grained APIs (#46552)

* [Auto Parallel] Suppport different dataloaders

* [Auto Parallel] Add num_shards config for dataset

* [Auto Parallel] Unify the logger and outputs of Engine API

* [Auto Parallel] Fix the bugs of to_static

* [Auto Parallel] Adjust the test_to_static.py

* [Auto Parallel] Add the prepare API and replace __call__ with run

* [Auto Parallel] Improve the private implementations of Engine

* [Auto Parallel] Set capacity of dataloader for opt tuning

* [Auto Parallel] [WIP] Change the fine-grained API

* [Auto Parallel] Improve APIs to support different user cases

* [Auto Parallel] Add removed config

* [Auto Parallel] Add imports

* [Auto Parallel] Fix bugs for to_static

* [Auto Parallel] Remove unnecessary imports

* bugfix (#46921)

* [Auto Parallel] Fix the bug for None labels (#46987)

* [AutoParallel] adapt for gpt-gen (#46771)

* for gpt-gen

* fix reshard

* adapt assign and shape op

* add dist_assign & unittest

* add conditional block unittest

* rename unittest

* [Auto Parallel] Fix the bug of completion (#47056)

* [Auto Parallel] Fix the bug for None labels

* [Auto Parallel] Fix the completion bug

* [AutoParallel] add callbacks (#47014)

* [AutoParallel] add callbacks

* fix unittest

* fix dist_context

* fix engine

* fix cmakelist

* fix unittest's returns

* fix cmakelist

* [Auto Parallel] Add cost interface (#47043)

* add cost interface

* update inferface and add unittest

* update unittest

* update inferface

* [Auto Parallel]Add parallel tuner (#46189)

* add parallel tuner

* add unittest

* fix unittest

* set timeout of unittest

* set unittest timeout

* fix auto_mode setting

* update unittest

* sync from develop and update unittest

* remove unused import

* update unittest

* update cmakelist

* add unittests
Co-authored-by: NYulong Ao <aoyulong@baidu.com>
Co-authored-by: NRuibiao Chen <chenruibiao@baidu.com>
Co-authored-by: Ncaozhou <48191911+Caozhou1995@users.noreply.github.com>
Co-authored-by: NJZ-LIANG <jianzhongliang10@gmail.com>
上级 23f2a4ea
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import time
import paddle
from paddle.hapi.callbacks import ProgBarLogger, ModelCheckpoint, LRScheduler, CallbackList, Callback
from .interface import CollectionNames, get_collection
def config_callbacks(callbacks=None,
engine=None,
batch_size=None,
epochs=None,
steps=None,
log_freq=2,
verbose=2,
save_freq=1,
save_dir=None,
metrics=None,
acc_step=1,
mode='train'):
cbks = callbacks or []
cbks = cbks if isinstance(cbks, (list, tuple)) else [cbks]
if not any(isinstance(k, ProgBarLogger) for k in cbks) and verbose:
cbks = [ProgBarLoggerAuto(log_freq, verbose=verbose)] + cbks
if not any(isinstance(k, LRScheduler) for k in cbks):
cbks = [LRSchedulerAuto()] + cbks
if not any(isinstance(k, ModelCheckpoint) for k in cbks):
cbks = cbks + [ModelCheckpointAuto(save_freq, save_dir)]
if not any(isinstance(k, Profiler) for k in cbks) and verbose == 3:
cbks = cbks + [Profiler(timer_only=True)]
if not any(isinstance(k, History) for k in cbks):
cbks = cbks + [History()]
for i, k in enumerate(cbks):
if isinstance(k, ProgBarLogger):
cbks[i] = ProgBarLoggerAuto(k.log_freq, k.verbose)
if isinstance(k, LRScheduler):
cbks[i] = LRSchedulerAuto(k.by_step, k.by_epoch)
if isinstance(k, ModelCheckpoint):
cbks[i] = ModelCheckpointAuto(k.save_freq, k.save_dir)
cbk_list = CallbackList(cbks)
cbk_list.set_model(engine)
metrics = metrics or [] if mode != 'test' else []
params = {
'batch_size': batch_size,
'epochs': epochs,
'steps': steps,
'verbose': verbose,
'metrics': metrics,
'acc_step': acc_step,
}
cbk_list.set_params(params)
return cbk_list
class ProgBarLoggerAuto(ProgBarLogger):
def __init__(self, log_freq=1, verbose=2):
super(ProgBarLoggerAuto, self).__init__(log_freq, verbose)
def _is_print(self):
return True
def _updates(self, logs, mode):
values = []
metrics = getattr(self, '%s_metrics' % (mode))
progbar = getattr(self, '%s_progbar' % (mode))
steps = getattr(self, '%s_step' % (mode))
for k in metrics:
if k in logs:
values.append((k, logs[k]))
if 'lr' in logs:
values.append(('lr', logs['lr']))
fetches_logs = logs.get('fetches', {})
collect_logging = get_collection(CollectionNames.LOGGING)
for name, var in collect_logging:
k = name or var.name
if k in fetches_logs:
values.append((k, fetches_logs[k]))
out_logs = logs.get('outputs', {})
for k in out_logs:
values.append((k, out_logs[k]))
if self.verbose == 3 and hasattr(self, '_%s_timer' % (mode)):
timer = getattr(self, '_%s_timer' % (mode))
cnt = timer['count'] if timer['count'] > 0 else 1.0
samples = timer['samples'] if timer['samples'] > 0 else 1.0
values.append(
('avg_reader_cost', "%.5f sec" % (timer['data_time'] / cnt)))
values.append(
('avg_batch_cost', "%.5f sec" % (timer['batch_time'] / cnt)))
values.append(
('ips', "%.5f samples/sec" %
(samples / (timer['data_time'] + timer['batch_time']))))
timer['count'] = 0
timer['samples'] = 0
timer['data_time'] = 0.
timer['batch_time'] = 0.
progbar.update(steps, values)
def on_eval_batch_end(self, step, logs=None):
logs = logs or {}
self.eval_step += 1
samples = self.params['batch_size']
self.evaled_samples += samples
self._eval_timer['batch_time'] += (
time.time() - self._eval_timer['batch_data_end_time'])
self._eval_timer['count'] += 1
samples = self.params['batch_size']
self._eval_timer['samples'] += samples
if self._is_print() and self.eval_step % self.log_freq == 0:
if self.eval_steps is None or self.eval_step < self.eval_steps:
self._updates(logs, 'eval')
self._eval_timer['batch_start_time'] = time.time()
class LRSchedulerAuto(LRScheduler):
def __init__(self, by_step=True, by_epoch=False):
super(LRSchedulerAuto, self).__init__(by_step, by_epoch)
def on_epoch_begin(self, epoch=None, logs=None):
self.acc_step = self.params["acc_step"]
self.epoch = epoch
self.train_step = 0
def on_train_batch_end(self, step, logs=None):
self.train_step += 1
if self.by_step and self.train_step % self.acc_step == 0:
if self.model._optimizer and \
hasattr(self.model._optimizer, '_learning_rate') and \
isinstance(self.model._optimizer._learning_rate,
paddle.optimizer.lr.LRScheduler):
self.model._optimizer._learning_rate.step()
class History(Callback):
def __init__(self):
self.history = {}
def on_train_begin(self, logs=None):
self.epoch = []
def on_epoch_end(self, epoch, logs=None):
logs = logs or {}
self.epoch.append(epoch)
for k, v in logs.items():
self.history.setdefault(k, []).append(v)
self.model.history = self
class Profiler(Callback):
def __init__(self, *args, **kwargs):
self.prof = paddle.profiler.Profiler(*args, **kwargs)
def on_epoch_begin(self, epoch=None, logs=None):
self.epoch = epoch
self.train_step = 0
self.batch_size = self.params["batch_size"]
self.steps = self.params['steps']
def on_train_begin(self, logs=None):
self.prof.start()
def on_train_batch_end(self, step, logs=None):
self.train_step += 1
self.prof.step(num_samples=self.batch_size)
print("step {}:{}".format(self.train_step,
self.prof.step_info(unit='samples')))
def on_train_end(self, logs=None):
self.prof.stop()
self.prof.summary()
class ModelCheckpointAuto(ModelCheckpoint):
def __init__(self, *args, **kwargs):
super(ModelCheckpointAuto, self).__init__(*args, **kwargs)
def _is_save(self):
return self.model and self.save_dir
def on_epoch_end(self, epoch, logs=None):
if self._is_save() and (self.epoch + 1) % self.save_freq == 0:
path = '{}/epoch{}'.format(self.save_dir, epoch)
print('save checkpoint at {}'.format(os.path.abspath(path)))
self.model.save(path)
def on_train_end(self, logs=None):
if self._is_save():
path = '{}/final'.format(self.save_dir)
print('save checkpoint at {}'.format(os.path.abspath(path)))
self.model.save(path)
......@@ -19,7 +19,7 @@ import time
from paddle.fluid import core
from paddle.fluid import framework
from .utils import print_program_with_dist_attr, is_gradient_clip_op
from .utils import is_gradient_clip_op, __not_shape_var_type__
from .operators import find_compatible_distributed_operator_impls
from .dist_context import get_default_distributed_context, _node_id
from .dist_tensor import DistributedTensor
......@@ -142,6 +142,7 @@ class Completer:
def __init__(self, dist_context):
assert dist_context is not None
self._dist_context = dist_context
self._has_prepared = False
def _update_tensor_node_dims_mapping(self, tensor_node, fwd=True):
changed = False
......@@ -366,7 +367,14 @@ class Completer:
def _update_dims_mapping_for_special(self):
# Set the dims_mapping of a tensor to the dims_mapping inside the op which produces it
op_nodes = self._dist_context._serial_ordered_op_nodes
# NOTE: this list may be changed if Paddle changes the existing rules.
related_reader_ops = [
"create_py_reader", "create_double_buffer_reader", "read"
]
for op_node in op_nodes:
if op_node.op() is not None \
and op_node.op().type() in related_reader_ops:
continue
op_dist_attr = self._dist_context.get_dist_attr_for_graph(op_node)
for tensor_node in op_node.outputs:
if tensor_node.is_var() and tensor_node.var() is not None:
......@@ -406,6 +414,7 @@ class Completer:
reach_fix_point = False
else:
reach_fix_point = True
# NOTE: this will be removed after changing the reshard rule
self._update_dims_mapping_for_special()
def _update_process_mesh_by_nearest(self, op_node, nearest_op_node):
......@@ -494,14 +503,14 @@ class Completer:
for tensor_node in node.inputs:
if tensor_node.is_var() and tensor_node.var(
) is not None:
if tensor_node.var().type() == core.VarDesc.VarType.READER \
if tensor_node.var().type() in __not_shape_var_type__ \
or len(tensor_node.var().shape()) != 1:
flag = False
break
for tensor_node in node.outputs:
if tensor_node.is_var() and tensor_node.var(
) is not None:
if tensor_node.var().type() == core.VarDesc.VarType.READER \
if tensor_node.var().type() in __not_shape_var_type__ \
or len(tensor_node.var().shape()) != 1:
flag = False
break
......@@ -719,6 +728,8 @@ class Completer:
self._update_process_mesh_between_graphs()
def _prepare(self):
if self._has_prepared:
return
self._while_op_nodes = {}
self._array_nodes = {}
self._node_pairs_between_graphs = []
......@@ -732,6 +743,8 @@ class Completer:
if self._array_nodes.get(array_var_name, None) is None:
self._array_nodes[array_var_name] = []
self._array_nodes[array_var_name].append(node)
# Add the array input node
self._array_nodes[array_var_name].append(node.inputs[0])
if node.op().type() == "write_to_array":
array_var_name = node.op().output("Out")[0]
if self._array_nodes.get(array_var_name, None) is None:
......@@ -752,6 +765,7 @@ class Completer:
and after_node.var().name() == node.var().name():
self._node_pairs_between_graphs.append(
(after_node, node))
self._has_prepared = True
def complete_forward_annotation(self, serial_main_program=None):
""" Complete annotation for the partial annotated serial_main_program.
......@@ -899,6 +913,72 @@ class Completer:
else:
dist_op.dist_attr = original_op_dist_attr
def _complete_tensor_dist_attr_by_op(self, serial_main_program=None):
if serial_main_program is None:
serial_main_program = self._dist_context.serial_main_program
else:
self._dist_context._serial_main_program = serial_main_program
self._dist_context.initialize()
self._prepare()
has_set_dist_attr = set()
all_nodes = self._dist_context.serial_ordered_nodes
for node in all_nodes:
if node.is_op():
if node.op().type() in ["while"]:
continue
dist_op = self._dist_context.get_dist_op_for_graph(node)
op_dist_attr = dist_op.dist_attr
for tensor_node in node.inputs:
if tensor_node.is_var() and tensor_node.var() is not None:
# Skip the non-leaf var node
if len(tensor_node.inputs) != 0:
continue
tensor_desc = tensor_node.var()
tensor_name = tensor_desc.name()
tensor = dist_op.get_serial_input(tensor_name)
# Use the first op to set the tensor dist attr
if tensor_name in has_set_dist_attr:
continue
tensor_dist_attr = self._dist_context.get_tensor_dist_attr_for_graph(
tensor_node)
tensor_dist_attr.process_mesh = op_dist_attr.process_mesh
tensor_dist_attr.dims_mapping = op_dist_attr.get_input_dims_mapping(
tensor_name) if tensor.is_parameter else [
-1 for i in tensor_desc.shape()
]
has_set_dist_attr.add(tensor_name)
for tensor_node in node.outputs:
if tensor_node.is_var() and tensor_node.var() is not None:
tensor_name = tensor_node.var().name()
if tensor_name in has_set_dist_attr:
continue
tensor_dist_attr = self._dist_context.get_tensor_dist_attr_for_graph(
tensor_node)
tensor_dist_attr.process_mesh = op_dist_attr.process_mesh
tensor_dist_attr.dims_mapping = op_dist_attr.get_output_dims_mapping(
tensor_name)
has_set_dist_attr.add(tensor_name)
self._update_process_mesh_for_specials()
self._update_process_mesh_between_graphs()
self._update_dims_mapping_for_special()
self._update_dims_mapping_between_graphs()
# Copy the corresponding distributed attribute from graph to serial_main_program
self._dist_context.copy_dist_attr_from_graph_to_program()
# Do the validation check and amend some completion
self._dist_context.amend_dist_attr_for_program()
self._dist_context.validate_dist_attr_for_program()
def _complete_high_order_grad_annotation(self, serial_main_program=None):
"""
NOTE:
......
......@@ -116,3 +116,10 @@ set_field_default_config(TUNING, "profile_start_step", 1)
set_field_default_config(TUNING, "profile_end_step", 1)
set_field_default_config(TUNING, "run_after_tuning", True)
set_field_default_config(TUNING, "verbose", True)
#########################################
# dataset configuration
#########################################
DATASET = "dataset"
set_field_default_config(DATASET, "enable", False)
set_field_default_config(DATASET, "num_shards", 1)
......@@ -167,6 +167,25 @@ class DropoutOpCost(CompOpCost):
return 0
@register_op_cost
class DropoutGradOpCost(CompOpCost):
OP_TYPE = "dropout_grad"
def __init__(self, op=None, op_desc=None, cluster=None):
super(DropoutGradOpCost, self).__init__(op=op,
op_desc=op_desc,
cluster=cluster)
# For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
def calc_flops(self):
# NOTE: The actual formula will be filled in the future
return 0
def calc_time(self):
# NOTE: The actual formula will be filled in the future
return 0
@register_op_cost
class ElementwiseAddOpCost(CompOpCost):
OP_TYPE = "elementwise_add"
......@@ -395,6 +414,42 @@ class FillConstantBatchSizeLikeOpCost(CompOpCost):
return 0
@register_op_cost
class FusedSoftmaxMaskUpperTriangleOpCost(CompOpCost):
OP_TYPE = "fused_softmax_mask_upper_triangle"
def __init__(self, op=None, op_desc=None, cluster=None):
super(FusedSoftmaxMaskUpperTriangleOpCost,
self).__init__(op=op, op_desc=op_desc, cluster=cluster)
# For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
def calc_flops(self):
# NOTE: The actual formula will be filled in the future
return 0
def calc_time(self):
# NOTE: The actual formula will be filled in the future
return 0
@register_op_cost
class FusedSoftmaxMaskUpperTriangleGradOpCost(CompOpCost):
OP_TYPE = "fused_softmax_mask_upper_triangle_grad"
def __init__(self, op=None, op_desc=None, cluster=None):
super(FusedSoftmaxMaskUpperTriangleGradOpCost,
self).__init__(op=op, op_desc=op_desc, cluster=cluster)
# For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
def calc_flops(self):
# NOTE: The actual formula will be filled in the future
return 0
def calc_time(self):
# NOTE: The actual formula will be filled in the future
return 0
@register_op_cost
class GatherOpCost(CompOpCost):
OP_TYPE = "gather"
......
......@@ -45,6 +45,8 @@ class CostEstimator:
) # {`op_id`: {"reshard": [], "dist_op": [], "local_cost": local_cost}}}
self._bubble_time_mapping = {}
self._ordered_ops = []
self.max_memories = {}
self.max_memory = None
@property
def loop_count(self):
......@@ -123,7 +125,7 @@ class CostEstimator:
for i in range(loop_count):
for op in ops:
self._detailed_cost[op.desc.id()] = OrderedDict()
# if in the while sub block, the detail of cost is the last cost
# If in the while sub block, the detail of cost is the last cost
detail = self._detailed_cost[op.desc.id()]
detail["reshard_cost"] = OrderedDict() #
detail["dist_op_cost"] = []
......@@ -147,15 +149,15 @@ class CostEstimator:
var = get_var_with_recursion(var_name, block, self.program)
reshard_cost = resharder.get_cost(op, var, self.cluster)
# calc reshard cost
# Calc reshard cost
if reshard_cost is not None:
detail["reshard_cost"][var_name] = reshard_cost
comm_costs = reshard_cost[0]
local_comp_cost = reshard_cost[1]
for comm_cost in comm_costs:
# time is cumulative in global cost and local cost, but memory and flops just are cumulative in global cost.
# comm sync
# Time is cumulative in global cost and local cost, but memory and flops just are cumulative in global cost.
# Comm sync
for item in comm_cost:
group_ranks, cost = item
max_time = None
......@@ -183,7 +185,7 @@ class CostEstimator:
for comp_cost in local_comp_cost[rank]:
self.local_cost(rank).time += comp_cost.time
# calc dist op cost
# Calc dist op cost
dist_op = dist_context.get_dist_op_for_program(op)
op_dist_attr = dist_op.dist_attr
processes = op_dist_attr.process_mesh.processes
......@@ -201,7 +203,7 @@ class CostEstimator:
continue
for item in dist_op_cost:
if isinstance(item, list):
# comm sync
# Comm sync
for comm_op_cost in item:
max_time = None
cost_time = {}
......@@ -222,9 +224,9 @@ class CostEstimator:
self._bubble_time_mapping[rank] += (
max_time - cost_time[rank])
elif isinstance(item, dict):
# op just one
# Op just one
for rank in processes:
# dp+pp+mp
# DP+PP+MP
if rank not in item:
continue
self.local_cost(rank).time += item[rank].time
......@@ -267,7 +269,7 @@ class CostEstimator:
return result
memories = {}
max_memories = {}
self.max_memories = {}
var_info = {
} # var_name: [[process_mesh, dims_mapping], [id]], [[process_mesh, dims_mapping], [id]]}
......@@ -277,6 +279,10 @@ class CostEstimator:
self._ordered_ops.sort(key=lambda x: x[0])
for op_id, op in self._ordered_ops:
if op.type in [
"create_py_reader", "create_double_buffer_reader", "read"
]:
continue
dist_op = dist_context.get_dist_op_for_program(op)
process_mesh = dist_op.dist_attr.process_mesh
for var_name in op.input_arg_names:
......@@ -288,7 +294,7 @@ class CostEstimator:
input_dims_mapping)
if key not in var_info[var_name]:
var_info[var_name][key] = {}
# it is even partition now
# It is even partition now
if "memory" not in var_info[var_name][key]:
var = dist_op.get_serial_input(var_name)
global_sizes = var.shape
......@@ -326,6 +332,10 @@ class CostEstimator:
has_used_vars = set()
for op_id, op in self._ordered_ops:
if op.type in [
"create_py_reader", "create_double_buffer_reader", "read"
]:
continue
can_free_memories = {}
can_free_vars = set()
dist_op = dist_context.get_dist_op_for_program(op)
......@@ -337,14 +347,14 @@ class CostEstimator:
input_dims_mapping)
has_used_var = var_name + key
var = dist_op.get_serial_input(var_name)
# not used
# Not used
if var_name + key not in has_used_vars:
has_used_vars.add(has_used_var)
for process in process_mesh.processes:
if process not in memories:
memories[process] = 0
memories[process] += var_info[var_name][key]["memory"]
# used
# Used
else:
if op_id == var_info[var_name][key]["position"][-1]:
if has_used_var not in can_free_vars:
......@@ -363,14 +373,14 @@ class CostEstimator:
output_dims_mapping)
has_used_var = var_name + key
var = dist_op.get_serial_output(var_name)
# not used
# Not used
if var_name + key not in has_used_vars:
has_used_vars.add(has_used_var)
for process in process_mesh.processes:
if process not in memories:
memories[process] = 0
memories[process] += var_info[var_name][key]["memory"]
# used
# Used
else:
if op_id == var_info[var_name][key]["position"][-1]:
if has_used_var not in can_free_vars:
......@@ -382,21 +392,22 @@ class CostEstimator:
can_free_memories[process] += var_info[
var_name][key]["memory"]
# calc peak memory
# Calc peak memory
for process in memories:
if process not in max_memories:
max_memories[process] = memories[process]
if process not in self.max_memories:
self.max_memories[process] = memories[process]
else:
if memories[process] > max_memories[process]:
max_memories[process] = memories[process]
if memories[process] > self.max_memories[process]:
self.max_memories[process] = memories[process]
# free memory
# Free memory
for process in can_free_memories:
if process in memories:
memories[process] -= can_free_memories[process]
# Calculate the max memory in all ranks
max_memory = max(max_memories.values())
max_memory = max(self.max_memories.values())
self.max_memory = max_memory
return max_memory
......@@ -410,3 +421,143 @@ class CostEstimator:
self._estimate_core(dist_context, resharder, block)
return self.global_cost
def _print_tag(self, max_len, length):
tag = "+" + "-" * max_len
for i in range(length):
print(tag, end="")
if i == length - 1:
print("+")
def _print_vals(self, vals, max_len):
for idx, val in enumerate(vals):
s = "|" + str(val).center(max_len)
print(s, end="")
if idx == len(vals) - 1:
print("|")
def _pretty_print_memory_cost(self):
"""Print memory of every rank prettily."""
if not self.max_memories or not self.max_memory:
raise ValueError("Please calculate memory cost before print.")
# Padding automatically
max_len = 0
header = ["Rank", "Memory(MiB)"]
memories = [
int(item // 1e6) for item in list(self.max_memories.values())
]
for memory in (memories + header):
if len(str(memory)) > max_len:
max_len = len(str(memory))
max_len += 4 # for pretty print of center
# Print tag
self._print_tag(max_len, len(header))
# Print header
self._print_vals(header, max_len)
# Print tag
self._print_tag(max_len, len(header))
# Print rank and its memory
for i in range(len(self.max_memories)):
memory = memories[i]
vals = [i, memory]
self._print_vals(vals, max_len)
self._print_tag(max_len, len(header))
def _pretty_print_global(self):
"""Print global execution time and max memory prettily."""
if not self.max_memories or not self.max_memory:
raise ValueError("Please calculate cost before print.")
# Padding automatically
max_len = 0
header = ["Execution Time(ms)", "Max Memory(MiB)"]
vals = [round(self.global_cost.time, 3), int(self.max_memory // 1e6)]
for memory in (vals + header):
if len(str(memory)) > max_len:
max_len = len(str(memory))
max_len += 4 # for pretty print of center
# Print tag
self._print_tag(max_len, len(header))
# Print header
self._print_vals(header, max_len)
# Print tag
self._print_tag(max_len, len(header))
# Print exec time and max memory
self._print_vals(vals, max_len)
# Print tag
self._print_tag(max_len, len(header))
def pretty_print_cost(self):
"""Print cost prettily."""
print("The global execution time and max memory are as follows:")
self._pretty_print_global()
print("The memory of every rank is as follows:")
self._pretty_print_memory_cost()
def get_cost_from_engine(engine, mode):
from ..utils import to_list
# Construct cost estimator by original main program
serial_main_prog = engine._serial_main_progs[mode].clone(
) if mode in engine._serial_main_progs else engine._orig_main_prog.clone()
serial_startup_prog = engine._serial_startup_progs[mode].clone(
) if mode in engine._serial_startup_progs else engine._orig_startup_prog.clone(
)
losses = to_list(
engine._loss) if (not isinstance(engine._loss, paddle.nn.Layer)
and not callable(engine._loss)) else engine._losses
if mode in engine._dist_contexts:
dist_context = engine._dist_contexts[mode]
completer = engine._planners[mode].completer
else:
from ..completion import Completer
from ..dist_context import DistributedContext
dist_context = DistributedContext(serial_main_prog, serial_startup_prog,
engine._optimizer, losses, {},
{"loss": losses}, engine._cluster,
engine._strategy)
completer = Completer(dist_context)
completer.complete_forward_annotation()
dist_context.block_state.parse_forward_blocks(
dist_context.serial_main_program)
if mode == "eval" or mode == "predict":
cost_estimator = CostEstimator(serial_main_prog, engine._cluster)
elif mode == "train":
from ..parallelizer_v2 import Parallelizer
# Get serial main program with backward
serial_optimizer = engine._optimizer
parallelizer = Parallelizer(mode, completer, dist_context)
# Generate backward
loss_name = dist_context.serial_loss.name
serial_loss = serial_main_prog.global_block()._var_recursive(loss_name)
params_grads = parallelizer._generate_backward(serial_main_prog,
serial_startup_prog,
serial_loss)
# Generate optimizer
optimizer_ops = parallelizer._generate_optimizer(
serial_main_prog, serial_startup_prog, serial_optimizer,
params_grads)
cost_estimator = CostEstimator(serial_main_prog, engine._cluster)
# Estimate global_cost and max memory
global_cost = cost_estimator.estimate(dist_context)
max_memory = cost_estimator._estimate_max_memory_by_dist_op(dist_context)
# Print the cost
cost_estimator.pretty_print_cost()
return global_cost, max_memory
......@@ -77,7 +77,6 @@ class DistributedContext:
self._serial_optimizer = None
self._serial_feed_vars = {}
self._serial_fetch_vars = {}
self._lr_optimizer = None # record the optimzier holding lr_scheduler
# Data members related to the program
self._dist_tensors_for_program = {}
......@@ -268,12 +267,24 @@ class DistributedContext:
def _restore_serial_fetch_vars(self):
for key, var_list in self._original_serial_fetch_vars.items():
new_var_list = []
for var in var_list:
block_idx = var.block.idx
var_name = var.name
var = self._serial_main_program.blocks[
block_idx]._var_recursive(var_name)
new_var_list.append(var)
# metrics is a list of list
if key == "metrics":
for inner_var_list in var_list:
new_inner_var_list = []
for var in inner_var_list:
block_idx = var.block.idx
var_name = var.name
var = self._serial_main_program.blocks[
block_idx]._var_recursive(var_name)
new_inner_var_list.append(var)
new_var_list.append(new_inner_var_list)
else:
for var in var_list:
block_idx = var.block.idx
var_name = var.name
var = self._serial_main_program.blocks[
block_idx]._var_recursive(var_name)
new_var_list.append(var)
self._serial_fetch_vars[key] = new_var_list
def _restore_serial_info(self, mode="to_backup"):
......@@ -861,7 +872,7 @@ class DistributedContext:
"_serial_ordered_nodes", "_serial_ordered_tensor_nodes", \
"_serial_ordered_op_nodes", "_original_serial_loss", \
"_original_serial_feed_vars", "_original_serial_fetch_vars", \
"_serial_loss", "_serial_feed_vars", "_serial_fetch_vars", "_lr_optimizer", \
"_serial_loss", "_serial_feed_vars", "_serial_fetch_vars", "_serial_optimizer", \
"_backup_serial_main_program_stack", "_backup_serial_startup_program_stack", \
"_pass_context"]:
setattr(result, k, v)
......
......@@ -14,44 +14,14 @@
import abc
import numpy as np
from functools import wraps
import paddle
from .utils import to_list
from paddle.fluid.layers.utils import flatten
from paddle.io import DataLoader, BatchSampler, IterableDataset
from paddle.fluid.dataloader.batch_sampler import _InfiniteIterableSampler
from paddle.io import BatchSampler, IterableDataset
from paddle.fluid.dataloader.batch_sampler import _InfiniteIterableSampler, DistributedBatchSampler
from paddle.fluid.dataloader.dataloader_iter import _DatasetKind, default_collate_fn, default_convert_fn
class DistributedDataLoader(metaclass=abc.ABCMeta):
def __init__(self, dataset, batch_size=1, epochs=1, drop_last=False):
if isinstance(dataset, IterableDataset):
self.dataset_kind = _DatasetKind.ITER
else:
self.dataset_kind = _DatasetKind.MAP
self.dataset = dataset
self.epochs = epochs
self.drop_lost = drop_last
if batch_size is None:
self.batch_size = None
self.batch_sampler = None
else:
self.batch_size = batch_size
if isinstance(dataset, IterableDataset):
self.batch_sampler = _InfiniteIterableSampler(
dataset, batch_size)
else:
self.batch_sampler = BatchSampler(dataset,
batch_size=batch_size,
shuffle=False,
drop_last=drop_last)
self.auto_collate_batch = self.batch_sampler is not None
self.sampler_iter = iter(self.index_sampler)
class DistributedDataLoaderBase(metaclass=abc.ABCMeta):
@abc.abstractmethod
def __iter__(self):
......@@ -72,40 +42,72 @@ class DistributedDataLoader(metaclass=abc.ABCMeta):
return _InfiniteIterableSampler(self.dataset, 1)
class NonIterableGeneratorLoader(DistributedDataLoader):
class DistributedDataLoaderFromGenerator(DistributedDataLoaderBase):
def __init__(self,
dataset,
feed_list,
places,
feed_list=None,
capacity=None,
use_double_buffer=True,
iterable=True,
return_list=False,
use_multiprocess=False,
drop_last=True,
places=None,
batch_size=1,
epochs=1,
steps_per_epoch=None,
collate_fn=None,
split_data=True,
data_parallel_world_size=[],
data_parallel_rank=[],
drop_last=False,
split_data=True):
data_parallel_rank=[]):
self.dataset = dataset
self.feed_list = feed_list
self.capacity = capacity
self.use_double_buffer = use_double_buffer
self.iterable = iterable
self.return_list = return_list
self.use_multiprocess = use_multiprocess
self.drop_last = drop_last
self.places = places
self.batch_size = batch_size
self.epochs = epochs
self.steps_per_epoch = steps_per_epoch
self.collate_fn = collate_fn
self.split_data = split_data
assert len(data_parallel_world_size) == len(feed_list)
assert len(data_parallel_rank) == len(feed_list)
self.dp_world_sizes = data_parallel_world_size
self.dp_ranks = data_parallel_rank
self.split_data = split_data
super(NonIterableGeneratorLoader,
self).__init__(dataset, batch_size, epochs, drop_last)
if isinstance(dataset, IterableDataset):
self.dataset_kind = _DatasetKind.ITER
else:
self.dataset_kind = _DatasetKind.MAP
if self.batch_size is None:
self.batch_sampler = None
else:
if isinstance(dataset, IterableDataset):
self.batch_sampler = _InfiniteIterableSampler(
dataset, batch_size)
else:
self.batch_sampler = BatchSampler(dataset,
batch_size=batch_size,
shuffle=False,
drop_last=drop_last)
self.auto_collate_batch = self.batch_sampler is not None
self.sampler_iter = iter(self.index_sampler)
if self.auto_collate_batch:
self.collate_fn = collate_fn or default_collate_fn
else:
self.collate_fn = collate_fn or default_convert_fn
self.dataset_fetcher = _DatasetKind.create_fetcher(
self.dataset_kind, self.dataset, self.auto_collate_batch,
self.collate_fn, self.drop_lost)
self.collate_fn, self.drop_last)
self._steps = self._infer_steps()
self._inner_dataloader = self._create_inner_dataloader()
......@@ -118,8 +120,10 @@ class NonIterableGeneratorLoader(DistributedDataLoader):
def __next__(self):
if not self._steps:
self._cur_step += 1
return None
elif self._cur_step < self._steps:
self._cur_step += 1
return None
else:
self._inner_dataloader.reset()
self.sampler_iter = iter(self.index_sampler)
......@@ -141,6 +145,16 @@ class NonIterableGeneratorLoader(DistributedDataLoader):
)
return steps_per_epoch
@property
def index_sampler(self):
if self.auto_collate_batch:
return self.batch_sampler
else:
if self.dataset_kind == _DatasetKind.MAP:
return list(range(len(self.dataset)))
else:
return _InfiniteIterableSampler(self.dataset, 1)
def _create_inner_dataloader(self):
def data_generator():
......@@ -153,7 +167,7 @@ class NonIterableGeneratorLoader(DistributedDataLoader):
self.dataset_fetcher = _DatasetKind.create_fetcher(
self.dataset_kind, self.dataset,
self.auto_collate_batch, self.collate_fn,
self.drop_lost)
self.drop_last)
break
partial_data = []
......@@ -173,7 +187,83 @@ class NonIterableGeneratorLoader(DistributedDataLoader):
yield partial_data
dataloader = paddle.fluid.io.DataLoader.from_generator(
feed_list=self.feed_list, capacity=70, iterable=False)
feed_list=self.feed_list,
capacity=self.capacity,
use_double_buffer=self.use_double_buffer,
# iterable=self.iterable,
iterable=False,
return_list=self.return_list,
use_multiprocess=self.use_multiprocess,
drop_last=self.drop_last)
dataloader.set_batch_generator(data_generator, self.places)
return dataloader
class DistributedDataLoader(DistributedDataLoaderBase):
def __init__(self,
dataset,
feed_list=None,
places=None,
return_list=True,
batch_size=1,
shuffle=False,
drop_last=False,
collate_fn=None,
num_workers=0,
use_buffer_reader=True,
use_shared_memory=True,
timeout=0,
worker_init_fn=None,
epochs=1,
steps_per_epoch=None,
split_data=True,
data_parallel_world_size=[],
data_parallel_rank=[]):
self.dataset = dataset
self.feed_list = feed_list
self.return_list = return_list
self.places = places
self.batch_size = batch_size
self.shuffle = shuffle
self.drop_last = drop_last
self.collate_fn = collate_fn
self.num_workers = num_workers
self.use_buffer_reader = use_buffer_reader
self.use_shared_memory = use_shared_memory
self.timeout = timeout
self.worker_init_fn = worker_init_fn
self.epochs = epochs
self.steps_per_epoch = steps_per_epoch
self.dp_world_sizes = data_parallel_world_size
self.dp_ranks = data_parallel_rank
self.split_data = split_data
# TODO: rank info
self.batch_sampler = DistributedBatchSampler(
self.dataset, self.batch_size, self.dp_world_sizes[0],
self.dp_ranks[0], self.shuffle, self.drop_last)
self._inner_dataloader = self._create_inner_dataloader()
def __iter__(self):
return self
def __next__(self):
return next(self.data)
def _create_inner_dataloader(self):
dataloader = paddle.fluid.io.DataLoader(
self.dataset,
feed_list=self.feed_list,
places=self.places,
return_list=self.return_list,
batch_sampler=self.batch_sampler,
collate_fn=self.collate_fn,
num_workers=self.num_workers,
use_buffer_reader=self.use_buffer_reader,
use_shared_memory=self.use_shared_memory,
timeout=self.timeout,
worker_init_fn=self.worker_init_fn)
self.data = (x for x in dataloader)
return dataloader
......@@ -13,8 +13,6 @@
# limitations under the License.
import os
import time
import copy
import logging
import random
import numpy as np
......@@ -24,18 +22,18 @@ import paddle
import paddle.utils as utils
from paddle import fluid, static
from paddle.jit import to_static
from paddle.metric import Metric
from paddle.static import InputSpec
from paddle.fluid import core
from paddle.fluid import Variable
from paddle.fluid.layers.utils import flatten
from paddle.fluid.executor import global_scope, _to_name_str
from paddle.fluid.framework import Operator, Parameter, _non_static_mode
from paddle.fluid.framework import Operator, _non_static_mode
from paddle.fluid.framework import _current_expected_place as _get_device
from paddle.fluid.dygraph.parallel import ParallelEnv
from paddle.distributed import fleet
from .callbacks import config_callbacks
from .converter import Converter
from .helper import ProgramHelper
from .cluster import Cluster, get_default_cluster
......@@ -43,13 +41,15 @@ from .planner_v2 import Planner
from .parallelizer_v2 import Parallelizer
from .dist_op import DistributedOperator
from .dist_saver import DistributedSaver
from .dist_loader import NonIterableGeneratorLoader
from .utils import print_program_with_dist_attr, to_list
from .utils import get_logger, get_dist_attr
from .dist_loader import DistributedDataLoaderFromGenerator, DistributedDataLoader
from .utils import to_list, get_dist_attr, get_lr
from .process_group import new_process_group, get_all_process_groups
from .dist_context import DistributedContext, get_default_distributed_context
from .strategy import Strategy
from .interface import _get_fetches
from .interface import CollectionNames, get_collection
from ..utils.log_utils import get_logger
from .utils import initialize_pg_in_full_mode
from .cost.estimate_cost import get_cost_from_engine
class Engine:
......@@ -129,12 +129,6 @@ class Engine:
"'model must be sub classes of `paddle.nn.Layer` or any callable function."
)
self._model = model
if loss and not isinstance(loss,
paddle.nn.Layer) and not callable(loss):
raise TypeError(
"'loss' must be sub classes of `paddle.nn.Layer` or any callable function."
)
self._loss = loss
if optimizer and not isinstance(
......@@ -187,17 +181,277 @@ class Engine:
self._feed_vars = {}
self._fetch_vars = {}
self._planners = {}
self._mode_init_states = {
self._has_prepared = {"train": False, "eval": False, "predict": False}
self._has_prepared_reader = {
"train": False,
"eval": False,
"predict": False
}
self._inputs_spec = []
self._labels_spec = []
self._inputs = []
self._labels = []
self._skip_build = False
self._outside_dataloader = False
self._planned_mode = None
self._dygraph_mode = False
self._tuning = self._strategy.tuning
self._losses = None
self.history = None
def _prepare_data_spec(self, data, split, batch_size):
inputs_spec = []
labels_spec = []
if isinstance(data, paddle.io.IterableDataset):
if split is None:
inputs, labels = next(iter(data))
else:
sample = next(iter(data))
inputs = sample[:split]
labels = sample[split:]
elif isinstance(data, paddle.io.Dataset):
if split is None:
inputs, labels = data[0]
else:
sample = data[0]
inputs = sample[:split]
labels = sample[split:]
else:
raise ValueError(
"Data should be a Dataset or IterableDatset, but received {}.".
format(type(data).__name__))
inputs = to_list(inputs)
labels = to_list(labels)
num_shards = self._strategy.dataset.num_shards
def _adjust_item_spec(num_shards, spec):
if num_shards > 1 and len(spec.shape) > 1:
spec.shape[0] = spec.shape[0] * num_shards
def _infer_item_spec(item, name, batch_size, specs):
if isinstance(item, np.ndarray):
spec = InputSpec.from_numpy(item, name)
if batch_size is None:
_adjust_item_spec(num_shards, spec)
specs.append(spec)
else:
specs.append(spec.batch(batch_size))
elif isinstance(item, (Variable, core.VarBase, core.eager.Tensor)):
_adjust_item_spec(num_shards, spec)
spec = InputSpec.from_tensor(item, name)
if batch_size is None:
specs.append(spec)
else:
specs.append(spec.batch(batch_size))
else:
specs.append(InputSpec([batch_size], type(item), name))
if inputs is not None:
for i, item in enumerate(inputs):
assert item is not None, "Receive None input."
name = "input" + str(i)
_infer_item_spec(item, name, batch_size, inputs_spec)
if labels is not None:
for i, item in enumerate(labels):
assert item is not None, "Receive None input."
name = "label" + str(i)
_infer_item_spec(item, name, batch_size, labels_spec)
inputs_spec = self._validate_spec(inputs_spec)
labels_spec = self._validate_spec(labels_spec)
return inputs_spec, labels_spec
def _prepare_data_tensor(self,
inputs_spec,
labels_spec,
inputs=None,
labels=None):
if _non_static_mode() or self._dygraph_mode:
return None, None
inputs_spec = inputs_spec if inputs_spec else []
labels_spec = labels_spec if labels_spec else []
if inputs_spec:
assert isinstance(inputs_spec, list), \
"inputs should be list, but received {}".format(type(inputs_spec))
if inputs is None:
inputs = [s._create_feed_layer() for s in inputs_spec]
else:
assert isinstance(inputs, list), \
"inputs should be list, but received {}".format(type(inputs))
for input_spec, input in zip(inputs_spec, inputs):
if input_spec.shape != input.shape:
input.desc.set_shape(input_spec.shape)
if labels_spec:
assert isinstance(labels_spec, list), \
"labels should be list, but received {}".format(type(labels_spec))
if labels is None:
labels = [s._create_feed_layer() for s in labels_spec]
else:
assert isinstance(labels, list), \
"labels should be list, but received {}".format(type(labels))
for label_spec, label in zip(labels_spec, labels):
if label_spec.shape != label.shape:
label.desc.set_shape(label_spec.shape)
return inputs, labels
def _prepare_reader(self):
dist_main_prog = self._dist_main_progs[self._mode][self._cur_rank]
dist_context = self._dist_contexts[self._mode]
dist_main_block = dist_main_prog.global_block()
def _prepare_single_mode(self, mode):
# NOTE: this list may be changed if Paddle changes the existing rules.
related_reader_ops = [
"create_py_reader", "create_double_buffer_reader", "read"
]
# remove the first three ops if multiple run fit/evaluate/predict
if dist_main_block.ops[0].type == 'create_py_reader':
for i in range(len(related_reader_ops)):
if dist_main_block.ops[0].type in related_reader_ops:
dist_main_block._remove_op(0, sync=False)
dist_main_block._sync_with_cpp()
# Step 1: find the reader ops
reader_op_indices = []
for idx, op in enumerate(dist_main_block.ops):
if op.type in related_reader_ops:
reader_op_indices.append(idx)
# Step 2: insert the new reader ops to cpp
new_reader_ops = []
for idx in reversed(reader_op_indices):
new_op_desc = dist_main_block.desc._prepend_op()
new_op_desc.copy_from(dist_main_block.ops[idx].desc)
new_op = Operator(dist_main_block,
new_op_desc,
type=new_op_desc.type())
new_reader_ops.append(new_op)
dist_op = DistributedOperator(new_op)
dist_context.add_dist_op_for_program(dist_op)
# Step 3: insert the new reader ops to python
for new_op in new_reader_ops:
dist_main_block.ops.insert(0, new_op)
for i in range(len(reader_op_indices)):
reader_op_indices[i] += len(reader_op_indices)
# Step 4: remove the old reader ops from python and cpp
for idx in reversed(reader_op_indices):
op = dist_main_block.ops.pop(idx)
dist_main_block.desc._remove_op(idx, idx + 1)
dist_main_block._sync_with_cpp()
self._has_prepared_reader[self._mode] = True
def _prepare_feed(self, data, user_feeds, mode):
feeds = {}
if data is not None:
if isinstance(data, (list, tuple)):
if len(data) == 1 and isinstance(data[0], dict):
for name, data in data[0].items():
feeds[name] = data
else:
raise ValueError("Unsupported data {}".format(data))
elif isinstance(data, dict):
for name, data in data.items():
feeds[name] = data
else:
raise ValueError("Unsupported data {}".format(data))
if user_feeds is not None:
assert isinstance(user_feeds, dict), \
"user_feeds must be a dict, but receive {}".format(type(user_feeds).__name__)
for name, data in user_feeds.items():
feeds[name] = data
return feeds
def _prepare_fetch(self, user_fetches, mode):
if user_fetches is not None:
assert isinstance(user_fetches, list), \
"user_fetches must be a list, but receive {}".format(type(user_fetches).__name__)
fetch_names = []
fetch_indices = []
def _process_fetch_group(group_name, var_list):
group_indices = []
for var in var_list:
# Remove duplicate var_names
if self._is_local_var(var):
var_name = _to_name_str(var)
if var_name not in fetch_names:
fetch_names.append(var_name)
group_indices.append(fetch_names.index(var_name))
if not group_indices:
fetch_names.append([])
fetch_indices.append(group_indices)
if mode != "predict":
_process_fetch_group("loss", self._fetch_vars[mode]["loss"])
if mode != "predict":
metrics = self._fetch_vars[mode]["metrics"]
for i, var_list in enumerate(metrics):
_process_fetch_group("metrics_" + str(i), var_list)
if mode == "predict":
_process_fetch_group("outputs", self._fetch_vars[mode]["outputs"])
user_fetches_collection = [
item[1] for item in get_collection(CollectionNames.FETCHES)
]
var_list = (user_fetches_collection or []) + (user_fetches or [])
_process_fetch_group("fetches", var_list)
return fetch_names, fetch_indices
def _prepare_logger(self,
outs,
epoch=None,
step=None,
lr=None,
fetch_names=None,
fetch_indices=None,
mode=None):
logs = {}
if epoch is not None:
logs["epoch"] = epoch
if step is not None:
logs["step"] = step + 1
if lr is not None:
logs["lr"] = lr
group_idx = 0
if mode != "predict":
# logging loss
loss_indices = fetch_indices[group_idx]
assert len(loss_indices) <= 1
for idx in loss_indices:
logs["loss"] = outs[idx][0]
group_idx += 1
# logging metrics
metric_vars = self._fetch_vars[mode]["metrics"]
if metric_vars:
for metric in self._metrics:
metrics_indices = fetch_indices[group_idx]
metric_out = []
for idx in metrics_indices:
metric_out.append(outs[idx])
if metric_out:
metric.update(*metric_out)
results = metric.accumulate()
for i, res in enumerate(to_list(results)):
logs[metric.name()[i]] = res
group_idx += 1
# logging outputs
elif mode == "predict":
outputs_indices = fetch_indices[group_idx]
logs_out = {}
for idx in outputs_indices:
logs_out["out%d" % (idx)] = outs[idx]
logs["outputs"] = logs_out
group_idx += 1
# logging user fetches
collect_fetches = get_collection(CollectionNames.FETCHES)
logs_fetch = {}
for name, var in collect_fetches:
if var.name in fetch_names:
idx = fetch_names.index(var.name)
logs_fetch[name or var.name] = outs[idx]
logs["fetches"] = logs_fetch
return logs
def _prepare_program(self, mode):
# Do the build process
self._build(mode)
# Do the planning process
......@@ -206,7 +460,7 @@ class Engine:
self._parallel(mode)
# Init comm and startup program
self._initialize(mode)
self._mode_init_states[mode] = True
self._has_prepared[mode] = True
def _build(self, mode):
if _non_static_mode() or self._dygraph_mode:
......@@ -214,8 +468,8 @@ class Engine:
self._dygraph_mode = True
self._logger.info("Building model with 'to_static' method.")
inputs_spec = self.inputs_spec
labels_spec = self.labels_spec if self.labels_spec else []
inputs_spec = self._inputs_spec
labels_spec = self._labels_spec if self._labels_spec else []
self.program_helper = ProgramHelper(self._model, self._loss,
self._metrics, inputs_spec,
labels_spec)
......@@ -230,8 +484,12 @@ class Engine:
outputs = self.program_helper.output_vars
labels = self.program_helper.label_vars
losses = self.program_helper.loss_vars
self._losses = losses
metrics = self.program_helper.metric_vars
self._inputs = inputs
self._labels = labels
paddle.enable_static()
else:
# build program in static mode
......@@ -239,24 +497,28 @@ class Engine:
if serial_main_prog is not None:
return
outputs = []
losses = []
metrics = []
inputs = self._inputs if self._inputs else []
labels = self._labels if self._labels else []
serial_main_prog = self._orig_main_prog.clone()
serial_startup_prog = self._orig_startup_prog.clone()
with static.program_guard(serial_main_prog, serial_startup_prog), \
utils.unique_name.guard():
inputs_spec = self.inputs_spec
labels_spec = self.labels_spec if self.labels_spec else []
inputs = [s._create_feed_layer() for s in inputs_spec]
labels = [s._create_feed_layer() for s in labels_spec]
outputs = to_list(self._model(*inputs))
if mode != "predict" and self._loss:
losses = to_list(self._loss(*(outputs + labels)))
if mode != "predict":
for metric in self._metrics:
metrics.extend(
to_list(metric.compute(*(outputs + labels))))
if not self._skip_build:
with static.program_guard(serial_main_prog, serial_startup_prog), \
utils.unique_name.guard():
outputs = to_list(self._model(*inputs))
if mode != "predict" and self._loss:
losses = to_list(self._loss(*(outputs + labels)))
self._losses = losses
if mode != "predict" and (outputs or labels):
for metric in self._metrics:
metrics.append(
to_list(metric.compute(*(outputs + labels))))
else:
losses = to_list(self._loss)
self.losses = losses
default_ctx = get_default_distributed_context()
if not default_ctx.has_annotation:
......@@ -299,8 +561,8 @@ class Engine:
self._optimization_tuner = OptimizationTuner(self._tuning.to_dict(),
self._dist_contexts[mode],
dataset,
self.inputs_spec,
self.labels_spec,
self._inputs_spec,
self._labels_spec,
batch_size=batch_size,
rank=self._cur_rank)
......@@ -324,6 +586,7 @@ class Engine:
inputs_var = self._dist_contexts[mode].serial_feed_vars["inputs"]
labels_var = self._dist_contexts[mode].serial_feed_vars["labels"]
block = self._dist_contexts[mode].serial_main_program.global_block()
# TODO: check this feed_list
feed_list = []
for var in inputs_var + labels_var:
if var.name in block.vars:
......@@ -378,18 +641,20 @@ class Engine:
mode].dist_startup_programs
self._feed_vars[mode] = self._dist_contexts[mode].serial_feed_vars
self._fetch_vars[mode] = self._dist_contexts[mode].serial_fetch_vars
self._lr_optimizer = self._dist_contexts[mode]._lr_optimizer
self._optimizer = self._dist_contexts[mode]._serial_optimizer
if self._nranks > 1:
# Traverse different rank programs and traverse each op of them,
# instantiate communication by process_mapping.
all_process_groups = get_all_process_groups()
# NOTE: add the comm init control in the future for auto search
for process_group in all_process_groups:
if self._cur_rank not in process_group.ranks:
continue
process_group.instantiate()
if self._strategy.auto_mode == "full":
initialize_pg_in_full_mode(all_process_groups, cur_rank)
else:
for process_group in all_process_groups:
if self._cur_rank not in process_group.ranks:
continue
process_group.instantiate()
place = _get_device()
if isinstance(place, fluid.CUDAPlace):
......@@ -423,77 +688,26 @@ class Engine:
self._dist_attr)
if self._strategy.reinit:
self._logger.info("NOTE: parameters wiil be re-initialized.")
self._logger.info("NOTE: parameters will be re-initialized.")
dist_startup_prog = self._dist_startup_progs[mode][self._cur_rank]
self._executor.run(dist_startup_prog)
def _infer_sample_spec(self, data, batch_size, split):
if isinstance(data, paddle.io.IterableDataset):
if split is None:
input, label = next(iter(data))
else:
sample = next(iter(data))
input = sample[:split]
label = sample[split:]
elif isinstance(data, paddle.io.Dataset):
if split is None:
input, label = data[0]
else:
sample = data[0]
input = sample[:split]
label = sample[split:]
else:
raise ValueError(
"Data should be a Dataset or IterableDatset, but received {}.".
format(type(data).__name__))
self.inputs_spec = []
self.labels_spec = []
input_list = to_list(input)
label_list = to_list(label)
def _infer_item_spec(item, name, batch_size, specs):
if isinstance(item, np.ndarray):
spec = InputSpec.from_numpy(item, name)
if batch_size is None:
specs.append(spec)
else:
specs.append(spec.batch(batch_size))
elif isinstance(item, (Variable, core.VarBase, core.eager.Tensor)):
spec = InputSpec.from_tensor(item, name)
if batch_size is None:
specs.append(spec)
else:
specs.append(spec.batch(batch_size))
else:
specs.append(InputSpec([batch_size], type(item), name))
if input_list is not None:
for i, item in enumerate(input_list):
assert item is not None, "Receive None input."
name = "input" + str(i)
_infer_item_spec(item, name, batch_size, self.inputs_spec)
if label_list is not None:
for i, item in enumerate(label_list):
assert item is not None, "Receive None input."
name = "label" + str(i)
_infer_item_spec(item, name, batch_size, self.labels_spec)
self.inputs_spec = self._validate_spec(self.inputs_spec)
self.labels_spec = self._validate_spec(self.labels_spec)
def fit(self,
train_data,
train_sample_split=None,
batch_size=1,
epochs=1,
steps_per_epoch=None,
log_freq=10,
save_dir=None,
save_freq=1,
valid_data=None,
valid_sample_split=None,
valid_freq=1,
valid_steps=None,
collate_fn=None,
callbacks=None):
callbacks=None,
verbose=2):
"""
Trains the model for a fixed number of epochs. If `valid_data` is set,
evaluation will be done at the end of each epoch.
......@@ -560,80 +774,90 @@ class Engine:
epochs=2,
batch_size=64)
"""
self.mode = 'train'
self._infer_sample_spec(train_data, batch_size, train_sample_split)
if not self._mode_init_states[self.mode]:
self._prepare_single_mode(self.mode)
self._mode = 'train'
self._inputs_spec, self._labels_spec = self._prepare_data_spec(
train_data, train_sample_split, batch_size)
self._inputs, self._labels = self._prepare_data_tensor(
self._inputs_spec, self._labels_spec)
if not self._has_prepared[self._mode]:
self._prepare_program(self._mode)
else:
self._switch_mode("train")
assert self.mode in self._dist_main_progs, \
"train model is not ready, please call `engine._prepare_single_mode('train')` first."
train_dataloader = self._create_dataloader(train_data, batch_size,
epochs, steps_per_epoch,
collate_fn)
fetch_loss = self._validate_fetches(self.fetch_vars["loss"])
fetch_metrics = self._validate_fetches(self.fetch_vars["metrics"])
inner_fetch = dict(fetch_loss, **fetch_metrics)
usr_fetch = self._validate_fetches(_get_fetches())
fetch_list, fetch_map = self._fetch_map(inner_fetch, usr_fetch)
lr_scheduler = self._get_lr_scheduler(self.main_program)
outputs = defaultdict(list)
self._switch_mode(self._mode)
assert self._mode in self._dist_main_progs, \
"train model is not ready, please call `engine._prepare_program('train')` first."
train_dataloader = self._prepare_dataloader_from_generator(
dataset=train_data,
capacity=70,
iterable=False,
batch_size=batch_size,
epochs=epochs,
steps_per_epoch=steps_per_epoch,
collate_fn=collate_fn)
fetch_names, fetch_indices = self._prepare_fetch(None, mode=self._mode)
cbks = config_callbacks(
callbacks,
engine=self,
batch_size=batch_size,
epochs=epochs,
steps=train_dataloader._steps,
log_freq=log_freq,
save_freq=save_freq,
save_dir=save_dir,
verbose=verbose,
metrics=self._metrics_name(),
acc_step=self._k_steps,
)
cbks.on_begin('train')
for epoch in range(epochs):
train_logs = {"epoch: {:d} ": epoch}
logs = {}
cbks.on_epoch_begin(epoch)
for step, _ in enumerate(train_dataloader):
cbks.on_batch_begin('train', step, logs)
try:
outs = self._executor.run(
self.main_program,
fetch_list=fetch_list,
fetch_list=fetch_names,
use_program_cache=self._strategy.use_cache,
return_numpy=self._strategy.return_numpy)
except core.EOFException:
break
train_logs["step: {:d} "] = step
# update lr
if lr_scheduler and step % self._k_steps == 0:
lr_scheduler.step()
train_logs["lr: {:5e} "] = self._get_lr(self._lr_optimizer)
# inner fetches
if fetch_loss:
train_logs["loss: {:8f} "] = outs[0][0]
outputs["loss"].append(outs[0][0])
# Metric
if fetch_metrics:
metric_out = outs[len(fetch_loss):len(inner_fetch)]
for metric in self._metrics:
metric.update(*metric_out)
results = metric.accumulate()
for i, res in enumerate(to_list(results)):
train_logs[metric.name()[i] + ": {:8f} "] = res
outputs[metric.name()[i]].append(outs[0][0])
# user fetches
user_outs = outs[len(inner_fetch):]
user_fetch_list = fetch_list[len(inner_fetch):]
for i, out in enumerate(user_outs):
train_logs[fetch_map[user_fetch_list[i]] + ": {}"] = out
# logger
string = '[train] ' + ''.join(list(train_logs.keys()))
self._logger.info(string.format(*list(train_logs.values())))
if valid_data and epoch % valid_freq == 0:
self.evaluate(valid_data, valid_sample_split, batch_size,
valid_steps, collate_fn, callbacks)
lr = get_lr(self._optimizer)
logs = self._prepare_logger(outs, epoch, step, lr, fetch_names,
fetch_indices, self._mode)
cbks.on_batch_end('train', step, logs)
if valid_data and (epoch + 1) % valid_freq == 0:
val_logs = self.evaluate(valid_data, valid_sample_split,
batch_size, valid_steps, log_freq,
collate_fn, callbacks, verbose)
val_logs = {
"val_" + name: val
for name, val in val_logs.items()
}
logs.update(val_logs)
self._switch_mode("train")
else:
self._reset_metrics()
return outputs
cbks.on_epoch_end(epoch, logs)
cbks.on_end('train', logs)
return self.history
def evaluate(self,
valid_data,
valid_sample_split=None,
batch_size=1,
steps=None,
log_freq=10,
collate_fn=None,
callbacks=None):
callbacks=None,
verbose=2):
"""
Evaluate the loss and metrics of the model on evaluation data.
......@@ -652,7 +876,7 @@ class Engine:
the sample list, None for only stack each fields of sample in axis
0. Default None.
callbacks (Callback|None, optional): A list of `Callback` instances to apply
during evaling. Default: None. (Unused for now)
during evaluating. Default: None. (Unused for now)
Returns:
None
......@@ -680,60 +904,59 @@ class Engine:
engine.evaluate(valid_dataset, batch_size=64)
"""
self.mode = 'eval'
self._infer_sample_spec(valid_data, batch_size, valid_sample_split)
if not self._mode_init_states[self.mode]:
self._prepare_single_mode(self.mode)
self._mode = 'eval'
self._inputs_spec, self._labels_spec = self._prepare_data_spec(
valid_data, valid_sample_split, batch_size)
self._inputs, self._labels = self._prepare_data_tensor(
self._inputs_spec, self._labels_spec)
if not self._has_prepared[self._mode]:
self._prepare_program(self._mode)
else:
self._switch_mode("eval")
assert self.mode in self._dist_main_progs, \
"eval model is not ready, please call `engine._prepare_single_mode('eval')` first."
valid_dataloader = self._create_dataloader(valid_data,
batch_size,
steps_per_epoch=steps,
collate_fn=collate_fn)
fetch_loss = self._validate_fetches(self.fetch_vars["loss"])
fetch_metrics = self._validate_fetches(self.fetch_vars["metrics"])
inner_fetch = dict(fetch_loss, **fetch_metrics)
usr_fetch = self._validate_fetches(_get_fetches())
fetch_list, fetch_map = self._fetch_map(inner_fetch, usr_fetch)
outputs = defaultdict(list)
self._switch_mode(self._mode)
assert self._mode in self._dist_main_progs, \
"eval model is not ready, please call `engine._prepare_program('eval')` first."
valid_dataloader = self._prepare_dataloader_from_generator(
dataset=valid_data,
capacity=70,
iterable=False,
batch_size=batch_size,
steps_per_epoch=steps,
collate_fn=collate_fn)
fetch_names, fetch_indices = self._prepare_fetch(None, mode=self._mode)
cbks = config_callbacks(
callbacks,
engine=self,
batch_size=batch_size,
log_freq=log_freq,
verbose=verbose,
metrics=self._metrics_name(),
)
eval_steps = valid_dataloader._steps
cbks.on_begin('eval', {
'steps': eval_steps,
'metrics': self._metrics_name()
})
logs = {}
for step, _ in enumerate(valid_dataloader):
cbks.on_batch_begin('eval', step, logs)
try:
outs = self._executor.run(
self.main_program,
fetch_list=fetch_list,
fetch_list=fetch_names,
use_program_cache=self._strategy.use_cache,
return_numpy=self._strategy.return_numpy)
except core.EOFException:
break
eval_logs = {"step: {:d} ": step}
# inner fetches
if fetch_loss:
eval_logs["loss: {:8f} "] = outs[0][0]
outputs["eval_loss"].append(outs[0][0])
# Metric
if fetch_metrics:
metric_out = outs[len(fetch_loss):len(inner_fetch)]
for metric in self._metrics:
metric.update(*metric_out)
results = metric.accumulate()
for i, res in enumerate(to_list(results)):
eval_logs[metric.name()[i] + ": {:8f} "] = res
outputs["eval_" + metric.name()[i]].append(res)
# user fetches
usr_outs = outs[len(inner_fetch):]
usr_fetch_list = fetch_list[len(inner_fetch):]
for i, out in enumerate(usr_outs):
eval_logs[fetch_map[usr_fetch_list[i]] + ": {}"] = out
# logger
string = '[eval] ' + ''.join(list(eval_logs.keys()))
self._logger.info(string.format(*list(eval_logs.values())))
logs = self._prepare_logger(outs, None, step, None, fetch_names,
fetch_indices, self._mode)
cbks.on_batch_end('eval', step, logs)
cbks.on_end('eval', logs)
self._reset_metrics()
return outputs
return logs
def predict(self,
test_data,
......@@ -741,7 +964,8 @@ class Engine:
batch_size=1,
steps=None,
collate_fn=None,
callbacks=None):
callbacks=None,
verbose=2):
"""
Compute the output predictions on testing data.
......@@ -785,72 +1009,223 @@ class Engine:
engine = auto.Engine(model)
engine.predict(valid_dataset, batch_size=64)
"""
self.mode = 'predict'
self._infer_sample_spec(test_data, batch_size, test_sample_split)
if not self._mode_init_states[self.mode]:
self._prepare_single_mode(self.mode)
self._mode = 'predict'
self._inputs_spec, self._labels_spec = self._prepare_data_spec(
test_data, test_sample_split, batch_size)
self._inputs, self._labels = self._prepare_data_tensor(
self._inputs_spec, self._labels_spec)
if not self._has_prepared[self._mode]:
self._prepare_program(self._mode)
else:
self._switch_mode("predict")
self._switch_mode(self._mode)
assert self.mode in self._dist_main_progs, \
"predict model is not ready, please call `engine._prepare_single_mode('predict')` first."
test_dataloader = self._create_dataloader(test_data,
batch_size,
steps_per_epoch=steps,
collate_fn=collate_fn)
assert self._mode in self._dist_main_progs, \
"predict model is not ready, please call `engine._prepare_program('predict')` first."
fetch_outputs = self._validate_fetches(self.fetch_vars["outputs"])
usr_fetch = self._validate_fetches(_get_fetches())
fetch_list, fetch_map = self._fetch_map(fetch_outputs, usr_fetch)
test_dataloader = self._prepare_dataloader_from_generator(
dataset=test_data,
capacity=70,
iterable=False,
batch_size=batch_size,
steps_per_epoch=steps,
collate_fn=collate_fn)
fetch_names, fetch_indices = self._prepare_fetch(None, mode=self._mode)
outputs = []
cbks = config_callbacks(callbacks, engine=self, verbose=verbose)
test_steps = test_dataloader._steps
cbks.on_begin('predict', {'steps': test_steps})
logs = {}
for step, _ in enumerate(test_dataloader):
cbks.on_batch_begin('predict', step, logs)
try:
outs = self._executor.run(
self.main_program,
fetch_list=fetch_list,
fetch_list=fetch_names,
use_program_cache=self._strategy.use_cache,
return_numpy=self._strategy.return_numpy)
except core.EOFException:
break
predict_logs = {"step: {:d} ": step}
outputs.append(outs[:len(fetch_outputs)])
for i, out in enumerate(outs):
predict_logs[fetch_map[fetch_list[i]] + ": {}"] = out
# logger
string = '[pred] ' + ''.join(list(predict_logs.keys()))
self._logger.info(string.format(*list(predict_logs.values())))
logs = self._prepare_logger(outs, None, step, None, fetch_names,
fetch_indices, self._mode)
cbks.on_batch_end('predict', step, logs)
outputs.append(list(logs["outputs"].values()))
cbks.on_end('predict', logs)
return outputs
def _tune(self, tune_data, tune_sample_split=None, batch_size=1):
self.mode = 'train'
self._infer_sample_spec(tune_data, batch_size, tune_sample_split)
self._optimization_tuning(self.mode, tune_data, batch_size)
def dataloader(self,
dataset,
batch_size=1,
shuffle=False,
drop_last=False,
collate_fn=None,
num_workers=0,
use_buffer_reader=True,
use_shared_memory=True,
timeout=0,
worker_init_fn=None,
epochs=1,
steps_per_epoch=None,
sample_split=1,
mode=None):
if mode is not None:
self.to_mode(mode)
self._inputs_spec, self._labels_spec = self._prepare_data_spec(
dataset, sample_split, batch_size)
self._inputs, self._labels = self._prepare_data_tensor(
self._inputs_spec, self._labels_spec)
if not self._has_prepared[self._mode]:
self._prepare_program(self._mode)
else:
self._switch_mode(self._mode)
dataloader = self._prepare_dataloader(
dataset,
return_list=False,
batch_size=batch_size,
shuffle=shuffle,
drop_last=drop_last,
collate_fn=collate_fn,
num_workers=num_workers,
use_buffer_reader=use_buffer_reader,
use_shared_memory=use_shared_memory,
timeout=timeout,
worker_init_fn=worker_init_fn,
epochs=epochs,
steps_per_epoch=steps_per_epoch)
return dataloader
def _create_dataloader(self,
dataset,
batch_size,
epochs=1,
steps_per_epoch=None,
collate_fn=None):
def dataloader_from_generator(self,
dataset,
capacity=70,
use_double_buffer=True,
iterable=True,
use_multiprocess=False,
drop_last=True,
batch_size=1,
epochs=1,
steps_per_epoch=None,
collate_fn=None,
sample_split=1,
mode=None):
if mode is not None:
self.to_mode(mode)
self._inputs_spec, self._labels_spec = self._prepare_data_spec(
dataset, sample_split, batch_size)
self._inputs, self._labels = self._prepare_data_tensor(
self._inputs_spec, self._labels_spec)
if not self._has_prepared[self._mode]:
self._prepare_program(self._mode)
else:
self._switch_mode(self._mode)
dataloader = self._prepare_dataloader_from_generator(
dataset=dataset,
capacity=capacity,
use_double_buffer=use_double_buffer,
iterable=iterable,
return_list=False,
use_multiprocess=use_multiprocess,
drop_last=drop_last,
batch_size=batch_size,
epochs=epochs,
steps_per_epoch=steps_per_epoch,
collate_fn=collate_fn)
return dataloader
def prepare(self,
inputs_spec=None,
labels_spec=None,
inputs=None,
labels=None,
main_program=None,
startup_program=None,
mode=None):
if mode is not None:
self.to_mode(mode)
if inputs or labels:
self._skip_build = True
self._inputs_spec = inputs_spec
self._labels_spec = labels_spec
self._inputs, self._labels = self._prepare_data_tensor(
self._inputs_spec, self._labels_spec, inputs, labels)
self._orig_main_prog = main_program
if self._orig_main_prog is None:
self._orig_main_prog = static.default_main_program()
self._orig_startup_prog = startup_program
if self._orig_startup_prog is None:
self._orig_startup_prog = static.default_startup_program()
if not self._has_prepared[self._mode]:
self._prepare_program(self._mode)
else:
self._switch_mode(self._mode)
elif inputs_spec or labels_spec:
self._inputs_spec = inputs_spec
self._labels_spec = labels_spec
self._outside_dataloader = True
self._inputs, self._labels = self._prepare_data_tensor(
self._inputs_spec, self._labels_spec)
self._orig_main_prog = main_program
if self._orig_main_prog is None:
self._orig_main_prog = static.default_main_program()
self._orig_startup_prog = startup_program
if self._orig_startup_prog is None:
self._orig_startup_prog = static.default_startup_program()
if not self._has_prepared[self._mode]:
self._prepare_program(self._mode)
else:
self._switch_mode(self._mode)
else:
assert self._inputs_spec and self._labels_spec, \
"Please call the dataloader(...) before calling prepare(...)"
def run(self, data=None, feed=None, fetch_list=None, mode=None):
if mode is not None:
self.to_mode(mode)
feed_dict = self._prepare_feed(data, feed, self._mode)
fetch_names, fetch_indices = self._prepare_fetch(fetch_list, self._mode)
if self._outside_dataloader and not self._has_prepared_reader[
self._mode]:
self._prepare_reader()
outs = self._executor.run(self.main_program,
feed=feed_dict,
fetch_list=fetch_names,
use_program_cache=self._strategy.use_cache,
return_numpy=self._strategy.return_numpy)
logs = self._prepare_logger(outs, None, None, None, fetch_names,
fetch_indices, self._mode)
return logs
def _prepare_dataloader(self,
dataset,
return_list=True,
batch_size=1,
shuffle=False,
drop_last=False,
collate_fn=None,
num_workers=0,
use_buffer_reader=True,
use_shared_memory=True,
timeout=0,
worker_init_fn=None,
epochs=1,
steps_per_epoch=None):
if self._strategy.gradient_merge and batch_size is not None:
assert batch_size % self._k_steps == 0, \
"Requires batch_size:[{}] to be divisible by k_steps:[{}].".format(batch_size, self._k_steps)
batch_size //= self._k_steps
dist_main_prog = self._dist_main_progs[self.mode][self._cur_rank]
dist_startup_prog = self._dist_startup_progs[self.mode][self._cur_rank]
dist_context = self._dist_contexts[self.mode]
dist_main_prog = self._dist_main_progs[self._mode][self._cur_rank]
dist_startup_prog = self._dist_startup_progs[self._mode][self._cur_rank]
dist_context = self._dist_contexts[self._mode]
dist_main_block = dist_main_prog.global_block()
# NOTE: Get feed_list, then insert dataloader op with sharded var shape.
# Cause predict_program does not contain labels var,
# then we will add labels var from serial_program to dist_program,
# that maintains the length of feed_list equal to the length of dataset's values.
inputs_var = self._feed_vars[self.mode]["inputs"]
labels_var = self._feed_vars[self.mode]["labels"]
inputs_var = self._feed_vars[self._mode]["inputs"]
labels_var = self._feed_vars[self._mode]["labels"]
feed_list = []
for var in inputs_var + labels_var:
if var.name in dist_main_block.vars:
......@@ -860,45 +1235,99 @@ class Engine:
copy_var.desc.set_original_id(var.desc.original_id())
feed_list.append(copy_var)
# remove the first three ops if multi run fit/evaluate/predict
op_size = len(dist_main_block.ops)
if dist_main_block.ops[0].type == 'create_py_reader':
op_size -= 3
for _ in range(3):
dist_main_block._remove_op(0, sync=False)
# insert read op at the end of program
places = paddle.static.cuda_places()
with static.program_guard(dist_main_prog, dist_startup_prog):
dataloader = NonIterableGeneratorLoader(
dataloader = DistributedDataLoader(
dataset,
feed_list,
places,
batch_size,
epochs,
steps_per_epoch,
collate_fn,
feed_list=feed_list,
places=places,
return_list=return_list,
batch_size=batch_size,
shuffle=shuffle,
drop_last=drop_last,
collate_fn=collate_fn,
num_workers=num_workers,
use_buffer_reader=use_buffer_reader,
use_shared_memory=use_shared_memory,
timeout=timeout,
worker_init_fn=worker_init_fn,
epochs=epochs,
steps_per_epoch=steps_per_epoch,
split_data=self._strategy.split_data,
data_parallel_world_size=self._dp_world_sizes,
data_parallel_rank=self._dp_ranks,
split_data=self._strategy.split_data)
data_parallel_rank=self._dp_ranks)
# move read op from the end of program to the start of program
new_op_size = len(dist_main_block.ops)
for _ in range(new_op_size - 1, op_size - 1, -1):
op = dist_main_block.ops[new_op_size - 1]
new_op_desc = dist_main_block.desc._prepend_op()
new_op_desc.copy_from(op.desc)
new_op = Operator(dist_main_block,
new_op_desc,
type=new_op_desc.type())
dist_main_block.ops.insert(0, new_op)
dist_op = DistributedOperator(new_op)
dist_context.add_dist_op_for_program(dist_op)
for _ in range(new_op_size - op_size):
dist_main_block._remove_op(new_op_size, sync=False)
dist_main_block._sync_with_cpp()
return dataloader
def _prepare_dataloader_from_generator(self,
dataset,
capacity=None,
use_double_buffer=True,
iterable=True,
return_list=False,
use_multiprocess=False,
drop_last=True,
batch_size=1,
epochs=1,
steps_per_epoch=None,
collate_fn=None):
if self._strategy.gradient_merge and batch_size is not None:
assert batch_size % self._k_steps == 0, \
"Requires batch_size:[{}] to be divisible by k_steps:[{}].".format(batch_size, self._k_steps)
batch_size //= self._k_steps
dist_main_prog = self._dist_main_progs[self._mode][self._cur_rank]
dist_startup_prog = self._dist_startup_progs[self._mode][self._cur_rank]
dist_context = self._dist_contexts[self._mode]
dist_main_block = dist_main_prog.global_block()
# NOTE: Get feed_list, then insert dataloader op with sharded var shape.
# Cause predict_program does not contain labels var,
# then we will add labels var from serial_program to dist_program,
# that maintains the length of feed_list equal to the length of dataset's values.
inputs_var = self._feed_vars[self._mode]["inputs"]
labels_var = self._feed_vars[self._mode]["labels"]
feed_list = []
for var in inputs_var + labels_var:
if var.name in dist_main_block.vars:
feed_list.append(dist_main_block.vars[var.name])
else:
copy_var = dist_main_block._clone_variable(var, var.persistable)
copy_var.desc.set_original_id(var.desc.original_id())
feed_list.append(copy_var)
places = paddle.static.cuda_places()
with static.program_guard(dist_main_prog, dist_startup_prog):
dataloader = DistributedDataLoaderFromGenerator(
dataset=dataset,
feed_list=feed_list,
capacity=capacity,
use_double_buffer=use_double_buffer,
iterable=iterable,
return_list=return_list,
use_multiprocess=use_multiprocess,
drop_last=drop_last,
places=places,
batch_size=batch_size,
epochs=epochs,
steps_per_epoch=steps_per_epoch,
collate_fn=collate_fn,
split_data=self._strategy.split_data,
data_parallel_world_size=self._dp_world_sizes,
data_parallel_rank=self._dp_ranks)
self._prepare_reader()
return dataloader
def _tune(self, tune_data, tune_sample_split=None, batch_size=1):
self._mode = 'train'
self._inputs_spec, self._labels_spec = self._prepare_data_spec(
tune_data, tune_sample_split, batch_size)
self._inputs, self._labels = self._prepare_data_tensor(
self._inputs_spec, self._labels_spec)
self._optimization_tuning(self._mode, tune_data, batch_size)
def _validate_spec(self, specs):
specs = to_list(specs)
self._k_steps = self._strategy.gradient_merge.k_steps
......@@ -921,32 +1350,6 @@ class Engine:
var_name = _to_name_str(var)
return var_name in self.main_program.global_block().vars
def _validate_fetches(self, fetches):
# 1. Check user-defined fetches type
# 2. Prepare fetches_dict like {user_defined_name: var_name}
if not fetches:
return {}
if isinstance(fetches, dict):
fetch_var_names = list(map(_to_name_str, fetches.values()))
fetches_dict = dict(zip(fetch_var_names, list(fetches.keys())))
elif isinstance(fetches, list):
fetch_var_names = list(map(_to_name_str, fetches))
fetches_dict = dict(zip(fetch_var_names, fetch_var_names))
else:
raise TypeError("'fetches' only support 'dict' and 'list', "
"but got '{}'".format(str(type(fetches))))
return dict(
filter(lambda x: self._is_local_var(x[0]), fetches_dict.items()))
def _fetch_map(self, inner_fetch, usr_fetch):
# replace inner fetch name if usr set for it
for iname in inner_fetch:
if iname in usr_fetch:
inner_fetch[iname] = usr_fetch[iname]
usr_fetch.pop(iname)
fetches = dict(inner_fetch, **usr_fetch)
return list(fetches.keys()), fetches
def _get_input_split_info(self, var, dist_context):
# deduce how the input data is split among the cluster
from .utils import _get_comm_group, _get_corresponding_rank
......@@ -1007,9 +1410,20 @@ class Engine:
for metric in self._metrics:
metric.reset()
def _metrics_name(self):
metrics_name = ['loss'] if self._loss else []
for m in self._metrics:
metrics_name.extend(to_list(m.name()))
return metrics_name
def _switch_mode(self, mode):
self.mode = mode
self._initialize(mode)
self.to_mode(mode)
self._optimizer = self._dist_contexts[mode]._serial_optimizer
def to_mode(self, mode):
assert mode in ["train", "eval", "predict"], \
"mode {} should be one of ['train', 'eval', 'predict']".format(mode)
self._mode = mode
def _set_state_dict(self, mode, strict, state_dict, dist_attr):
program = self._dist_main_progs[mode][self._cur_rank]
......@@ -1029,7 +1443,7 @@ class Engine:
is 'dirname/file_prefix' or 'file_prefix'. if empty str.
A exception will be raised.
training (bool, optional): Whether to save for training. If not, save
for inference only. If `training` is set to True, the optimzer state
for inference only. If `training` is set to True, the optimizer state
will be saved. Otherwise, only the model and parameters are saved.
This function will silently overwrite existing file at the target
location. Default: True.
......@@ -1065,20 +1479,19 @@ class Engine:
"""
if training:
assert 'train' in self._serial_main_progs, \
"training model is not ready, please call `engine._prepare_single_mode('train')` first."
serial_program = self._serial_main_progs["train"]
dist_main_prog = self._dist_main_progs["train"][self._cur_rank]
dist_context = self._dist_contexts["train"]
assert self._mode in self._serial_main_progs
serial_program = self._serial_main_progs[self._mode]
dist_main_prog = self._dist_main_progs[self._mode][self._cur_rank]
dist_context = self._dist_contexts[self._mode]
self._saver.save(path,
serial_program=serial_program,
dist_main_program=dist_main_prog,
dist_context=dist_context)
else:
mode = "predict"
feed_vars = self._feed_vars[mode]['inputs']
fetch_vars = self._fetch_vars[mode]['outputs']
dist_main_prog = self._dist_main_progs[mode][self._cur_rank]
assert "predict" in self._dist_main_progs
feed_vars = self._feed_vars["predict"]['inputs']
fetch_vars = self._fetch_vars["predict"]['outputs']
dist_main_prog = self._dist_main_progs["predict"][self._cur_rank]
self._saver.save_inference_model(path,
feed_vars,
fetch_vars,
......@@ -1097,7 +1510,7 @@ class Engine:
the parameter in file storing model states of or receives a
mismatch shape). Default: False.
load_optimizer (bool, optional): If True, the stored optimizer
states is restored. Otherwise, the optimizer states is intialized
states is restored. Otherwise, the optimizer states is initialized
from scratch. Default: False.
Returns:
......@@ -1136,65 +1549,82 @@ class Engine:
path, load_optimizer)
return self._state_dict, self._dist_attr
@staticmethod
def _get_lr_scheduler(program):
lr_sheduler = None
if hasattr(program, 'lr_sheduler'):
from paddle.optimizer.lr import LRScheduler
lr_sheduler = program.lr_sheduler
assert isinstance(lr_sheduler, LRScheduler), "must be LRScheduler"
return lr_sheduler
def _get_lr(self, optimizer):
if isinstance(optimizer, paddle.optimizer.Optimizer):
return optimizer.get_lr()
elif isinstance(optimizer, paddle.fluid.optimizer.Optimizer):
if isinstance(optimizer._learning_rate, float):
return optimizer._learning_rate
else:
return optimizer._learning_rate()
def cost(self, inputs_spec=None, labels_spec=None, mode="train"):
"""
Get and Print cost, including memory of every rank,
max memory among all ranks, and the global cost of one step based on
communication cost(computation cost is 0 by default).
In the future, the flops information of every rank and global cost including
computation cost will be added.
Args:
inputs_spec(InputSpec): The specification of inputs. Default: None.
labels_spec(InputSpec): The specification of labels. Default: None.
mode (str): The engine mode must be in ["train", "predict", "eval"]. Default: "train".
Returns:
Return the global execution time (ms) and max memory (B).
"""
# Check parallel mode
if self._strategy.auto_mode == "full":
print(
"The cost will be calcudated in the search process when the auto mode is full."
)
return
# Check mode
accepted_modes = ["train", "predict", "eval"]
if mode not in accepted_modes:
raise ValueError("The mode {} is not in accepted modes {}".format(
mode, accepted_modes))
self.to_mode(mode)
if inputs_spec is not None:
self._inputs_spec, self._labels_spec = inputs_spec, labels_spec
self._inputs, self._labels = self._prepare_data_tensor(
self._inputs_spec, self._labels_spec)
self._build(mode)
self._plan(mode)
else:
raise TypeError(
"'optimizer' must be object of class `paddle.optimizer.Optimizer`" \
" or `paddle.fluid.optimizer.Optimizer`, but got {}.".format(type(optimizer))
if _non_static_mode() or self._dygraph_mode:
raise ValueError(
"Please call `engine._prepare_program('mode')` firstly when in the static graph mode."
)
@property
def mode(self):
return self._mode
# Estimate the exec cost and max memory
global_cost, max_memory = get_cost_from_engine(self, mode)
@mode.setter
def mode(self, mode):
self._mode = mode
return global_cost.time, max_memory
@property
def main_program(self):
return self._dist_main_progs[self.mode][self._cur_rank]
return self._dist_main_progs[self._mode][self._cur_rank]
@property
def startup_program(self):
return self._dist_startup_progs[self.mode][self._cur_rank]
return self._dist_startup_progs[self._mode][self._cur_rank]
@property
def dist_context(self):
return self._dist_contexts[self.mode]
return self._dist_contexts[self._mode]
@property
def serial_main_program(self):
return self._serial_main_progs[self.mode]
return self._serial_main_progs[self._mode]
@property
def serial_startup_program(self):
return self._serial_startup_progs[self.mode]
return self._serial_startup_progs[self._mode]
@property
def fetch_vars(self):
return self._fetch_vars[self.mode]
return self._fetch_vars[self._mode]
@property
def inputs(self):
return self.inputs_spec
return self._inputs
@property
def labels(self):
return self.labels_spec
return self._labels
......@@ -139,7 +139,7 @@ class ProxyLayer(Layer):
"""
outs = []
for metric in self.metrics:
outs.extend(metric.compute(*inputs))
outs.append(to_list(metric.compute(*inputs)))
return outs
......
......@@ -12,6 +12,8 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from collections import defaultdict
import paddle
from paddle.fluid import core
from .process_mesh import ProcessMesh
......@@ -196,15 +198,36 @@ def recompute(op):
return RecomputeOperator(op)
_g_fetched_tensors = {}
_g_collections = {}
class CollectionNames(object):
FETCHES = "fetches"
LOGGING = "logging"
def get_collection(name):
collection = _g_collections.get(name, None)
if collection is None:
collection = []
_g_collections[name] = collection
return _g_collections[name]
def fetch(tensor, name=None):
if name is None:
_g_fetched_tensors[tensor.name] = tensor
def add_to_collection(collection_name, value, name=None):
if collection_name not in _g_collections:
_g_collections[collection_name] = []
if name is not None:
for _, v in _g_collections[collection_name]:
if v == value: return
_g_collections[collection_name].append((name, value))
else:
_g_fetched_tensors[name] = tensor
for _, v in _g_collections[collection_name]:
if v == value: return
_g_collections[collection_name].append((None, value))
def _get_fetches():
return _g_fetched_tensors
def fetch(tensor, name=None, logging=False):
add_to_collection(CollectionNames.FETCHES, tensor, name)
if logging:
add_to_collection(CollectionNames.LOGGING, tensor, name)
......@@ -33,3 +33,5 @@ from . import dist_slice
from . import dist_fused_feedforward
from . import dist_fused_attention
from . import dist_reduce_sum_p
from . import dist_shape
from . import dist_assign
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .common import DistributedOperatorImplContainer
from .common import DistributedOperatorImpl
from .common import register_distributed_operator_impl_container
from .common import register_distributed_operator_impl
from .dist_default import DistributedDefaultImpl0
from ..utils import compute_compatible_and_update_dim_mapping
class DistributedAssign(DistributedOperatorImplContainer):
def __init__(self, op_type):
super(DistributedAssign, self).__init__(op_type)
register_distributed_operator_impl_container(DistributedAssign("assign"))
class DistributedAssignImpl(DistributedOperatorImpl):
def __init__(self, name):
super(DistributedAssignImpl, self).__init__(name)
self._forward_implemented = True
self._backward_implemented = True
def is_input_compatible(self, dist_op):
return True
def is_output_compatible(self, dist_op):
return True
def is_auto_compatible(self, dist_op):
if (not self.is_input_compatible(dist_op)) or \
(not self.is_output_compatible(dist_op)):
return False
op_desc = dist_op.serial_op.desc
op_dist_attr = dist_op.dist_attr
x_name = op_desc.input('X')[0]
out_name = op_desc.output('Out')[0]
x_dims_mapping = op_dist_attr.get_input_dims_mapping(x_name)
out_dims_mapping = op_dist_attr.get_output_dims_mapping(out_name)
if x_dims_mapping != out_dims_mapping:
return False
return True
def update_dims_mapping(self, dist_op):
changed = False
op_desc = dist_op.serial_op.desc
op_dist_attr = dist_op.dist_attr
x_name = op_desc.input('X')[0]
out_name = op_desc.output('Out')[0]
x_dims_mapping = op_dist_attr.get_input_dims_mapping(x_name)
out_dims_mapping = op_dist_attr.get_output_dims_mapping(out_name)
for i in range(len(x_dims_mapping)):
dim_changed = compute_compatible_and_update_dim_mapping(
[x_dims_mapping, out_dims_mapping], [i, i])
if dim_changed:
changed = True
return changed
@staticmethod
def forward(ctx, *args, **kwargs):
DistributedDefaultImpl0.forward(ctx, *args, **kwargs)
@staticmethod
def backward(ctx, *args, **kwargs):
DistributedDefaultImpl0.backward(ctx, *args, **kwargs)
register_distributed_operator_impl("assign", DistributedAssignImpl("assign"))
......@@ -1308,6 +1308,8 @@ class DistributedMatmulV2Impl0(DistributedOperatorImpl):
process_mesh = dist_attr.process_mesh
processes = process_mesh.processes
# col parallel: matmul + allreduce
if backward_op.attr("trans_y"):
Y_var_dim_mapping.reverse()
assert Y_var_dim_mapping[0] < 0
parallel_axis = Y_var_dim_mapping[1]
......
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .common import DistributedOperatorImplContainer
from .common import DistributedOperatorImpl
from .common import register_distributed_operator_impl_container
from .common import register_distributed_operator_impl
from .dist_default import DistributedDefaultImpl0
from ..utils import is_dim_shard
class DistributedShape(DistributedOperatorImplContainer):
def __init__(self, op_type):
super(DistributedShape, self).__init__(op_type)
register_distributed_operator_impl_container(DistributedShape("shape"))
class DistributedShapeImpl(DistributedOperatorImpl):
def __init__(self, name):
super(DistributedShapeImpl, self).__init__(name)
self._forward_implemented = True
self._backward_implemented = True
def is_input_compatible(self, dist_op):
return True
def is_output_compatible(self, dist_op):
op_desc = dist_op.serial_op.desc
op_dist_attr = dist_op.dist_attr
out_name = op_desc.output('Out')[0]
out_dims_mapping = op_dist_attr.get_output_dims_mapping(out_name)
assert len(out_dims_mapping) == 1
if is_dim_shard(out_dims_mapping[0]):
return False
return True
def is_auto_compatible(self, dist_op):
if (not self.is_input_compatible(dist_op)) or \
(not self.is_output_compatible(dist_op)):
return False
return True
def update_dims_mapping(self, dist_op):
return False
@staticmethod
def forward(ctx, *args, **kwargs):
DistributedDefaultImpl0.forward(ctx, *args, **kwargs)
@staticmethod
def backward(ctx, *args, **kwargs):
DistributedDefaultImpl0.backward(ctx, *args, **kwargs)
register_distributed_operator_impl("shape", DistributedShapeImpl("shape"))
......@@ -101,8 +101,12 @@ class DistributedSplitImpl(DistributedOperatorImpl):
return changed
def is_auto_compatible(self, dist_op):
raise NotImplementedError(
"Auto Search is not supported by dist split yet.")
if (not self.is_input_compatible(dist_op)) or \
(not self.is_output_compatible(dist_op)) or \
(not self.is_compatible(dist_op)):
return False
return True
@staticmethod
def forward(ctx, *args, **kwargs):
......
......@@ -23,14 +23,12 @@ import logging
import pickle
import time
import paddle
from paddle.fluid.backward import append_backward
from paddle.distributed.utils.log_utils import get_logger
from paddle.distributed.fleet import cloud_utils
import paddle.fluid.core as core
from paddle.fluid import program_guard
from paddle.fluid.backward import append_backward
from paddle.distributed.utils.log_utils import get_logger
from paddle.distributed.passes import new_pass, PassContext
from .dist_context import DistributedContext
from .dist_context import get_default_distributed_context
from .dist_context import set_default_distributed_context
from .completion import Completer
from .partitioner import Partitioner
......@@ -40,9 +38,7 @@ from .process_group import get_world_process_group
from .process_group import _g_process_group_map, ProcessGroup
from .utils import make_data_unshard
from .utils import set_grad_var_shape
from .utils import print_program_with_dist_attr
from .utils import SerialProgramInfo
from .utils import get_logger
from .reshard import Resharder
from .cluster import Cluster
from .mapper import mapping
......@@ -148,7 +144,7 @@ class AutoParallelizer:
with program_guard(main_program, startup_program):
optimize_ops = optimizer.apply_gradients(params_grads)
self._dist_context._lr_optimizer = optimizer
self._dist_context._serial_optimizer = optimizer
# update completion
self._completer = Completer(self._dist_context)
self._completer.complete_update_annotation(main_program)
......
......@@ -15,24 +15,17 @@
import copy
import time
import logging
from collections import defaultdict
import paddle
from paddle.fluid import program_guard
from paddle.fluid.backward import append_backward
from paddle.fluid.framework import _non_static_mode, unique_name
from paddle.fluid.framework import unique_name
from paddle.distributed.passes import new_pass
from .reshard import Resharder
from .partitioner import Partitioner
from .dist_op import DistributedOperator
from .dist_saver import DistributedSaver
from .dist_loader import NonIterableGeneratorLoader
from .utils import make_data_unshard, set_grad_var_shape
from .utils import print_program_with_dist_attr, to_list
from .utils import get_logger
from .process_group import get_all_process_groups, get_world_process_group
from .dist_context import DistributedContext, get_default_distributed_context
from .utils import set_grad_var_shape
from .process_group import get_world_process_group
from ..utils.log_utils import get_logger
class Parallelizer:
......@@ -69,7 +62,7 @@ class Parallelizer:
serial_main_program, serial_startup_program, params_grads = self._apply_pre_optimization(
serial_main_program, serial_startup_program, serial_loss,
serial_optimizer, params_grads)
self._logger.info(
self._logger.debug(
"within parallel apply_pre_optimization time: {}, mode {}".
format(time.time() - time0, self._mode))
# Do logical partition
......@@ -77,14 +70,14 @@ class Parallelizer:
partitioner = Partitioner(self._dist_context, rank)
dist_main_prog, dist_startup_prog, dist_params_grads = partitioner.partition(
serial_main_program, serial_startup_program, params_grads)
self._logger.info(
self._logger.debug(
"within parallel partitioner time: {}, mode {}".format(
time.time() - time0, self._mode))
# Generate optimizer
time0 = time.time()
self._generate_optimizer(dist_main_prog, dist_startup_prog,
serial_optimizer, dist_params_grads)
self._logger.info(
self._logger.debug(
"within parallel optimizer time: {}, mode {}".format(
time.time() - time0, self._mode))
# Do reshard process
......@@ -93,14 +86,14 @@ class Parallelizer:
resharder = Resharder(dist_main_prog, dist_startup_prog, rank,
self._dist_context, dist_params_grads)
resharder.reshard()
self._logger.info(
self._logger.debug(
"within parallel reshard time: {}, mode {}".format(
time.time() - time0, self._mode))
# Apply post optimization passes
time0 = time.time()
self._apply_post_optimization(dist_main_prog, dist_startup_prog,
rank, dist_params_grads)
self._logger.info(
self._logger.debug(
"within parallel apply_post_optimization time: {}, mode {}".
format(time.time() - time0, self._mode))
else:
......@@ -109,7 +102,7 @@ class Parallelizer:
self._apply_pre_optimization(serial_main_program,
serial_startup_program, None, None,
None)
self._logger.info(
self._logger.debug(
"within parallel apply_pre_optimization time: {}, mode {}".
format(time.time() - time0, self._mode))
# Do logical partition
......@@ -118,14 +111,14 @@ class Parallelizer:
dist_main_prog, dist_startup_prog, dist_params_grads = partitioner.partition(
serial_main_program, serial_startup_program, [])
# Do reshard process
self._logger.info(
self._logger.debug(
"within parallel partitioner time: {}, mode {}".format(
time.time() - time0, self._mode))
time0 = time.time()
resharder = Resharder(dist_main_prog, dist_startup_prog, rank,
self._dist_context, [], 1)
resharder.reshard()
self._logger.info(
self._logger.debug(
"within parallel reshard time: {}, mode {}".format(
time.time() - time0, self._mode))
# Clone program for test
......@@ -150,7 +143,7 @@ class Parallelizer:
# NOTE: `apply_gradients` will add an Accumulator for a parameter only once,
# but optimizer will be called repeatedly in re-launch, so optimizer need to be copied.
optimizer = copy.deepcopy(optimizer)
self._dist_context._lr_optimizer = optimizer
self._dist_context._serial_optimizer = optimizer
with program_guard(main_program, startup_program):
with unique_name.guard("opt_"):
optimizer_ops = optimizer.apply_gradients(params_grads)
......@@ -177,9 +170,7 @@ class Parallelizer:
startup_program = self._pass_context.get_attr("startup_program")
params_grads = self._pass_context.get_attr("params_grads")
# apply amp pass
# FIXME we disenable amp for eval since it has a little bug with
# eval program and which will be fixed in future
# apply amp pass on train/eval/predict
if self._strategy.amp.enable:
config = copy.deepcopy(self._strategy.amp.to_dict())
config["dist_context"] = self._dist_context
......
......@@ -28,7 +28,7 @@ from .utils import set_dist_op_desc_original_id
from .utils import print_program_with_dist_attr, is_forward_op, is_backward_op, is_loss_op, is_optimize_op
from .operators.common import BACKWARD_ONLY_DIST_OPS
__varname_not_in_block__ = ["lod_tensor_blocking_queue_0"]
__varname_not_in_block__ = ["lod_tensor_blocking_queue"]
__not_shape_var_type__ = [
core.VarDesc.VarType.READER, core.VarDesc.VarType.STEP_SCOPES
]
......@@ -243,7 +243,9 @@ class Partitioner(object):
target_block, serial_input_varname,
new_varname)
else:
assert serial_input_varname in __varname_not_in_block__
for varname_not_in_block in __varname_not_in_block__:
assert varname_not_in_block in serial_input_varname, \
"{} is not found".format(serial_input_varname)
self._serial2dist_varname_mapping[
serial_input_varname] = new_varname
......
......@@ -14,9 +14,7 @@
from .completion import Completer
from .dist_context import get_default_distributed_context
from .utils import print_program_with_dist_attr
# from .tuner.parallel_tuner import ParallelTuner
from .tuner.parallel_tuner import ParallelTuner
class Planner:
......@@ -39,20 +37,20 @@ class Planner:
self._completer = Completer(self._dist_context)
self._strategy = dist_context.strategy
# if self._strategy.auto_search:
# self._parallel_tuner = ParallelTuner(
# self._dist_context, mode=self._mode)
# set parallel tuner for auto search
if self._strategy.auto_mode == "full":
self._parallel_tuner = ParallelTuner(self._dist_context,
mode=self._mode)
@property
def completer(self):
return self._completer
def plan(self):
self._completer.complete_forward_annotation()
# if self._strategy.auto_search:
# self._parallel_tuner.tune()
# else:
# self._completer.complete_forward_annotation()
if self._strategy.auto_mode == "full":
self._parallel_tuner.tune()
else:
self._completer.complete_forward_annotation()
# parse forward sub block
self._dist_context.block_state.parse_forward_blocks(
self._dist_context.serial_main_program)
......@@ -168,7 +168,10 @@ class ProcessMesh(object):
else:
new_mesh = self._mesh[index]
new_dim_names = self._dim_names[1:]
return ProcessMesh(new_mesh, new_dim_names)
if new_mesh.shape:
return ProcessMesh(new_mesh, new_dim_names)
else:
return ProcessMesh([new_mesh])
def __enter__(self):
set_current_process_mesh(self)
......
......@@ -37,6 +37,7 @@ _g_special_ops = ['check_finite_and_unscale', 'update_loss_scaling']
_g_gradient_clip_ops = [
"sum", "sqrt", "fill_constant", "elementwise_max", "elementwise_div"
]
_g_subblock_ops = ["while", "conditional_block"]
def get_var_with_recursion(var_name, block, program):
......@@ -45,10 +46,11 @@ def get_var_with_recursion(var_name, block, program):
if var_name in block.vars:
var = block.vars[var_name]
else:
parent_block = program.blocks[block.parent_idx]
if var_name in parent_block.vars:
var = parent_block.vars[var_name]
assert var is not None
var = block._var_recursive(var_name)
# parent_block = program.blocks[block.parent_idx]
# if var_name in parent_block.vars:
# var = parent_block.vars[var_name]
assert var is not None, "{} is not found".format(var.name)
return var
......@@ -1077,7 +1079,9 @@ class Resharder:
new_Out = []
for var_name in while_op.output("Out"):
for output_name in sub_block_op_outputs[::-1]:
if output_name.find(var_name) != -1:
if output_name.find(var_name) != -1 and (
len(var_name) == len(output_name)
or "@RESHARD" in output_name):
if output_name not in new_Out:
new_Out.append(output_name)
assert new_Out
......@@ -1106,13 +1110,15 @@ class Resharder:
return False
def is_condition_replicative(self, op):
assert op.type == "while"
sub_block = self.auto_parallel_main_prog.blocks[op.attr("sub_block").id]
dist_op = self.dist_context.get_dist_op_for_program(op)
op_dist_attr = dist_op.dist_attr
if op.type == "while":
input_cond = op.input("Condition")
elif op.type == "conditional_block":
input_cond = op.input("Cond")
# the dims mapping of condition tensor should be replicative
for var_name in op.input("Condition"):
for var_name in input_cond:
var = get_var_with_recursion(var_name, sub_block,
self.auto_parallel_main_prog)
dist_tensor = self.dist_context.get_dist_tensor_for_program(var)
......@@ -1662,9 +1668,9 @@ class Resharder:
op.desc.set_input(proto.inputs[0].name,
op.input("X") + while_op_X_append)
def _get_while_op_input_attrs(self, op, var_name):
def _get_subblock_input_attrs(self, op, var_name):
# NOTE: Multi while loop is not supported
assert op.type == "while"
assert op.type in _g_subblock_ops
sub_block = self.auto_parallel_main_prog.blocks[op.attr("sub_block").id]
ops = sub_block.ops
input_attrs = []
......@@ -1715,8 +1721,8 @@ class Resharder:
def get_op_input_attrs(self, op, var_name):
op_input_attrs = []
if op.type == "while":
op_input_attrs = self._get_while_op_input_attrs(op, var_name)
if op.type in _g_subblock_ops:
op_input_attrs = self._get_subblock_input_attrs(op, var_name)
else:
op_input_attrs = self._get_common_op_input_attrs(op, var_name)
......@@ -1738,8 +1744,18 @@ class Resharder:
if len(set(process_mesh.processes)) == len(processes):
global_process_mesh_idx = idx
break
if global_process_mesh_idx is not None:
self.dist_context.process_meshes.pop(idx)
is_removed = False
global_mesh = self.dist_context.process_meshes[idx]
for i, mesh in enumerate(self.dist_context.process_meshes):
if i == idx:
continue
if set(mesh.processes) < set(global_mesh.processes):
is_removed = True
if is_removed:
self.dist_context.process_meshes.pop(idx)
def _change_subblock_op_input_and_output(self, block_idx, block):
if "var_reshard_mapping" in Resharder.while_block_info[block_idx]:
......@@ -1810,7 +1826,7 @@ class Resharder:
if dist_op is not None:
op_input_dist_attrs = [
] # [(op_process_mesh, op_input_dims_mapping), (op_process_mesh, op_input_dims_mapping)]
if op.type == "while":
if op.type in _g_subblock_ops:
if not self.is_condition_replicative(op):
raise ValueError(
"Please check the condition due to the dims mapping is not replicative."
......@@ -1824,6 +1840,8 @@ class Resharder:
if op.type == "while":
# condition var process mesh is the same with op and dims_mapping is replicative, so it do not need reshard
input_var_names = op.input("X")
elif op.type == "conditional_block":
input_var_names = op.input("Input")
else:
input_var_names = op.input_arg_names
# to avoid while op X order different
......@@ -1831,8 +1849,8 @@ class Resharder:
idx_offset = 0
for var_name in input_var_names:
# skip lod_tensor_blocking_queue_0
if var_name == "lod_tensor_blocking_queue_0":
# skip lod_tensor_blocking_queue_? name
if "lod_tensor_blocking_queue" in var_name:
continue
var = get_var_with_recursion(var_name, block,
self.auto_parallel_main_prog)
......@@ -1976,11 +1994,12 @@ class Resharder:
idx = 0
# skip reader and ops whose process mesh is union
skip_ops = [
"create_py_reader", "create_double_buffer_reader", "read", "while",
"create_py_reader", "create_double_buffer_reader", "read",
"write_to_array", "read_from_array"
]
global _g_special_ops
skip_ops += _g_special_ops
skip_ops += _g_subblock_ops
while idx < len(block.ops):
pre_op_count = len(block.ops)
op = block.ops[idx]
......
......@@ -116,6 +116,13 @@ class TuningConfig(BaseConfig):
super(TuningConfig, self).__init__(category, config_dict)
class DatasetConfig(BaseConfig):
def __init__(self, config_dict=None):
category = constants.DATASET
super(DatasetConfig, self).__init__(category, config_dict)
class Strategy(BaseConfig):
"""
The `Strategy` object is used to configure the paralleization and optimization beheviors.
......@@ -180,3 +187,6 @@ class Strategy(BaseConfig):
config_dict = self._config_dict.get(constants.TUNING, None)
self.tuning = TuningConfig(config_dict)
config_dict = self._config_dict.get(constants.DATASET, None)
self.dataset = DatasetConfig(config_dict)
......@@ -136,12 +136,24 @@ def _copy_context(ref_dist_context):
for key, var_list in ref_dist_context._serial_fetch_vars.items():
new_var_list = []
for var in var_list:
block_idx = var.block.idx
var_name = var.name
var = new_dist_context._serial_main_program.blocks[
block_idx]._var_recursive(var_name)
new_var_list.append(var)
# metrics is a list of list
if key == "metrics":
for inner_var_list in var_list:
new_inner_var_list = []
for var in inner_var_list:
block_idx = var.block.idx
var_name = var.name
var = new_dist_context._serial_main_program.blocks[
block_idx]._var_recursive(var_name)
new_inner_var_list.append(var)
new_var_list.append(new_inner_var_list)
else:
for var in var_list:
block_idx = var.block.idx
var_name = var.name
var = new_dist_context._serial_main_program.blocks[
block_idx]._var_recursive(var_name)
new_var_list.append(var)
new_dist_context._serial_fetch_vars[key] = new_var_list
# copy information in forward and backward
......
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import time
import math
import copy
import hashlib
import itertools
from collections import defaultdict
import numpy as np
from ..process_mesh import ProcessMesh
from ..completion import Completer
from ..parallelizer_v2 import Parallelizer
from ..dist_context import _node_id
from ..dist_op import DistributedOperator
from ..operators.common import find_compatible_distributed_operator_impls
from .trial import Trial, TrialStatus
from .tunable_space import TunableSpace
from .tunable_variable import Boolean, IntRange
from ..cost import CostEstimator
from .tunable_variable import Boolean, IntRange
class ParallelTuner:
def __init__(self,
dist_context,
mode="train",
max_trials=25,
tuner_id=None,
seed=None,
logger=None,
loop_count=10):
self._loop_count = loop_count
self._estimator = None
self._dist_context = dist_context
assert self._dist_context._is_initialized
self._mode = mode
self._cluster = self._dist_context.cluster
self._num_machines = self._cluster.get_num_machines()
self._num_devices_per_machine = self._cluster.get_num_devices_per_machine(
)
self._space = TunableSpace()
self._objective = "time"
self._direction = "min"
self._max_trials = max_trials
self._tuner_id = tuner_id
self._seed = seed if seed is not None else 9999
print("seed",
self._seed,
"mode",
self._mode,
"num_machies",
self._num_machines,
"num_devices_per_machine",
self._num_devices_per_machine,
flush=True)
self._seed_state = self._seed
self._logger = logger
self._max_collisions = 3
self._tried_values = set()
self._num_trials = 0
self._rng = np.random.default_rng(self._seed)
# Search the op types in the include_op_types,
# and will search all op types if it is empty.
# Exclude the op types in the exclude_op_types
# from the search list.
self._exclude_op_types = []
self._include_op_types = []
# The final dist ops will be searched after considering
# the include_op_types and exclude_op_types.
self._concerned_dist_ops = {}
self._op_id_to_dist_attr_candidates = defaultdict(list)
self._cached_dims_mapping_candidates = {}
self._cached_candidates_info = defaultdict(list)
self._special_ops = [
"create_py_reader", "create_double_buffer_reader", "read", "while",
"read_from_array", "write_to_array"
]
# Each parallel strategy has two elements. The First one is for distributed tensors,
# the second element is for distributed tensors, the third element is for process meshes.
self._init_parallel_strategy = [None, None, None]
self._best_parallel_strategy = [None, None, None]
self._completer = Completer(self._dist_context)
self._parallelizer = Parallelizer(self._mode, self._completer,
self._dist_context)
def _generate_combination(self,
elements,
target,
idx,
partial_candidate,
candidates,
num_candidates=None):
if target == 0:
candidates.append(copy.deepcopy(partial_candidate))
return
if target < 0 or idx == len(elements) \
or len(candidates) > num_candidates:
return
# Use
partial_candidate.append(elements[idx])
self._generate_combination(elements, target - elements[idx], idx,
partial_candidate, candidates,
num_candidates)
# Not use
partial_candidate.pop()
self._generate_combination(elements, target, idx + 1, partial_candidate,
candidates, num_candidates)
def _permute_combination(self,
combination,
target,
check,
partial_candidate,
candidates,
num_candidates=None,
skip_prob=None):
if num_candidates is not None \
and len(candidates) == num_candidates:
return
if len(partial_candidate) == len(combination):
candidates.append(partial_candidate)
return
for i in range(len(combination)):
if check[i] == 1:
continue
if self._rng.choice([True, False], p=[skip_prob, 1 - skip_prob]):
continue
if i > 0 and combination[i] == combination[i - 1] \
and check[i -1] == 0:
continue
check[i] = 1
self._permute_combination(combination, target, check,
partial_candidate + [combination[i]],
candidates, num_candidates, skip_prob)
check[i] = 0
def _partition_number(self, target):
log2_target = int(math.log2(target))
elements = [pow(2, i) for i in range(log2_target)]
if pow(2, log2_target) == target:
elements.append(target)
seed_candidates = []
num_seed_candidates = 1000
partial_results = []
self._generate_combination(elements, target, 0, partial_results,
seed_candidates, num_seed_candidates)
candidates = []
for seed_candidate in seed_candidates:
cur_candidates = []
num_cur_candidates = 16
seed_candidate.sort()
check = [0 for i in range(len(seed_candidate))]
if target <= 8:
skip_prob = 0.0
else:
skip_prob = (len(seed_candidate) / target)
self._permute_combination(seed_candidate, target, check, [],
cur_candidates, num_cur_candidates,
skip_prob)
candidates.extend(cur_candidates)
return candidates
def _partition_devices(self, num_machines, num_devices_per_machine):
inter_node_partitions = self._partition_number(num_machines)
intra_node_partitions = self._partition_number(num_devices_per_machine)
return inter_node_partitions, intra_node_partitions
def _generate_process_mesh_list(self, inter_node_partition,
intra_node_partition):
process_mesh_list = []
start_row = 0
start_col = 0
for m in inter_node_partition:
start_col = 0
for n in intra_node_partition:
process_mesh = []
for p in range(m):
start = (start_row +
p) * self._num_devices_per_machine + start_col
tmp = []
for q in range(n):
tmp.append(start + q)
process_mesh.append(tmp)
process_mesh_list.append(copy.deepcopy(process_mesh))
start_col += n
start_row += m
return process_mesh_list
def _generate_dims_mapping_candidates_helper(self, dims_mapping, dims_list,
start, visited, candidates):
if start == len(dims_mapping) or all(visited):
candidates.append(copy.deepcopy(dims_mapping))
return
for idx, dim in enumerate(dims_list):
if visited[idx] == False:
dims_mapping[start] = dim
visited[idx] = True
self._generate_dims_mapping_candidates_helper(
dims_mapping, dims_list, start + 1, visited, candidates)
visited[idx] = False
dims_mapping[start] = -1
self._generate_dims_mapping_candidates_helper(dims_mapping, dims_list,
start + 1, visited,
candidates)
def _generate_dims_mapping_candidates(self, dims_mapping_len,
process_mesh_len):
assert dims_mapping_len >= 1 and process_mesh_len >= 1
key = (dims_mapping_len, process_mesh_len)
if key in self._cached_dims_mapping_candidates:
return self._cached_dims_mapping_candidates[key]
candidates = []
dims_mapping = [-1 for i in range(dims_mapping_len)]
dims_list = [i for i in range(process_mesh_len)]
visited = [False for i in range(process_mesh_len)]
self._generate_dims_mapping_candidates_helper(dims_mapping, dims_list,
0, visited, candidates)
self._cached_dims_mapping_candidates[key] = candidates
return candidates
def _generate_dist_attr_candidates(self, op_id, dist_op):
# For now, only allow the process meshes have two dimensions
process_mesh_len = 2
serial_op = dist_op.serial_op
op_dist_attr = dist_op.dist_attr
if serial_op.type in self._special_ops:
return [copy.deepcopy(op_dist_attr)]
key = []
key.append(serial_op.type)
for input_name in serial_op.input_names:
key.append(input_name)
for input_arg_name in serial_op.input(input_name):
key.append(
len(op_dist_attr.get_input_dims_mapping(input_arg_name)))
for output_name in serial_op.output_names:
key.append(output_name)
for output_arg_name in serial_op.output(output_name):
key.append(
len(op_dist_attr.get_output_dims_mapping(output_arg_name)))
key = tuple(key)
if key in self._cached_candidates_info:
cached_dist_attr_candidates = []
cached_input_arg_names = self._cached_candidates_info[key][0]
cached_output_arg_names = self._cached_candidates_info[key][1]
for cached_dist_attr in self._cached_candidates_info[key][2]:
new_op_dist_attr = copy.deepcopy(dist_op.dist_attr)
i = 0
for input_name in serial_op.input_names:
for input_arg_name in serial_op.input(input_name):
cached_dims_mapping = cached_dist_attr.get_input_dims_mapping(
cached_input_arg_names[i])
new_op_dist_attr.set_input_dims_mapping(
input_arg_name, cached_dims_mapping)
i += 1
i = 0
for output_name in serial_op.output_names:
for output_arg_name in serial_op.output(output_name):
cached_dims_mapping = cached_dist_attr.get_output_dims_mapping(
cached_output_arg_names[i])
new_op_dist_attr.set_output_dims_mapping(
output_arg_name, cached_dims_mapping)
i += 1
cached_dist_attr_candidates.append(new_op_dist_attr)
return cached_dist_attr_candidates
# cached_candidates_info = []
input_arg_names = []
for input_name in serial_op.input_names:
for input_arg_name in serial_op.input(input_name):
input_arg_names.append(input_arg_name)
self._cached_candidates_info[key].append(input_arg_names)
# cached_candidates_info.append(input_arg_names)
output_arg_names = []
for output_name in serial_op.output_names:
for output_arg_name in serial_op.output(output_name):
output_arg_names.append(output_arg_name)
self._cached_candidates_info[key].append(output_arg_names)
# cached_candidates_info.append(output_arg_names)
new_op_dist_attr = copy.deepcopy(dist_op.dist_attr)
# Find valid dims_mapping candidates for inputs
input_names = []
dims_mapping_generated = []
inputs_dist_attrs = op_dist_attr.inputs_dist_attrs
for tensor_name, tensor_dist_attr in inputs_dist_attrs.items():
original_dims_mapping = tensor_dist_attr.dims_mapping
dims_mapping_len = len(original_dims_mapping)
input_names.append(tensor_name)
if dims_mapping_len < 1:
dims_mapping_generated.append(
[copy.deepcopy(original_dims_mapping)])
else:
dims_mapping_generated.append(
self._generate_dims_mapping_candidates(
dims_mapping_len, process_mesh_len))
input_dims_mapping_candidates = []
for dims_mapping_list in itertools.product(*dims_mapping_generated):
dims_mapping_list = list(dims_mapping_list)
assert len(dims_mapping_list) == len(input_names)
for i, dims_mapping in enumerate(dims_mapping_list):
new_op_dist_attr.set_input_dims_mapping(input_names[i],
dims_mapping)
new_dist_op = DistributedOperator(dist_op.serial_op,
new_op_dist_attr)
dist_op_impls = find_compatible_distributed_operator_impls(
new_dist_op, fwd=True)
if dist_op_impls is not None:
input_dims_mapping_candidates.append(dims_mapping_list)
# Find valid dims_mapping candidates for outputs
output_names = []
dims_mapping_generated = []
outputs_dist_attrs = op_dist_attr.outputs_dist_attrs
for tensor_name, tensor_dist_attr in outputs_dist_attrs.items():
original_dims_mapping = tensor_dist_attr.dims_mapping
dims_mapping_len = len(original_dims_mapping)
output_names.append(tensor_name)
if dims_mapping_len < 1:
dims_mapping_generated.append(
[copy.deepcopy(original_dims_mapping)])
else:
dims_mapping_generated.append(
self._generate_dims_mapping_candidates(
dims_mapping_len, process_mesh_len))
output_dims_mapping_candidates = []
for dims_mapping_list in itertools.product(*dims_mapping_generated):
dims_mapping_list = list(dims_mapping_list)
assert len(dims_mapping_list) == len(output_names)
for i, dims_mapping in enumerate(dims_mapping_list):
new_op_dist_attr.set_output_dims_mapping(
output_names[i], dims_mapping)
new_dist_op = DistributedOperator(dist_op.serial_op,
new_op_dist_attr)
dist_op_impls = find_compatible_distributed_operator_impls(
new_dist_op, fwd=False)
if dist_op_impls is not None:
output_dims_mapping_candidates.append(dims_mapping_list)
if not input_dims_mapping_candidates and output_dims_mapping_candidates:
inout_dims_mapping_generated = [[[[-2]]],
output_dims_mapping_candidates]
elif input_dims_mapping_candidates and not output_dims_mapping_candidates:
inout_dims_mapping_generated = [
input_dims_mapping_candidates, [[[-2]]]
]
elif not input_dims_mapping_candidates and not output_dims_mapping_candidates:
inout_dims_mapping_generated = [[[[-2]]], [[[-2]]]]
else:
inout_dims_mapping_generated = [
input_dims_mapping_candidates, output_dims_mapping_candidates
]
# Find valid dims_mapping generated for both inputs and outputs
cached_dist_attr_candidates = []
for inout_dims_mapping_list in itertools.product(
*inout_dims_mapping_generated):
assert len(inout_dims_mapping_list) == 2
if input_dims_mapping_candidates:
assert len(inout_dims_mapping_list[0]) == len(input_names)
if output_dims_mapping_candidates:
assert len(inout_dims_mapping_list[1]) == len(output_names)
# set the dims_mappings for inputs
for i, dims_mapping in enumerate(inout_dims_mapping_list[0]):
if dims_mapping != [-2]:
new_op_dist_attr.set_input_dims_mapping(
input_names[i], dims_mapping)
# set the dims_mappings for outputs
for i, dims_mapping in enumerate(inout_dims_mapping_list[1]):
if dims_mapping != [-2]:
new_op_dist_attr.set_output_dims_mapping(
output_names[i], dims_mapping)
new_dist_op = DistributedOperator(dist_op.serial_op,
new_op_dist_attr)
dist_op_impls = find_compatible_distributed_operator_impls(
new_dist_op, partial=False)
if dist_op_impls is None:
continue
for dist_op_impl in dist_op_impls:
new_op_dist_attr.impl_type = dist_op_impl.type
new_op_dist_attr.impl_idx = dist_op_impl.idx
cached_dist_attr_candidates.append(
copy.deepcopy(new_op_dist_attr))
self._cached_candidates_info[key].append(cached_dist_attr_candidates)
return self._cached_candidates_info[key][2]
def construct_space(self):
inter_node_partitions, intra_node_partitions = self._partition_devices(
self._num_machines, self._num_devices_per_machine)
self._space.choice("inter_node_partitions",
inter_node_partitions,
default=inter_node_partitions[0])
self._space.choice("intra_node_partitions",
intra_node_partitions,
default=intra_node_partitions[0])
dist_ops = self._dist_context._dist_ops_for_program
for op_id, dist_op in dist_ops.items():
op_type = dist_op.serial_op.type
if self._include_op_types:
if op_type in self._include_op_types:
self._concerned_dist_ops[op_id] = dist_op
else:
self._concerned_dist_ops[op_id] = dist_op
for op_id, dist_op in self._concerned_dist_ops.items():
op_type = dist_op.serial_op.type
if op_type in self._exclude_op_types:
del self._concerned_dist_ops[op_id]
print("Number of the concered dist ops",
len(self._concerned_dist_ops),
flush=True)
search_space = 1
for op_id, dist_op in self._concerned_dist_ops.items():
op_dist_attr_candidates = self._generate_dist_attr_candidates(
op_id, dist_op)
search_space *= len(op_dist_attr_candidates)
self._space.choice(str(op_id),
op_dist_attr_candidates,
default=op_dist_attr_candidates[0])
def _compute_values_hash(self, values):
keys = sorted(values.keys())
s = "".join(str(k) + "=" + str(values[k]) for k in keys)
return hashlib.sha256(s.encode("utf-8")).hexdigest()[:32]
def _random_values(self):
space = TunableSpace()
collisions = 0
while True:
for v in self._space.variables.values():
space._register(v)
space.values[v.name] = v.random(self._seed_state)
self._seed_state += 1
values = space.values
values_hash = self._compute_values_hash(values)
if values_hash in self._tried_values:
collisions += 1
if collisions > self._max_collisions:
return None
continue
self._tried_values.add(values_hash)
break
return values
def _populate_space(self):
values = self._random_values()
if values is None:
return {"status": TrialStatus.STOPPED, "values": None}
return {"status": TrialStatus.RUNNING, "values": values}
def _create_trial(self):
trial_id = "{{:0{}d}}".format(len(str(self._max_trials)))
trial_id = trial_id.format(self._num_trials)
if self._max_trials and self._num_trials >= self._max_trials:
status = TrialStatus.STOPPED
values = None
else:
results = self._populate_space()
status = results["status"]
values = results["values"]
space = TunableSpace()
space.variables = self._space.variables
space.values = values
trial = Trial(tunable_space=space, trial_id=trial_id, status=status)
self._num_trials += 1
return trial
def _generate_pipeline_starts(self, process_mesh_list):
total_ops = len(self._dist_context._dist_ops_for_program)
total_stages = len(process_mesh_list)
ops_per_stage = total_ops // total_stages
if ops_per_stage == 0:
return None
# Compute the initial pipeline starts
pipeline_starts = []
start = 0
pipeline_starts.append(0)
# The pipeline_starts have total_stages+1 items, and
# at least have 2 items.
for _ in process_mesh_list:
start += ops_per_stage
pipeline_starts.append(start)
pipeline_starts[-1] = total_ops
# Adjust the pipeline starts by random selection
directions = []
sizes = []
half_ops_per_stage = ops_per_stage // 2
if half_ops_per_stage > 0 and total_stages > 1:
new_pipeline_starts = []
# Don't change the first start
new_pipeline_starts.append(0)
# Consider the starts except the first and the last one
for _ in pipeline_starts[1:-1]:
directions.append(Boolean("direction"))
sizes.append(
IntRange("size",
start=0,
stop=half_ops_per_stage,
endpoint=True))
for i, start in enumerate(pipeline_starts[1:-1]):
direction = directions[i].random(self._seed)
size = sizes[i].random(self._seed)
if direction:
# Substract 1 from size to avoid the overlapping of new starts
new_start = start - (size - 1)
else:
new_start = start + size
new_pipeline_starts.append(new_start)
# Don't change the last start
new_pipeline_starts.append(pipeline_starts[-1])
# Validate the new starts
print("Adjusted pipeline starts",
new_pipeline_starts,
half_ops_per_stage,
pipeline_starts,
flush=True)
for i, new_start in enumerate(new_pipeline_starts[1:]):
assert new_start > new_pipeline_starts[i]
return new_pipeline_starts
else:
print("Non-adjusted pipeline starts",
pipeline_starts,
half_ops_per_stage,
flush=True)
return pipeline_starts
def _apply_pipeline_partition(self, process_mesh_list):
op_id_to_process_mesh = {}
total_ops = len(self._dist_context._dist_ops_for_program)
total_stages = len(process_mesh_list)
ops_per_stage = total_ops // total_stages
if ops_per_stage == 0:
return None
pipeline_starts = self._generate_pipeline_starts(process_mesh_list)
start_idx = 1
sorted_op_ids = sorted(self._dist_context._dist_ops_for_program.keys())
for idx, op_id in enumerate(sorted_op_ids):
if idx < pipeline_starts[start_idx]:
op_id_to_process_mesh[op_id] = process_mesh_list[start_idx - 1]
else:
start_idx += 1
op_id_to_process_mesh[op_id] = process_mesh_list[start_idx - 1]
return op_id_to_process_mesh
def _amend_dist_attr(self):
# 1) Reshape the process mesh of [1, x] to [x] or [x, 1] to [x],
# and amend the corresponding dims_mapping.
# 2) Set the dim_mapping to -1 when the shape cannot be divided
# by the corresponding processes.
for dist_op in self._dist_context._dist_ops_for_program.values():
dist_attr = dist_op.dist_attr
process_mesh = dist_attr.process_mesh
if process_mesh is None:
continue
assert process_mesh.ndim == 2
dim_of_one = None
dim_of_other = None
if process_mesh.topology[0] == 1:
dim_of_one = 0
dim_of_other = 1
elif process_mesh.topology[1] == 1:
dim_of_one = 1
dim_of_other = 0
if dim_of_one is not None:
dist_attr.process_mesh = ProcessMesh(process_mesh.processes)
self._dist_context.add_process_mesh(dist_attr.process_mesh)
for arg_name in dist_attr.inputs_dist_attrs.keys():
new_dims_mapping = []
dims_mapping = dist_attr.get_input_dims_mapping(arg_name)
for dim_mapping in dims_mapping:
if dim_mapping == dim_of_one:
new_dims_mapping.append(-1)
elif dim_mapping == dim_of_other:
new_dims_mapping.append(0)
else:
new_dims_mapping.append(dim_mapping)
dist_attr.set_input_dims_mapping(arg_name, new_dims_mapping)
dims_mapping = dist_attr.get_input_dims_mapping(arg_name)
# dynamic_dims = dist_attr.get_input_dynamic_dims(arg_name)
process_mesh = dist_attr.process_mesh
process_shape = process_mesh.topology
tensor = dist_op.get_serial_input(arg_name)
if dims_mapping:
tensor_shape = tensor.shape
else:
continue
for i, dim_mapping in enumerate(dims_mapping):
# if dim_mapping != -1 \
# and (tensor_shape[i] % process_shape[dim_mapping] != 0 \
# or dynamic_dims[i] == 1):
if dim_mapping != -1 \
and (tensor_shape[i] % process_shape[dim_mapping] != 0):
dims_mapping[i] = -1
# it is a fix-bug
if dim_mapping != -1 \
and process_shape[dim_mapping] == 1:
dims_mapping[i] = -1
for arg_name in dist_attr.outputs_dist_attrs.keys():
new_dims_mapping = []
dims_mapping = dist_attr.get_output_dims_mapping(arg_name)
for dim_mapping in dims_mapping:
if dim_mapping == dim_of_one:
new_dims_mapping.append(-1)
elif dim_mapping == dim_of_other:
new_dims_mapping.append(0)
else:
new_dims_mapping.append(dim_mapping)
dist_attr.set_output_dims_mapping(arg_name, new_dims_mapping)
dims_mapping = dist_attr.get_output_dims_mapping(arg_name)
# dynamic_dims = dist_attr.get_output_dynamic_dims(arg_name)
process_mesh = dist_attr.process_mesh
process_shape = process_mesh.topology
tensor = dist_op.get_serial_output(arg_name)
if dims_mapping:
tensor_shape = tensor.shape
else:
continue
for i, dim_mapping in enumerate(dims_mapping):
if dim_mapping != -1 \
and (tensor_shape[i] % process_shape[dim_mapping] != 0):
dims_mapping[i] = -1
# it is a fix-bug
if dim_mapping != -1 \
and process_shape[dim_mapping] == 1:
dims_mapping[i] = -1
dist_op_impls = find_compatible_distributed_operator_impls(
dist_op, partial=False)
serial_op_type = dist_op.serial_op.type
if dist_op_impls is not None and (
serial_op_type != "fused_softmax_mask_upper_triangle"
or self._check_fused_softmax_mask_upper_triangle(dist_op)):
dist_op.dist_attr.impl_type = dist_op_impls[0].type
dist_op.dist_attr.impl_idx = dist_op_impls[0].idx
else:
# Use the default dist op impl
for arg_name in dist_attr.inputs_dist_attrs.keys():
dims_mapping = dist_attr.get_input_dims_mapping(arg_name)
for i, _ in enumerate(dims_mapping):
dims_mapping[i] = -1
for arg_name in dist_attr.outputs_dist_attrs.keys():
dims_mapping = dist_attr.get_output_dims_mapping(arg_name)
for i, _ in enumerate(dims_mapping):
dims_mapping[i] = -1
dist_op.dist_attr.impl_type = "default"
dist_op.dist_attr.impl_idx = 0
def _check_fused_softmax_mask_upper_triangle(self, dist_op):
"""The last_but_one dim shoule be equal to last dim."""
input_name = dist_op.serial_op.input_arg_names[0]
input_dims_mapping = dist_op.dist_attr.get_input_dims_mapping(
input_name)
topology = dist_op.dist_attr.process_mesh.topology
input_tensor = dist_op.get_serial_input(input_name)
last_but_one_dim = input_tensor.shape[-2] // topology[
input_dims_mapping[-2]] if input_dims_mapping[
-2] != -1 else input_tensor.shape[-2]
last_dim = input_tensor.shape[-1] // topology[input_dims_mapping[
-1]] if input_dims_mapping[-1] != -1 else input_tensor.shape[-1]
if last_but_one_dim == last_dim:
return True
return False
def _eval_trial(self, trial):
if self._num_trials == 0:
num_prev_trials = 0
else:
num_prev_trials = self._num_trials - 1
results = None
start_time = time.time()
inter_node_partition = trial.space.values["inter_node_partitions"]
intra_node_partition = trial.space.values["intra_node_partitions"]
process_mesh_list = self._generate_process_mesh_list(
inter_node_partition, intra_node_partition)
print("\tprocess_mesh list", process_mesh_list, flush=True)
op_id_to_process_mesh = self._apply_pipeline_partition(
process_mesh_list)
if op_id_to_process_mesh is None:
print("Operators are less than pipeline stages", flush=True)
return results
op_id_to_dist_attr = {}
for name, value in trial.space.values.items():
if name != "inter_node_partitions" \
and name !="intra_node_partitions":
op_id_to_dist_attr[int(name)] = value
end_time = time.time()
cur_sample_time = end_time - start_time
self._sample_time = (num_prev_trials * self._sample_time +
cur_sample_time) / self._num_trials
print("\tsample_time",
num_prev_trials,
self._num_trials,
self._sample_time,
cur_sample_time,
flush=True)
assert len(op_id_to_process_mesh) == len(op_id_to_dist_attr)
start_time = time.time()
for op_id, process_mesh in op_id_to_process_mesh.items():
dist_op = self._dist_context._dist_ops_for_program[op_id]
dist_op.dist_attr = copy.deepcopy(op_id_to_dist_attr[op_id])
assert dist_op.dist_attr.impl_type == op_id_to_dist_attr[
op_id].impl_type
assert dist_op.dist_attr.impl_idx == op_id_to_dist_attr[
op_id].impl_idx
dist_op.dist_attr.process_mesh = process_mesh
self._amend_dist_attr()
self._completer._complete_tensor_dist_attr_by_op()
self._dist_context.block_state.parse_forward_blocks(
self._dist_context.serial_main_program)
end_time = time.time()
cur_complete_time = end_time - start_time
self._complete_time = (num_prev_trials * self._complete_time +
cur_complete_time) / self._num_trials
print("\tcomplete_time",
num_prev_trials,
self._num_trials,
self._complete_time,
cur_complete_time,
flush=True)
start_time = time.time()
estimate_time = self._estimate_trial()
end_time = time.time()
cur_estimate_time = end_time - start_time
self._estimate_time = (num_prev_trials * self._estimate_time +
cur_estimate_time) / self._num_trials
print("\testimate_time",
num_prev_trials,
self._num_trials,
self._estimate_time,
cur_estimate_time,
estimate_time,
flush=True)
results = {"estimate_time": estimate_time}
return results
def _update_trail(self, trial, metrics, step=0):
for metric_name, metric_value in metrics.items():
trial.recorder.update(metric_name, metric_value, step=step)
return trial.status
def _estimate_trial(self):
assert self._cluster is not None
if self._mode == "eval":
self._estimator = CostEstimator(
self._dist_context.serial_main_program,
self._cluster,
loop_count=self._loop_count)
elif self._mode == "predict":
self._estimator = CostEstimator(
self._dist_context.serial_main_program,
self._cluster,
loop_count=self._loop_count)
elif self._mode == "train":
# get serial main program with backward
serial_main_program = self._dist_context.serial_main_program
serial_startup_program = self._dist_context.serial_startup_program
serial_optimizer = self._dist_context.serial_optimizer
# Generate backward
serial_loss = self._dist_context.serial_fetch_vars["loss"][0]
params_grads = self._parallelizer._generate_backward(
serial_main_program, serial_startup_program, serial_loss)
# Generate optimizer
optimizer_ops = self._parallelizer._generate_optimizer(
serial_main_program, serial_startup_program, serial_optimizer,
params_grads)
self._estimator = CostEstimator(serial_main_program,
self._cluster,
loop_count=self._loop_count)
max_memory = self._estimator._estimate_max_memory_by_dist_op(
self._dist_context)
print("\tmax_memory", "{:,}".format(max_memory), flush=True)
# The max memory must be less than 80% 32GB (hard code)
if max_memory > 32 * 0.8 * 1024 * 1024 * 1024:
return math.inf
else:
global_cost = self._estimator.estimate(self._dist_context)
return global_cost.time
def _store_init_parallel_strategy(self):
# If there is no annotation information, use the dp as the initial parallel strategy.
# TODO: we should need a better way to set up the initial parallel strategy.
if not self._dist_context.has_annotation \
or not self._dist_context.process_meshes:
ranks = self._num_machines * self._num_devices_per_machine
tensor_node = self._dist_context._serial_ordered_tensor_nodes[0]
tensor_node_id = _node_id(tensor_node)
tensor = self._dist_context._dist_tensors_for_graph[
tensor_node_id].serial_tensor
tensor_dist_attr = self._dist_context._dist_tensors_for_graph[
tensor_node_id].dist_attr
tensor_dist_attr.process_mesh = ProcessMesh(list(range(ranks)))
self._dist_context._process_meshes.append(
tensor_dist_attr.process_mesh)
tensor_dist_attr.dims_mapping = [0] + [
-1 for _ in range(len(tensor.shape) - 1)
]
tensor_dist_attr.mark_annotated("process_mesh")
tensor_dist_attr.mark_annotated("dims_mapping")
print("Use dp as the init parallel strategy!", flush=True)
# Do the sharding propagation
self._completer.complete_forward_annotation()
self._dist_context.block_state.parse_forward_blocks(
self._dist_context.serial_main_program)
# Backup the intital parallel strategy
self._init_parallel_strategy[0] = copy.deepcopy(
self._dist_context._dist_tensors_for_program)
self._init_parallel_strategy[1] = copy.deepcopy(
self._dist_context._dist_ops_for_program)
self._init_parallel_strategy[2] = copy.deepcopy(
self._dist_context.process_meshes)
# Initialize the best parallel strategy to the initial one
self._best_parallel_strategy[0] = copy.deepcopy(
self._dist_context._dist_tensors_for_program)
self._best_parallel_strategy[1] = copy.deepcopy(
self._dist_context._dist_ops_for_program)
self._best_parallel_strategy[2] = copy.deepcopy(
self._dist_context._process_meshes)
def _store_best_parallel_strategy(self):
# Swap the best and the current parallel strategy
tmp = [None, None, None]
tmp[0] = self._best_parallel_strategy[0]
tmp[1] = self._best_parallel_strategy[1]
tmp[2] = self._best_parallel_strategy[2]
self._best_parallel_strategy[
0] = self._dist_context._dist_tensors_for_program
self._best_parallel_strategy[
1] = self._dist_context._dist_ops_for_program
self._best_parallel_strategy[2] = self._dist_context._process_meshes
self._dist_context._dist_tensors_for_program = tmp[0]
self._dist_context._dist_ops_for_program = tmp[1]
self._dist_context._process_meshes = tmp[2]
def tune(self):
global_start_time = time.time()
self._dist_context._backup(serial=True, dist=True)
# This store statement must follow the above backup statement
self._store_init_parallel_strategy()
init_time = self._estimate_trial() # estimate_trial when init
# print_program_with_dist_attr(self._dist_context.serial_main_program, self._dist_context)
# We have to restore the distributed context, because the estimation of one trail need to
# generate the backward and update parts. Since we will do the tuning process,
# here we only need to reset all distributed information to the default one.
self._dist_context._restore(serial=True,
serial_mode="to_backup",
dist=True,
dist_mode="to_default")
best_time = init_time
start_time = time.time()
self.construct_space()
end_time = time.time()
print("construct_space time",
self._num_trials,
end_time - start_time,
flush=True)
create_trial_time = 0.0
eval_trial_time = 0.0
self._sample_time = 0.0
self._complete_time = 0.0
self._estimate_time = 0.0
while True:
start_time = time.time()
trial = self._create_trial()
if self._num_trials == 0:
num_prev_trials = 0
else:
num_prev_trials = self._num_trials - 1
end_time = time.time()
cur_create_trial_time = end_time - start_time
create_trial_time = (num_prev_trials * create_trial_time +
cur_create_trial_time) / self._num_trials
print("create_trial time",
num_prev_trials,
self._num_trials,
create_trial_time,
cur_create_trial_time,
flush=True)
if trial.status == TrialStatus.STOPPED:
break
# We need to backup the distributed context, because the evaluation of one trail will
# generate the backward and update parts which may change the context.
# However, the distributed information of the context aren't backup since a new one is used.
self._dist_context._backup(serial=True, dist=False)
start_time = time.time()
results = self._eval_trial(trial)
end_time = time.time()
cur_eval_trial_time = end_time - start_time
eval_trial_time = (num_prev_trials * eval_trial_time +
cur_eval_trial_time) / self._num_trials
print("eval_trial time",
num_prev_trials,
self._num_trials,
eval_trial_time,
cur_eval_trial_time,
"\n",
flush=True)
cur_time = results["estimate_time"]
if cur_time < best_time:
self._update_trail(trial, results)
self._store_best_parallel_strategy()
best_time = cur_time
# We need to restore the distributed context and reset the distributed information to the default.
self._dist_context._restore(serial=True,
serial_mode="to_backup",
dist=True,
dist_mode="to_default")
# Select the best parallel strategy
self._dist_context._dist_tensors_for_program = self._best_parallel_strategy[
0]
self._dist_context._dist_ops_for_program = self._best_parallel_strategy[
1]
self._dist_context._process_meshes = self._best_parallel_strategy[2]
......@@ -13,20 +13,17 @@
# limitations under the License.
import os
import sys
import argparse
import traceback
import pickle
import json
import time
import numpy as np
from functools import partial
import paddle
from paddle.fluid.framework import Program, _current_expected_place
from paddle.fluid.framework import Operator, Parameter
from paddle.distributed.auto_parallel.process_group import clear_all_process_groups, get_all_process_groups, new_process_group
from paddle.distributed.auto_parallel.dist_loader import NonIterableGeneratorLoader
from paddle.fluid.framework import Operator
from paddle.distributed.auto_parallel.process_group import get_all_process_groups, new_process_group
from paddle.distributed.auto_parallel.dist_loader import DistributedDataLoaderFromGenerator
from paddle.distributed.collective import _get_global_env
paddle.enable_static()
......@@ -135,13 +132,14 @@ def create_dataloader(main_program,
# insert read op at the end of program
places = paddle.static.cuda_places()
with paddle.static.program_guard(main_program, startup_program):
dataloader = NonIterableGeneratorLoader(
dataset,
feed_list,
places,
dataset.batch_size,
epochs,
steps_per_epoch,
dataloader = DistributedDataLoaderFromGenerator(
dataset=dataset,
feed_list=feed_list,
capacity=70,
places=places,
batch_size=dataset.batch_size,
epochs=epochs,
steps_per_epoch=steps_per_epoch,
data_parallel_world_size=dataset.dp_world_size,
data_parallel_rank=dataset.dp_rank)
......
......@@ -44,10 +44,18 @@ class TunableSpace(object):
def variables(self):
return self._variables
@variables.setter
def variables(self, variables):
self._variables = variables
@property
def values(self):
return self._values
@values.setter
def values(self, values):
self._values = values
def get_value(self, name):
if name in self.values:
return self.values[name]
......
......@@ -90,6 +90,7 @@ class Choice(TunableVariable):
raise TypeError(
"Choice can contain only one type of value, but found values: {} with types: {}."
.format(str(values), str(types)))
self._is_unknown_type = False
if isinstance(values[0], str):
values = [str(v) for v in values]
......@@ -108,9 +109,8 @@ class Choice(TunableVariable):
if default is not None:
default = bool(default)
else:
raise TypeError(
"Choice can only contain str, int, float, or boll, but found: {} "
.format(str(values)))
self._is_unknown_type = True
self._indices = [i for i in range(len(values))]
self.values = values
if default is not None and default not in values:
......@@ -129,7 +129,11 @@ class Choice(TunableVariable):
def random(self, seed=None):
rng = np.random.default_rng(seed)
return rng.choice(self.values)
if self._is_unknown_type:
indice = rng.choice(self._indices)
return self.values[indice]
else:
return rng.choice(self.values)
def get_state(self):
state = super(Choice, self).get_state()
......
......@@ -27,6 +27,10 @@ from paddle.distributed.auto_parallel.process_group import get_all_process_group
from paddle.fluid.io import is_parameter, is_belong_to_optimizer
from paddle.distributed.auto_parallel.dist_attribute import TensorDistributedAttribute, OperatorDistributedAttribute
__not_shape_var_type__ = [
core.VarDesc.VarType.READER, core.VarDesc.VarType.STEP_SCOPES
]
def get_logger(log_level, name="auto_parallel"):
logger = logging.getLogger(name)
......@@ -1583,3 +1587,80 @@ def find_higher_order_backward_op(program):
return True
return False
def get_lr(optimizer):
if isinstance(optimizer, paddle.optimizer.Optimizer):
return optimizer.get_lr()
elif isinstance(optimizer, paddle.fluid.optimizer.Optimizer):
if isinstance(optimizer._learning_rate, float):
return optimizer._learning_rate
else:
return optimizer._learning_rate()
else:
raise TypeError(
"'optimizer' must be object of class `paddle.optimizer.Optimizer`" \
" or `paddle.fluid.optimizer.Optimizer`, but got {}.".format(type(optimizer))
)
def initialize_pg_in_full_mode(all_process_groups, cur_rank):
import socket
from ..collective import _get_global_env
has_recv_by_socket = []
# This is a magic number
magic_num = 500
genv = _get_global_env()
cur_rank_ip, cur_rank_port = genv.current_endpoint.split(":")
cur_rank_recv_port = int(cur_rank_port) + magic_num
server_socket = None
# Large enough for recv rank
buff_size = 1024
server_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
server_socket.bind((cur_rank_ip, cur_rank_recv_port))
# The 10 is an empirical value
server_socket.listen(10)
client_sockets = {}
for process_group in all_process_groups:
if cur_rank not in process_group.ranks:
continue
if len(process_group.ranks) == 2:
index = process_group.ranks.index(cur_rank)
is_send = True if index == 0 else False
if is_send:
recv_rank = process_group.ranks[1]
recv_rank_ip, recv_rank_port = genv.trainer_endpoints[
recv_rank].split(":")
connect_port = int(recv_rank_port) + magic_num
client_socket = socket.socket(socket.AF_INET,
socket.SOCK_STREAM)
client_socket.connect((recv_rank_ip, connect_port))
client_socket.send(str(cur_rank).encode('utf-8'))
rank = client_socket.recv(buff_size).decode('utf-8')
rank = int(rank)
if rank != recv_rank:
raise ValueError(
"Please check comm pair, the recv rank should be {} but got {}."
.format(recv_rank, rank))
else:
print("It is able to instantiate {} as sender now.".format(
process_group.ranks))
client_socket.close()
else:
send_rank = process_group.ranks[0]
while True:
if send_rank not in has_recv_by_socket:
client_socket, recv_addr = server_socket.accept()
rank = int(client_socket.recv(buff_size).decode())
client_sockets[rank] = client_socket
has_recv_by_socket.append(rank)
else:
client_sockets[send_rank].send(
str(cur_rank).encode("utf-8"))
client_sockets[send_rank].close()
print("It is able to instantiate {} as recver now.".
format(process_group.ranks))
break
process_group.instantiate()
server_socket.close()
......@@ -517,9 +517,11 @@ class AMPPass(PassBase):
self.set_attr("use_dynamic_loss_scaling", False)
self.set_attr("input_data", [])
self.set_attr("params_grads", [])
self._loss = None
self._loss_scaling = None
self._num_good_steps = None
self._num_bad_steps = None
self._loss = None
def _check_self(self):
if self.get_attr("init_loss_scaling") < 0:
......
......@@ -82,9 +82,11 @@ class DataParallelOptimizationPass(PassBase):
with paddle.static.program_guard(main_program, startup_program):
self._analyze_program()
self._prune_grad_scaling()
self._calc_comm_overlap()
grad_group = self._fuse_allreduce()
if self.is_data_parallel_applied():
self._prune_grad_scaling()
self._calc_comm_overlap()
grad_group = self._fuse_allreduce()
# self.summary(grad_group)
......@@ -167,6 +169,9 @@ class DataParallelOptimizationPass(PassBase):
) == 0, "Unexception: gradients [{}] is scaled BUT NOT synchronized.".format(
not_synchronized_grads)
def is_data_parallel_applied(self):
return len(self._group_to_grad_name_map) > 0
def _could_be_prune(self):
return self.dist_context.gradient_scale and (
......
......@@ -213,7 +213,7 @@ class ClipGradByGloblNormPass(PassBase):
if self.get_attr("dist_context") is None:
return False
dist_context = self.get_attr("dist_context")
if dist_context._lr_optimizer._grad_clip is None:
if dist_context._serial_optimizer._grad_clip is None:
return False
if self.get_attr("params_grads") is None:
return False
......
......@@ -396,7 +396,7 @@ class ShardingPass(PassBase):
dp_ring_ids = [group.id for group in self.dp_groups]
for idx, op in reversed(list(enumerate(main_block.ops))):
if is_data_parallel_reduce_op(op):
if _is_param_grad_allreduce_op(op, main_block):
input_name = op.input_arg_names[0]
base_name = _get_base_name_from_grad_name(input_name)
sharding_info = self.varname_to_sharding_info[base_name]
......@@ -653,6 +653,20 @@ def _get_base_name_from_grad_name(grad_name):
return base_name
def _is_param_grad_allreduce_op(op, block):
if not is_data_parallel_reduce_op(op):
return False
output_name = op.output_arg_names[0]
base_name = _get_base_name_from_grad_name(output_name)
if not block.has_var(base_name):
return False
return block.var(base_name).is_parameter
def _is_param_grad_sum_op(op, block):
if not is_backward_op(op):
......
......@@ -60,6 +60,9 @@ if(WITH_DISTRIBUTE AND WITH_GPU)
py_test_modules(test_pass_amp MODULES test_pass_amp ENVS ${dist_ENVS})
set_tests_properties(test_pass_amp PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE"
TIMEOUT 50)
py_test_modules(test_engine_callbacks MODULES test_engine_callbacks)
set_tests_properties(test_engine_callbacks
PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 50)
py_test_modules(test_while_op_completion MODULES test_while_op_completion
ENVS ${dist_ENVS})
......@@ -78,6 +81,7 @@ if(WITH_DISTRIBUTE AND WITH_GPU)
py_test_modules(test_dist_embedding MODULES test_dist_embedding ENVS
${dist_ENVS})
py_test_modules(test_dist_slice MODULES test_dist_slice ENVS ${dist_ENVS})
py_test_modules(test_dist_split MODULES test_dist_split ENVS ${dist_ENVS})
py_test_modules(test_cluster MODULES test_cluster ENVS ${dist_ENVS})
py_test_modules(test_comm_cost MODULES test_comm_cost ENVS ${dist_ENVS})
py_test_modules(test_comp_cost MODULES test_comp_cost ENVS ${dist_ENVS})
......@@ -96,4 +100,19 @@ if(WITH_DISTRIBUTE AND WITH_GPU)
py_test_modules(test_strategy MODULES test_strategy)
py_test_modules(test_pass_quantization MODULES test_pass_quantization)
py_test_modules(test_dist_shape MODULES test_dist_shape)
py_test_modules(test_dist_assign MODULES test_dist_assign)
py_test_modules(test_conditional_block_reshard MODULES
test_conditional_block_reshard)
py_test_modules(test_parallel_tuner MODULES test_parallel_tuner ENVS
${dist_ENVS})
set_tests_properties(test_parallel_tuner PROPERTIES TIMEOUT 120)
py_test_modules(test_parallel_tuner_full MODULES test_parallel_tuner_full
ENVS ${dist_ENVS})
set_tests_properties(test_parallel_tuner_full PROPERTIES TIMEOUT 120)
py_test_modules(test_parallel_tuner_predict MODULES
test_parallel_tuner_predict ENVS ${dist_ENVS})
set_tests_properties(test_parallel_tuner_predict PROPERTIES TIMEOUT 120)
endif()
......@@ -88,33 +88,27 @@ class TestAMPPass(unittest.TestCase):
def test_amp_pass(self):
# mp2 training
mp_engine = self.get_engine()
mp_losses = mp_engine.fit(self.dataset, 3, batch_size=self.batch_size)
mp_losses = np.array(mp_losses["loss"])
history = mp_engine.fit(self.dataset, 3, batch_size=self.batch_size)
mp_losses = np.array(history.history["loss"])
# mp2 amp-o1 training
amp_o1_engine = self.get_engine(True, "o1")
amp_o1_losses = amp_o1_engine.fit(self.dataset,
3,
batch_size=self.batch_size)
amp_o1_losses = np.array(amp_o1_losses["loss"])
history = amp_o1_engine.fit(self.dataset, 3, batch_size=self.batch_size)
amp_o1_losses = np.array(history.history["loss"])
amp_o1_engine.evaluate(self.dataset, 3, batch_size=self.batch_size)
# self.check_results(mp_losses, amp_o1_losses)
# mp2 amp-o2 training
amp_o2_engine = self.get_engine(True, "o2")
amp_o2_losses = amp_o2_engine.fit(self.dataset,
3,
batch_size=self.batch_size)
amp_o2_losses = np.array(amp_o2_losses["loss"])
history = amp_o2_engine.fit(self.dataset, 3, batch_size=self.batch_size)
amp_o2_losses = np.array(history.history["loss"])
amp_o2_engine.evaluate(self.dataset, 3, batch_size=self.batch_size)
# self.check_results(mp_losses, amp_o2_losses)
# mp2 amp-o3 training
amp_o3_engine = self.get_engine(True, "o3")
amp_o3_losses = amp_o3_engine.fit(self.dataset,
3,
batch_size=self.batch_size)
amp_o3_losses = np.array(amp_o3_losses["loss"])
history = amp_o3_engine.fit(self.dataset, 3, batch_size=self.batch_size)
amp_o3_losses = np.array(history.history["loss"])
amp_o3_engine.evaluate(self.dataset, 3, batch_size=self.batch_size)
# self.check_results(mp_losses, amp_o3_losses)
......
......@@ -20,6 +20,8 @@ import os
import numpy as np
import subprocess
import paddle
import paddle.static as static
import paddle.utils as utils
import paddle.nn as nn
import paddle.fluid as fluid
import paddle.static as static
......@@ -29,14 +31,17 @@ from paddle.fluid import layers
from paddle.io import Dataset, IterableDataset, DataLoader
from paddle.distributed.fleet import auto
from paddle.distributed.auto_parallel.interface import get_collection, CollectionNames
from paddle.optimizer.lr import CosineAnnealingDecay
from paddle.fluid.dataloader.collate import default_collate_fn
paddle.enable_static()
global_process_mesh = auto.ProcessMesh(mesh=[0, 1])
PP_MESH_0 = auto.ProcessMesh([0])
PP_MESH_1 = auto.ProcessMesh([1])
batch_size = 1
epoch_num = 1
batch_size = 2
batch_num = 10
hidden_size = 1024
sequence_len = 512
......@@ -46,6 +51,8 @@ class_num = 10
paddle.seed(44)
is_fetch = True
is_feed = True
my_feed_vars = []
class MyDataset(Dataset):
......@@ -63,6 +70,23 @@ class MyDataset(Dataset):
return self.num_samples
def get_random_inputs_and_labels(image_shape, label_shape):
input = np.random.random(size=image_shape).astype('float32')
label = np.random.random(size=label_shape).astype('int64')
return input, label
def batch_generator_creator():
def __reader__():
for _ in range(batch_num):
batch_input, batch_label = get_random_inputs_and_labels(
[batch_size, image_size], [batch_size, 1])
yield batch_input, batch_label
return __reader__
class MLPLayer(nn.Layer):
def __init__(self,
......@@ -92,16 +116,20 @@ class MLPLayer(nn.Layer):
def forward(self, input):
out = auto.shard_op(self.norm, PP_MESH_0)(input)
out = self.linear0(out)
if is_feed:
my_feed_vars.append((out, out.shape))
out = F.gelu(out, approximate=True)
out = auto.shard_op(self.linear1, PP_MESH_1)(out)
out = self.dropout(out)
out = self.linear2(out)
if is_feed:
my_feed_vars.append((out, out.shape))
if is_fetch:
auto.fetch(out, "out")
auto.fetch(out, "my_fetch", logging=True)
return out
def train(fetch):
def train_high_level(fetch):
global is_fetch
is_fetch = fetch
mlp = MLPLayer(hidden_size=hidden_size,
......@@ -124,10 +152,12 @@ def train(fetch):
# train
train_dataset = MyDataset(batch_num * batch_size)
eval_dataset1 = MyDataset(5 * batch_size)
engine.fit(train_data=train_dataset,
epochs=2,
batch_size=batch_size,
valid_data=eval_dataset1)
history = engine.fit(train_data=train_dataset,
epochs=2,
batch_size=batch_size,
valid_data=eval_dataset1,
log_freq=1)
# eval
eval_dataset2 = MyDataset(batch_size)
......@@ -135,7 +165,7 @@ def train(fetch):
# predict
test_dataset = MyDataset(batch_size)
engine.predict(test_dataset, batch_size=batch_size)
outputs = engine.predict(test_dataset, batch_size=batch_size)
# save
temp_dir = tempfile.TemporaryDirectory()
......@@ -145,6 +175,265 @@ def train(fetch):
temp_dir.cleanup()
def train_low_level():
mlp = MLPLayer(hidden_size=hidden_size,
intermediate_size=4 * hidden_size,
dropout_ratio=0.1,
initializer_range=0.02)
loss = paddle.nn.CrossEntropyLoss()
optimizer = paddle.optimizer.Adam(learning_rate=0.00001,
beta1=0.9,
beta2=0.999,
epsilon=1e-08,
grad_clip=None)
metric = paddle.metric.Accuracy()
strategy = auto.Strategy()
strategy.auto_mode = "semi"
engine = auto.Engine(mlp, loss, optimizer, metrics=None, strategy=strategy)
feed_dict = {}
for feed_var, shape in my_feed_vars:
feed_dict[feed_var.name] = np.zeros(shape, dtype="float32")
# Build normal normal dataloader
# train
train_dataset = MyDataset(batch_num * batch_size)
train_dataloader = engine.dataloader(train_dataset,
batch_size=batch_size,
mode="train")
engine.prepare(mode="train")
for data in train_dataloader:
outs = engine.run(data, feed=feed_dict, mode="train")
# eval
eval_dataset2 = MyDataset(batch_size)
eval_dataloader = engine.dataloader(eval_dataset2,
batch_size=batch_size,
mode="eval")
engine.prepare(mode="eval")
for data in eval_dataloader:
outs = engine.run(data, feed=feed_dict, mode="eval")
# predict
engine.to_mode("predict")
test_dataset = MyDataset(batch_size)
predict_dataloader = engine.dataloader(test_dataset, batch_size=batch_size)
engine.prepare()
for data in predict_dataloader:
outs = engine.run(data, feed=feed_dict)
# save
temp_dir = tempfile.TemporaryDirectory()
model_filename = os.path.join(temp_dir.name, 'mlp')
engine.save(model_filename, training=True)
engine.load(model_filename)
temp_dir.cleanup()
# Build dataloader from generator
# train
train_dataset = MyDataset(batch_num * batch_size)
train_dataloader = engine.dataloader_from_generator(train_dataset,
batch_size=batch_size,
mode="train")
engine.prepare(mode="train")
for data in train_dataloader:
outs = engine.run(data, feed=feed_dict, mode="train")
# eval
engine.to_mode("eval")
eval_dataset2 = MyDataset(batch_size)
eval_dataloader = engine.dataloader_from_generator(eval_dataset2,
batch_size=batch_size)
engine.prepare()
for data in eval_dataloader:
outs = engine.run(data, feed=feed_dict)
# predict
test_dataset = MyDataset(batch_size)
predict_dataloader = engine.dataloader_from_generator(test_dataset,
batch_size=batch_size,
mode="predict")
engine.prepare(mode="predict")
for data in predict_dataloader:
outs = engine.run(data, feed=feed_dict, mode="predict")
# save
temp_dir = tempfile.TemporaryDirectory()
model_filename = os.path.join(temp_dir.name, 'mlp')
engine.save(model_filename, training=True)
engine.load(model_filename)
temp_dir.cleanup()
def train_builtin_data_vars():
mlp = MLPLayer(hidden_size=hidden_size,
intermediate_size=4 * hidden_size,
dropout_ratio=0.1,
initializer_range=0.02)
loss = paddle.nn.CrossEntropyLoss()
optimizer = paddle.optimizer.Adam(learning_rate=0.00001,
beta1=0.9,
beta2=0.999,
epsilon=1e-08,
grad_clip=None)
metric = paddle.metric.Accuracy()
strategy = auto.Strategy()
strategy.auto_mode = "semi"
engine = auto.Engine(mlp, loss, optimizer, metric, strategy=strategy)
# train
engine.to_mode("train")
input_spec = static.InputSpec([batch_size, image_size], 'float32', 'input')
label_spec = static.InputSpec([batch_size, 1], 'int64', 'label')
engine.prepare(inputs_spec=[input_spec], labels_spec=[label_spec])
with static.program_guard(engine.main_program, engine.startup_program):
feed_list = engine.inputs + engine.labels
print(feed_list)
loader = paddle.io.DataLoader.from_generator(feed_list=feed_list,
capacity=4 * batch_size,
iterable=False)
places = static.cuda_places()
loader.set_batch_generator(batch_generator_creator(), places=places)
for _ in range(epoch_num):
loader.start() # call DataLoader.start() before each epoch starts
try:
while True:
engine.run()
except paddle.fluid.core.EOFException:
loader.reset(
) # call DataLoader.reset() after catching EOFException
def train_non_builtin_data_vars():
main_program = static.Program()
startup_program = static.Program()
with static.program_guard(main_program,
startup_program), utils.unique_name.guard():
input = static.data(name="input",
shape=[batch_size, image_size],
dtype='float32')
label = static.data(name="label", shape=[batch_size, 1], dtype='int64')
loader = paddle.io.DataLoader.from_generator(feed_list=[input, label],
capacity=4 * batch_size,
iterable=False)
places = static.cuda_places()
loader.set_batch_generator(batch_generator_creator(), places=places)
mlp = MLPLayer(hidden_size=hidden_size,
intermediate_size=4 * hidden_size,
dropout_ratio=0.1,
initializer_range=0.02)
loss = paddle.nn.CrossEntropyLoss()
optimizer = paddle.optimizer.Adam(learning_rate=0.00001,
beta1=0.9,
beta2=0.999,
epsilon=1e-08,
grad_clip=None)
metric = paddle.metric.Accuracy()
predict = mlp(input)
loss_var = loss(predict, label)
strategy = auto.Strategy()
strategy.auto_mode = "semi"
engine = auto.Engine(loss=loss_var,
optimizer=optimizer,
metrics=metric,
strategy=strategy)
# train
engine.to_mode("train")
engine.prepare(inputs=[input],
labels=[label],
main_program=main_program,
startup_program=startup_program)
for _ in range(epoch_num):
loader.start() # call DataLoader.start() before each epoch starts
try:
while True:
engine.run()
except paddle.fluid.core.EOFException:
loader.reset(
) # call DataLoader.reset() after catching EOFException
def get_cost():
main_program = static.default_main_program()
startup_program = static.default_startup_program()
with static.program_guard(main_program,
startup_program), utils.unique_name.guard():
input = static.data(name="input",
shape=[batch_size, image_size],
dtype='float32')
label = static.data(name="label", shape=[batch_size, 1], dtype='int64')
loader = paddle.io.DataLoader.from_generator(feed_list=[input, label],
capacity=4 * batch_size,
iterable=False)
places = static.cuda_places()
loader.set_batch_generator(batch_generator_creator(), places=places)
mlp = MLPLayer(hidden_size=hidden_size,
intermediate_size=4 * hidden_size,
dropout_ratio=0.1,
initializer_range=0.02)
loss = paddle.nn.CrossEntropyLoss()
optimizer = paddle.optimizer.Adam(learning_rate=0.00001,
beta1=0.9,
beta2=0.999,
epsilon=1e-08,
grad_clip=None)
metric = paddle.metric.Accuracy()
predict = mlp(input)
loss_var = loss(predict, label)
strategy = auto.Strategy()
strategy.auto_mode = "semi"
engine = auto.Engine(loss=loss_var,
optimizer=optimizer,
metrics=metric,
strategy=strategy)
engine.cost()
def get_cost_by_spec():
mlp = MLPLayer(hidden_size=hidden_size,
intermediate_size=4 * hidden_size,
dropout_ratio=0.1,
initializer_range=0.02)
loss = paddle.nn.CrossEntropyLoss()
optimizer = paddle.optimizer.Adam(learning_rate=0.00001,
beta1=0.9,
beta2=0.999,
epsilon=1e-08,
grad_clip=None)
metric = paddle.metric.Accuracy()
strategy = auto.Strategy()
strategy.auto_mode = "semi"
engine = auto.Engine(mlp, loss, optimizer, metric, strategy=strategy)
input_spec = static.InputSpec([batch_size, image_size], 'float32', 'input')
label_spec = static.InputSpec([batch_size, 1], 'int64', 'label')
engine.cost(mode="eval", inputs_spec=[input_spec], labels_spec=[label_spec])
if __name__ == "__main__":
train(fetch=True)
train(fetch=False)
train_high_level(fetch=True)
train_high_level(fetch=False)
train_low_level()
train_builtin_data_vars()
train_non_builtin_data_vars()
get_cost()
get_cost_by_spec()
......@@ -84,25 +84,32 @@ class TestGradientMergePass(unittest.TestCase):
def test_gradient_merge_pass(self):
# dp2 training
dp_engine = self.get_engine()
dp_losses = dp_engine.fit(self.dataset, 3, batch_size=self.batch_size)
dp_losses = np.array(dp_losses["loss"])
history = dp_engine.fit(self.dataset,
3,
batch_size=self.batch_size,
log_freq=1)
dp_losses = np.array(history.history["loss"])
# dp2 gradient merge training
gm_engine = self.get_engine(True)
gm_losses = gm_engine.fit(self.dataset, 3, batch_size=self.batch_size)
gm_losses = np.array(gm_losses["loss"])
avg_loss = 0
pass_avg_ret_list = []
for i, pass_ret in enumerate(gm_losses):
if (i + 1) % 4 == 0:
avg_loss += pass_ret
pass_avg_ret_list.append(avg_loss / 4)
avg_loss = 0
else:
avg_loss += pass_ret
self.check_results(dp_losses, np.array(pass_avg_ret_list))
history = gm_engine.fit(self.dataset,
3,
batch_size=self.batch_size,
log_freq=1)
gm_losses = np.array(history.history["loss"])
# avg_loss = 0
# pass_avg_ret_list = []
# for i, pass_ret in enumerate(gm_losses):
# if (i + 1) % 4 == 0:
# avg_loss += pass_ret
# pass_avg_ret_list.append(avg_loss / 4)
# avg_loss = 0
# else:
# avg_loss += pass_ret
# NOTE: every sample data from dataset is all the same
self.check_results(dp_losses, gm_losses)
if __name__ == "__main__":
......
......@@ -79,13 +79,13 @@ class TestRecomputePass(unittest.TestCase):
def test_recompute_pass(self):
# mp2 training
mp_engine = self.get_engine()
mp_losses = mp_engine.fit(self.dataset, 3, batch_size=self.batch_size)
mp_losses = np.array(mp_losses["loss"])
history = mp_engine.fit(self.dataset, 3, batch_size=self.batch_size)
mp_losses = np.array(history.history["loss"])
# mp2 recompute training
rc_engine = self.get_engine(True)
rc_losses = rc_engine.fit(self.dataset, 3, batch_size=self.batch_size)
rc_losses = np.array(rc_losses["loss"])
history = rc_engine.fit(self.dataset, 3, batch_size=self.batch_size)
rc_losses = np.array(history.history["loss"])
self.check_results(mp_losses, rc_losses)
......
......@@ -84,31 +84,31 @@ class TestShardingPass(unittest.TestCase):
def test_sharding_pass(self):
# dp2 training
dp_engine = self.get_engine()
dp_losses = dp_engine.fit(self.dataset, 3, batch_size=self.batch_size)
dp_losses = np.array(dp_losses["loss"])
history = dp_engine.fit(self.dataset, 3, batch_size=self.batch_size)
dp_losses = np.array(history.history["loss"])
# sharding2 stage1 training
sharding1_engine = self.get_engine(True, 1)
sharding1_losses = sharding1_engine.fit(self.dataset,
3,
batch_size=self.batch_size)
sharding1_losses = np.array(sharding1_losses["loss"])
history = sharding1_engine.fit(self.dataset,
3,
batch_size=self.batch_size)
sharding1_losses = np.array(history.history["loss"])
self.check_results(dp_losses, sharding1_losses)
# sharding2 stage2 training
sharding2_engine = self.get_engine(True, 2)
sharding2_losses = sharding2_engine.fit(self.dataset,
3,
batch_size=self.batch_size)
sharding2_losses = np.array(sharding2_losses["loss"])
history = sharding2_engine.fit(self.dataset,
3,
batch_size=self.batch_size)
sharding2_losses = np.array(history.history["loss"])
self.check_results(dp_losses, sharding2_losses)
# sharding2 stage3 training
sharding3_engine = self.get_engine(True, 3)
sharding3_losses = sharding3_engine.fit(self.dataset,
3,
batch_size=self.batch_size)
sharding3_losses = np.array(sharding3_losses["loss"])
history = sharding3_engine.fit(self.dataset,
3,
batch_size=self.batch_size)
sharding3_losses = np.array(history.history["loss"])
self.check_results(dp_losses, sharding3_losses)
......
......@@ -82,6 +82,9 @@ from paddle.distributed.auto_parallel.cost.comp_op_cost import Transpose2OpCost
from paddle.distributed.auto_parallel.cost.comp_op_cost import Transpose2GradOpCost
from paddle.distributed.auto_parallel.cost.comp_op_cost import Unsqueeze2OpCost
from paddle.distributed.auto_parallel.cost.comp_op_cost import WriteToArrayOpCost
from paddle.distributed.auto_parallel.cost.comp_op_cost import DropoutGradOpCost
from paddle.distributed.auto_parallel.cost.comp_op_cost import FusedSoftmaxMaskUpperTriangleOpCost
from paddle.distributed.auto_parallel.cost.comp_op_cost import FusedSoftmaxMaskUpperTriangleGradOpCost
from test_cluster import cluster_json
......@@ -417,6 +420,22 @@ class TestCompOpCost(unittest.TestCase):
self.assertTrue(op_cost.flops >= 0)
self.assertTrue(op_cost.time >= 0)
self.assertTrue(op_cost.memory >= 0)
op_cost = DropoutGradOpCost(cluster=cluster)
self.assertTrue(op_cost.flops >= 0)
self.assertTrue(op_cost.time >= 0)
self.assertTrue(op_cost.memory >= 0)
op_cost = FusedSoftmaxMaskUpperTriangleOpCost(cluster=cluster)
self.assertTrue(op_cost.flops >= 0)
self.assertTrue(op_cost.time >= 0)
self.assertTrue(op_cost.memory >= 0)
op_cost = FusedSoftmaxMaskUpperTriangleGradOpCost(cluster=cluster)
self.assertTrue(op_cost.flops >= 0)
self.assertTrue(op_cost.time >= 0)
self.assertTrue(op_cost.memory >= 0)
# Remove unnecessary files
if os.path.exists(cluster_json_path):
os.remove(cluster_json_path)
......
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from paddle.static import InputSpec
from paddle.distributed.fleet import auto
class MLPLayer(nn.Layer):
def __init__(self,
hidden_size=64,
intermediate_size=4 * 64,
initializer_range=0.02):
super(MLPLayer, self).__init__()
self.norm = nn.LayerNorm(hidden_size, epsilon=1e-5)
self.linear0 = nn.Linear(
hidden_size,
intermediate_size,
paddle.ParamAttr(initializer=nn.initializer.Normal(
mean=0.0, std=initializer_range)),
bias_attr=None)
self.linear1 = nn.Linear(
intermediate_size,
hidden_size,
paddle.ParamAttr(initializer=nn.initializer.Normal(
mean=0.0, std=initializer_range)),
bias_attr=None)
def forward(self, input):
out = self.norm(input)
auto.shard_tensor(self.linear0.weight, auto.ProcessMesh([0, 1], "x"),
[None, "x"])
out = self.linear0(out)
out = F.gelu(out, approximate=True)
auto.shard_tensor(self.linear1.weight, auto.ProcessMesh([0, 1], "x"),
["x", None])
out = self.linear1(out)
if paddle.mean(out) < 2:
out = self.norm(out)
out = self.linear0(out)
out = F.gelu(out, approximate=True)
out = self.linear1(out)
else:
out = self.norm(out)
out = self.linear0(out)
out = self.linear1(out)
return out
def loss_fn(predict, label):
error_cost = paddle.nn.functional.square_error_cost(predict, label)
loss = paddle.mean(error_cost)
return loss
class TestSubblock(unittest.TestCase):
def test_subblock(self):
mlp = MLPLayer()
strategy = auto.Strategy()
strategy.auto_mode = "semi"
engine = auto.Engine(model=mlp, loss=loss_fn, strategy=strategy)
input_sepc = InputSpec([4, 64], 'float32', 'input')
label_spec = InputSpec([4, 1], 'float32', 'label')
engine.prepare(inputs_spec=[input_sepc],
labels_spec=[label_spec],
mode="predict")
if __name__ == "__main__":
unittest.main()
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import paddle
from paddle.distributed.fleet import auto
paddle.enable_static()
def make_program():
main_program = paddle.fluid.Program()
start_program = paddle.fluid.Program()
with paddle.static.program_guard(main_program, start_program):
x = paddle.static.data(name='x', shape=[4, 4, 8], dtype='float32')
y = paddle.static.data(name='y', shape=[4, 4, 8], dtype='float32')
auto.shard_tensor(x, auto.ProcessMesh([0, 1], dim_names=["d"]),
[None, "d", None])
z = paddle.add(x, y)
paddle.assign(x, output=z)
return main_program, start_program
def parallelizer(program_func, rank):
from paddle.distributed.auto_parallel.completion import Completer
from paddle.distributed.auto_parallel.partitioner import Partitioner
from paddle.distributed.auto_parallel.dist_context import DistributedContext
main_program, start_program = program_func()
dist_context = DistributedContext()
completer = Completer(dist_context)
completer.complete_forward_annotation(main_program)
dist_context.block_state.parse_forward_blocks(main_program)
partitioner = Partitioner(dist_context, rank)
dist_main_prog, _, _ = partitioner.partition(main_program, start_program,
[])
return dist_main_prog, dist_context
class TestDistAssign(unittest.TestCase):
def test_dist_assign(self):
dist_main_prog, dist_context = parallelizer(make_program, 0)
ops = dist_main_prog.global_block().ops
for op in ops:
if op.type == "assign":
dist_op = dist_context.get_dist_op_for_program(op)
dist_op.dist_attr.impl_type == "assign"
dist_op.dist_attr.impl_idx == 0
x_name = op.input_arg_names[0]
out_name = op.output_arg_names[0]
out_var = dist_main_prog.global_block().vars[out_name]
dist_out = dist_context.get_dist_tensor_for_program(out_var)
x_dims_mapping = dist_op.dist_attr.get_input_dims_mapping(
x_name)
out_dims_mapping = dist_op.dist_attr.get_output_dims_mapping(
out_name)
assert x_dims_mapping == out_dims_mapping
assert out_dims_mapping == dist_out.dist_attr.dims_mapping
if __name__ == "__main__":
unittest.main()
......@@ -199,7 +199,7 @@ class TestDistributedContext(unittest.TestCase):
"_serial_ordered_nodes", "_serial_ordered_tensor_nodes", \
"_serial_ordered_op_nodes", "_original_serial_loss", \
"_original_serial_feed_vars", "_original_serial_fetch_vars", \
"_serial_loss", "_serial_feed_vars", "_serial_fetch_vars", "_lr_optimizer", \
"_serial_loss", "_serial_feed_vars", "_serial_fetch_vars", "_serial_optimizer", \
"_backup_serial_main_program_stack", "_backup_serial_startup_program_stack", \
"_pass_context"]
......
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import paddle
from paddle.distributed.fleet import auto
paddle.enable_static()
def make_program():
main_program = paddle.fluid.Program()
start_program = paddle.fluid.Program()
with paddle.static.program_guard(main_program, start_program):
x = paddle.static.data(name='x', shape=[4, 4, 8], dtype='float32')
x.stop_gradient = False
auto.shard_tensor(x, auto.ProcessMesh([0, 1], dim_names=["x"]),
["x", None, None])
shape = paddle.shape(x)
return main_program, start_program
def parallelizer(program_func, rank):
from paddle.distributed.auto_parallel.completion import Completer
from paddle.distributed.auto_parallel.partitioner import Partitioner
from paddle.distributed.auto_parallel.dist_context import DistributedContext
main_program, start_program = program_func()
dist_context = DistributedContext()
completer = Completer(dist_context)
completer.complete_forward_annotation(main_program)
dist_context.block_state.parse_forward_blocks(main_program)
partitioner = Partitioner(dist_context, rank)
dist_main_prog, _, _ = partitioner.partition(main_program, start_program,
[])
return dist_main_prog, dist_context
class TestDistShape(unittest.TestCase):
def test_dist_shape(self):
dist_main_prog, dist_context = parallelizer(make_program, 0)
ops = dist_main_prog.global_block().ops
shape_op = ops[0]
dist_op = dist_context.get_dist_op_for_program(shape_op)
dist_op.dist_attr.impl_type == "shape"
dist_op.dist_attr.impl_idx == 0
in_name = shape_op.input_arg_names[0]
out_name = shape_op.output_arg_names[0]
in_dims_mapping = dist_op.dist_attr.get_input_dims_mapping(in_name)
out_dims_mapping = dist_op.dist_attr.get_output_dims_mapping(out_name)
assert in_dims_mapping == [0, -1, -1]
assert out_dims_mapping == [-1]
if __name__ == "__main__":
unittest.main()
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import paddle
from paddle.distributed.fleet import auto
from paddle.fluid import program_guard
from paddle.distributed.auto_parallel.utils import print_program_with_dist_attr
paddle.enable_static()
def make_program_dp2():
main_program = paddle.fluid.Program()
start_program = paddle.fluid.Program()
with paddle.static.program_guard(main_program, start_program):
x = paddle.static.data(name='x', shape=[4, 12, 16], dtype='float32')
x.stop_gradient = False
auto.shard_tensor(x, auto.ProcessMesh([0, 1], dim_names=["x"]),
["x", None, None])
out0, out1, out2 = paddle.split(x, num_or_sections=3, axis=1)
return main_program, start_program
def parallelizer(program_func, rank):
from paddle.distributed.auto_parallel.completion import Completer
from paddle.distributed.auto_parallel.partitioner import Partitioner
from paddle.distributed.auto_parallel.dist_context import DistributedContext
main_program, start_program = program_func()
dist_context = DistributedContext()
completer = Completer(dist_context)
completer.complete_forward_annotation(main_program)
dist_context.block_state.parse_forward_blocks(main_program)
partitioner = Partitioner(dist_context, rank)
dist_main_prog, _, _ = partitioner.partition(main_program, start_program,
[])
return dist_main_prog, dist_context
class TestDistSplit(unittest.TestCase):
def test_dist_split_dp2(self):
for rank in range(2):
dist_main_prog, dist_context = parallelizer(make_program_dp2, rank)
ops = dist_main_prog.global_block().ops
op_dist_attr = dist_context.get_op_dist_attr_for_program(ops[0])
assert op_dist_attr.impl_type == "split"
assert op_dist_attr.impl_idx == 0
if __name__ == "__main__":
unittest.main()
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import tempfile
import shutil
import time
import random
import paddle
import paddle.vision.transforms as T
from paddle.static import InputSpec
from paddle.distributed.fleet import auto
from paddle.distributed.auto_parallel.callbacks import config_callbacks
from paddle.vision.models import LeNet
from paddle.vision.datasets import MNIST
paddle.enable_static()
class TestCallbacks(unittest.TestCase):
def setUp(self):
self.save_dir = tempfile.mkdtemp()
def tearDown(self):
shutil.rmtree(self.save_dir)
def run_callback(self):
epochs = 2
steps = 5
freq = 2
eval_steps = 2
inputs_spec = [InputSpec([None, 1, 28, 28], 'float32', 'image')]
strategy = auto.Strategy()
strategy.auto_mode = "semi"
engine = auto.Engine(LeNet(), strategy=strategy)
engine.prepare(inputs_spec, mode="predict")
cbks = config_callbacks(engine=engine,
batch_size=128,
epochs=epochs,
steps=steps,
log_freq=freq,
verbose=self.verbose,
metrics=['loss', 'acc'],
save_dir=self.save_dir)
cbks.on_begin('train')
logs = {'loss': 50.341673, 'acc': 0.00256}
for epoch in range(epochs):
cbks.on_epoch_begin(epoch)
for step in range(steps):
cbks.on_batch_begin('train', step, logs)
logs['loss'] -= random.random() * 0.1
logs['acc'] += random.random() * 0.1
time.sleep(0.005)
cbks.on_batch_end('train', step, logs)
cbks.on_epoch_end(epoch, logs)
eval_logs = {'eval_loss': 20.341673, 'eval_acc': 0.256}
params = {
'steps': eval_steps,
'metrics': ['eval_loss', 'eval_acc'],
}
cbks.on_begin('eval', params)
for step in range(eval_steps):
cbks.on_batch_begin('eval', step, eval_logs)
eval_logs['eval_loss'] -= random.random() * 0.1
eval_logs['eval_acc'] += random.random() * 0.1
eval_logs['batch_size'] = 2
time.sleep(0.005)
cbks.on_batch_end('eval', step, eval_logs)
cbks.on_end('eval', eval_logs)
test_logs = {}
params = {'steps': eval_steps}
cbks.on_begin('predict', params)
for step in range(eval_steps):
cbks.on_batch_begin('predict', step, test_logs)
test_logs['batch_size'] = 2
time.sleep(0.005)
cbks.on_batch_end('predict', step, test_logs)
cbks.on_end('predict', test_logs)
cbks.on_end('train')
print(engine.history.history)
def test_callback_verbose_0(self):
self.verbose = 0
self.run_callback()
def test_callback_verbose_1(self):
self.verbose = 1
self.run_callback()
def test_callback_verbose_2(self):
self.verbose = 2
self.run_callback()
def test_callback_verbose_3(self):
self.verbose = 3
self.run_callback()
class TestCallbacksEngine(unittest.TestCase):
def setUp(self):
self.save_dir = tempfile.mkdtemp()
transform = T.Compose([T.Transpose(), T.Normalize([127.5], [127.5])])
self.train_dataset = MNIST(mode='train', transform=transform)
self.test_dataset = MNIST(mode='test', transform=transform)
self.prepare_engine()
def tearDown(self):
shutil.rmtree(self.save_dir)
def prepare_engine(self):
model = paddle.vision.models.LeNet()
loss = paddle.nn.CrossEntropyLoss()
base_lr = 1e-3
boundaries = [5, 8]
values = [base_lr * (0.1**i) for i in range(len(boundaries) + 1)]
lr = paddle.optimizer.lr.PiecewiseDecay(boundaries=boundaries,
values=values,
verbose=False)
optimizer = paddle.optimizer.Adam(learning_rate=lr,
parameters=model.parameters())
auto.fetch(model.parameters()[0], "param0", logging=True)
metrics = paddle.metric.Accuracy(topk=(1, 2))
self.engine = auto.Engine(model, loss, optimizer, metrics)
def test_fit_eval(self):
history = self.engine.fit(train_data=self.train_dataset,
valid_data=self.test_dataset,
batch_size=128,
steps_per_epoch=60,
valid_steps=40,
log_freq=20,
save_dir=self.save_dir,
save_freq=1)
print(history.history)
def test_eval(self):
self.engine.evaluate(valid_data=self.test_dataset,
batch_size=128,
steps=40,
log_freq=10)
def test_predict(self):
logger_cbks = paddle.callbacks.ProgBarLogger()
self.engine.predict(test_data=self.test_dataset,
batch_size=128,
callbacks=[logger_cbks])
if __name__ == '__main__':
unittest.main()
......@@ -78,7 +78,7 @@ class TestLRScheduler(TestEngineBase):
def test_lr_scheduler(self):
self.init_engine()
self.engine.fit(self.dataset, batch_size=self.batch_size)
lr = self.engine._lr_optimizer._learning_rate
lr = self.engine._optimizer._learning_rate
assert isinstance(lr, paddle.optimizer.lr.LRScheduler)
......
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import paddle
import paddle.static as static
from paddle.distributed import fleet
from paddle.distributed.auto_parallel.cluster import Cluster
from paddle.distributed.auto_parallel.dist_context import DistributedContext, set_default_distributed_context
from paddle.distributed.auto_parallel.tuner.parallel_tuner import ParallelTuner
from paddle.distributed.auto_parallel.process_mesh import ProcessMesh
import sys
sys.path.append("..")
import auto_parallel_gpt_model as modeling
from auto_parallel_gpt_model import GPTModel, GPTForPretraining, GPTPretrainingCriterion
paddle.enable_static()
batch_size = 4
epoch_num = 10
hidden_size = 1024
sequence_len = 512
_g_process_mesh = [
ProcessMesh([0, 1], dim_names=["x"]),
ProcessMesh([2, 3], dim_names=["x"])
]
def get_program_v3():
dist_strategy = fleet.DistributedStrategy()
dist_strategy.semi_auto = True
# fleet.init(is_collective=True, strategy=dist_strategy)
place = paddle.set_device("gpu")
gpus = [0, 1]
batch_size = 8
sequence_len = 512
vocab_size = 1000
train_program = static.Program()
start_program = static.Program()
modeling.init_global()
modeling._global_parallel_strategy = None
# modeling.DPMPPP_MESH_LIST = [
# ProcessMesh([[0, 1], [2, 3]], dim_names=["x", "y"]),
# ProcessMesh([[4, 5], [6, 7]], dim_names=["x", "y"])
# ]
with static.program_guard(train_program, start_program):
tokens = paddle.static.data(name="tokens",
shape=[batch_size, sequence_len],
dtype='int64')
position_ids = paddle.static.data(name="position_ids",
shape=[batch_size, sequence_len],
dtype='int64')
attention_mask = paddle.static.data(
name="attention_mask",
shape=[batch_size, 1, sequence_len, sequence_len],
dtype='float32')
labels = paddle.static.data(name="labels",
shape=[batch_size, sequence_len],
dtype='int64')
loss_mask = paddle.static.data(name="loss_mask",
shape=[batch_size, sequence_len],
dtype='float32')
data_holder = [tokens, position_ids, attention_mask, labels, loss_mask]
gpt = GPTModel(vocab_size=1000,
hidden_size=1024,
num_hidden_layers=2,
num_attention_heads=16,
intermediate_size=4 * 1024,
hidden_act="gelu",
hidden_dropout_prob=0.0,
attention_probs_dropout_prob=0.0,
max_position_embeddings=1024,
type_vocab_size=1,
initializer_range=0.02,
pad_token_id=0,
eos_token_id=7,
bos_token_id=0,
eol_token_id=3,
pp_degree=1)
model = GPTForPretraining(gpt,
vocab_size=1000,
hidden_size=64,
initializer_range=0.02)
preds = model(tokens, position_ids, attention_mask)
criterion = GPTPretrainingCriterion()
loss = criterion(preds, labels, loss_mask)
optimizer = paddle.fluid.optimizer.AdamOptimizer(learning_rate=0.00001,
beta1=0.9,
beta2=0.999,
epsilon=1e-08,
grad_clip=None)
feed_vars = {
"inputs": [tokens, position_ids, attention_mask, loss_mask],
"labels": [labels]
}
fetch_vars = {"loss": [loss]}
return train_program, start_program, None, loss, optimizer, feed_vars, fetch_vars
class TestParallelTunerTrain(unittest.TestCase):
def test_tune_with_train(self):
flag = False
set_default_distributed_context(DistributedContext())
train_program, start_program, dataloader, loss, optimizer, feed_vars, fetch_vars = get_program_v3(
)
cluster = Cluster()
cluster.gen_default_config_cluster(node_count=1, device_count=8)
dist_context = DistributedContext(train_program, start_program,
optimizer, loss, feed_vars,
fetch_vars, cluster)
dist_context.initialize()
parallel_tuner = ParallelTuner(dist_context, max_trials=3, mode="train")
parallel_tuner.tune()
parallel_tuner._store_best_parallel_strategy()
flag = True
self.assertTrue(flag)
if __name__ == "__main__":
unittest.main()
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import paddle
import paddle.static as static
from paddle.distributed import fleet
from paddle.distributed.auto_parallel.cluster import Cluster
from paddle.distributed.auto_parallel.dist_context import DistributedContext, set_default_distributed_context
from paddle.distributed.auto_parallel.tuner.parallel_tuner import ParallelTuner
from paddle.distributed.auto_parallel.process_mesh import ProcessMesh
from paddle.distributed.auto_parallel.planner_v2 import Planner
from paddle.distributed.auto_parallel.strategy import Strategy
import sys
sys.path.append("..")
import auto_parallel_gpt_model as modeling
from auto_parallel_gpt_model import GPTModel, GPTForPretraining, GPTPretrainingCriterion
paddle.enable_static()
batch_size = 4
epoch_num = 10
hidden_size = 1024
sequence_len = 512
_g_process_mesh = [
ProcessMesh([0, 1], dim_names=["x"]),
ProcessMesh([2, 3], dim_names=["x"])
]
def get_program_v3():
dist_strategy = fleet.DistributedStrategy()
dist_strategy.semi_auto = True
# fleet.init(is_collective=True, strategy=dist_strategy)
place = paddle.set_device("gpu")
gpus = [0, 1]
batch_size = 8
sequence_len = 512
vocab_size = 1000
train_program = static.Program()
start_program = static.Program()
modeling.init_global()
modeling._global_parallel_strategy = "dp_mp_pp"
modeling.DPMPPP_MESH_LIST = [
ProcessMesh([[0, 1], [2, 3]], dim_names=["x", "y"]),
ProcessMesh([[4, 5], [6, 7]], dim_names=["x", "y"])
]
with static.program_guard(train_program, start_program):
tokens = paddle.static.data(name="tokens",
shape=[batch_size, sequence_len],
dtype='int64')
position_ids = paddle.static.data(name="position_ids",
shape=[batch_size, sequence_len],
dtype='int64')
attention_mask = paddle.static.data(
name="attention_mask",
shape=[batch_size, 1, sequence_len, sequence_len],
dtype='float32')
labels = paddle.static.data(name="labels",
shape=[batch_size, sequence_len],
dtype='int64')
loss_mask = paddle.static.data(name="loss_mask",
shape=[batch_size, sequence_len],
dtype='float32')
data_holder = [tokens, position_ids, attention_mask, labels, loss_mask]
gpt = GPTModel(vocab_size=1000,
hidden_size=1024,
num_hidden_layers=2,
num_attention_heads=16,
intermediate_size=4 * 1024,
hidden_act="gelu",
hidden_dropout_prob=0.0,
attention_probs_dropout_prob=0.0,
max_position_embeddings=1024,
type_vocab_size=1,
initializer_range=0.02,
pad_token_id=0,
eos_token_id=7,
bos_token_id=0,
eol_token_id=3,
pp_degree=len(modeling.DPMPPP_MESH_LIST))
model = GPTForPretraining(gpt,
vocab_size=1000,
hidden_size=64,
initializer_range=0.02)
preds = model(tokens, position_ids, attention_mask)
criterion = GPTPretrainingCriterion()
loss = criterion(preds, labels, loss_mask)
optimizer = paddle.fluid.optimizer.AdamOptimizer(learning_rate=0.00001,
beta1=0.9,
beta2=0.999,
epsilon=1e-08,
grad_clip=None)
feed_vars = {
"inputs": [tokens, position_ids, attention_mask, loss_mask],
"labels": [labels]
}
fetch_vars = {"loss": [loss]}
return train_program, start_program, None, loss, optimizer, feed_vars, fetch_vars
class TestParallelTunerFull(unittest.TestCase):
def test_tune_with_planner(self):
flag = False
set_default_distributed_context(DistributedContext())
train_program, start_program, dataloader, loss, optimizer, feed_vars, fetch_vars = get_program_v3(
)
cluster = Cluster()
cluster.gen_default_config_cluster(node_count=1, device_count=8)
strategy = Strategy()
strategy.auto_mode = "full"
dist_context = DistributedContext(train_program, start_program,
optimizer, loss, feed_vars,
fetch_vars, cluster, strategy)
dist_context.initialize()
planner = Planner("train", dist_context)
planner._parallel_tuner = ParallelTuner(planner._dist_context,
mode=planner._mode,
max_trials=3)
planner.plan()
flag = True
self.assertTrue(flag)
if __name__ == "__main__":
unittest.main()
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import paddle
import paddle.static as static
from paddle.distributed import fleet
from paddle.distributed.auto_parallel.cluster import Cluster
from paddle.distributed.auto_parallel.dist_context import DistributedContext, set_default_distributed_context
from paddle.distributed.auto_parallel.tuner.parallel_tuner import ParallelTuner
from paddle.distributed.auto_parallel.process_mesh import ProcessMesh
import sys
sys.path.append("..")
import auto_parallel_gpt_model as modeling
from auto_parallel_gpt_model import GPTModel, GPTForPretraining, GPTPretrainingCriterion
paddle.enable_static()
batch_size = 4
epoch_num = 10
hidden_size = 1024
sequence_len = 512
_g_process_mesh = [
ProcessMesh([0, 1], dim_names=["x"]),
ProcessMesh([2, 3], dim_names=["x"])
]
def get_program_v3():
dist_strategy = fleet.DistributedStrategy()
dist_strategy.semi_auto = True
# fleet.init(is_collective=True, strategy=dist_strategy)
place = paddle.set_device("gpu")
gpus = [0, 1]
batch_size = 8
sequence_len = 512
vocab_size = 1000
train_program = static.Program()
start_program = static.Program()
modeling.init_global()
modeling._global_parallel_strategy = "dp_mp_pp"
modeling.DPMPPP_MESH_LIST = [
ProcessMesh([[0, 1], [2, 3]], dim_names=["x", "y"]),
ProcessMesh([[4, 5], [6, 7]], dim_names=["x", "y"])
]
with static.program_guard(train_program, start_program):
tokens = paddle.static.data(name="tokens",
shape=[batch_size, sequence_len],
dtype='int64')
position_ids = paddle.static.data(name="position_ids",
shape=[batch_size, sequence_len],
dtype='int64')
attention_mask = paddle.static.data(
name="attention_mask",
shape=[batch_size, 1, sequence_len, sequence_len],
dtype='float32')
labels = paddle.static.data(name="labels",
shape=[batch_size, sequence_len],
dtype='int64')
loss_mask = paddle.static.data(name="loss_mask",
shape=[batch_size, sequence_len],
dtype='float32')
data_holder = [tokens, position_ids, attention_mask, labels, loss_mask]
gpt = GPTModel(vocab_size=1000,
hidden_size=1024,
num_hidden_layers=2,
num_attention_heads=16,
intermediate_size=4 * 1024,
hidden_act="gelu",
hidden_dropout_prob=0.0,
attention_probs_dropout_prob=0.0,
max_position_embeddings=1024,
type_vocab_size=1,
initializer_range=0.02,
pad_token_id=0,
eos_token_id=7,
bos_token_id=0,
eol_token_id=3,
pp_degree=len(modeling.DPMPPP_MESH_LIST))
model = GPTForPretraining(gpt,
vocab_size=1000,
hidden_size=64,
initializer_range=0.02)
preds = model(tokens, position_ids, attention_mask)
criterion = GPTPretrainingCriterion()
loss = criterion(preds, labels, loss_mask)
optimizer = paddle.fluid.optimizer.AdamOptimizer(learning_rate=0.00001,
beta1=0.9,
beta2=0.999,
epsilon=1e-08,
grad_clip=None)
feed_vars = {
"inputs": [tokens, position_ids, attention_mask, loss_mask],
"labels": [labels]
}
fetch_vars = {"loss": [loss]}
return train_program, start_program, None, loss, optimizer, feed_vars, fetch_vars
class TestParallelTunerPredict(unittest.TestCase):
def test_tune_predict(self):
flag = False
set_default_distributed_context(DistributedContext())
train_program, start_program, dataloader, loss, optimizer, feed_vars, fetch_vars = get_program_v3(
)
cluster = Cluster()
cluster.gen_default_config_cluster(node_count=1, device_count=8)
dist_context = DistributedContext(train_program, start_program,
optimizer, loss, feed_vars,
fetch_vars, cluster)
dist_context.initialize()
parallel_tuner = ParallelTuner(dist_context,
max_trials=3,
mode="predict")
parallel_tuner.tune()
flag = True
self.assertTrue(flag)
if __name__ == "__main__":
unittest.main()
......@@ -101,6 +101,12 @@ class TestProcessMesh(unittest.TestCase):
self.assertEqual(sub_process_mesh4.dim_names, ["d0"])
self.assertEqual(sub_process_mesh4.ndim, 1)
sub_process_mesh5 = sub_process_mesh3[0]
self.assertEqual(sub_process_mesh5.shape, [1])
self.assertEqual(sub_process_mesh5.process_ids, [1])
self.assertEqual(sub_process_mesh5.dim_names, ["d0"])
self.assertEqual(sub_process_mesh5.ndim, 1)
def test_context_manager(self):
mesh = np.array([1, 2, 3, 4])
input = static.data(name="input",
......
......@@ -110,7 +110,7 @@ class TestWholeProgram(unittest.TestCase):
program_helper.to('train')
forward_ops = program_helper.main_program.block(0).ops
self.assertEqual(len(forward_ops), 21)
self.assertEqual(len(forward_ops), 17)
# step 2: apply optimzer to generate whole program
optimize_ops, _ = program_helper.apply_optimizer(optimizer)
......@@ -119,7 +119,7 @@ class TestWholeProgram(unittest.TestCase):
op for op in program_helper.main_program.block(0).ops
if op.type == 'sgd'
]
self.assertEqual(len(all_ops), 41)
self.assertEqual(len(all_ops), 37)
self.assertEqual(len(optimize_ops), len(sgd_ops))
program_helper.reset()
......
......@@ -136,6 +136,16 @@ class TestTunableSpace(unittest.TestCase):
self.assertEqual(new_space.variables["int_range"].step, 1)
self.assertEqual(new_space.variables["int_range"].endpoint, False)
def test_expection(self):
space = ts.TunableSpace()
flag = True
try:
val = space.get_value("test")
flag = False
except:
pass
self.assertTrue(flag)
if __name__ == "__main__":
unittest.main()
......@@ -187,6 +187,14 @@ class TestMLP(unittest.TestCase):
train_program)
# print_program_with_dist_attr(complete_train_program, dist_context)
def test_completer_by_dist_op(self):
train_program, start_program, dataloader, i, loss = get_program()
dist_context = DistributedContext()
completer = Completer(dist_context)
complete_train_program = completer.complete_forward_annotation(
train_program)
complete_train_program = completer._complete_tensor_dist_attr_by_op()
if __name__ == "__main__":
unittest.main()
......@@ -305,14 +305,14 @@ class TransformerDecoder(nn.Layer):
auto.shard_tensor(output, PP_MESH_LIST[0],
[None for i in range(len(output.shape))])
if _global_parallel_strategy == "dp_pp":
auto.shard_tensor(output, DPPP_MESH_LIST[0], ["x"].extends(
[None for i in range(len(output.shape) - 1)]))
auto.shard_tensor(output, DPPP_MESH_LIST[0], ["x"] +
[None for i in range(len(output.shape) - 1)])
if _global_parallel_strategy == "mp_pp":
auto.shard_tensor(output, MPPP_MESH_LIST[0],
[None for i in range(len(output.shape))])
if _global_parallel_strategy == "dp_mp_pp":
auto.shard_tensor(output, DPMPPP_MESH_LIST[0], ["x"].extends(
[None for i in range(len(output.shape) - 1)]))
auto.shard_tensor(output, DPMPPP_MESH_LIST[0], ["x"] +
[None for i in range(len(output.shape) - 1)])
for i, mod in enumerate(self.layers):
if cache is None:
if use_cache:
......@@ -330,8 +330,8 @@ class TransformerDecoder(nn.Layer):
tgt_mask,
use_cache, cache)
auto.shard_tensor(
output, DPPP_MESH_LIST[mod.mesh_idx], ["x"].extends(
[None for i in range(len(output.shape) - 1)]))
output, DPPP_MESH_LIST[mod.mesh_idx], ["x"] +
[None for i in range(len(output.shape) - 1)])
elif _global_parallel_strategy == "mp_pp":
output, new_cache = auto.shard_op(
mod, MPPP_MESH_LIST[mod.mesh_idx])(output, memory,
......@@ -369,8 +369,8 @@ class TransformerDecoder(nn.Layer):
tgt_mask,
use_cache, cache)
auto.shard_tensor(
output, DPPP_MESH_LIST[mod.mesh_idx], ["x"].extends(
[None for i in range(len(output.shape) - 1)]))
output, DPPP_MESH_LIST[mod.mesh_idx], ["x"] +
[None for i in range(len(output.shape) - 1)])
elif _global_parallel_strategy == "mp_pp":
output = auto.shard_op(
mod, MPPP_MESH_LIST[mod.mesh_idx])(output, memory,
......@@ -385,9 +385,8 @@ class TransformerDecoder(nn.Layer):
output, memory, tgt_mask,
use_cache, cache)
auto.shard_tensor(
output, DPMPPP_MESH_LIST[mod.mesh_idx],
["x"].extends(
[None for i in range(len(output.shape) - 1)]))
output, DPMPPP_MESH_LIST[mod.mesh_idx], ["x"] +
[None for i in range(len(output.shape) - 1)])
else:
output = mod(output,
memory,
......@@ -407,9 +406,9 @@ class TransformerDecoder(nn.Layer):
mod,
DPPP_MESH_LIST[mod.mesh_idx])(output, memory, tgt_mask,
use_cache, cache)
auto.shard_tensor(output, DPPP_MESH_LIST[mod.mesh_idx], [
"x"
].extends([None for i in range(len(output.shape) - 1)]))
auto.shard_tensor(
output, DPPP_MESH_LIST[mod.mesh_idx],
["x"] + [None for i in range(len(output.shape) - 1)])
elif _global_parallel_strategy == "mp_pp":
output, new_cache = auto.shard_op(
mod,
......@@ -422,9 +421,9 @@ class TransformerDecoder(nn.Layer):
mod, DPMPPP_MESH_LIST[mod.mesh_idx])(output, memory,
tgt_mask,
use_cache, cache)
auto.shard_tensor(output, DPMPPP_MESH_LIST[mod.mesh_idx], [
"x"
].extends([None for i in range(len(output.shape) - 1)]))
auto.shard_tensor(
output, DPMPPP_MESH_LIST[mod.mesh_idx],
["x"] + [None for i in range(len(output.shape) - 1)])
else:
output, new_cache = mod(output,
memory,
......@@ -689,11 +688,11 @@ class GPTModel(nn.Layer):
auto.shard_tensor(input_ids, PP_MESH_LIST[0],
[None for i in range(len(input_ids.shape))])
if _global_parallel_strategy == "dp_pp":
auto.shard_tensor(input_ids, DPPP_MESH_LIST[0], ["x"].extends(
[None for i in range(len(input_ids.shape) - 1)]))
auto.shard_tensor(input_ids, DPPP_MESH_LIST[0], ["x"] +
[None for i in range(len(input_ids.shape) - 1)])
if _global_parallel_strategy == "dp_mp_pp":
auto.shard_tensor(input_ids, DPMPPP_MESH_LIST[0], ["x"].extends(
[None for i in range(len(input_ids.shape) - 1)]))
auto.shard_tensor(input_ids, DPMPPP_MESH_LIST[0], ["x"] +
[None for i in range(len(input_ids.shape) - 1)])
encoder_outputs = self.decoder(embedding_output,
memory=None,
tgt_mask=attention_mask,
......
......@@ -20,7 +20,7 @@ import warnings
import numpy as np
import paddle
from paddle.distributed import ParallelEnv
from paddle.fluid.dygraph.parallel import ParallelEnv
from paddle.utils import try_import
from .progressbar import ProgressBar
......
......@@ -50,6 +50,7 @@ from paddle.static import InputSpec as Input
import paddle.distributed as dist
import paddle.distributed.fleet as fleet
from paddle.distributed.fleet.base import role_maker
from paddle.autograd import no_grad
from .callbacks import config_callbacks, EarlyStopping
from .model_summary import summary
......@@ -1105,7 +1106,7 @@ class Model(object):
self._update_inputs()
return loss
@paddle.no_grad()
@no_grad()
def eval_batch(self, inputs, labels=None):
"""
Run one evaluating step on a batch of data.
......@@ -1157,7 +1158,7 @@ class Model(object):
self._update_inputs()
return loss
@paddle.no_grad()
@no_grad()
def predict_batch(self, inputs):
"""
Run one predicting step on a batch of data.
......
......@@ -19,7 +19,7 @@ import numbers
import paddle
import paddle.nn as nn
from paddle.static import InputSpec
from paddle.autograd import no_grad
from collections import OrderedDict
__all__ = []
......@@ -229,7 +229,7 @@ def summary(net, input_size=None, dtypes=None, input=None):
return params_info
@paddle.no_grad()
@no_grad()
def summary_string(model, input_size=None, dtypes=None, input=None):
def _all_is_numper(items):
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册