未验证 提交 1490aaa9 编写于 作者: U ustiniankw 提交者: GitHub

[cherry-pick2.4]en-docs warning&error fix (#48332)

* fixdocs, test=document_fix

* fixdocs, test=document_fix
上级 3fa7a736
......@@ -26,7 +26,6 @@ non_auto_func_called = True
def __non_auto_func_called__(func):
def __impl__(*args, **kwargs):
global non_auto_func_called
non_auto_func_called = False
......@@ -112,6 +111,7 @@ class DistributedStrategy(object):
def __init__(self):
"""
DistributedStrategy is the main configuration entry for distributed training of Paddle.
All of the distributed training configurations can be configured in DistributedStrategy,
such as automatic mixed precision (AMP), Layer-wise Adaptive Rate Scaling (LARS),
......@@ -129,7 +129,8 @@ class DistributedStrategy(object):
key = 'FLAGS_cudnn_batchnorm_spatial_persistent'
if _global_flags().is_public(key):
self.strategy.cudnn_batchnorm_spatial_persistent = bool(
_global_flags()[key])
_global_flags()[key]
)
key = 'FLAGS_conv_workspace_size_limit'
if _global_flags().is_public(key):
self.strategy.conv_workspace_size_limit = int(_global_flags()[key])
......@@ -144,16 +145,17 @@ class DistributedStrategy(object):
def __setattr__(self, key, value):
if self.__lock_attr and not hasattr(self, key):
raise TypeError("%s is not a attribute of %s" %
(key, self.__class__.__name__))
raise TypeError(
"%s is not a attribute of %s" % (key, self.__class__.__name__)
)
object.__setattr__(self, key, value)
def save_to_prototxt(self, output):
"""
Serialize current DistributedStrategy to string and save to output file
Examples:
.. code-block:: python
import paddle.distributed.fleet as fleet
......@@ -162,25 +164,28 @@ class DistributedStrategy(object):
strategy.recompute = True
strategy.recompute_configs = {"checkpoints": ["x"]}
strategy.save_to_prototxt("dist_strategy.prototxt")
"""
with open(output, "w") as fout:
fout.write(str(self.strategy))
def load_from_prototxt(self, pb_file):
"""
Load from prototxt file for DistributedStrategy initialization
Examples:
.. code-block:: python
import paddle.distributed.fleet as fleet
strategy = fleet.DistributedStrategy()
strategy.load_from_prototxt("dist_strategy.prototxt")
"""
with open(pb_file, 'r') as f:
self.strategy = google.protobuf.text_format.Merge(
str(f.read()), self.strategy)
str(f.read()), self.strategy
)
@property
def execution_strategy(self):
......@@ -188,7 +193,6 @@ class DistributedStrategy(object):
Configure ExecutionStrategy for DistributedStrategy
Examples:
.. code-block:: python
import paddle
......@@ -199,12 +203,16 @@ class DistributedStrategy(object):
strategy = paddle.distributed.fleet.DistributedStrategy()
strategy.execution_strategy = exe_strategy
"""
execution_strategy = paddle.fluid.ExecutionStrategy()
fields = self.strategy.execution_strategy.DESCRIPTOR.fields
for f in fields:
setattr(execution_strategy, f.name,
getattr(self.strategy.execution_strategy, f.name))
setattr(
execution_strategy,
f.name,
getattr(self.strategy.execution_strategy, f.name),
)
return execution_strategy
@execution_strategy.setter
......@@ -212,18 +220,21 @@ class DistributedStrategy(object):
def execution_strategy(self, strategy):
fields = self.strategy.execution_strategy.DESCRIPTOR.fields
for f in fields:
setattr(self.strategy.execution_strategy, f.name,
getattr(strategy, f.name))
setattr(
self.strategy.execution_strategy,
f.name,
getattr(strategy, f.name),
)
@property
def build_strategy(self):
"""
Configure BuildStrategy for DistributedStrategy
Note that the properties of BuildStrategy are valid in DistributedStrategy
only if the property is non-distributed strategy.
Examples:
.. code-block:: python
import paddle
......@@ -239,6 +250,7 @@ class DistributedStrategy(object):
strategy = paddle.distributed.fleet.DistributedStrategy()
strategy.build_strategy = build_strategy
"""
build_strategy = paddle.fluid.BuildStrategy()
......@@ -261,41 +273,48 @@ class DistributedStrategy(object):
value = ReduceStrategyFleet(value)
setattr(self.strategy.build_strategy, f.name, value)
elif f.label == 3: # repeated field
getattr(self.strategy.build_strategy,
f.name).extend(getattr(strategy, f.name))
getattr(self.strategy.build_strategy, f.name).extend(
getattr(strategy, f.name)
)
@property
def gradient_scale_configs(self):
"""
Set the strategy of gradient scale
Examples:
Examples:
.. code-block:: python
import paddle.distributed.fleet as fleet
strategy = fleet.DistributedStrategy()
strategy.gradient_scale_configs = {'scale_strategy': 'avg'}
Note that, strategy must be in 'avg', 'sum' or 'customized'
"""
return get_msg_dict(self.strategy.gradient_scale_configs)
@gradient_scale_configs.setter
@is_strict_auto
def gradient_scale_configs(self, config):
check_configs_key(self.strategy.gradient_scale_configs, config,
'gradient_scale_configs')
check_configs_key(
self.strategy.gradient_scale_configs,
config,
'gradient_scale_configs',
)
assign_configs_value(self.strategy.gradient_scale_configs, config)
@property
def a_sync(self):
"""
Indicating whether we are using asynchronous stocastic gradient descent updates
for training. This property is valid when we are using parameter server training,
which is implied by setting approperate RoleMaker
Default value: True
Examples:
.. code-block:: python
import paddle.distributed.fleet as fleet
......@@ -307,6 +326,7 @@ class DistributedStrategy(object):
# code block for defining loss and local optimizer
# sgd = fleet.distributed_optimizer(optimizer, strategy)
"""
return self.strategy.a_sync
......@@ -318,12 +338,15 @@ class DistributedStrategy(object):
self.a_sync_configs = {"k_steps": 0}
else:
raise ValueError(
"The type of `flag` is invalid, expected type is bool, but received {}"
.format(type(flag)))
"The type of `flag` is invalid, expected type is bool, but received {}".format(
type(flag)
)
)
@property
def a_sync_configs(self):
"""
Set a_sync update configurations. In general, asynchronous parameter server
training has serveral configurable settings that can be configured through
a dict.
......@@ -344,7 +367,6 @@ class DistributedStrategy(object):
runtime_split_send_recv(bool): if we are using Tensor split for send and recv during runtime
Examples:
.. code-block:: python
import paddle.distributed.fleet as fleet
......@@ -365,13 +387,15 @@ class DistributedStrategy(object):
@a_sync_configs.setter
@is_strict_auto
def a_sync_configs(self, configs):
check_configs_key(self.strategy.a_sync_configs, configs,
"a_sync_configs")
check_configs_key(
self.strategy.a_sync_configs, configs, "a_sync_configs"
)
assign_configs_value(self.strategy.a_sync_configs, configs)
@property
def trainer_desc_configs(self):
"""
Set trainer desc configurations.
**Notes**:
......@@ -384,7 +408,6 @@ class DistributedStrategy(object):
stat_var_names(list(str)):
Examples:
.. code-block:: python
import paddle.distributed.fleet as fleet
......@@ -404,11 +427,11 @@ class DistributedStrategy(object):
@property
def adam_d2sum(self):
"""
set adam_d2sum
Default value: False
Examples:
.. code-block:: python
import paddle.distributed.fleet as fleet
......@@ -420,6 +443,7 @@ class DistributedStrategy(object):
# code block for defining loss and local optimizer
# sgd = fleet.distributed_optimizer(optimizer, strategy)
"""
return self.strategy.adam_d2sum
......@@ -430,27 +454,37 @@ class DistributedStrategy(object):
self.strategy.adam_d2sum = flag
else:
raise ValueError(
"The type of `flag` is invalid, expected type is bool, but received {}"
.format(type(flag)))
"The type of `flag` is invalid, expected type is bool, but received {}".format(
type(flag)
)
)
@trainer_desc_configs.setter
@is_strict_auto
def trainer_desc_configs(self, configs):
check_configs_key(self.strategy.trainer_desc_configs, configs,
"trainer_desc_configs")
check_configs_key(
self.strategy.trainer_desc_configs, configs, "trainer_desc_configs"
)
assign_configs_value(self.strategy.trainer_desc_configs, configs)
@property
def fs_client_param(self):
"""
Set fs client configurations.
**Notes**:
Note:
uri(str): the uri of fs client
user(str): the user_name of fs client
passwd(str): the passwd of fs client
hadoop_bin(str):
Examples:
.. code-block:: python
import paddle.distributed.fleet as fleet
role_maker = fleet.PaddleCloudRoleMaker()
fleet.init(role_maker)
......@@ -459,14 +493,16 @@ class DistributedStrategy(object):
strategy.fs_client_param = configs
# code block for defining loss and local optimizer
# sgd = fleet.distributed_optimizer(optimizer, strategy)
"""
return self.strategy.fs_client_param
@fs_client_param.setter
@is_strict_auto
def fs_client_param(self, configs):
check_configs_key(self.strategy.fs_client_param, configs,
"fs_client_param")
check_configs_key(
self.strategy.fs_client_param, configs, "fs_client_param"
)
assign_configs_value(self.strategy.fs_client_param, configs)
@property
......@@ -477,6 +513,7 @@ class DistributedStrategy(object):
@is_strict_auto
def sparse_table_configs(self, configs):
from google.protobuf.descriptor import FieldDescriptor
table_param = self.strategy.downpour_table_param
def set_table_config(msg, config_name, configs, index=0):
......@@ -493,8 +530,9 @@ class DistributedStrategy(object):
data = getattr(msg, field.name).add()
set_table_config(data, name, configs, i)
else:
set_table_config(getattr(msg, field.name), name,
configs)
set_table_config(
getattr(msg, field.name), name, configs
)
else:
# print("not message:", name)
if name not in configs:
......@@ -513,133 +551,206 @@ class DistributedStrategy(object):
for table_name in configs:
table_data = table_param.add()
table_data.table_name = table_name
set_table_config(table_data, "table_parameters." + table_name,
configs[table_name])
set_table_config(
table_data,
"table_parameters." + table_name,
configs[table_name],
)
@sparse_table_configs.setter
def fleet_desc_configs(self, configs):
support_sparse_key_list = ['sparse_table_class', 'sparse_compress_in_save', 'sparse_shard_num', \
'sparse_accessor_class', 'sparse_learning_rate', 'sparse_initial_g2sum', 'sparse_initial_range', \
'sparse_weight_bounds', 'sparse_fea_dim', 'sparse_embedx_dim', 'sparse_embedx_threshold', 'sparse_nonclk_coeff', \
'sparse_click_coeff', 'sparse_base_threshold', 'sparse_delta_threshold', 'sparse_delta_keep_days', \
'sparse_delete_after_unseen_days', 'sparse_show_click_decay_rate', 'sparse_delete_threshold', \
'sparse_converter', 'sparse_deconverter', 'sparse_enable_cache', 'sparse_cache_rate', \
'sparse_cache_file_num', 'sparse_beta1_decay_rate', 'sparse_beta2_decay_rate', \
'sparse_ada_epsilon', 'sparse_optimizer', 'sparse_ssd_unseenday_threshold',
'embed_sparse_optimizer', 'embed_sparse_learning_rate', 'embed_sparse_weight_bounds', \
'embed_sparse_initial_range', 'embed_sparse_initial_g2sum', 'embed_sparse_beta1_decay_rate', \
'embed_sparse_beta2_decay_rate', 'embedx_sparse_optimizer', 'embedx_sparse_learning_rate', \
'embedx_sparse_weight_bounds', 'embedx_sparse_initial_range', 'embedx_sparse_initial_g2sum', \
'embedx_sparse_beta1_decay_rate', 'embedx_sparse_beta2_decay_rate', 'feature_learning_rate', 'nodeid_slot']
support_sparse_key_list = [
'sparse_table_class',
'sparse_compress_in_save',
'sparse_shard_num',
'sparse_accessor_class',
'sparse_learning_rate',
'sparse_initial_g2sum',
'sparse_initial_range',
'sparse_weight_bounds',
'sparse_fea_dim',
'sparse_embedx_dim',
'sparse_embedx_threshold',
'sparse_nonclk_coeff',
'sparse_click_coeff',
'sparse_base_threshold',
'sparse_delta_threshold',
'sparse_delta_keep_days',
'sparse_delete_after_unseen_days',
'sparse_show_click_decay_rate',
'sparse_delete_threshold',
'sparse_converter',
'sparse_deconverter',
'sparse_enable_cache',
'sparse_cache_rate',
'sparse_cache_file_num',
'sparse_beta1_decay_rate',
'sparse_beta2_decay_rate',
'sparse_ada_epsilon',
'sparse_optimizer',
'sparse_ssd_unseenday_threshold',
'embed_sparse_optimizer',
'embed_sparse_learning_rate',
'embed_sparse_weight_bounds',
'embed_sparse_initial_range',
'embed_sparse_initial_g2sum',
'embed_sparse_beta1_decay_rate',
'embed_sparse_beta2_decay_rate',
'embedx_sparse_optimizer',
'embedx_sparse_learning_rate',
'embedx_sparse_weight_bounds',
'embedx_sparse_initial_range',
'embedx_sparse_initial_g2sum',
'embedx_sparse_beta1_decay_rate',
'embedx_sparse_beta2_decay_rate',
'feature_learning_rate',
'nodeid_slot',
]
support_sparse_table_class = ['DownpourSparseTable']
support_sparse_accessor_class = [
'DownpourSparseValueAccessor', 'DownpourCtrAccessor',
'DownpourCtrDoubleAccessor', 'DownpourUnitAccessor',
'DownpourDoubleUnitAccessor', 'DownpourCtrDymfAccessor'
'DownpourSparseValueAccessor',
'DownpourCtrAccessor',
'DownpourCtrDoubleAccessor',
'DownpourUnitAccessor',
'DownpourDoubleUnitAccessor',
'DownpourCtrDymfAccessor',
]
from google.protobuf.descriptor import FieldDescriptor
table_param = self.strategy.downpour_table_param
def add_graph_config(graph, strategy):
graph.feature_learning_rate = strategy.get('feature_learning_rate',
0.05)
graph.feature_learning_rate = strategy.get(
'feature_learning_rate', 0.05
)
graph.nodeid_slot = strategy.get('nodeid_slot', 9008)
def sparse_optimizer_config(sgd, strategy, prefix):
optimizer_name = strategy.get(prefix + "sparse_optimizer",
"adagrad")
optimizer_name = strategy.get(
prefix + "sparse_optimizer", "adagrad"
)
sgd.name = optimizer_name
if optimizer_name == "naive":
sgd.name = "SparseNaiveSGDRule"
sgd.naive.learning_rate = strategy.get(
prefix + 'sparse_learning_rate', 0.05)
prefix + 'sparse_learning_rate', 0.05
)
sgd.naive.initial_range = strategy.get(
prefix + 'sparse_initial_range', 1e-4)
bounds = strategy.get(prefix + 'sparse_weight_bounds',
[-10, 10])
prefix + 'sparse_initial_range', 1e-4
)
bounds = strategy.get(
prefix + 'sparse_weight_bounds', [-10, 10]
)
sgd.naive.weight_bounds.extend(bounds)
elif optimizer_name == "adagrad":
sgd.name = 'SparseAdaGradSGDRule'
sgd.adagrad.learning_rate = strategy.get(
prefix + 'sparse_learning_rate', 0.05)
prefix + 'sparse_learning_rate', 0.05
)
sgd.adagrad.initial_range = strategy.get(
prefix + 'sparse_initial_range', 1e-4)
prefix + 'sparse_initial_range', 1e-4
)
if prefix == "embed_":
sgd.adagrad.initial_range = 0
sgd.adagrad.initial_g2sum = strategy.get(
prefix + 'sparse_initial_g2sum', 3)
bounds = strategy.get(prefix + 'sparse_weight_bounds',
[-10, 10])
prefix + 'sparse_initial_g2sum', 3
)
bounds = strategy.get(
prefix + 'sparse_weight_bounds', [-10, 10]
)
sgd.adagrad.weight_bounds.extend(bounds)
elif optimizer_name == "std_adagrad":
sgd.name = 'StdAdaGradSGDRule'
sgd.adagrad.learning_rate = strategy.get(
prefix + 'sparse_learning_rate', 0.05)
prefix + 'sparse_learning_rate', 0.05
)
sgd.adagrad.initial_range = strategy.get(
prefix + 'sparse_initial_range', 1e-4)
prefix + 'sparse_initial_range', 1e-4
)
if prefix == "embed_":
sgd.adagrad.initial_range = 0
sgd.adagrad.initial_g2sum = strategy.get(
prefix + 'sparse_initial_g2sum', 3)
bounds = strategy.get(prefix + 'sparse_weight_bounds',
[-10, 10])
prefix + 'sparse_initial_g2sum', 3
)
bounds = strategy.get(
prefix + 'sparse_weight_bounds', [-10, 10]
)
sgd.adagrad.weight_bounds.extend(bounds)
elif optimizer_name == "adam":
sgd.name = 'SparseAdamSGDRule'
sgd.adam.learning_rate = strategy.get(
prefix + 'sparse_learning_rate', 0.001)
prefix + 'sparse_learning_rate', 0.001
)
sgd.adam.initial_range = strategy.get(
prefix + 'sparse_initial_range', 1e-4)
prefix + 'sparse_initial_range', 1e-4
)
sgd.adam.beta1_decay_rate = strategy.get(
prefix + 'sparse_beta1_decay_rate', 0.9)
prefix + 'sparse_beta1_decay_rate', 0.9
)
sgd.adam.beta2_decay_rate = strategy.get(
prefix + 'sparse_beta2_decay_rate', 0.999)
prefix + 'sparse_beta2_decay_rate', 0.999
)
sgd.adam.ada_epsilon = strategy.get(
prefix + 'sparse_ada_epsilon', 1e-8)
bounds = strategy.get(prefix + 'sparse_weight_bounds',
[-10, 10])
prefix + 'sparse_ada_epsilon', 1e-8
)
bounds = strategy.get(
prefix + 'sparse_weight_bounds', [-10, 10]
)
sgd.adam.weight_bounds.extend(bounds)
elif optimizer_name == "shared_adam":
sgd.name = 'SparseSharedAdamSGDRule'
sgd.adam.learning_rate = strategy.get(
prefix + 'sparse_learning_rate', 0.001)
prefix + 'sparse_learning_rate', 0.001
)
sgd.adam.initial_range = strategy.get(
prefix + 'sparse_initial_range', 1e-4)
prefix + 'sparse_initial_range', 1e-4
)
sgd.adam.beta1_decay_rate = strategy.get(
prefix + 'sparse_beta1_decay_rate', 0.9)
prefix + 'sparse_beta1_decay_rate', 0.9
)
sgd.adam.beta2_decay_rate = strategy.get(
prefix + 'sparse_beta2_decay_rate', 0.999)
prefix + 'sparse_beta2_decay_rate', 0.999
)
sgd.adam.ada_epsilon = strategy.get(
prefix + 'sparse_ada_epsilon', 1e-8)
bounds = strategy.get(prefix + 'sparse_weight_bounds',
[-10, 10])
prefix + 'sparse_ada_epsilon', 1e-8
)
bounds = strategy.get(
prefix + 'sparse_weight_bounds', [-10, 10]
)
sgd.adam.weight_bounds.extend(bounds)
def set_sparse_table_config(table_data, config):
for key in config:
if key not in support_sparse_key_list:
raise ValueError("strategy key '%s' not support" % (key))
table_class = config.get("sparse_table_class",
"DownpourSparseTable")
table_class = config.get(
"sparse_table_class", "DownpourSparseTable"
)
if table_class not in support_sparse_table_class:
raise ValueError(
"support sparse_table_class: ['DownpourSparseTable'], but actual %s"
% (table_class))
% (table_class)
)
table_data.table_class = 'MemorySparseTable'
table_data.shard_num = config.get('sparse_shard_num', 1000)
table_data.enable_sparse_table_cache = config.get(
'sparse_enable_cache', True)
'sparse_enable_cache', True
)
table_data.sparse_table_cache_rate = config.get(
'sparse_cache_rate', 0.00055)
'sparse_cache_rate', 0.00055
)
table_data.sparse_table_cache_file_num = config.get(
'sparse_cache_file_num', 16)
'sparse_cache_file_num', 16
)
accessor_class = config.get("sparse_accessor_class",
"DownpourCtrAccessor")
accessor_class = config.get(
"sparse_accessor_class", "DownpourCtrAccessor"
)
if accessor_class not in support_sparse_accessor_class:
raise ValueError(
"support sparse_accessor_class: ['DownpourSparseValueAccessor', 'DownpourCtrAccessor', 'DownpourCtrDoubleAccessor', 'DownpourUnitAccessor', 'DownpourDoubleUnitAccessor'], but actual %s"
% (accessor_class))
% (accessor_class)
)
if accessor_class.find("Double") >= 0:
table_data.accessor.accessor_class = 'CtrDoubleAccessor'
......@@ -654,7 +765,8 @@ class DistributedStrategy(object):
table_data.accessor.embedx_dim = config.get('sparse_embedx_dim', 8)
table_data.accessor.fea_dim = table_data.accessor.embedx_dim + 3
table_data.accessor.embedx_threshold = config.get(
'sparse_embedx_threshold', 10)
'sparse_embedx_threshold', 10
)
if accessor_class == 'DownpourUnitAccessor':
table_data.accessor.ctr_accessor_param.show_scale = False
......@@ -662,23 +774,32 @@ class DistributedStrategy(object):
table_data.accessor.ctr_accessor_param.show_scale = True
table_data.accessor.ctr_accessor_param.nonclk_coeff = config.get(
'sparse_nonclk_coeff', 0.1)
'sparse_nonclk_coeff', 0.1
)
table_data.accessor.ctr_accessor_param.click_coeff = config.get(
'sparse_click_coeff', 1)
'sparse_click_coeff', 1
)
table_data.accessor.ctr_accessor_param.base_threshold = config.get(
'sparse_base_threshold', 1.5)
'sparse_base_threshold', 1.5
)
table_data.accessor.ctr_accessor_param.delta_threshold = config.get(
'sparse_delta_threshold', 0.25)
'sparse_delta_threshold', 0.25
)
table_data.accessor.ctr_accessor_param.delta_keep_days = config.get(
'sparse_delta_keep_days', 16)
table_data.accessor.ctr_accessor_param.show_click_decay_rate = config.get(
'sparse_show_click_decay_rate', 0.98)
table_data.accessor.ctr_accessor_param.delete_threshold = config.get(
'sparse_delete_threshold', 0.8)
table_data.accessor.ctr_accessor_param.delete_after_unseen_days = config.get(
'sparse_delete_after_unseen_days', 30)
table_data.accessor.ctr_accessor_param.ssd_unseenday_threshold = config.get(
'sparse_ssd_unseenday_threshold', 1)
'sparse_delta_keep_days', 16
)
table_data.accessor.ctr_accessor_param.show_click_decay_rate = (
config.get('sparse_show_click_decay_rate', 0.98)
)
table_data.accessor.ctr_accessor_param.delete_threshold = (
config.get('sparse_delete_threshold', 0.8)
)
table_data.accessor.ctr_accessor_param.delete_after_unseen_days = (
config.get('sparse_delete_after_unseen_days', 30)
)
table_data.accessor.ctr_accessor_param.ssd_unseenday_threshold = (
config.get('sparse_ssd_unseenday_threshold', 1)
)
converter = config.get('sparse_converter', "")
deconverter = config.get('sparse_deconverter', "")
......@@ -692,23 +813,33 @@ class DistributedStrategy(object):
save_data2.converter = converter
save_data2.deconverter = deconverter
if accessor_class == 'DownpourCtrAccessor' or accessor_class == 'DownpourCtrDoubleAccessor':
sparse_optimizer_config(table_data.accessor.embed_sgd_param,
config, '')
sparse_optimizer_config(table_data.accessor.embedx_sgd_param,
config, '')
if (
accessor_class == 'DownpourCtrAccessor'
or accessor_class == 'DownpourCtrDoubleAccessor'
):
sparse_optimizer_config(
table_data.accessor.embed_sgd_param, config, ''
)
sparse_optimizer_config(
table_data.accessor.embedx_sgd_param, config, ''
)
else:
sparse_optimizer_config(table_data.accessor.embed_sgd_param,
config, 'embed_')
sparse_optimizer_config(table_data.accessor.embedx_sgd_param,
config, 'embedx_')
sparse_optimizer_config(
table_data.accessor.embed_sgd_param, config, 'embed_'
)
sparse_optimizer_config(
table_data.accessor.embedx_sgd_param, config, 'embedx_'
)
add_graph_config(table_data.accessor.graph_sgd_param, config)
if not configs:
print("fleet desc config is empty")
else:
for table_name in configs:
if table_name == 'dense_table' or table_name == 'datanorm_table':
if (
table_name == 'dense_table'
or table_name == 'datanorm_table'
):
continue
if type(configs[table_name]) != dict:
continue
......@@ -744,6 +875,7 @@ class DistributedStrategy(object):
@property
def amp_configs(self):
"""
Set automatic mixed precision training configurations. In general, amp has serveral configurable
settings that can be configured through a dict.
......@@ -772,7 +904,6 @@ class DistributedStrategy(object):
Default True. Only takes effect when `use_pure_fp16` is turned on.
Examples 1:
.. code-block:: python
import paddle.distributed.fleet as fleet
......@@ -783,7 +914,6 @@ class DistributedStrategy(object):
"custom_white_list": ['conv2d']}
Examples 2:
.. code-block:: python
import paddle.distributed.fleet as fleet
......@@ -794,6 +924,7 @@ class DistributedStrategy(object):
"init_loss_scaling": 32768,
"use_pure_fp16": True
}
"""
return get_msg_dict(self.strategy.amp_configs)
......@@ -806,11 +937,11 @@ class DistributedStrategy(object):
@property
def asp(self):
"""
Indicating whether we are using automatic sparsity training
Default Value: False
Examples:
.. code-block:: python
import paddle.distributed.fleet as fleet
......@@ -835,7 +966,6 @@ class DistributedStrategy(object):
Default value: False
Examples:
.. code-block:: python
import paddle.distributed.fleet as fleet
......@@ -843,22 +973,24 @@ class DistributedStrategy(object):
strategy.recompute = True
# suppose x and y are names of checkpoint tensors for recomputation
strategy.recompute_configs = {"checkpoints": ["x", "y"]}
"""
return self.strategy.recompute
@property
def sync_nccl_allreduce(self):
"""
Indicating whether we are using synchronized all reduce in each communication thread
We note that system overhead is usually lower when sync_nccl_allreduce = True
Examples:
.. code-block:: python
import paddle.distributed.fleet as fleet
strategy = fleet.DistributedStrategy()
strategy.sync_nccl_allreduce = True
"""
return self.strategy.sync_nccl_allreduce
......@@ -873,17 +1005,18 @@ class DistributedStrategy(object):
@property
def use_hierarchical_allreduce(self):
"""
Indicating whether we are using hierarchical allreduce in collective communication
Hierarchical allreduce often does allreduce within a certain node group and then do
allreduce among the leaders of each group
Examples:
.. code-block:: python
import paddle.distributed.fleet as fleet
strategy = fleet.DistributedStrategy()
strategy.use_hierarchical_allreduce = True
"""
return self.strategy.use_hierarchical_allreduce
......@@ -900,16 +1033,17 @@ class DistributedStrategy(object):
@property
def hierarchical_allreduce_inter_nranks(self):
"""
Number of ranks for low level node groups in hierarchical allreduce
Default value: number of GPU cards on each single GPU machine
Example:
.. code-block:: python
import paddle.distributed.fleet as fleet
strategy = fleet.DistributedStrategy()
strategy.hierarchical_allreduce_inter_nranks = 8
"""
return self.strategy.hierarchical_allreduce_inter_nranks
......@@ -926,17 +1060,18 @@ class DistributedStrategy(object):
@property
def sync_batch_norm(self):
"""
Indicating whether we are using sync_batch_norm to do synchronous batch normalization among all training nodes.
Default value: False
Examples:
.. code-block:: python
import paddle.distributed.fleet as fleet
strategy = fleet.DistributedStrategy()
strategy.sync_batch_norm = True
"""
return self.strategy.sync_batch_norm
......@@ -952,16 +1087,17 @@ class DistributedStrategy(object):
@property
def fuse_all_reduce_ops(self):
"""
Indicating whether we are using fuse_all_reduce_ops for gradient fusion during backward phase of training
Default value: True
Examples:
.. code-block:: python
import paddle.distributed.fleet as fleet
strategy = fleet.DistributedStrategy()
strategy.fuse_all_reduce_ops = False
"""
return self.strategy.fuse_all_reduce_ops
......@@ -976,17 +1112,18 @@ class DistributedStrategy(object):
@property
def fuse_grad_size_in_MB(self):
"""
Specifying the size of gradient to fuse in Mega-Bytes
Default value: 32
Examples:
.. code-block:: python
import paddle.distributed.fleet as fleet
strategy = fleet.DistributedStrategy()
strategy.fuse_grad_size_in_MB = 50
"""
return self.strategy.fuse_grad_size_in_MB
......@@ -1001,6 +1138,7 @@ class DistributedStrategy(object):
@property
def last_comm_group_size_MB(self):
"""
Specifying the size of gradient to fuse in Mega-Bytes when
the last group of each batch communicates. Making the last group
small is useful to improve performance.
......@@ -1013,6 +1151,7 @@ class DistributedStrategy(object):
import paddle.distributed.fleet as fleet
strategy = fleet.DistributedStrategy()
strategy.last_comm_group_size_MB = 2
"""
return self.strategy.last_comm_group_size_MB
......@@ -1027,18 +1166,19 @@ class DistributedStrategy(object):
@property
def find_unused_parameters(self):
"""
Indicating whether we are using find_unused_parameters to
find unused parameters in DataParallel.
Default value: False
Examples:
.. code-block:: python
import paddle.distributed.fleet as fleet
strategy = fleet.DistributedStrategy()
strategy.find_unused_parameters = True
"""
return self.strategy.find_unused_parameters
......@@ -1070,17 +1210,18 @@ class DistributedStrategy(object):
@property
def nccl_comm_num(self):
"""
Specifying the number of NCCL communicator
Default value: 1
Examples:
.. code-block:: python
import paddle.distributed.fleet as fleet
strategy = fleet.DistributedStrategy()
strategy.nccl_comm_num = 2
"""
return self.strategy.nccl_comm_num
......@@ -1104,6 +1245,7 @@ class DistributedStrategy(object):
@property
def recompute_configs(self):
"""
Set recompute configurations.
**Note**:
......@@ -1120,7 +1262,6 @@ class DistributedStrategy(object):
specific here should be determined ("-1" is not allowed).
Examples:
.. code-block:: python
import paddle.distributed.fleet as fleet
......@@ -1137,13 +1278,15 @@ class DistributedStrategy(object):
@recompute_configs.setter
@is_strict_auto
def recompute_configs(self, configs):
check_configs_key(self.strategy.recompute_configs, configs,
"checkpoint_configs")
check_configs_key(
self.strategy.recompute_configs, configs, "checkpoint_configs"
)
assign_configs_value(self.strategy.recompute_configs, configs)
@property
def sharding(self):
"""
Indicating whether we are using sharding Optimizer for memory
optimization. We implement the sharding optimizer following the ZeRO-DP
idea from [ZeRO: Memory Optimizations Toward Training Trillion Parameter Models](https://arxiv.org/abs/1910.02054).
......@@ -1154,12 +1297,12 @@ class DistributedStrategy(object):
Default value: False
Examples:
.. code-block:: python
import paddle.fleet as fleet
strategy = fleet.DistributedStrategy()
strategy.sharding = True
"""
return self.strategy.sharding
......@@ -1174,6 +1317,7 @@ class DistributedStrategy(object):
@property
def sharding_configs(self):
"""
Set sharding configurations.
**Note**:
......@@ -1211,7 +1355,6 @@ class DistributedStrategy(object):
Examples:
.. code-block:: python
# sharding-DP, 2 nodes with 8 gpus per node
......@@ -1225,23 +1368,25 @@ class DistributedStrategy(object):
"dp_degree": 2,
"gradient_merge_acc_step": 4,
}
"""
return get_msg_dict(self.strategy.sharding_configs)
@sharding_configs.setter
@is_strict_auto
def sharding_configs(self, configs):
check_configs_key(self.strategy.sharding_configs, configs,
"sharding_configs")
check_configs_key(
self.strategy.sharding_configs, configs, "sharding_configs"
)
assign_configs_value(self.strategy.sharding_configs, configs)
@property
def without_graph_optimization(self):
"""
Run program using Executor other than ParallelExecutor.
Examples:
.. code-block:: python
import paddle.distributed.fleet as fleet
......@@ -1264,14 +1409,18 @@ class DistributedStrategy(object):
@property
def _calc_comm_same_stream(self):
"""
This based on raw_program_optimizer program
Set whether use same stream for calc and comm when fuse allreduce
The default value for the calc_comm_same_stream is False
Examples:
.. code-block:: python
import paddle.distributed.fleet as fleet
strategy = fleet.DistributedStrategy()
strategy.calc_comm_same_stream = True
"""
return self.strategy.calc_comm_same_stream
......@@ -1288,14 +1437,18 @@ class DistributedStrategy(object):
@property
def fuse_grad_merge(self):
"""
Set whether fuse the grad for gradient merge.
Note: this flag will only effect the gradient merge under pipeline mode
The default value for the fuse_grad_merge is False
Examples:
.. code-block:: python
import paddle.distributed.fleet as fleet
strategy = fleet.DistributedStrategy()
strategy.fuse_param_grad = True
"""
return self.strategy.fuse_grad_merge
......@@ -1310,12 +1463,17 @@ class DistributedStrategy(object):
@property
def fuse_grad_size_in_num(self):
"""
This based on raw_program_optimizer program and allreduce the num of the fused op
Examples:
.. code-block:: python
import paddle.distributed.fleet as fleet
strategy = fleet.DistributedStrategy()
strategy.fuse_grad_size_in_num = 2
"""
return self.strategy.fuse_grad_size_in_num
......@@ -1332,13 +1490,13 @@ class DistributedStrategy(object):
@property
def pipeline(self):
"""
Indicating whether we are using pipeline parallelism for distributed training.
Current implementation mainly focus on single GPU machine pipeline parallelism and
data parallelism across GPU machine. The pipeline information is indicated through
device_guard information in user-defined program.
Examples:
.. code-block:: python
import paddle.distributed.fleet as fleet
......@@ -1383,6 +1541,7 @@ class DistributedStrategy(object):
@property
def pipeline_configs(self):
"""
Set pipeline parallelism configurations. In pipeline parallelism,
different parts of neural networks are running on different GPUS.
There are Tensor queue buffer between each pair of neighborhood GPUS
......@@ -1398,7 +1557,6 @@ class DistributedStrategy(object):
**micro_batch_size**: the number of small batches in each user defined batch
Examples:
.. code-block:: python
import paddle.distributed.fleet as fleet
......@@ -1413,17 +1571,18 @@ class DistributedStrategy(object):
@pipeline_configs.setter
@is_strict_auto
def pipeline_configs(self, configs):
check_configs_key(self.strategy.pipeline_configs, configs,
"pipeline_configs")
check_configs_key(
self.strategy.pipeline_configs, configs, "pipeline_configs"
)
assign_configs_value(self.strategy.pipeline_configs, configs)
@property
def tensor_parallel(self):
"""
Indicating whether we are using tensor parallel for distributed training.
Examples:
.. code-block:: python
import paddle.distributed.fleet as fleet
......@@ -1444,16 +1603,18 @@ class DistributedStrategy(object):
@property
def tensor_parallel_configs(self):
"""
Set tensor_parallel configurations.
**Notes**:
**Detailed arguments for tensor_parallel_configs**
**tensor_parallel_degree**: degree of tensor parallel
**tensor_init_seed**: parameter initialization random seed
Examples:
.. code-block:: python
import paddle.distributed.fleet as fleet
......@@ -1468,54 +1629,62 @@ class DistributedStrategy(object):
@tensor_parallel_configs.setter
@is_strict_auto
def tensor_parallel_configs(self, configs):
check_configs_key(self.strategy.tensor_parallel_configs, configs,
"tensor_parallel_configs")
check_configs_key(
self.strategy.tensor_parallel_configs,
configs,
"tensor_parallel_configs",
)
assign_configs_value(self.strategy.tensor_parallel_configs, configs)
@property
def hybrid_configs(self):
"""
Dynamic graph hybrid parallel strategy configuration. Three-way hybrid parallelism
needs to meet the following relationships
total_number_GPUs = dp_degree * mp_degree * pp_degree
**Note**:
dp_degree(int): set number of GPUs in a data parallel group. Default -1.
**dp_degree(int)**: set number of GPUs in a data parallel group. Default -1.
This value should be an integer greater than 0.
If it is not set, or set to -1, its value will be inferred
based on the total number of cards.
mp_degree(int): set number of GPUs in a model parallel group. Default 1
pp_degree(int): set number of GPUs in a pipeline parallel group. Default 1
**mp_degree(int)**: set number of GPUs in a model parallel group. Default 1
**pp_degree(int)**: set number of GPUs in a pipeline parallel group. Default 1
Examples:
.. code-block:: python
import paddle.distributed.fleet as fleet
strategy = fleet.DistributedStrategy()
strategy.hybrid_configs = {
"dp_degree": 1,
"mp_degree": 2,
"pp_degree": 1}
"""
return get_msg_dict(self.strategy.hybrid_configs)
@hybrid_configs.setter
def hybrid_configs(self, configs):
check_configs_key(self.strategy.hybrid_configs, configs,
"hybrid_configs")
check_configs_key(
self.strategy.hybrid_configs, configs, "hybrid_configs"
)
assign_configs_value(self.strategy.hybrid_configs, configs)
@property
def localsgd(self):
"""
Indicating whether we are using Local SGD training. Default Value: False
For more details, please refer to
`Don't Use Large Mini-Batches, Use Local SGD <https://arxiv.org/pdf/1808.07217.pdf>`_.
Examples:
.. code-block:: python
import paddle.distributed.fleet as fleet
......@@ -1536,6 +1705,7 @@ class DistributedStrategy(object):
@property
def localsgd_configs(self):
"""
Set LocalSGD training configurations. LocalSGD has a configurable
setting that can be configured through a dict.
......@@ -1544,7 +1714,6 @@ class DistributedStrategy(object):
begin_step(int) The step of beginning training by localsgd. Default 1.
Examples:
.. code-block:: python
import paddle.distributed.fleet as fleet
......@@ -1552,6 +1721,7 @@ class DistributedStrategy(object):
strategy.localsgd = True
strategy.localsgd_configs = {"k_steps": 4,
"begin_step": 30}
"""
return get_msg_dict(self.strategy.localsgd_configs)
......@@ -1559,20 +1729,20 @@ class DistributedStrategy(object):
@localsgd_configs.setter
@is_strict_auto
def localsgd_configs(self, configs):
check_configs_key(self.strategy.localsgd_configs, configs,
"localsgd_configs")
check_configs_key(
self.strategy.localsgd_configs, configs, "localsgd_configs"
)
assign_configs_value(self.strategy.localsgd_configs, configs)
@property
def adaptive_localsgd(self):
"""
Indicating whether we are using Adaptive Local SGD training. Default Value: False
For more details, please refer to `Adaptive Communication Strategies to Achieve
the Best Error-Runtime Trade-off in Local-Update SGD <https://arxiv.org/pdf/1810.08313.pdf>`_.
Examples:
.. code-block:: python
import paddle.distributed.fleet as fleet
......@@ -1593,6 +1763,7 @@ class DistributedStrategy(object):
@property
def adaptive_localsgd_configs(self):
"""
Set AdaptiveLocalSGD training configurations. AdaptiveLocalSGD has a configurable
setting that can be configured through a dict.
......@@ -1600,10 +1771,10 @@ class DistributedStrategy(object):
init_k_steps(int) The initial steps for training before adaptive localsgd.
Then, the adaptive localsgd method will modify init_k_steps automatically.
Default 1.
begin_step(int) The step of beginning training by adaptive localsgd. Default 1.
Examples:
.. code-block:: python
import paddle.distributed.fleet as fleet
......@@ -1611,6 +1782,7 @@ class DistributedStrategy(object):
strategy.adaptive_localsgd = True
strategy.adaptive_localsgd_configs = {"init_k_steps": 1,
"begin_step": 30}
"""
return get_msg_dict(self.strategy.adaptive_localsgd_configs)
......@@ -1618,20 +1790,23 @@ class DistributedStrategy(object):
@adaptive_localsgd_configs.setter
@is_strict_auto
def adaptive_localsgd_configs(self, configs):
check_configs_key(self.strategy.adaptive_localsgd_configs, configs,
"adaptive_localsgd_configs")
check_configs_key(
self.strategy.adaptive_localsgd_configs,
configs,
"adaptive_localsgd_configs",
)
assign_configs_value(self.strategy.adaptive_localsgd_configs, configs)
@property
def dgc(self):
"""
Indicating whether we are using Deep Gradient Compression training. For more details, please refer to
[Deep Gradient Compression](https://arxiv.org/abs/1712.01887).
Default Value: False
Examples:
.. code-block:: python
import paddle.distributed.fleet as fleet
......@@ -1652,6 +1827,7 @@ class DistributedStrategy(object):
@property
def dgc_configs(self):
r"""
Set Deep Gradient Compression training configurations. In general, dgc has serveral configurable
settings that can be configured through a dict.
......@@ -1668,13 +1844,13 @@ class DistributedStrategy(object):
element will be transmitted.
Examples:
.. code-block:: python
import paddle.distributed.fleet as fleet
strategy = fleet.DistributedStrategy()
strategy.dgc = True
strategy.dgc_configs = {"rampup_begin_step": 1252}
"""
return get_msg_dict(self.strategy.dgc_configs)
......@@ -1687,14 +1863,15 @@ class DistributedStrategy(object):
@property
def fp16_allreduce(self):
"""
Indicating whether we are using fp16 gradient allreduce training
Default Value: False
Examples:
.. code-block:: python
import paddle.distributed.fleet as fleet
strategy = fleet.DistributedStrategy()
strategy.fp16_allreduce = True # by default this is false
......@@ -1711,6 +1888,7 @@ class DistributedStrategy(object):
@property
def gradient_merge(self):
"""
Gradient Merge, also called as Gradient Accumulation,
is a strategy for large batch training. With this strategy,
model parameter will not be updated until user-defined steps.
......@@ -1721,13 +1899,13 @@ class DistributedStrategy(object):
to model parameters.
Examples:
.. code-block:: python
import paddle.distributed.fleet as fleet
strategy = fleet.DistributedStrategy()
strategy.gradient_merge = True
strategy.gradient_merge_configs = {"k_steps": 4, "avg": True}
"""
return self.strategy.gradient_merge
......@@ -1742,6 +1920,7 @@ class DistributedStrategy(object):
@property
def gradient_merge_configs(self):
"""
the key-value configs of distribute_strategy
**Note**:
......@@ -1750,26 +1929,28 @@ class DistributedStrategy(object):
avg(bool): whether to average the gradients of each mini-batch, the default value is `True`
Examples:
.. code-block:: python
import paddle.distributed.fleet as fleet
strategy = fleet.DistributedStrategy()
strategy.gradient_merge = True
strategy.gradient_merge_configs = {"k_steps": 4, "avg": True}
"""
return get_msg_dict(self.strategy.gradient_merge_configs)
@gradient_merge_configs.setter
@is_strict_auto
def gradient_merge_configs(self, configs):
check_configs_key(self.strategy.gradient_merge_configs, configs,
"gradient_configs")
check_configs_key(
self.strategy.gradient_merge_configs, configs, "gradient_configs"
)
assign_configs_value(self.strategy.gradient_merge_configs, configs)
@property
def lars(self):
"""
Set lars configurations. lars is used to deal with the convergence problems when the global
batch size is larger than 8k. For more details, please refer to
[Large Batch Training of Convolutional Networks](https://arxiv.org/abs/1708.03888).
......@@ -1777,12 +1958,12 @@ class DistributedStrategy(object):
Default Value: False
Examples:
.. code-block:: python
import paddle.distributed.fleet as fleet
strategy = fleet.DistributedStrategy()
strategy.lars = True # by default this is false
"""
return self.strategy.lars
......@@ -1797,6 +1978,7 @@ class DistributedStrategy(object):
@property
def lars_configs(self):
"""
Set Lars training configurations.
**Notes**:
......@@ -1808,7 +1990,6 @@ class DistributedStrategy(object):
will be exclude from weight decay in lars formula.
Examples:
.. code-block:: python
import paddle.distributed.fleet as fleet
......@@ -1820,6 +2001,7 @@ class DistributedStrategy(object):
"epsilon": 0,
"exclude_from_weight_decay": ['batch_norm', '.b_0']
}
"""
return get_msg_dict(self.strategy.lars_configs)
......@@ -1832,6 +2014,7 @@ class DistributedStrategy(object):
@property
def lamb(self):
"""
Set lamb configurations. lamb is used to deal with the convergence problems for large
batch size training, specially for attention-related model like BERT. For more details,
please refer to
......@@ -1840,12 +2023,12 @@ class DistributedStrategy(object):
Default Value: False
Examples:
.. code-block:: python
import paddle.distributed.fleet as fleet
strategy = fleet.DistributedStrategy()
strategy.lamb = True # by default this is false
"""
return self.strategy.lamb
......@@ -1861,6 +2044,7 @@ class DistributedStrategy(object):
@property
def lamb_configs(self):
"""
Set Lars training configurations.
**Notes**:
......@@ -1869,7 +2053,6 @@ class DistributedStrategy(object):
will be exclude from weight decay in lamb formula.
Examples:
.. code-block:: python
import paddle.distributed.fleet as fleet
......@@ -1879,6 +2062,7 @@ class DistributedStrategy(object):
'lamb_weight_decay': 0.01,
'exclude_from_weight_decay': [],
}
"""
return get_msg_dict(self.strategy.lamb_configs)
......@@ -1891,8 +2075,10 @@ class DistributedStrategy(object):
@property
def elastic(self):
"""
Indicating whether we want to do current distributed training on clusters with elastic resources.
Currently, this is configuration is not valid.
"""
return self.strategy.elastic
......@@ -1907,6 +2093,7 @@ class DistributedStrategy(object):
@property
def auto(self):
"""
Indicating whether we are using auto-parallel configuration
This feature is currently an experimental feature. Currently,
auto-parallelism can be used only when a user does not set any other
......@@ -1915,7 +2102,6 @@ class DistributedStrategy(object):
Default Value: False
Examples:
.. code-block:: python
import paddle
......@@ -1929,6 +2115,7 @@ class DistributedStrategy(object):
optimizer = paddle.optimizer.SGD(learning_rate=0.01)
optimizer = fleet.distributed_optimizer(optimizer, strategy)
"""
return self.strategy.auto
......@@ -1942,6 +2129,7 @@ class DistributedStrategy(object):
@property
def semi_auto(self):
"""
Indicating whether we are using semi-auto parallel function
This feature is currently an experimental feature. Currently,
auto-parallelism can be used only when a user does not set any other
......@@ -1950,7 +2138,6 @@ class DistributedStrategy(object):
Default Value: False
Examples:
.. code-block:: python
import paddle
......@@ -1964,6 +2151,7 @@ class DistributedStrategy(object):
optimizer = paddle.optimizer.SGD(learning_rate=0.01)
optimizer = fleet.distributed_optimizer(optimizer, strategy)
"""
return self.strategy.semi_auto
......@@ -1977,16 +2165,21 @@ class DistributedStrategy(object):
@property
def auto_search(self):
"""
Indicating whether we are using auto-search parallel function
For details, please reference the following code example
Default Value: False
Examples:
.. code-block:: python
import paddle
paddle.enable_static()
import paddle.distributed.fleet as fleet
strategy = fleet.DistributedStrategy()
strategy.auto_search = True
"""
return self.strategy.auto_search
......@@ -2000,15 +2193,20 @@ class DistributedStrategy(object):
@property
def split_data(self):
"""
Indicating whether we split the data. If True, we split the data.
Default Value: True
Examples:
.. code-block:: python
import paddle
paddle.enable_static()
import paddle.distributed.fleet as fleet
strategy = fleet.DistributedStrategy()
strategy.split_data = True
"""
return self.strategy.split_data
......@@ -2022,8 +2220,10 @@ class DistributedStrategy(object):
@property
def qat(self):
"""
Indicating whether we are using quantization training
Default Value: False
"""
return self.strategy.qat
......@@ -2037,6 +2237,7 @@ class DistributedStrategy(object):
@property
def qat_configs(self):
"""
Set quantization training configurations. In general, qat has serveral configurable
settings that can be configured through a dict.
......@@ -2053,10 +2254,10 @@ class DistributedStrategy(object):
algo(str): Other quantization training algorithm.
Exampless:
.. code-block:: python
import paddle.distributed.fleet as fleet
strategy = fleet.DistributedStrategy()
strategy.qat = True
strategy.qat_configs = {
......@@ -2076,13 +2277,13 @@ class DistributedStrategy(object):
@property
def heter_ccl_mode(self):
"""
Indicating whether we are using heter_ccl_mode for model training.
This feature is currently an experimental feature. Currently,
heter_ccl_mode can be used only for dataparallel with dygraph mode.
Default Value: False
Examples:
.. code-block:: python
import paddle
......@@ -2094,6 +2295,7 @@ class DistributedStrategy(object):
# for initialize parallel env, only need to call
paddle.distributed.init_parallel_env()
# then the heterogenous context will be created.
"""
return self.strategy.heter_ccl_mode
......@@ -2107,6 +2309,7 @@ class DistributedStrategy(object):
@property
def cudnn_exhaustive_search(self):
"""
Indicating whether to use exhaustive search method to choose convolution algorithms.
Exhaustive search attempts all cuDNN algorithms to choose the fastest algorithm.
This method is time-consuming, the choosed algorithm will be cached for the given layer specifications.
......@@ -2114,17 +2317,18 @@ class DistributedStrategy(object):
Default Value: True
Examples:
.. code-block:: python
import paddle
paddle.enable_static()
import paddle.distributed.fleet as fleet
strategy = fleet.DistributedStrategy()
strategy.cudnn_exhaustive_search = False
optimizer = paddle.optimizer.SGD(learning_rate=0.01)
optimizer = fleet.distributed_optimizer(optimizer, strategy)
"""
return self.strategy.cudnn_exhaustive_search
......@@ -2141,6 +2345,7 @@ class DistributedStrategy(object):
@property
def conv_workspace_size_limit(self):
"""
The workspace limit size in MB unit for choosing cuDNN convolution algorithms.
The inner funciton of cuDNN obtain the fastest suited algorithm that fits within this memory limit.
Usually, large workspace size may lead to choose faster algorithms,
......@@ -2148,12 +2353,12 @@ class DistributedStrategy(object):
Default Value: 4000
Examples:
.. code-block:: python
import paddle
paddle.enable_static()
import paddle.distributed.fleet as fleet
strategy = fleet.DistributedStrategy()
strategy.conv_workspace_size_limit = 1024
......@@ -2176,17 +2381,18 @@ class DistributedStrategy(object):
@property
def cudnn_batchnorm_spatial_persistent(self):
"""
Indicates whether to use the mode CUDNN_BATCHNORM_SPATIAL_PERSISTENT function in batchnorm.
This is only useful in cudnn.
Default Value: True
Examples:
.. code-block:: python
import paddle
paddle.enable_static()
import paddle.distributed.fleet as fleet
strategy = fleet.DistributedStrategy()
strategy.cudnn_batchnorm_spatial_persistent = True
......@@ -2244,7 +2450,8 @@ class DistributedStrategy(object):
h1_format = " " + "|{{:^{}s}}|\n".format(length)
h2_format = " " + "|{{:>{}s}}{}{{:^{}s}}|\n".format(
max_k, " " * spacing, max_v)
max_k, " " * spacing, max_v
)
border = " +" + "".join(["="] * length) + "+"
line = " +" + "".join(["-"] * length) + "+"
......@@ -2269,37 +2476,48 @@ class DistributedStrategy(object):
if getattr(self.strategy, f.name):
draws += border + "\n"
draws += h1_format.format(
"{}=True <-> {}_configs".format(f.name, f.name))
"{}=True <-> {}_configs".format(f.name, f.name)
)
draws += line + "\n"
my_configs = getattr(self.strategy,
f.name + "_configs")
my_configs = getattr(
self.strategy, f.name + "_configs"
)
config_fields = my_configs.DESCRIPTOR.fields
for ff in config_fields:
if isinstance(
getattr(my_configs,
ff.name), google.protobuf.pyext.
_message.RepeatedScalarContainer):
getattr(my_configs, ff.name),
google.protobuf.pyext._message.RepeatedScalarContainer,
):
values = getattr(my_configs, ff.name)
for i, v in enumerate(values):
if i == 0:
draws += h2_format.format(
ff.name, str(v))
ff.name, str(v)
)
else:
draws += h2_format.format(
"", str(v))
"", str(v)
)
else:
draws += h2_format.format(
ff.name,
str(getattr(my_configs, ff.name)))
str(getattr(my_configs, ff.name)),
)
else:
env_draws += h2_format.format(
f.name, str(getattr(self.strategy, f.name)))
f.name, str(getattr(self.strategy, f.name))
)
else:
env_draws += h2_format.format(
f.name, str(getattr(self.strategy, f.name)))
f.name, str(getattr(self.strategy, f.name))
)
result_res = draws + border + "\n" + h1_format.format(
"Environment Flags, Communication Flags")
result_res = (
draws
+ border
+ "\n"
+ h1_format.format("Environment Flags, Communication Flags")
)
result_res += env_draws
build_strategy_str = border + "\n"
......@@ -2309,7 +2527,8 @@ class DistributedStrategy(object):
fields = self.strategy.build_strategy.DESCRIPTOR.fields
for f in fields:
build_strategy_str += h2_format.format(
f.name, str(getattr(self.strategy.build_strategy, f.name)))
f.name, str(getattr(self.strategy.build_strategy, f.name))
)
build_strategy_str += border + "\n"
execution_strategy_str = h1_format.format("Execution Strategy")
......@@ -2318,7 +2537,8 @@ class DistributedStrategy(object):
fields = self.strategy.execution_strategy.DESCRIPTOR.fields
for f in fields:
execution_strategy_str += h2_format.format(
f.name, str(getattr(self.strategy.execution_strategy, f.name)))
f.name, str(getattr(self.strategy.execution_strategy, f.name))
)
execution_strategy_str += border + "\n"
result_res += build_strategy_str + execution_strategy_str
......
......@@ -28,12 +28,13 @@ _HYBRID_PARALLEL_GROUP = None
class ParallelMode(object):
"""
There are all the parallel modes currently supported:
- DATA_PARALLEL: Distribute input data to different devices.
- TENSOR_PARALLEL: Shards tensors in the network to different devices.
- PIPELINE_PARALLEL: Place different layers of the network on different devices.
- SHARDING_PARALLEL: Segment the model parameters, parameter gradients and optimizer states
corresponding to the parameters to each device.
- SHARDING_PARALLEL: Segment the model parameters, parameter gradients and optimizer states corresponding to the parameters to each device.
Examples:
.. code-block:: python
......@@ -43,6 +44,7 @@ class ParallelMode(object):
print(parallel_mode.DATA_PARALLEL) # 0
"""
DATA_PARALLEL = 0
TENSOR_PARALLEL = 1
PIPELINE_PARALLEL = 2
......@@ -50,14 +52,16 @@ class ParallelMode(object):
class CommunicateTopology(object):
def __init__(self,
def __init__(
self,
hybrid_group_names=["data", "pipe", "sharding", "model"],
dims=[1, 1, 1, 1]):
dims=[1, 1, 1, 1],
):
self._parallel_names = hybrid_group_names
self._dims = dims
self.coordinate = collections.namedtuple('Coordinate',
self._parallel_names)
self.coordinate = collections.namedtuple(
'Coordinate', self._parallel_names
)
self._world_size = reduce(lambda x, y: x * y, self._dims)
ranges = [range(d) for d in self._dims]
......@@ -65,7 +69,8 @@ class CommunicateTopology(object):
self._coord2rank = dict(zip(all_coordinate, range(len(all_coordinate))))
self._rank2coord = dict(
zip(self._coord2rank.values(), self._coord2rank.keys()))
zip(self._coord2rank.values(), self._coord2rank.keys())
)
def get_hybrid_group_names(self):
return self._parallel_names
......@@ -90,7 +95,8 @@ class CommunicateTopology(object):
def get_axis_list(self, axis_name, index):
axis = self._parallel_names.index(axis_name)
ranks = [
self._coord2rank[coord] for coord in self._coord2rank.keys()
self._coord2rank[coord]
for coord in self._coord2rank.keys()
if coord[axis] == index
]
ranks.sort()
......@@ -132,7 +138,6 @@ class CommunicateTopology(object):
class HybridCommunicateGroup(object):
def __init__(self, topology):
self.nranks = paddle.distributed.get_world_size()
self.global_rank = paddle.distributed.get_rank()
......@@ -148,10 +153,16 @@ class HybridCommunicateGroup(object):
self._sharding_parallel_id = self._get_sharding_parallel_id()
self.stage_id = self._get_pipe_parallel_id()
assert self._check_vaild_topo(
), "Here is an unreasonable topogy setting. world_size: {}, but" \
"mp_num: {}, sharding_num: {}, pp_num: {}, dp_num: {}".format(self.nranks,
self._mp_degree, self._sharding_degree, self._pp_degree, self._dp_degree)
assert self._check_vaild_topo(), (
"Here is an unreasonable topogy setting. world_size: {}, but"
"mp_num: {}, sharding_num: {}, pp_num: {}, dp_num: {}".format(
self.nranks,
self._mp_degree,
self._sharding_degree,
self._pp_degree,
self._dp_degree,
)
)
# create comm group for data parallel
self._dp_group, self._dp_comm_group = self._set_comm_group("data")
......@@ -164,26 +175,43 @@ class HybridCommunicateGroup(object):
# create comm group for sharding parallel
self._sharding_group, self._sharding_comm_group = self._set_comm_group(
"sharding")
"sharding"
)
# create global group for check inf_nan / clip global norm
self._check_group, self._check_comm_group = self._set_check_group(
"data")
"data"
)
# create p2p group
self.is_first_stage = (self.stage_id == 0)
self.is_last_stage = (self.stage_id == (self._pp_degree - 1))
self.is_first_stage = self.stage_id == 0
self.is_last_stage = self.stage_id == (self._pp_degree - 1)
# create p2p_groups
if self._pp_degree > 1:
self._set_p2p_group()
debug_str = "HybridParallelInfo: rank_id: %d, mp_degree: %d, " \
"sharding_degree: %d, pp_degree: %d, dp_degree: %d" % (self.global_rank, self._mp_degree,
self._sharding_degree, self._pp_degree, self._dp_degree)
debug_str += ", mp_group: %s, sharding_group: %s, pp_group: %s, dp_group: %s, check/clip group: %s" % (
self._mp_group, self._sharding_group, self._pp_group,
self._dp_group, self._check_group)
debug_str = (
"HybridParallelInfo: rank_id: %d, mp_degree: %d, "
"sharding_degree: %d, pp_degree: %d, dp_degree: %d"
% (
self.global_rank,
self._mp_degree,
self._sharding_degree,
self._pp_degree,
self._dp_degree,
)
)
debug_str += (
", mp_group: %s, sharding_group: %s, pp_group: %s, dp_group: %s, check/clip group: %s"
% (
self._mp_group,
self._sharding_group,
self._pp_group,
self._dp_group,
self._check_group,
)
)
logger.info(debug_str)
global _HYBRID_PARALLEL_GROUP
......@@ -195,7 +223,12 @@ class HybridCommunicateGroup(object):
# adding its parallel logic within that parallelism
# when use sharding alone, it should have its own parallelism for its parallel logic
# TODO modify 3 others parallel to support sharding
if self._mp_degree == 1 and self._pp_degree == 1 and self._dp_degree == 1 and self._sharding_degree > 1:
if (
self._mp_degree == 1
and self._pp_degree == 1
and self._dp_degree == 1
and self._sharding_degree > 1
):
return ParallelMode.SHARDING_PARALLEL
elif self._mp_degree == 1 and self._pp_degree == 1:
return ParallelMode.DATA_PARALLEL
......@@ -206,7 +239,13 @@ class HybridCommunicateGroup(object):
return ParallelMode.PIPELINE_PARALLEL
def _check_vaild_topo(self):
return self._dp_degree * self._mp_degree * self._pp_degree * self._sharding_degree == self.nranks
return (
self._dp_degree
* self._mp_degree
* self._pp_degree
* self._sharding_degree
== self.nranks
)
def _set_comm_group(self, parallel_method="data"):
parallel_group = []
......@@ -268,14 +307,16 @@ class HybridCommunicateGroup(object):
self.prev_rank = prev_rank
next_group = paddle.distributed.new_group(
ranks=[curr_rank, next_rank])
ranks=[curr_rank, next_rank]
)
if self.global_rank == curr_rank:
self.send_next_group = next_group
elif self.global_rank == next_rank:
self.recv_prev_group = next_group
prev_group = paddle.distributed.new_group(
ranks=[prev_rank, curr_rank])
ranks=[prev_rank, curr_rank]
)
if self.global_rank == curr_rank:
self.send_prev_group = prev_group
......@@ -339,7 +380,12 @@ class HybridCommunicateGroup(object):
return self._pp_comm_group
def get_p2p_groups(self):
return self.send_next_group, self.send_prev_group, self.recv_next_group, self.recv_prev_group
return (
self.send_next_group,
self.send_prev_group,
self.recv_next_group,
self.recv_prev_group,
)
# sharding parallel message:
def _get_sharding_parallel_id(self):
......@@ -363,23 +409,25 @@ class HybridCommunicateGroup(object):
return self._check_comm_group
def get_rank_from_stage(self, stage_id, **kwargs):
return self._topo.get_rank_from_stage(self.global_rank,
pipe=stage_id,
**kwargs)
return self._topo.get_rank_from_stage(
self.global_rank, pipe=stage_id, **kwargs
)
class _CommunicateGroup(object):
""" tmp for static """
"""tmp for static"""
def __init__(self):
global _HYBRID_PARALLEL_GROUP
_HYBRID_PARALLEL_GROUP = self
self.groups = dict()
def set_comm_group(self, group_name, group_rank, group_size, ring_id,
group_ranks):
group = paddle.distributed.collective.Group(group_rank, ring_id,
group_ranks)
def set_comm_group(
self, group_name, group_rank, group_size, ring_id, group_ranks
):
group = paddle.distributed.collective.Group(
group_rank, ring_id, group_ranks
)
self.groups[group_name] = group
def get_group(self, group_name):
......
......@@ -103,6 +103,7 @@ def _check_var_exists(var_name):
def init_parallel_env():
"""
Initialize parallel training environment in dynamic graph mode.
Note:
......@@ -118,6 +119,7 @@ def init_parallel_env():
Examples:
.. code-block:: python
# required: gpu
import paddle
import paddle.nn as nn
......@@ -158,6 +160,7 @@ def init_parallel_env():
if __name__ == '__main__':
dist.spawn(train)
"""
# 0. get env & check world size
......
......@@ -51,61 +51,76 @@ __all__ = [
def _check_normalization(norm):
if norm not in ['forward', 'backward', 'ortho']:
raise ValueError(
"Unexpected norm: {}. Norm should be forward, backward or ortho".
format(norm))
"Unexpected norm: {}. Norm should be forward, backward or ortho".format(
norm
)
)
def _check_fft_n(n):
if not isinstance(n, int):
raise ValueError(
"Invalid FFT argument n({}), it shoule be an integer.".format(n))
"Invalid FFT argument n({}), it shoule be an integer.".format(n)
)
if n <= 0:
raise ValueError(
"Invalid FFT argument n({}), it should be positive.".format(n))
"Invalid FFT argument n({}), it should be positive.".format(n)
)
def _check_fft_shape(x, s):
ndim = x.ndim
if not isinstance(s, Sequence):
raise ValueError(
"Invaid FFT argument s({}), it should be a sequence of integers.")
"Invaid FFT argument s({}), it should be a sequence of integers."
)
if len(s) > ndim:
raise ValueError(
"Length of FFT argument s should not be larger than the rank of input. "
"Received s: {}, rank of x: {}".format(s, ndim))
"Received s: {}, rank of x: {}".format(s, ndim)
)
for size in s:
if not isinstance(size, int) or size <= 0:
raise ValueError("FFT sizes {} contains invalid value ({})".format(
s, size))
raise ValueError(
"FFT sizes {} contains invalid value ({})".format(s, size)
)
def _check_fft_axis(x, axis):
ndim = x.ndim
if not isinstance(axis, int):
raise ValueError(
"Invalid FFT axis ({}), it shoule be an integer.".format(axis))
"Invalid FFT axis ({}), it shoule be an integer.".format(axis)
)
if axis < -ndim or axis >= ndim:
raise ValueError(
"Invalid FFT axis ({}), it should be in range [-{}, {})".format(
axis, ndim, ndim))
axis, ndim, ndim
)
)
def _check_fft_axes(x, axes):
ndim = x.ndim
if not isinstance(axes, Sequence):
raise ValueError(
"Invalid FFT axes ({}), it should be a sequence of integers.".
format(axes))
"Invalid FFT axes ({}), it should be a sequence of integers.".format(
axes
)
)
if len(axes) > ndim:
raise ValueError(
"Length of fft axes should not be larger than the rank of input. "
"Received, len of axes: {}, rank of x: {}".format(len(axes), ndim))
"Received, len of axes: {}, rank of x: {}".format(len(axes), ndim)
)
for axis in axes:
if not isinstance(axis, int) or axis < -ndim or axis >= ndim:
raise ValueError(
"FFT axes {} contains invalid value ({}), it should be in range [-{}, {})"
.format(axes, axis, ndim, ndim))
"FFT axes {} contains invalid value ({}), it should be in range [-{}, {})".format(
axes, axis, ndim, ndim
)
)
def _resize_fft_input(x, s, axes):
......@@ -127,10 +142,12 @@ def _resize_fft_input(x, s, axes):
slices.append((0, s[i]))
if axes_to_slice:
x = paddle.slice(x,
x = paddle.slice(
x,
axes_to_slice,
starts=[item[0] for item in slices],
ends=[item[1] for item in slices])
ends=[item[1] for item in slices],
)
if axes_to_pad:
padding_widths = [0] * (2 * ndim)
for axis, pad in zip(axes_to_pad, paddings):
......@@ -146,8 +163,9 @@ def _normalize_axes(x, axes):
def _check_at_least_ndim(x, rank):
if x.ndim < rank:
raise ValueError("The rank of the input ({}) should >= {}".format(
x.ndim, rank))
raise ValueError(
"The rank of the input ({}) should >= {}".format(x.ndim, rank)
)
# public APIs 1d
......@@ -197,13 +215,9 @@ def fft(x, n=None, axis=-1, norm="backward", name=None):
"""
if is_integer(x) or is_floating_point(x):
return fft_r2c(x,
n,
axis,
norm,
forward=True,
onesided=False,
name=name)
return fft_r2c(
x, n, axis, norm, forward=True, onesided=False, name=name
)
else:
return fft_c2c(x, n, axis, norm, forward=True, name=name)
......@@ -266,13 +280,9 @@ def ifft(x, n=None, axis=-1, norm="backward", name=None):
"""
if is_integer(x) or is_floating_point(x):
return fft_r2c(x,
n,
axis,
norm,
forward=False,
onesided=False,
name=name)
return fft_r2c(
x, n, axis, norm, forward=False, onesided=False, name=name
)
else:
return fft_c2c(x, n, axis, norm, forward=False, name=name)
......@@ -536,13 +546,9 @@ def fftn(x, s=None, axes=None, norm="backward", name=None):
# [-8.-8.j 0.+0.j 0.+0.j 0.-0.j]]]
"""
if is_integer(x) or is_floating_point(x):
return fftn_r2c(x,
s,
axes,
norm,
forward=True,
onesided=False,
name=name)
return fftn_r2c(
x, s, axes, norm, forward=True, onesided=False, name=name
)
else:
return fftn_c2c(x, s, axes, norm, forward=True, name=name)
......@@ -608,19 +614,16 @@ def ifftn(x, s=None, axes=None, norm="backward", name=None):
# (-0.1666666716337204+0.28867512941360474j)]])
"""
if is_integer(x) or is_floating_point(x):
return fftn_r2c(x,
s,
axes,
norm,
forward=False,
onesided=False,
name=name)
return fftn_r2c(
x, s, axes, norm, forward=False, onesided=False, name=name
)
else:
return fftn_c2c(x, s, axes, norm, forward=False, name=name)
def rfftn(x, s=None, axes=None, norm="backward", name=None):
"""
The N dimensional FFT for real input.
This function computes the N-dimensional discrete Fourier Transform over
......@@ -665,10 +668,9 @@ def rfftn(x, s=None, axes=None, norm="backward", name=None):
refer to :ref:`api_guide_Name` .
Returns:
out(Tensor): complex tensor
out(Tensor), complex tensor
Examples:
.. code-block:: python
import paddle
......@@ -914,13 +916,17 @@ def fft2(x, s=None, axes=(-2, -1), norm="backward", name=None):
if s is not None:
if not isinstance(s, Sequence) or len(s) != 2:
raise ValueError(
"Invalid FFT argument s ({}), it should be a sequence of 2 integers."
.format(s))
"Invalid FFT argument s ({}), it should be a sequence of 2 integers.".format(
s
)
)
if axes is not None:
if not isinstance(axes, Sequence) or len(axes) != 2:
raise ValueError(
"Invalid FFT argument axes ({}), it should be a sequence of 2 integers."
.format(axes))
"Invalid FFT argument axes ({}), it should be a sequence of 2 integers.".format(
axes
)
)
return fftn(x, s, axes, norm, name)
......@@ -979,13 +985,17 @@ def ifft2(x, s=None, axes=(-2, -1), norm="backward", name=None):
if s is not None:
if not isinstance(s, Sequence) or len(s) != 2:
raise ValueError(
"Invalid FFT argument s ({}), it should be a sequence of 2 integers."
.format(s))
"Invalid FFT argument s ({}), it should be a sequence of 2 integers.".format(
s
)
)
if axes is not None:
if not isinstance(axes, Sequence) or len(axes) != 2:
raise ValueError(
"Invalid FFT argument axes ({}), it should be a sequence of 2 integers."
.format(axes))
"Invalid FFT argument axes ({}), it should be a sequence of 2 integers.".format(
axes
)
)
return ifftn(x, s, axes, norm, name)
......@@ -1038,13 +1048,17 @@ def rfft2(x, s=None, axes=(-2, -1), norm="backward", name=None):
if s is not None:
if not isinstance(s, Sequence) or len(s) != 2:
raise ValueError(
"Invalid FFT argument s ({}), it should be a sequence of 2 integers."
.format(s))
"Invalid FFT argument s ({}), it should be a sequence of 2 integers.".format(
s
)
)
if axes is not None:
if not isinstance(axes, Sequence) or len(axes) != 2:
raise ValueError(
"Invalid FFT argument axes ({}), it should be a sequence of 2 integers."
.format(axes))
"Invalid FFT argument axes ({}), it should be a sequence of 2 integers.".format(
axes
)
)
return rfftn(x, s, axes, norm, name)
......@@ -1090,13 +1104,17 @@ def irfft2(x, s=None, axes=(-2, -1), norm="backward", name=None):
if s is not None:
if not isinstance(s, Sequence) or len(s) != 2:
raise ValueError(
"Invalid FFT argument s ({}), it should be a sequence of 2 integers."
.format(s))
"Invalid FFT argument s ({}), it should be a sequence of 2 integers.".format(
s
)
)
if axes is not None:
if not isinstance(axes, Sequence) or len(axes) != 2:
raise ValueError(
"Invalid FFT argument axes ({}), it should be a sequence of 2 integers."
.format(axes))
"Invalid FFT argument axes ({}), it should be a sequence of 2 integers.".format(
axes
)
)
return irfftn(x, s, axes, norm, name)
......@@ -1135,13 +1153,17 @@ def hfft2(x, s=None, axes=(-2, -1), norm="backward", name=None):
if s is not None:
if not isinstance(s, Sequence) or len(s) != 2:
raise ValueError(
"Invalid FFT argument s ({}), it should be a sequence of 2 integers."
.format(s))
"Invalid FFT argument s ({}), it should be a sequence of 2 integers.".format(
s
)
)
if axes is not None:
if not isinstance(axes, Sequence) or len(axes) != 2:
raise ValueError(
"Invalid FFT argument axes ({}), it should be a sequence of 2 integers."
.format(axes))
"Invalid FFT argument axes ({}), it should be a sequence of 2 integers.".format(
axes
)
)
return hfftn(x, s, axes, norm, name)
......@@ -1187,13 +1209,17 @@ def ihfft2(x, s=None, axes=(-2, -1), norm="backward", name=None):
if s is not None:
if not isinstance(s, Sequence) or len(s) != 2:
raise ValueError(
"Invalid FFT argument s ({}), it should be a sequence of 2 integers."
.format(s))
"Invalid FFT argument s ({}), it should be a sequence of 2 integers.".format(
s
)
)
if axes is not None:
if not isinstance(axes, Sequence) or len(axes) != 2:
raise ValueError(
"Invalid FFT argument axes ({}), it should be a sequence of 2 integers."
.format(axes))
"Invalid FFT argument axes ({}), it should be a sequence of 2 integers.".format(
axes
)
)
return ihfftn(x, s, axes, norm, name)
......@@ -1417,10 +1443,9 @@ def fft_c2c(x, n, axis, norm, forward, name):
dtype = helper.input_dtype(input_param_name='x')
out = helper.create_variable_for_type_inference(dtype)
outputs = {"Out": [out]}
helper.append_op(type=op_type,
inputs=inputs,
outputs=outputs,
attrs=attrs)
helper.append_op(
type=op_type, inputs=inputs, outputs=outputs, attrs=attrs
)
return out
......@@ -1442,8 +1467,16 @@ def fft_r2c(x, n, axis, norm, forward, onesided, name):
if in_dygraph_mode():
out = _C_ops.fft_r2c(x, axes, norm, forward, onesided)
elif _in_legacy_dygraph():
attrs = ('axes', axes, 'normalization', norm, 'forward', forward,
'onesided', onesided)
attrs = (
'axes',
axes,
'normalization',
norm,
'forward',
forward,
'onesided',
onesided,
)
out = getattr(_legacy_C_ops, op_type)(x, *attrs)
else:
inputs = {
......@@ -1458,12 +1491,12 @@ def fft_r2c(x, n, axis, norm, forward, onesided, name):
helper = LayerHelper(op_type, **locals())
dtype = helper.input_dtype(input_param_name='x')
out = helper.create_variable_for_type_inference(
_real_to_complex_dtype(dtype))
_real_to_complex_dtype(dtype)
)
outputs = {"Out": [out]}
helper.append_op(type=op_type,
inputs=inputs,
outputs=outputs,
attrs=attrs)
helper.append_op(
type=op_type, inputs=inputs, outputs=outputs, attrs=attrs
)
return out
......@@ -1491,8 +1524,16 @@ def fft_c2r(x, n, axis, norm, forward, name):
out = _C_ops.fft_c2r(x, axes, norm, forward, 0)
elif _in_legacy_dygraph():
if n is not None:
attrs = ('axes', axes, 'normalization', norm, 'forward', forward,
'last_dim_size', n)
attrs = (
'axes',
axes,
'normalization',
norm,
'forward',
forward,
'last_dim_size',
n,
)
else:
attrs = ('axes', axes, 'normalization', norm, 'forward', forward)
out = getattr(_legacy_C_ops, op_type)(x, *attrs)
......@@ -1506,12 +1547,12 @@ def fft_c2r(x, n, axis, norm, forward, name):
helper = LayerHelper(op_type, **locals())
dtype = helper.input_dtype(input_param_name='x')
out = helper.create_variable_for_type_inference(
_complex_to_real_dtype(dtype))
_complex_to_real_dtype(dtype)
)
outputs = {"Out": [out]}
helper.append_op(type=op_type,
inputs=inputs,
outputs=outputs,
attrs=attrs)
helper.append_op(
type=op_type, inputs=inputs, outputs=outputs, attrs=attrs
)
return out
......@@ -1539,8 +1580,10 @@ def fftn_c2c(x, s, axes, norm, forward, name):
if s is not None:
if len(s) != len(axes):
raise ValueError(
"Length of s ({}) and length of axes ({}) does not match.".
format(len(s), len(axes)))
"Length of s ({}) and length of axes ({}) does not match.".format(
len(s), len(axes)
)
)
s = [s[i] for i in axes_argsoft]
if s is not None:
......@@ -1562,10 +1605,9 @@ def fftn_c2c(x, s, axes, norm, forward, name):
dtype = helper.input_dtype(input_param_name='x')
out = helper.create_variable_for_type_inference(dtype)
outputs = {"Out": [out]}
helper.append_op(type=op_type,
inputs=inputs,
outputs=outputs,
attrs=attrs)
helper.append_op(
type=op_type, inputs=inputs, outputs=outputs, attrs=attrs
)
return out
......@@ -1591,8 +1633,10 @@ def fftn_r2c(x, s, axes, norm, forward, onesided, name):
if s is not None:
if len(s) != len(axes):
raise ValueError(
"Length of s ({}) and length of axes ({}) does not match.".
format(len(s), len(axes)))
"Length of s ({}) and length of axes ({}) does not match.".format(
len(s), len(axes)
)
)
s = [s[i] for i in axes_argsoft] + [s[-1]]
if s is not None:
......@@ -1604,8 +1648,16 @@ def fftn_r2c(x, s, axes, norm, forward, onesided, name):
if in_dygraph_mode():
out = _C_ops.fft_r2c(x, axes, norm, forward, onesided)
elif _in_legacy_dygraph():
attrs = ('axes', axes, 'normalization', norm, 'forward', forward,
'onesided', onesided)
attrs = (
'axes',
axes,
'normalization',
norm,
'forward',
forward,
'onesided',
onesided,
)
out = getattr(_legacy_C_ops, op_type)(x, *attrs)
else:
inputs = {
......@@ -1620,12 +1672,12 @@ def fftn_r2c(x, s, axes, norm, forward, onesided, name):
helper = LayerHelper(op_type, **locals())
dtype = helper.input_dtype(input_param_name='x')
out = helper.create_variable_for_type_inference(
_real_to_complex_dtype(dtype))
_real_to_complex_dtype(dtype)
)
outputs = {"Out": [out]}
helper.append_op(type=op_type,
inputs=inputs,
outputs=outputs,
attrs=attrs)
helper.append_op(
type=op_type, inputs=inputs, outputs=outputs, attrs=attrs
)
return out
......@@ -1654,8 +1706,10 @@ def fftn_c2r(x, s, axes, norm, forward, name):
if s is not None:
if len(s) != len(axes):
raise ValueError(
"Length of s ({}) and length of axes ({}) does not match.".
format(len(s), len(axes)))
"Length of s ({}) and length of axes ({}) does not match.".format(
len(s), len(axes)
)
)
s = [s[i] for i in axes_argsoft] + [s[-1]]
if s is not None:
......@@ -1673,8 +1727,16 @@ def fftn_c2r(x, s, axes, norm, forward, name):
out = _C_ops.fft_c2r(x, axes, norm, forward, 0)
elif _in_legacy_dygraph():
if s:
attrs = ('axes', axes, 'normalization', norm, 'forward', forward,
'last_dim_size', s[-1])
attrs = (
'axes',
axes,
'normalization',
norm,
'forward',
forward,
'last_dim_size',
s[-1],
)
else:
attrs = ('axes', axes, 'normalization', norm, 'forward', forward)
out = getattr(_legacy_C_ops, op_type)(x, *attrs)
......@@ -1688,10 +1750,10 @@ def fftn_c2r(x, s, axes, norm, forward, name):
helper = LayerHelper(op_type, **locals())
dtype = helper.input_dtype(input_param_name='x')
out = helper.create_variable_for_type_inference(
_complex_to_real_dtype(dtype))
_complex_to_real_dtype(dtype)
)
outputs = {"Out": [out]}
helper.append_op(type=op_type,
inputs=inputs,
outputs=outputs,
attrs=attrs)
helper.append_op(
type=op_type, inputs=inputs, outputs=outputs, attrs=attrs
)
return out
......@@ -23,9 +23,9 @@ from ...log_helper import get_logger
__all__ = ['add_supported_layer']
_logger = get_logger(__name__,
logging.INFO,
fmt='%(asctime)s-%(levelname)s: %(message)s')
_logger = get_logger(
__name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s'
)
def _default_pruning(weight_nparray, m, n, func_name, param_name):
......@@ -38,13 +38,17 @@ def _default_pruning(weight_nparray, m, n, func_name, param_name):
exlude_cond_shape4 = len(shape) == 4 and shape[1] < m
if exlude_cond_shape2:
_logger.warning(
'{} is not pruned because the first dimension of {} is smaller than {}'
.format(param_name, shape, m))
'{} is not pruned because the first dimension of {} is smaller than {}'.format(
param_name, shape, m
)
)
return weight_pruned_nparray, weight_sparse_mask
if exlude_cond_shape4:
_logger.warning(
'{} is not pruned because the second dimension of {} is smaller than {}'
.format(param_name, shape, m))
'{} is not pruned because the second dimension of {} is smaller than {}'.format(
param_name, shape, m
)
)
return weight_pruned_nparray, weight_sparse_mask
checked_func_name = sparsity.CheckMethod.get_checking_method(func_name)
......@@ -60,13 +64,13 @@ def _default_pruning(weight_nparray, m, n, func_name, param_name):
# sparsity/utils is row-major pruning. That is the reason we have to transpose weight
# matrices beforce invoking create_mask. Then we transpose the result mask to make
# sure its shape to be the same as the input weight.
weight_sparse_mask = sparsity.create_mask(weight_nparray.T,
func_name=func_name,
n=n,
m=m).T
weight_sparse_mask = sparsity.create_mask(
weight_nparray.T, func_name=func_name, n=n, m=m
).T
weight_pruned_nparray = np.multiply(weight_nparray, weight_sparse_mask)
assert sparsity.check_sparsity(weight_pruned_nparray.T, n=n, m=m, func_name=checked_func_name), \
'Pruning {} weight matrix failure!!!'.format(param_name)
assert sparsity.check_sparsity(
weight_pruned_nparray.T, n=n, m=m, func_name=checked_func_name
), 'Pruning {} weight matrix failure!!!'.format(param_name)
return weight_pruned_nparray, weight_sparse_mask
......@@ -78,6 +82,7 @@ supported_layers_and_prune_func_map = {}
def add_supported_layer(layer, pruning_func=None):
r"""
Add supported layers and its corresponding pruning function.
Args:
......@@ -87,19 +92,25 @@ def add_supported_layer(layer, pruning_func=None):
pruning_func (function, optional): a function type which receives five argument (weight_nparray,
m, n, func_name, param_name), weight_nparray is a nparray of weight, param_name is the name of weight,
m, n, and func_name, please see `prune_model` for details.
"""
name = None
if isinstance(layer, str):
name = layer
elif isinstance(layer, paddle.fluid.dygraph.layers.Layer):
name = paddle.fluid.dygraph.layers._convert_camel_to_snake(
type(layer).__name__)
type(layer).__name__
)
elif issubclass(layer, paddle.fluid.dygraph.layers.Layer):
name = paddle.fluid.dygraph.layers._convert_camel_to_snake(
layer.__name__)
layer.__name__
)
else:
assert "The type of layer should be string of Layer, but got {}!".format(
type(layer))
assert (
"The type of layer should be string of Layer, but got {}!".format(
type(layer)
)
)
if pruning_func is None:
pruning_func = _default_pruning
_supported_layers_and_prune_func_map_lock.acquire()
......
......@@ -27,9 +27,16 @@ from itertools import permutations
import threading
__all__ = [
'calculate_density', 'check_mask_1d', 'get_mask_1d', 'check_mask_2d',
'get_mask_2d_greedy', 'get_mask_2d_best', 'create_mask', 'check_sparsity',
'MaskAlgo', 'CheckMethod'
'calculate_density',
'check_mask_1d',
'get_mask_1d',
'check_mask_2d',
'get_mask_2d_greedy',
'get_mask_2d_best',
'create_mask',
'check_sparsity',
'MaskAlgo',
'CheckMethod',
]
......@@ -76,8 +83,9 @@ class CheckMethod(Enum):
CheckMethod.get_checking_method(MaskAlgo.MASK_2D_BEST)
# CheckMethod.CHECK_2D
"""
assert isinstance(mask_algo, MaskAlgo), \
"mask_algo should be MaskAlgo type"
assert isinstance(
mask_algo, MaskAlgo
), "mask_algo should be MaskAlgo type"
if mask_algo == MaskAlgo.MASK_1D:
return CheckMethod.CHECK_1D
else:
......@@ -86,20 +94,25 @@ class CheckMethod(Enum):
def calculate_density(x):
r"""
Return the density of the input tensor.
Args:
x (nparray): The input tensor.
Returns:
float: The density of :attr:`x`.
float, The density of :attr:`x`.
Examples:
.. code-block:: python
import paddle
import numpy as np
x = np.array([[0, 1, 3, 0],
[1, 1, 0, 1]])
paddle.incubate.asp.calculate_density(x) # 0.625
"""
x_flattened = x.flatten()
return float(np.nonzero(x_flattened)[0].size) / x_flattened.size
......@@ -126,7 +139,7 @@ def _reshape_1d(mat, m):
remainder = mat.shape[1] % m
if mat.shape[1] % m > 0:
mat_padded = np.zeros((mat.shape[0], mat.shape[1] + (m - remainder)))
mat_padded[:, :mat.shape[1]] = mat
mat_padded[:, : mat.shape[1]] = mat
shape = mat_padded.shape
return mat_padded.reshape(-1, m), shape
else:
......@@ -213,7 +226,7 @@ def get_mask_1d(mat, n, m):
min_order_indices = np.argsort(np.absolute(sub_mat))
mask_flattern[i, min_order_indices[:n].tolist()] = 0
mask_flattern = mask_flattern.reshape(shape)
mask[:, :] = mask_flattern[:, :mat.shape[1]]
mask[:, :] = mask_flattern[:, : mat.shape[1]]
return mask
......@@ -239,12 +252,12 @@ def _reshape_2d(mat, m):
remainder_0 = mat.shape[0] % m
remainder_1 = mat.shape[1] % m
new_shape = (mat.shape[0] if remainder_0 == 0 \
else mat.shape[0] + (m - remainder_0),
mat.shape[1] if remainder_1 == 0 \
else mat.shape[1] + (m - remainder_1))
new_shape = (
mat.shape[0] if remainder_0 == 0 else mat.shape[0] + (m - remainder_0),
mat.shape[1] if remainder_1 == 0 else mat.shape[1] + (m - remainder_1),
)
mat_padded = np.zeros(new_shape)
mat_padded[:mat.shape[0], :mat.shape[1]] = mat
mat_padded[: mat.shape[0], : mat.shape[1]] = mat
mat_flattern = np.empty(new_shape).reshape(-1, m * m)
curr_idx = 0
......@@ -252,9 +265,9 @@ def _reshape_2d(mat, m):
row_end = row_start + m
for col_start in range(0, mat_padded.shape[1], m):
col_end = col_start + m
sub_mat = np.squeeze(mat_padded[row_start:row_end, \
col_start:col_end] \
.reshape(-1))
sub_mat = np.squeeze(
mat_padded[row_start:row_end, col_start:col_end].reshape(-1)
)
mat_flattern[curr_idx] = sub_mat
curr_idx += 1
return mat_flattern, mat_padded.shape
......@@ -304,8 +317,9 @@ def check_mask_2d(mat, n, m):
mat_padded, shape = _reshape_2d(mat, m)
for sub_mat in mat_padded:
sub_mask = np.absolute(np.squeeze(sub_mat.reshape(m, m))) > 0
if (np.sum(np.sum(sub_mask, axis=1) > (m-n)) != 0) and \
(np.sum(np.sum(sub_mask, axis=0) > (m-n)) != 0):
if (np.sum(np.sum(sub_mask, axis=1) > (m - n)) != 0) and (
np.sum(np.sum(sub_mask, axis=0) > (m - n)) != 0
):
return False
return True
......@@ -350,15 +364,17 @@ def get_mask_2d_greedy(mat, n, m):
sub_mask = np.squeeze(mask_padded[idx])
min_order_1d_indices = np.argsort(sub_mat)
min_order_2d_indices = [(int(x / m), x % m)
for x in min_order_1d_indices]
min_order_2d_indices = [
(int(x / m), x % m) for x in min_order_1d_indices
]
row_counter = collections.Counter()
col_counter = collections.Counter()
for i in range(len(min_order_1d_indices) - 1, -1, -1):
matrix_entry = min_order_2d_indices[i]
if (row_counter[matrix_entry[0]] == n) or \
(col_counter[matrix_entry[1]] == n):
if (row_counter[matrix_entry[0]] == n) or (
col_counter[matrix_entry[1]] == n
):
continue
sub_mask[matrix_entry[0], matrix_entry[1]] = 1.0
......@@ -373,7 +389,7 @@ def get_mask_2d_greedy(mat, n, m):
col_end = col_start + m
mask[row_start:row_end, col_start:col_end] = mask_padded[curr_idx]
curr_idx += 1
return mask[:mat.shape[0], :mat.shape[1]]
return mask[: mat.shape[0], : mat.shape[1]]
_valid_2d_patterns_lock = threading.Lock()
......@@ -406,8 +422,11 @@ def _compute_valid_2d_patterns(n, m):
patterns = patterns + patterns
patterns = np.asarray(list(set(permutations(patterns, m))))
valid = ((patterns.sum(axis=1) <= n).sum(
axis=1) == m).nonzero()[0].reshape(-1)
valid = (
((patterns.sum(axis=1) <= n).sum(axis=1) == m)
.nonzero()[0]
.reshape(-1)
)
valid_patterns = np.empty((valid.shape[0], m, m))
valid_patterns[:] = patterns[valid[:]]
......@@ -454,9 +473,10 @@ def get_mask_2d_best(mat, n, m):
mat_flattern, shape = _reshape_2d(mat, m)
mask_flattern = np.ones_like(mat_flattern).reshape(-1, m, m)
pmax = np.argmax(np.matmul(mat_flattern,
patterns.reshape(patterns.shape[0], m * m).T),
axis=1)
pmax = np.argmax(
np.matmul(mat_flattern, patterns.reshape(patterns.shape[0], m * m).T),
axis=1,
)
mask_flattern[:] = patterns[pmax[:]]
mask = np.empty(shape)
......@@ -468,7 +488,7 @@ def get_mask_2d_best(mat, n, m):
col_end = col_start + m
mask[row_start:row_end, col_start:col_end] = mask_flattern[curr_idx]
curr_idx += 1
return mask[:mat.shape[0], :mat.shape[1]]
return mask[: mat.shape[0], : mat.shape[1]]
def create_mask(tensor, func_name=MaskAlgo.MASK_1D, n=2, m=4):
......@@ -508,9 +528,10 @@ def create_mask(tensor, func_name=MaskAlgo.MASK_1D, n=2, m=4):
dtype = tensor.dtype
t = tensor.astype(float)
assert isinstance(func_name, MaskAlgo), \
"func_name argumet of create_mask is only accepted as type MaskAlgo. " \
assert isinstance(func_name, MaskAlgo), (
"func_name argumet of create_mask is only accepted as type MaskAlgo. "
"But got {}".format(type(func_name))
)
func = getattr(sys.modules[__name__], func_name.value, None)
if len(shape) == 1:
t = t.reshape(1, shape[0])
......@@ -520,14 +541,20 @@ def create_mask(tensor, func_name=MaskAlgo.MASK_1D, n=2, m=4):
t = t.reshape(shape[0] * shape[1], shape[2])
# 4d-tensor conv (h, w, in, out) -> (h*w*out, in) in GemmConvKernel Op
elif len(shape) == 4:
t = t.transpose([0, 1, 3, 2]).reshape(shape[0] * shape[1] * shape[3],
shape[2])
t = t.transpose([0, 1, 3, 2]).reshape(
shape[0] * shape[1] * shape[3], shape[2]
)
mask = func(t, n=n, m=m)
return mask.reshape([shape[0], shape[1], shape[3],
shape[2]]).transpose([0, 1, 3, 2]).astype(dtype)
return (
mask.reshape([shape[0], shape[1], shape[3], shape[2]])
.transpose([0, 1, 3, 2])
.astype(dtype)
)
else:
raise ValueError("The dimension of input tensor is not supported in create_mask, " \
"Only dimension < 4 is supported but got {}".format(len(shape)))
raise ValueError(
"The dimension of input tensor is not supported in create_mask, "
"Only dimension < 4 is supported but got {}".format(len(shape))
)
mask = func(t, n=n, m=m)
return mask.reshape(shape).astype(dtype)
......@@ -566,9 +593,10 @@ def check_sparsity(tensor, func_name=CheckMethod.CHECK_1D, n=2, m=4):
shape = tensor.shape
t = tensor.astype(float)
assert type(func_name) == CheckMethod, \
"func_name argumet of check_sparsity is only accepted as type CheckMethod. " \
assert type(func_name) == CheckMethod, (
"func_name argumet of check_sparsity is only accepted as type CheckMethod. "
"But got {}".format(type(func_name))
)
func = getattr(sys.modules[__name__], func_name.value, None)
if len(shape) == 1:
t = t.reshape(1, shape[0])
......@@ -578,10 +606,13 @@ def check_sparsity(tensor, func_name=CheckMethod.CHECK_1D, n=2, m=4):
t = t.reshape(shape[0] * shape[1], shape[2])
# 4d-tensor conv (h, w, in, out) -> (h*w*out, in) in GemmConvKernel Op
elif len(shape) == 4:
t = t.transpose([0, 1, 3,
2]).reshape([shape[0] * shape[1] * shape[3], shape[2]])
t = t.transpose([0, 1, 3, 2]).reshape(
[shape[0] * shape[1] * shape[3], shape[2]]
)
else:
raise ValueError("The dimension of input tensor is not supported in create_mask, " \
"Only dimension < 4 is supported but got {}".format(len(shape)))
raise ValueError(
"The dimension of input tensor is not supported in create_mask, "
"Only dimension < 4 is supported but got {}".format(len(shape))
)
return func(t, n=n, m=m)
......@@ -32,12 +32,25 @@ from . import parallel_helper
from .. import unique_name
from paddle.fluid import core
from .layer_object_helper import LayerObjectHelper
from .layer_hooks import record_program_ops_pre_hook, set_op_customized_attrs_post_hook, LayerOpsRecoder
from .base import program_desc_tracing_guard, param_guard, in_declarative_mode, _convert_into_variable
from .layer_hooks import (
record_program_ops_pre_hook,
set_op_customized_attrs_post_hook,
LayerOpsRecoder,
)
from .base import (
program_desc_tracing_guard,
param_guard,
in_declarative_mode,
_convert_into_variable,
)
from paddle.fluid import framework
from ..param_attr import ParamAttr
from paddle.fluid.executor import Executor, global_scope
from paddle.fluid.framework import _non_static_mode, convert_np_dtype_to_dtype_, in_dygraph_mode
from paddle.fluid.framework import (
_non_static_mode,
convert_np_dtype_to_dtype_,
in_dygraph_mode,
)
from paddle.fluid.framework import Program, program_guard
from paddle.fluid.framework import _current_expected_place as _get_device
from paddle.fluid.core import VarDesc
......@@ -67,7 +80,7 @@ def _addindent(string, indent):
class HookRemoveHelper(object):
""" A HookRemoveHelper that can be used to remove hook. """
"""A HookRemoveHelper that can be used to remove hook."""
next_hook_id = 0
......@@ -153,13 +166,14 @@ class Layer(object):
def train(self):
"""
Sets this Layer and all its sublayers to training mode.
This only effects certain modules like `Dropout` and `BatchNorm`.
Returns:
None
Example::
Examples:
.. code-block:: python
import paddle
......@@ -236,6 +250,7 @@ class Layer(object):
def apply(self, fn):
"""
Applies ``fn`` recursively to every sublayer (as returned by ``.sublayers()``)
as well as self. Typical use includes initializing the parameters of a model.
......@@ -243,7 +258,7 @@ class Layer(object):
fn (function): a function to be applied to each sublayer
Returns:
Layer: self
Layer, self
Example::
.. code-block:: python
......@@ -263,6 +278,7 @@ class Layer(object):
net.apply(init_weights)
print(net.state_dict())
"""
for layer in self.children():
layer.apply(fn)
......@@ -272,10 +288,12 @@ class Layer(object):
return self
def full_name(self):
"""Full name for this layer, composed by name_scope + "/" + MyLayer.__class__.__name__
"""
Full name for this layer, composed by name_scope + "/" + MyLayer.__class__.__name__
Returns:
str: full name of this layer.
str, full name of this layer.
Example::
.. code-block:: python
......@@ -297,7 +315,9 @@ class Layer(object):
return self._full_name
def register_forward_post_hook(self, hook):
"""Register a forward post-hook for Layer. The hook will be called after `forward` function has been computed.
"""
Register a forward post-hook for Layer. The hook will be called after `forward` function has been computed.
It should have the following form, `input` and `output` of the `hook` is `input` and `output` of the `Layer` respectively.
User can use forward post-hook to change the output of the Layer or perform information statistics tasks on the Layer.
......@@ -308,7 +328,7 @@ class Layer(object):
hook(function): a function registered as a forward post-hook
Returns:
HookRemoveHelper: a HookRemoveHelper object that can be used to remove the added hook by calling `hook_remove_helper.remove()` .
HookRemoveHelper, a HookRemoveHelper object that can be used to remove the added hook by calling `hook_remove_helper.remove()` .
Examples:
.. code-block:: python
......@@ -340,13 +360,16 @@ class Layer(object):
# hook change the linear's output to output * 2, so out0 is equal to out1 * 2.
assert (out0.numpy() == (out1.numpy()) * 2).any()
"""
hook_remove_helper = HookRemoveHelper(self._forward_post_hooks)
self._forward_post_hooks[hook_remove_helper._hook_id] = hook
return hook_remove_helper
def register_forward_pre_hook(self, hook):
"""Register a forward pre-hook for Layer. The hook will be called before `forward` function has been computed.
"""
Register a forward pre-hook for Layer. The hook will be called before `forward` function has been computed.
It should have the following form, `input` of the `hook` is `input` of the `Layer`,
hook can either return a tuple or a single modified value in the hook. We will wrap the value into a tuple if
......@@ -359,7 +382,7 @@ class Layer(object):
hook(function): a function registered as a forward pre-hook
Returns:
HookRemoveHelper: a HookRemoveHelper object that can be used to remove the added hook by calling `hook_remove_helper.remove()` .
HookRemoveHelper, a HookRemoveHelper object that can be used to remove the added hook by calling `hook_remove_helper.remove()` .
Examples:
.. code-block:: python
......@@ -398,12 +421,14 @@ class Layer(object):
self._forward_pre_hooks[hook_remove_helper._hook_id] = hook
return hook_remove_helper
def create_parameter(self,
def create_parameter(
self,
shape,
attr=None,
dtype=None,
is_bias=False,
default_initializer=None):
default_initializer=None,
):
"""Create parameters for this layer.
Parameters:
......@@ -443,12 +468,15 @@ class Layer(object):
temp_attr = copy.deepcopy(attr)
if isinstance(temp_attr, six.string_types) and temp_attr == "":
temp_attr = None
return self._helper.create_parameter(temp_attr, shape, dtype, is_bias,
default_initializer)
return self._helper.create_parameter(
temp_attr, shape, dtype, is_bias, default_initializer
)
@deprecated(since="2.0.0",
@deprecated(
since="2.0.0",
update_to="paddle.nn.Layer.create_tensor",
reason="New api in create_tensor, easier to use.")
reason="New api in create_tensor, easier to use.",
)
def create_variable(self, name=None, persistable=None, dtype=None):
"""
......@@ -488,14 +516,16 @@ class Layer(object):
if name is not None:
var_name = ".".join([self._full_name, name])
else:
var_name = unique_name.generate(".".join(
[self._full_name, "_generated_var"]))
var_name = unique_name.generate(
".".join([self._full_name, "_generated_var"])
)
return self._helper.main_program.current_block().create_var(
name=var_name,
persistable=persistable,
dtype=dtype,
type=core.VarDesc.VarType.LOD_TENSOR)
type=core.VarDesc.VarType.LOD_TENSOR,
)
# TODO: Add more parameter list when we need them
def create_tensor(self, name=None, persistable=None, dtype=None):
......@@ -538,20 +568,24 @@ class Layer(object):
if name is not None:
var_name = ".".join([self._full_name, name])
else:
var_name = unique_name.generate(".".join(
[self._full_name, "_generated_var"]))
var_name = unique_name.generate(
".".join([self._full_name, "_generated_var"])
)
return self._helper.main_program.current_block().create_var(
name=var_name,
persistable=persistable,
dtype=dtype,
type=core.VarDesc.VarType.LOD_TENSOR)
type=core.VarDesc.VarType.LOD_TENSOR,
)
def parameters(self, include_sublayers=True):
"""Returns a list of all Parameters from current layer and its sub-layers.
"""
Returns a list of all Parameters from current layer and its sub-layers.
Returns:
list of Tensor : a list of Parameters.
list of Tensor, a list of Parameters.
Examples:
.. code-block:: python
......@@ -563,13 +597,17 @@ class Layer(object):
"""
ret = [
param for _, param in self.named_parameters(
include_sublayers=include_sublayers)
param
for _, param in self.named_parameters(
include_sublayers=include_sublayers
)
]
return ret
def children(self):
"""Returns an iterator over immediate children layers.
"""
Returns an iterator over immediate children layers.
Yields:
Layer: a child layer
......@@ -619,13 +657,15 @@ class Layer(object):
yield name, layer
def sublayers(self, include_self=False):
"""Returns a list of sub layers.
"""
Returns a list of sub layers.
Parameters:
include_self(bool, optional): Whether return self as sublayers. Default: False
Returns:
list of Layer : a list of sub layers.
list of Layer, a list of sub layers.
Examples:
.. code-block:: python
......@@ -678,9 +718,11 @@ class Layer(object):
"""
params_set = set()
named_sublayers = self.named_sublayers(
prefix=prefix, include_self=True) if include_sublayers else zip(
[prefix], [self])
named_sublayers = (
self.named_sublayers(prefix=prefix, include_self=True)
if include_sublayers
else zip([prefix], [self])
)
for layer_prefix, sublayer in named_sublayers:
params = sublayer._parameters.items()
for key, param in params:
......@@ -724,9 +766,9 @@ class Layer(object):
if layer is None:
continue
layer_prefix = prefix + ('.' if prefix else '') + key
for p, l in layer.named_sublayers(prefix=layer_prefix,
include_self=True,
layers_set=layers_set):
for p, l in layer.named_sublayers(
prefix=layer_prefix, include_self=True, layers_set=layers_set
):
yield p, l
def register_buffer(self, name, tensor, persistable=True):
......@@ -769,25 +811,32 @@ class Layer(object):
if '_buffers' not in self.__dict__:
raise ValueError(
"super(YourLayer, self).__init__() should be called first")
"super(YourLayer, self).__init__() should be called first"
)
elif not isinstance(name, six.string_types):
raise TypeError(
"The name of buffer should be a string, but received {}.".
format(type(name).__name__))
"The name of buffer should be a string, but received {}.".format(
type(name).__name__
)
)
elif '.' in name:
raise KeyError(
"The name of buffer can not contain `.`, "
"because when you access the newly added buffer in the "
"form of `self.**.**`, it will cause AttributeError.")
"form of `self.**.**`, it will cause AttributeError."
)
elif name == '':
raise KeyError("The name of buffer can not be empty.")
elif hasattr(self, name) and name not in self._buffers:
raise KeyError("attribute '{}' already exists.".format(name))
elif tensor is not None and not (type(tensor) == core.VarBase
or type(tensor) == core.eager.Tensor):
elif tensor is not None and not (
type(tensor) == core.VarBase or type(tensor) == core.eager.Tensor
):
raise TypeError(
"The registered buffer should be a Paddle.Tensor, but received {}."
.format(type(tensor).__name__))
"The registered buffer should be a Paddle.Tensor, but received {}.".format(
type(tensor).__name__
)
)
else:
self._buffers[name] = tensor
if persistable:
......@@ -797,13 +846,14 @@ class Layer(object):
def buffers(self, include_sublayers=True):
"""
Returns a list of all buffers from current layer and its sub-layers.
Parameters:
include_sublayers(bool, optional): Whether include the buffers of sublayers. If True, also include the buffers from sublayers. Default: True
Returns:
list of Tensor : a list of buffers.
list of Tensor, a list of buffers.
Examples:
.. code-block:: python
......@@ -820,8 +870,10 @@ class Layer(object):
"""
ret = [
buffer for _, buffer in self.named_buffers(
include_sublayers=include_sublayers)
buffer
for _, buffer in self.named_buffers(
include_sublayers=include_sublayers
)
]
return ret
......@@ -862,9 +914,11 @@ class Layer(object):
"""
buffers_set = set()
named_sublayers = self.named_sublayers(
prefix=prefix, include_self=True) if include_sublayers else zip(
[prefix], [self])
named_sublayers = (
self.named_sublayers(prefix=prefix, include_self=True)
if include_sublayers
else zip([prefix], [self])
)
for layer_prefix, sublayer in named_sublayers:
buffers = sublayer._buffers.items()
for key, buffer in buffers:
......@@ -910,7 +964,7 @@ class Layer(object):
hook_result = forward_pre_hook(self, inputs)
if hook_result is not None:
if not isinstance(hook_result, tuple):
hook_result = (hook_result, )
hook_result = (hook_result,)
inputs = hook_result
if not self._built:
......@@ -920,16 +974,20 @@ class Layer(object):
# TODO(liuyuhui) Only xpu broadcast parameters here.
# The other device is to call _sync_params_buffers in DataParallel
# to realize the parameter synchronization among multiply cards.
if parallel_helper._is_data_parallel_mode(
) and paddle.is_compiled_with_xpu():
if (
parallel_helper._is_data_parallel_mode()
and paddle.is_compiled_with_xpu()
):
parallel_helper._broadcast_parameters(
self._parameters.values())
self._parameters.values()
)
self._built = True
if in_profiler_mode():
with profiler.RecordEvent(self.__class__.__name__,
profiler.TracerEventType.Forward):
with profiler.RecordEvent(
self.__class__.__name__, profiler.TracerEventType.Forward
):
outputs = self.forward(*inputs, **kwargs)
else:
outputs = self.forward(*inputs, **kwargs)
......@@ -942,8 +1000,14 @@ class Layer(object):
return outputs
def __call__(self, *inputs, **kwargs):
if (not in_declarative_mode()) and (not self._forward_pre_hooks) \
and (not self._forward_post_hooks) and (not self._built) and in_dygraph_mode() and (not in_profiler_mode()):
if (
(not in_declarative_mode())
and (not self._forward_pre_hooks)
and (not self._forward_post_hooks)
and (not self._built)
and in_dygraph_mode()
and (not in_profiler_mode())
):
self._build_once(*inputs, **kwargs)
return self.forward(*inputs, **kwargs)
else:
......@@ -964,7 +1028,9 @@ class Layer(object):
raise ValueError("Layer shouldn't implement backward")
def add_sublayer(self, name, sublayer):
"""Adds a sub Layer instance.
"""
Adds a sub Layer instance.
Added sublayer can be accessed by self.name
......@@ -972,7 +1038,7 @@ class Layer(object):
name(str): name of this sublayer.
sublayer(Layer): an instance of Layer.
Returns:
Layer: the sublayer passed in.
Layer, the sublayer passed in.
Examples:
.. code-block:: python
......@@ -999,8 +1065,9 @@ class Layer(object):
model = MySequential(fc1, fc2)
for prefix, layer in model.named_sublayers():
print(prefix, layer)
"""
assert (isinstance(sublayer, Layer) or sublayer == None)
assert isinstance(sublayer, Layer) or sublayer == None
self._sub_layers[name] = sublayer
return sublayer
......@@ -1014,7 +1081,7 @@ class Layer(object):
name(str): name of this sublayer.
parameter(Parameter): an instance of Parameter.
Returns:
Parameter: the parameter passed in.
Parameter, the parameter passed in.
Examples:
.. code-block:: python
......@@ -1037,32 +1104,42 @@ class Layer(object):
"""
if '_parameters' not in self.__dict__:
raise RuntimeError(
"super(YourLayer, self).__init__() should be called firstly.")
"super(YourLayer, self).__init__() should be called firstly."
)
elif not isinstance(name, six.string_types):
raise TypeError(
"The name of parameter should be a string, but received {}.".
format(type(name).__name__))
"The name of parameter should be a string, but received {}.".format(
type(name).__name__
)
)
elif '.' in name:
raise KeyError(
"The name of parameter can not contain `.`, "
"because when you access the newly added parameter in the "
"form of `self.**.**`, it will cause AttributeError.")
"form of `self.**.**`, it will cause AttributeError."
)
elif name == '':
raise KeyError("The name of parameter can not be empty.")
elif hasattr(self, name) and name not in self._parameters:
raise KeyError("The parameter '{}' already exists.".format(name))
elif parameter is not None and not isinstance(parameter,
framework.Parameter):
elif parameter is not None and not isinstance(
parameter, framework.Parameter
):
raise TypeError(
"The parameter to be added should be a Parameter, but received {}."
.format(type(parameter).__name__))
"The parameter to be added should be a Parameter, but received {}.".format(
type(parameter).__name__
)
)
else:
if parameter is None:
self._parameters[name] = None
if len(self._loaddict_holder) > 0:
assert parameter.name in self._loaddict_holder, "Parameter not found, Can't not find [ {} ] in state_dict".format(
parameter.name)
assert (
parameter.name in self._loaddict_holder
), "Parameter not found, Can't not find [ {} ] in state_dict".format(
parameter.name
)
parameter.set_value(self._loaddict_holder[parameter.name])
......@@ -1081,37 +1158,50 @@ class Layer(object):
"""
def is_already_registered(is_pre_hook):
layers_hooks = self._forward_pre_hooks if is_pre_hook else self._forward_post_hooks
candidate_hook = record_program_ops_pre_hook if is_pre_hook else set_op_customized_attrs_post_hook
layers_hooks = (
self._forward_pre_hooks
if is_pre_hook
else self._forward_post_hooks
)
candidate_hook = (
record_program_ops_pre_hook
if is_pre_hook
else set_op_customized_attrs_post_hook
)
already_registed = False
if layers_hooks:
last_key = next(reversed(layers_hooks))
already_registed = (layers_hooks[last_key] == candidate_hook)
already_registed = layers_hooks[last_key] == candidate_hook
return already_registed
if not isinstance(attrs, dict):
raise TypeError(
"attrs should be type(dict), but received {}".format(
type(attrs).__name__))
type(attrs).__name__
)
)
# NOTE: Overwrite behavior for same key.
self._customized_attrs.update(attrs)
if not is_already_registered(is_pre_hook=True):
pre_hook_helper = self.register_forward_pre_hook(
record_program_ops_pre_hook)
record_program_ops_pre_hook
)
assert len(self._op_recorder.hooks) == 0
self._op_recorder.hooks = [pre_hook_helper]
# manually register post_hook to ensure it is inserted into the head.
if not is_already_registered(is_pre_hook=False):
post_hook_helper = self.register_forward_post_hook(
set_op_customized_attrs_post_hook)
set_op_customized_attrs_post_hook
)
if len(self._forward_post_hooks) > 1:
self._forward_post_hooks.move_to_end(post_hook_helper._hook_id,
last=False)
self._forward_post_hooks.move_to_end(
post_hook_helper._hook_id, last=False
)
assert len(self._op_recorder.hooks) == 1
......@@ -1144,7 +1234,6 @@ class Layer(object):
return object.__getattribute__(self, name)
def __setattr__(self, name, value):
def _remove_if_exist(*dicts):
for d in dicts:
if name in d:
......@@ -1156,10 +1245,14 @@ class Layer(object):
if isinstance(value, framework.Parameter):
if params is None:
raise ValueError(
"super(YourLayer, self).__init__() should be called first")
"super(YourLayer, self).__init__() should be called first"
)
if len(self._loaddict_holder) > 0:
assert value.name in self._loaddict_holder, "Parameter not found, Can't not find [ {} ] in state_dict".format(
value.name)
assert (
value.name in self._loaddict_holder
), "Parameter not found, Can't not find [ {} ] in state_dict".format(
value.name
)
value.set_value(self._loaddict_holder[value.name])
......@@ -1168,9 +1261,10 @@ class Layer(object):
elif params is not None and name in params:
if value is not None:
raise TypeError(
"assignment to parameter '{}' should be of type Parameter or None, but got '{}'"
.format(name,
type(value).__name__))
"assignment to parameter '{}' should be of type Parameter or None, but got '{}'".format(
name, type(value).__name__
)
)
params[name] = None
else:
layers = self.__dict__.get('_sub_layers', None)
......@@ -1185,9 +1279,10 @@ class Layer(object):
elif layers is not None and name in layers:
if value is not None:
raise TypeError(
"assignment to sublayer '{}' should be of type Layer or None, but got '{}'"
.format(name,
type(value).__name__))
"assignment to sublayer '{}' should be of type Layer or None, but got '{}'".format(
name, type(value).__name__
)
)
layers[name] = None
else:
_buffers = self.__dict__.get('_buffers', None)
......@@ -1196,8 +1291,9 @@ class Layer(object):
raise ValueError(
"super(YourLayer, self).__init__() should be called first"
)
_remove_if_exist(self.__dict__, self._parameters,
self._sub_layers)
_remove_if_exist(
self.__dict__, self._parameters, self._sub_layers
)
# Set persistable=False by default. Only `register_buffer` can
# add a persistable buffer.
if name not in self._buffers:
......@@ -1211,6 +1307,7 @@ class Layer(object):
# value via `assign`.
if type(value) == framework.Variable:
from paddle import assign
# Note(zhhsplendid): the condition below happens in PaddleGan model,
# but should all non-Variable _buffers[name] be re-assign? We
# should consider it in the future. I current wrote this as
......@@ -1218,18 +1315,23 @@ class Layer(object):
if in_declarative_mode() and _buffers[name] is None:
raise RuntimeError(
'In Dy2stat, self.{0} is a buffer and self.{0} is '
'not allowed to be set to Variable when self.{0} is None.'
.format(name))
elif _buffers[name] is None or type(getattr(
self, name)) == core.VarBase:
'not allowed to be set to Variable when self.{0} is None.'.format(
name
)
)
elif (
_buffers[name] is None
or type(getattr(self, name)) == core.VarBase
):
_buffers[name] = assign(value)
else:
assign(value, getattr(self, name))
elif value is not None:
raise TypeError(
"assignment to buffers '{}' should be of type core.VarBase or None, but got '{}'"
.format(name,
type(value).__name__))
"assignment to buffers '{}' should be of type core.VarBase or None, but got '{}'".format(
name, type(value).__name__
)
)
else:
# Assigning None will remove the buffer, but if re-assign a new varBase to it,
# it will be remarked as a buffer with same `persistable` attribute.
......@@ -1316,10 +1418,12 @@ class Layer(object):
self._state_dict_hooks[hook_remove_helper._hook_id] = hook
return hook_remove_helper
def _obtain_parameters_buffers(self,
def _obtain_parameters_buffers(
self,
destination=None,
include_sublayers=True,
structured_name_prefix=""):
structured_name_prefix="",
):
"""
The difference from state_dict() is that state_dict_hook will not be called,
but the original types of parameters and buffers will be maintained.
......@@ -1330,7 +1434,10 @@ class Layer(object):
if data is not None:
destination[structured_name_prefix + name] = data
for name, buffer in self._buffers.items():
if buffer is not None and name not in self._non_persistable_buffer_names_set:
if (
buffer is not None
and name not in self._non_persistable_buffer_names_set
):
destination[structured_name_prefix + name] = buffer
if include_sublayers:
......@@ -1339,17 +1446,22 @@ class Layer(object):
destination_temp = destination.copy()
destination_temp.update(
layer_item._obtain_parameters_buffers(
destination_temp, include_sublayers,
structured_name_prefix + layer_name + "."))
destination_temp,
include_sublayers,
structured_name_prefix + layer_name + ".",
)
)
destination = destination_temp
return destination
def _state_dict_impl(self,
def _state_dict_impl(
self,
destination=None,
include_sublayers=True,
structured_name_prefix="",
include_non_persistable_buffer=False,
use_hook=True):
use_hook=True,
):
"""
Get all parameters and persistable buffers of current layer and its sub-layers. And set them into a dict
......@@ -1367,7 +1479,10 @@ class Layer(object):
destination[structured_name_prefix + name] = data
for name, buffer in self._buffers.items():
if not include_non_persistable_buffer:
if buffer is not None and name not in self._non_persistable_buffer_names_set:
if (
buffer is not None
and name not in self._non_persistable_buffer_names_set
):
destination[structured_name_prefix + name] = buffer
else:
if buffer is not None:
......@@ -1379,9 +1494,13 @@ class Layer(object):
destination_temp = destination.copy()
destination_temp.update(
layer_item._state_dict_impl(
destination_temp, include_sublayers,
destination_temp,
include_sublayers,
structured_name_prefix + layer_name + ".",
include_non_persistable_buffer, use_hook))
include_non_persistable_buffer,
use_hook,
)
)
destination = destination_temp
if use_hook:
for state_dict_hook in self._state_dict_hooks.values():
......@@ -1391,12 +1510,15 @@ class Layer(object):
return destination
def to_static_state_dict(self,
def to_static_state_dict(
self,
destination=None,
include_sublayers=True,
structured_name_prefix="",
use_hook=True):
use_hook=True,
):
'''
Get all parameters and buffers of current layer and its sub-layers. And set them into a dict
Parameters:
......@@ -1405,7 +1527,7 @@ class Layer(object):
use_hook(bool, optional) : If true, the operations contained in _state_dict_hooks will be appended to the destination. Default: True
Retruns:
dict: a dict contains all the parameters and persistable buffers.
dict, a dict contains all the parameters and persistable buffers.
Examples:
.. code-block:: python
......@@ -1423,13 +1545,16 @@ class Layer(object):
include_sublayers=include_sublayers,
structured_name_prefix=structured_name_prefix,
include_non_persistable_buffer=True,
use_hook=use_hook)
use_hook=use_hook,
)
def state_dict(self,
def state_dict(
self,
destination=None,
include_sublayers=True,
structured_name_prefix="",
use_hook=True):
use_hook=True,
):
'''
Get all parameters and persistable buffers of current layer and its sub-layers. And set them into a dict
......@@ -1457,7 +1582,8 @@ class Layer(object):
include_sublayers=include_sublayers,
structured_name_prefix=structured_name_prefix,
include_non_persistable_buffer=False,
use_hook=use_hook)
use_hook=use_hook,
)
@framework.deprecate_stat_dict
def set_state_dict(self, state_dict, use_structured_name=True):
......@@ -1489,22 +1615,31 @@ class Layer(object):
state = state_dict.get(key, None)
if state is None:
raise ValueError(
"{} is not found in the provided dict.".format(key))
if (isinstance(state, dict) or isinstance(state, list)):
if (len(state) != len(param)):
raise ValueError("{} receieves the length of {}, "
"{} is not found in the provided dict.".format(key)
)
if isinstance(state, dict) or isinstance(state, list):
if len(state) != len(param):
raise ValueError(
"{} receieves the length of {}, "
"but the expected shape is {}".format(
key, len(state), len(param)))
key, len(state), len(param)
)
)
else:
return param, state
else:
state_shape = state.shape() if inspect.ismethod(
state.shape) else state.shape
state_shape = (
state.shape()
if inspect.ismethod(state.shape)
else state.shape
)
if list(state_shape) != list(param.shape):
raise ValueError(
"{} receives a shape {}, but the expected shape is {}.".
format(key, list(state_shape), list(param.shape)))
"{} receives a shape {}, but the expected shape is {}.".format(
key, list(state_shape), list(param.shape)
)
)
return param, state
matched_param_state = []
......@@ -1541,8 +1676,10 @@ class Layer(object):
executor = Executor(_get_device())._default_executor
# restore parameter states
core._create_loaded_parameter(
[param for param, state in matched_param_state], global_scope(),
executor)
[param for param, state in matched_param_state],
global_scope(),
executor,
)
for param, state in matched_param_state:
_set_var(param, state)
......@@ -1594,11 +1731,13 @@ class Layer(object):
# [ 0.33960250, 0.96878713]])
'''
return self._to_impl(device=device,
return self._to_impl(
device=device,
dtype=dtype,
blocking=blocking,
include_sublayers=True,
floating_only=False)
floating_only=False,
)
def _apply(self, func, device, dtype, blocking, include_sublayers=True):
if include_sublayers:
......@@ -1612,8 +1751,9 @@ class Layer(object):
if param.grad is not None:
with no_grad():
grad_applied = func(param._grad_ivar(), device, dtype,
blocking)
grad_applied = func(
param._grad_ivar(), device, dtype, blocking
)
for key, buf in self._buffers.items():
if buf is not None:
......@@ -1637,12 +1777,14 @@ class Layer(object):
# Note(zhangbo): Paddle GPU minimum memory allocation unit is 256 bytes, waiting_alloc_memory will comput ‘t’ occupied memory space.
# Coefficient 1.2 is used to avoid OOM that may occur in this critical state when the memory is just enough.
waiting_alloc_memory = (
(np.prod(t.shape) * size_dtype) / 256 + 1) * 256 * 1.2
((np.prod(t.shape) * size_dtype) / 256 + 1) * 256 * 1.2
)
gpu_memory_available = core.gpu_memory_available()
if gpu_memory_available < waiting_alloc_memory:
# Copy param / Tensor to cpu
t_used = t._copy_to(paddle.CPUPlace(),
blocking) # k-v type will error
t_used = t._copy_to(
paddle.CPUPlace(), blocking
) # k-v type will error
# Release mem of t
t.value().get_tensor()._clear()
else:
......@@ -1653,7 +1795,8 @@ class Layer(object):
# 2. cast param / Tensor to dtype
if dtype is not None and dtype != t_used.dtype:
with paddle.fluid.framework._dygraph_place_guard(
place=t_used.place):
place=t_used.place
):
t_casted = t_used.cast(dtype=dtype)
else:
t_casted = t_used
......@@ -1671,12 +1814,14 @@ class Layer(object):
return t
def _to_impl(self,
def _to_impl(
self,
device=None,
dtype=None,
blocking=None,
include_sublayers=True,
floating_only=False):
floating_only=False,
):
'''
Cast the parameters and buffers of Layer by the give device, dtype and blocking.
......@@ -1705,20 +1850,28 @@ class Layer(object):
if device is not None:
if isinstance(device, str):
device = paddle.device._convert_to_place(device)
elif isinstance(device, (core.CPUPlace, core.CUDAPlace,
core.CUDAPinnedPlace, core.XPUPlace)):
elif isinstance(
device,
(
core.CPUPlace,
core.CUDAPlace,
core.CUDAPinnedPlace,
core.XPUPlace,
),
):
pass
else:
raise ValueError(
"device value error, must be str, paddle.CPUPlace(), paddle.CUDAPlace(), paddle.CUDAPinnedPlace() or paddle.XPUPlace(), but the type of device is "
+ type(device).__name__)
+ type(device).__name__
)
if blocking is None:
blocking = True
else:
assert isinstance(
blocking,
bool), "blocking value error, must be the True, False or None"
blocking, bool
), "blocking value error, must be the True, False or None"
def transform(t, device, dtype, blocking):
if floating_only and (not paddle.is_floating_point(t)):
......
......@@ -1352,12 +1352,13 @@ class ParameterMetaClass(VariableMetaClass):
@six.add_metaclass(VariableMetaClass)
class Variable(object):
"""
**Notes**:
**The constructor of Variable should not be invoked directly.**
**In Static Graph Mode: Please use** `Block.create_var` **to create a Static variable which has no data until being feed.**
Notes:
The constructor of Variable should not be invoked directly.
In Static Graph Mode: Please use ** `Block.create_var` ** to create a Static variable which has no data until being feed.
**In Dygraph Mode: Please use** :ref:`api_fluid_dygraph_to_variable` **to create a dygraph variable with real data**
In Dygraph Mode: Please use ** :ref:`api_fluid_dygraph_to_variable` ** to create a dygraph variable with real data.
In Fluid, every input and output of an OP is a variable. In most
cases, variables are used for holding different kinds of data or training
......@@ -1514,12 +1515,13 @@ class Variable(object):
def detach(self):
"""
Returns a new Variable, detached from the current graph.
It will share data with origin Variable and without tensor copy.
In addition, the detached Variable doesn't provide gradient propagation.
Returns:
( :ref:`api_guide_Variable_en` | dtype is same as current Variable): The detached Variable.
( :ref:`api_guide_Variable_en` | dtype is same as current Variable), The detached Variable.
Examples:
.. code-block:: python
......@@ -1533,6 +1535,7 @@ class Variable(object):
# create a detached Variable
y = x.detach()
"""
assert (
......@@ -2085,6 +2088,7 @@ class Variable(object):
@property
def T(self):
"""
Permute current Variable with its dimensions reversed.
If `n` is the dimensions of `x` , `x.T` is equivalent to `x.transpose([n-1, n-2, ..., 0])`.
......@@ -2103,6 +2107,7 @@ class Variable(object):
x_T_np = exe.run(paddle.static.default_main_program(), fetch_list=[x_T])[0]
print(x_T_np.shape)
# (5, 3, 2)
"""
if len(self.shape) == 1:
return self
......@@ -2141,7 +2146,7 @@ class Variable(object):
as ``out = assign(tensor)`` .
Returns:
Variable: The cloned Variable.
Variable, The cloned Variable.
Examples:
.. code-block:: python
......@@ -2171,6 +2176,7 @@ class Variable(object):
def _set_error_clip(self, error_clip):
"""
Set the error_clip.
Args:
......@@ -2178,11 +2184,13 @@ class Variable(object):
Returns:
None
"""
self.error_clip = error_clip
def _set_info(self, key, value):
"""
Set key-value information for this variable.
Args:
......@@ -2191,6 +2199,7 @@ class Variable(object):
Returns:
None
"""
if not hasattr(self, "_info"):
self._info = {}
......@@ -2198,6 +2207,7 @@ class Variable(object):
def _get_info(self, key):
"""
Get the information of this variable corresponding to key.
Args:
......@@ -2205,6 +2215,7 @@ class Variable(object):
Returns:
object
"""
if hasattr(self, "_info") and key in self._info:
return self._info[key]
......@@ -2212,7 +2223,9 @@ class Variable(object):
def _slice_indices(self, slice, length):
"""
Reference implementation for the slice.indices method.
"""
# Compute step and length as integers.
step = 1 if slice.step is None else slice.step
......@@ -2383,7 +2396,7 @@ class Variable(object):
Default: None
Returns:
Tensor: the value in given scope.
Tensor, the value in given scope.
Examples:
.. code-block:: python
......@@ -2438,6 +2451,7 @@ class Variable(object):
def set_value(self, value, scope=None):
'''
Set the value to the tensor in given scope.
Args:
......@@ -2477,6 +2491,7 @@ class Variable(object):
if var.persistable:
t_load = paddle.load(path+var.name+'.pdtensor')
var.set_value(t_load)
'''
# The 'framework' is a low-level module, and 'executor'
......@@ -2547,10 +2562,11 @@ class Variable(object):
def size(self):
"""
Returns the number of elements for current Variable, which is a int64 Variable with shape [1]
Returns:
Variable: the number of elements for current Variable
Variable, the number of elements for current Variable
Examples:
.. code-block:: python
......@@ -2564,6 +2580,7 @@ class Variable(object):
# get the number of elements of the Variable
y = x.size()
"""
output = self.block.create_var(
......@@ -2578,23 +2595,27 @@ class Variable(object):
def _set_attr(self, name, val):
"""
Set the value of attribute by attribute's name.
Args:
name(str): the attribute name.
val(int|str|list): the value of the attribute.
"""
self._update_desc_attr(name, val)
def _has_attr(self, name):
"""
Whether this Variable has the attribute with the name `name` or not.
Args:
name(str): the attribute name.
Returns:
bool: True if has this attribute.
bool, True if has this attribute.
"""
return self.desc.has_attr(name)
......@@ -2624,7 +2645,7 @@ class Variable(object):
name(str): the attribute name.
Returns:
int|str|list: The attribute value. The return value
int|str|list, The attribute value. The return value
can be any valid attribute type.
"""
return self.desc.attr(name)
......@@ -3196,14 +3217,16 @@ class Operator(object):
def input(self, name):
r"""
Get the input arguments according to the input parameter name.
Args:
name(str): The input parameter name.
Returns:
list: return the list of argument names that associated with \
list, return the list of argument names that associated with \
the specific parameter name.
"""
return self.desc.input(name)
......
......@@ -20,7 +20,13 @@ from __future__ import print_function
import warnings
from ..layer_helper import LayerHelper
from ..initializer import Normal, Constant
from ..framework import Variable, _non_static_mode, _varbase_creator, _in_legacy_dygraph, in_dygraph_mode
from ..framework import (
Variable,
_non_static_mode,
_varbase_creator,
_in_legacy_dygraph,
in_dygraph_mode,
)
from .. import core
from ..param_attr import ParamAttr
from . import nn
......@@ -33,22 +39,29 @@ __all__ = ['accuracy', 'auc']
def accuracy(input, label, k=1, correct=None, total=None):
"""
accuracy layer.
Refer to the https://en.wikipedia.org/wiki/Precision_and_recall
This function computes the accuracy using the input and label.
If the correct label occurs in top k predictions, then correct will increment by one.
Note: the dtype of accuracy is determined by input. the input and label dtype can be different.
Note:
the dtype of accuracy is determined by input. the input and label dtype can be different.
Args:
input(Tensor): The input of accuracy layer, which is the predictions of network. A Tensor with type float32,float64.
The shape is ``[sample_number, class_dim]`` .
label(Tensor): The label of dataset. Tensor with type int32,int64. The shape is ``[sample_number, 1]`` .
k(int): The top k predictions for each class will be checked. Data type is int64 or int32.
correct(Tensor): The correct predictions count. A Tensor with type int64 or int32.
total(Tensor): The total entries count. A tensor with type int64 or int32.
k(int, optional): The top k predictions for each class will be checked. Data type is int64 or int32. Default is 1.
correct(Tensor, optional): The correct predictions count. A Tensor with type int64 or int32. Default is None.
total(Tensor, optional): The total entries count. A tensor with type int64 or int32. Default is None.
Returns:
Tensor: The correct rate. A Tensor with type float32.
Tensor, The correct rate. A Tensor with type float32.
Examples:
.. code-block:: python
import numpy as np
import paddle
import paddle.static as static
......@@ -68,6 +81,7 @@ def accuracy(input, label, k=1, correct=None, total=None):
fetch_list=[result[0]])
print(output)
#[array([0.], dtype=float32)]
"""
if _non_static_mode():
if correct is None:
......@@ -76,15 +90,18 @@ def accuracy(input, label, k=1, correct=None, total=None):
total = _varbase_creator(dtype="int32")
_k = k.numpy().item(0) if isinstance(k, Variable) else k
topk_out, topk_indices = _legacy_C_ops.top_k_v2(input, 'k', _k,
'sorted', False)
_acc, _, _ = _legacy_C_ops.accuracy(topk_out, topk_indices, label,
correct, total)
topk_out, topk_indices = _legacy_C_ops.top_k_v2(
input, 'k', _k, 'sorted', False
)
_acc, _, _ = _legacy_C_ops.accuracy(
topk_out, topk_indices, label, correct, total
)
return _acc
helper = LayerHelper("accuracy", **locals())
check_variable_and_dtype(input, 'input', ['float16', 'float32', 'float64'],
'accuracy')
check_variable_and_dtype(
input, 'input', ['float16', 'float32', 'float64'], 'accuracy'
)
topk_out = helper.create_variable_for_type_inference(dtype=input.dtype)
topk_indices = helper.create_variable_for_type_inference(dtype="int64")
inputs = {"X": [input]}
......@@ -93,39 +110,38 @@ def accuracy(input, label, k=1, correct=None, total=None):
else:
attrs = {'k': k}
attrs['sorted'] = False
helper.append_op(type="top_k_v2",
helper.append_op(
type="top_k_v2",
inputs=inputs,
attrs=attrs,
outputs={
"Out": [topk_out],
"Indices": [topk_indices]
})
outputs={"Out": [topk_out], "Indices": [topk_indices]},
)
acc_out = helper.create_variable_for_type_inference(dtype="float32")
if correct is None:
correct = helper.create_variable_for_type_inference(dtype="int32")
if total is None:
total = helper.create_variable_for_type_inference(dtype="int32")
helper.append_op(type="accuracy",
inputs={
"Out": [topk_out],
"Indices": [topk_indices],
"Label": [label]
},
helper.append_op(
type="accuracy",
inputs={"Out": [topk_out], "Indices": [topk_indices], "Label": [label]},
outputs={
"Accuracy": [acc_out],
"Correct": [correct],
"Total": [total],
})
},
)
return acc_out
def auc(input,
def auc(
input,
label,
curve='ROC',
num_thresholds=2**12 - 1,
topk=1,
slide_steps=1,
ins_tag_weight=None):
ins_tag_weight=None,
):
"""
**Area Under the Curve (AUC) Layer**
......@@ -216,13 +232,14 @@ def auc(input,
helper = LayerHelper("auc", **locals())
if ins_tag_weight is None:
ins_tag_weight = tensor.fill_constant(shape=[1, 1],
dtype="float32",
value=1.0)
ins_tag_weight = tensor.fill_constant(
shape=[1, 1], dtype="float32", value=1.0
)
check_variable_and_dtype(input, 'input', ['float32', 'float64'], 'auc')
check_variable_and_dtype(label, 'label', ['int32', 'int64'], 'auc')
check_variable_and_dtype(ins_tag_weight, 'ins_tag_weight',
['float32', 'float64'], 'auc')
check_variable_and_dtype(
ins_tag_weight, 'ins_tag_weight', ['float32', 'float64'], 'auc'
)
auc_out = helper.create_variable_for_type_inference(dtype="float64")
batch_auc_out = helper.create_variable_for_type_inference(dtype="float64")
# make tp, tn, fp, fn persistable, so that can accumulate all batches.
......@@ -236,62 +253,71 @@ def auc(input,
batch_stat_pos = helper.create_global_variable(
persistable=True,
dtype='int64',
shape=[(1 + slide_steps) * (num_thresholds + 1) + 1])
shape=[(1 + slide_steps) * (num_thresholds + 1) + 1],
)
batch_stat_neg = helper.create_global_variable(
persistable=True,
dtype='int64',
shape=[(1 + slide_steps) * (num_thresholds + 1) + 1])
shape=[(1 + slide_steps) * (num_thresholds + 1) + 1],
)
# for global auc
# Needn't maintain the batch id
stat_pos = helper.create_global_variable(persistable=True,
dtype='int64',
shape=[1, num_thresholds + 1])
stat_neg = helper.create_global_variable(persistable=True,
dtype='int64',
shape=[1, num_thresholds + 1])
stat_pos = helper.create_global_variable(
persistable=True, dtype='int64', shape=[1, num_thresholds + 1]
)
stat_neg = helper.create_global_variable(
persistable=True, dtype='int64', shape=[1, num_thresholds + 1]
)
for var in [batch_stat_pos, batch_stat_neg, stat_pos, stat_neg]:
helper.set_variable_initializer(var, Constant(value=0.0,
force_cpu=False))
helper.set_variable_initializer(
var, Constant(value=0.0, force_cpu=False)
)
#"InsTagWeight": [ins_tag_weight]
# "InsTagWeight": [ins_tag_weight]
# Batch AUC
helper.append_op(type="auc",
helper.append_op(
type="auc",
inputs={
"Predict": [input],
"Label": [label],
"StatPos": [batch_stat_pos],
"StatNeg": [batch_stat_neg]
"StatNeg": [batch_stat_neg],
},
attrs={
"curve": curve,
"num_thresholds": num_thresholds,
"slide_steps": slide_steps
"slide_steps": slide_steps,
},
outputs={
"AUC": [batch_auc_out],
"StatPosOut": [batch_stat_pos],
"StatNegOut": [batch_stat_neg]
})
"StatNegOut": [batch_stat_neg],
},
)
# Global AUC
helper.append_op(type="auc",
helper.append_op(
type="auc",
inputs={
"Predict": [input],
"Label": [label],
"StatPos": [stat_pos],
"StatNeg": [stat_neg]
"StatNeg": [stat_neg],
},
attrs={
"curve": curve,
"num_thresholds": num_thresholds,
"slide_steps": 0
"slide_steps": 0,
},
outputs={
"AUC": [auc_out],
"StatPosOut": [stat_pos],
"StatNegOut": [stat_neg]
})
return auc_out, batch_auc_out, [
batch_stat_pos, batch_stat_neg, stat_pos, stat_neg
]
"StatNegOut": [stat_neg],
},
)
return (
auc_out,
batch_auc_out,
[batch_stat_pos, batch_stat_neg, stat_pos, stat_neg],
)
因为 它太大了无法显示 source diff 。你可以改为 查看blob
......@@ -241,13 +241,13 @@ def send_ue_recv(
src_index (Tensor): An 1-D tensor, and the available data type is int32, int64.
dst_index (Tensor): An 1-D tensor, and should have the same shape as `src_index`.
The available data type is int32, int64.
message_op (str): Different message ops for x and e, including `add`, `sub`, `mul`, `div`.
reduce_op (str): Different reduce ops, including `sum`, `mean`, `max`, `min`.
message_op (str, optional): Different message ops for x and e, including `add`, `sub`, `mul`, `div`.
reduce_op (str, optional): Different reduce ops, including `sum`, `mean`, `max`, `min`.
Default value is `sum`.
out_size (int|Tensor|None): We can set `out_size` to get necessary output shape. If not set or
out_size (int|Tensor, optional): We can set `out_size` to get necessary output shape. If not set or
out_size is smaller or equal to 0, then this input will not be used.
Otherwise, `out_size` should be equal with or larger than
max(dst_index) + 1.
max(dst_index) + 1. Default value is `None`.
name (str, optional): Name for the operation (optional, default is None).
For more information, please refer to :ref:`api_guide_Name`.
......
......@@ -26,6 +26,7 @@ def reindex_graph(
x, neighbors, count, value_buffer=None, index_buffer=None, name=None
):
"""
Reindex Graph API.
This API is mainly used in Graph Learning domain, which should be used
......@@ -49,12 +50,12 @@ def reindex_graph(
should be the same with `x`.
count (Tensor): The neighbor count of the input nodes `x`. And the
data type should be int32.
value_buffer (Tensor|None): Value buffer for hashtable. The data type should be int32,
and should be filled with -1. Only useful for gpu version.
index_buffer (Tensor|None): Index buffer for hashtable. The data type should be int32,
value_buffer (Tensor, optional): Value buffer for hashtable. The data type should be int32,
and should be filled with -1. Only useful for gpu version. Default is None.
index_buffer (Tensor, optional): Index buffer for hashtable. The data type should be int32,
and should be filled with -1. Only useful for gpu version.
`value_buffer` and `index_buffer` should be both not None
if you want to speed up by using hashtable buffer.
if you want to speed up by using hashtable buffer. Default is None.
name (str, optional): Name for the operation (optional, default is None).
For more information, please refer to :ref:`api_guide_Name`.
......@@ -69,6 +70,7 @@ def reindex_graph(
.. code-block:: python
import paddle
x = [0, 1, 2]
neighbors = [8, 9, 0, 4, 7, 6, 7]
count = [2, 3, 2]
......@@ -138,6 +140,7 @@ def reindex_heter_graph(
x, neighbors, count, value_buffer=None, index_buffer=None, name=None
):
"""
Reindex HeterGraph API.
This API is mainly used in Graph Learning domain, which should be used
......@@ -161,12 +164,12 @@ def reindex_heter_graph(
The data type should be the same with `x`.
count (list|tuple): The neighbor counts of the input nodes `x` from different graphs.
And the data type should be int32.
value_buffer (Tensor|None): Value buffer for hashtable. The data type should be int32,
and should be filled with -1. Only useful for gpu version.
index_buffer (Tensor|None): Index buffer for hashtable. The data type should be int32,
value_buffer (Tensor, optional): Value buffer for hashtable. The data type should be int32,
and should be filled with -1. Only useful for gpu version. Default is None.
index_buffer (Tensor, optional): Index buffer for hashtable. The data type should be int32,
and should be filled with -1. Only useful for gpu version.
`value_buffer` and `index_buffer` should be both not None
if you want to speed up by using hashtable buffer.
if you want to speed up by using hashtable buffer. Default is None.
name (str, optional): Name for the operation (optional, default is None).
For more information, please refer to :ref:`api_guide_Name`.
......@@ -183,6 +186,7 @@ def reindex_heter_graph(
.. code-block:: python
import paddle
x = [0, 1, 2]
neighbors_a = [8, 9, 0, 4, 7, 6, 7]
count_a = [2, 3, 2]
......
......@@ -32,6 +32,7 @@ def sample_neighbors(
name=None,
):
"""
Graph Sample Neighbors API.
This API is mainly used in Graph Learning domain, and the main purpose is to
......@@ -52,16 +53,16 @@ def sample_neighbors(
The data type should be the same with `row`.
input_nodes (Tensor): The input nodes we need to sample neighbors for, and the
data type should be the same with `row`.
sample_size (int): The number of neighbors we need to sample. Default value is -1,
sample_size (int, optional): The number of neighbors we need to sample. Default value is -1,
which means returning all the neighbors of the input nodes.
eids (Tensor): The eid information of the input graph. If return_eids is True,
eids (Tensor, optional): The eid information of the input graph. If return_eids is True,
then `eids` should not be None. The data type should be the
same with `row`. Default is None.
return_eids (bool): Whether to return eid information of sample edges. Default is False.
perm_buffer (Tensor): Permutation buffer for fisher-yates sampling. If `use_perm_buffer`
return_eids (bool, optional): Whether to return eid information of sample edges. Default is False.
perm_buffer (Tensor, optional): Permutation buffer for fisher-yates sampling. If `use_perm_buffer`
is True, then `perm_buffer` should not be None. The data type should
be the same with `row`. If not None, we will use fiser-yates sampling
to speed up. Only useful for gpu version.
to speed up. Only useful for gpu version. Default is None.
name (str, optional): Name for the operation (optional, default is None).
For more information, please refer to :ref:`api_guide_Name`.
......@@ -78,6 +79,7 @@ def sample_neighbors(
.. code-block:: python
import paddle
# edges: (3, 0), (7, 0), (0, 1), (9, 1), (1, 2), (4, 3), (2, 4),
# (9, 5), (3, 5), (9, 6), (1, 6), (9, 8), (7, 8)
row = [3, 7, 0, 9, 1, 4, 2, 9, 3, 9, 1, 9, 7]
......
......@@ -69,8 +69,9 @@ def to_list(value):
def to_numpy(var):
assert isinstance(var, (Variable, fluid.core.VarBase,
fluid.core.eager.Tensor)), "not a variable"
assert isinstance(
var, (Variable, fluid.core.VarBase, fluid.core.eager.Tensor)
), "not a variable"
if isinstance(var, (fluid.core.VarBase, fluid.core.eager.Tensor)):
return var.numpy()
t = global_scope().find_var(var.name).get_tensor()
......@@ -105,10 +106,9 @@ def extract_args(func):
def _all_gather(x, nranks, ring_id=0, use_calc_stream=True):
return collective._c_allgather(x,
nranks,
ring_id=ring_id,
use_calc_stream=use_calc_stream)
return collective._c_allgather(
x, nranks, ring_id=ring_id, use_calc_stream=use_calc_stream
)
def wait_server_ready(endpoints):
......@@ -119,7 +119,8 @@ def wait_server_ready(endpoints):
for ep in endpoints:
ip_port = ep.split(":")
with contextlib.closing(
socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock:
socket.socket(socket.AF_INET, socket.SOCK_STREAM)
) as sock:
sock.settimeout(2)
result = sock.connect_ex((ip_port[0], int(ip_port[1])))
if result != 0:
......@@ -131,8 +132,9 @@ def wait_server_ready(endpoints):
break
def init_communicator(program, rank, nranks, wait_port, current_endpoint,
endpoints):
def init_communicator(
program, rank, nranks, wait_port, current_endpoint, endpoints
):
if nranks < 2:
return
other_endpoints = endpoints[:]
......@@ -144,53 +146,66 @@ def init_communicator(program, rank, nranks, wait_port, current_endpoint,
nccl_id_var = block.create_var(
name=fluid.unique_name.generate('nccl_id'),
persistable=True,
type=fluid.core.VarDesc.VarType.RAW)
type=fluid.core.VarDesc.VarType.RAW,
)
block.append_op(type='c_gen_nccl_id',
block.append_op(
type='c_gen_nccl_id',
inputs={},
outputs={'Out': nccl_id_var},
attrs={
'rank': rank,
'endpoint': current_endpoint,
'other_endpoints': other_endpoints
})
'other_endpoints': other_endpoints,
},
)
block.append_op(type='c_comm_init',
block.append_op(
type='c_comm_init',
inputs={'X': nccl_id_var},
outputs={},
attrs={
'nranks': nranks,
'rank': rank,
'ring_id': 0,
})
},
)
elif core.is_compiled_with_npu():
hccl_id_var = block.create_var(
name=fluid.unique_name.generate('hccl_id'),
persistable=True,
type=core.VarDesc.VarType.RAW)
block.append_op(type='c_gen_hccl_id',
type=core.VarDesc.VarType.RAW,
)
block.append_op(
type='c_gen_hccl_id',
inputs={},
outputs={'Out': hccl_id_var},
attrs={
'rank': rank,
'endpoint': current_endpoint,
'other_endpoints': other_endpoints
})
block.append_op(type='c_comm_init_hccl',
'other_endpoints': other_endpoints,
},
)
block.append_op(
type='c_comm_init_hccl',
inputs={'X': hccl_id_var},
outputs={},
attrs={
'rank': rank,
'ring_id': 0,
'device_id': int(os.getenv("FLAGS_selected_npus")),
'rank_ids': nranks
})
'rank_ids': nranks,
},
)
def prepare_distributed_context(place=None):
if place is None:
place = fluid.CUDAPlace(ParallelEnv().dev_id) if ParallelEnv().nranks > 1 \
place = (
fluid.CUDAPlace(ParallelEnv().dev_id)
if ParallelEnv().nranks > 1
else fluid.CUDAPlace(0)
)
place = _get_paddle_place(place)
strategy = fluid.dygraph.parallel.ParallelStrategy()
......@@ -208,9 +223,14 @@ def prepare_distributed_context(place=None):
def _init_context():
communicator_prog = fluid.Program()
init_communicator(communicator_prog, strategy.local_rank,
strategy.nranks, True, strategy.current_endpoint,
strategy.trainer_endpoints)
init_communicator(
communicator_prog,
strategy.local_rank,
strategy.nranks,
True,
strategy.current_endpoint,
strategy.trainer_endpoints,
)
exe = fluid.Executor(place)
exe.run(communicator_prog)
......@@ -220,7 +240,7 @@ def prepare_distributed_context(place=None):
fluid.enable_dygraph(place)
else:
assert ("Only support CUDAPlace for now.")
assert "Only support CUDAPlace for now."
_parallel_context_initialized = True
return strategy
......@@ -246,7 +266,9 @@ def _update_input_info(inputs):
class StaticGraphAdapter(object):
"""
Model traning/inference with a static graph.
"""
def __init__(self, model):
......@@ -269,7 +291,7 @@ class StaticGraphAdapter(object):
'eval_total': 0,
'test_total': 0,
'eval_batch': 0,
'test_batch': 0
'test_batch': 0,
}
self._nranks = ParallelEnv().nranks
......@@ -289,10 +311,13 @@ class StaticGraphAdapter(object):
self.model.mode = value
def train_batch(self, inputs, labels=None, update=True):
assert self.model._optimizer, \
"model not ready, please call `model.prepare()` first"
assert (
self.model._optimizer
), "model not ready, please call `model.prepare()` first"
self.mode = 'train'
assert update is True, "Does not support `update == False` in static mode by now."
assert (
update is True
), "Does not support `update == False` in static mode by now."
return self._run(inputs, labels)
def eval_batch(self, inputs, labels=None):
......@@ -307,7 +332,6 @@ class StaticGraphAdapter(object):
return self.model.network.parameters(*args, **kwargs)
def save(self, path):
def _save(state, path):
if not state:
return
......@@ -331,8 +355,7 @@ class StaticGraphAdapter(object):
# XXX `optimizer.state_dict()` only work in dygraph mode
optim_path = path + ".pdopt"
optim = {
p.name: p
for p in filter(is_belong_to_optimizer, prog.list_vars())
p.name: p for p in filter(is_belong_to_optimizer, prog.list_vars())
}
if not optim:
return
......@@ -348,8 +371,10 @@ class StaticGraphAdapter(object):
# restore parameter states
fluid.core._create_loaded_parameter(
[param for param, state in param_state_pairs], global_scope(),
executor)
[param for param, state in param_state_pairs],
global_scope(),
executor,
)
for param, state in param_state_pairs:
self._set_var(param, state)
......@@ -377,9 +402,10 @@ class StaticGraphAdapter(object):
# static-graph, since the time of global_step to increase is
# different.
state_val = (
np.array(converted_state.pop("global_step")) - 1
) if "global_step" in converted_state else converted_state.pop(
"@LR_DECAY_COUNTER@", None)
(np.array(converted_state.pop("global_step")) - 1)
if "global_step" in converted_state
else converted_state.pop("@LR_DECAY_COUNTER@", None)
)
if state_val is not None:
converted_state[var.name] = state_val
elif var.name.startswith("learning_rate_"):
......@@ -396,36 +422,61 @@ class StaticGraphAdapter(object):
opt_cls_name = self.model._optimizer.__class__.__name__
opt_unq_name = None
for name in self.model._optimizer._accumulators.keys():
accum_name = name if opt_name is None else name[
len(opt_name) + 1:]
for param_name, state_var in self.model._optimizer._accumulators[
name].items():
accum_name = (
name
if opt_name is None
else name[len(opt_name) + 1 :]
)
for (
param_name,
state_var,
) in self.model._optimizer._accumulators[name].items():
if opt_unq_name is None:
# can not infer out the exact unique(opt_name),
# thus try to extract rather than generate
for state_key in sorted(state.keys(),
for state_key in sorted(
state.keys(),
key=lambda x: len(x),
reverse=True):
prefix = param_name + "_" + (
reverse=True,
):
prefix = (
param_name
+ "_"
+ (
opt_cls_name
if opt_name is None else opt_name) + "_"
if opt_name is None
else opt_name
)
+ "_"
)
if state_key.startswith(prefix):
prefix_offset = state_key[len(
prefix):].find("_") + len(prefix)
prefix_offset = state_key[
len(prefix) :
].find("_") + len(prefix)
opt_unq_name = state_key[
len(param_name + "_"):prefix_offset]
len(
param_name + "_"
) : prefix_offset
]
# TODO: assert
# assert opt_unq_name is None
# gen(param.name + "_" + gen(opt_name) + "_" + accum_name)
# always end with "_0" since the unique optimizer._name
dy_state_name = (param_name + "_" + opt_unq_name +
"_" + accum_name + "_0")
dy_state_name = (
param_name
+ "_"
+ opt_unq_name
+ "_"
+ accum_name
+ "_0"
)
converted_state[
state_var.name] = converted_state.pop(
dy_state_name)
state_var.name
] = converted_state.pop(dy_state_name)
assert var.name in converted_state, \
"variable [{}] is not in optimizer state file".format(var.name)
assert (
var.name in converted_state
), "variable [{}] is not in optimizer state file".format(var.name)
self._set_var(var, converted_state[var.name])
def _set_var(self, var, ndarray):
......@@ -444,15 +495,17 @@ class StaticGraphAdapter(object):
def _run(self, inputs, labels=None):
compiled_prog = self._compiled_progs.get(self.mode, None)
assert compiled_prog, \
"Model is not ready, please call `model.prepare()` first"
assert (
compiled_prog
), "Model is not ready, please call `model.prepare()` first"
inputs = to_list(inputs)
if labels is not None:
labels = to_list(labels)
assert len(inputs) == len(self._input_vars[self.mode]), \
"number of inputs" \
assert len(inputs) == len(self._input_vars[self.mode]), (
"number of inputs"
+ " does not match number of arguments of `forward` method"
)
feed = {}
input_names = [v.name for v in self._input_vars[self.mode]]
......@@ -462,8 +515,10 @@ class StaticGraphAdapter(object):
# train and test may take different arguments
if inputs[idx] is not None:
feed[n] = inputs[idx]
if self._amp_level == 'O2' and input_dtypes[
idx] == core.VarDesc.VarType.FP16:
if (
self._amp_level == 'O2'
and input_dtypes[idx] == core.VarDesc.VarType.FP16
):
if isinstance(feed[n], core.LoDTensor):
feed[n] = feed[n]._as_type(core.VarDesc.VarType.FP16)
elif isinstance(feed[n], np.array):
......@@ -491,10 +546,12 @@ class StaticGraphAdapter(object):
else:
pruned_fetch_list.append(fetch_var)
rets = self._executor.run(compiled_prog,
rets = self._executor.run(
compiled_prog,
feed=feed,
fetch_list=pruned_fetch_list,
return_numpy=False)
return_numpy=False,
)
# restore pruned fetch_list Variable from feeds
for i, name in enumerate(pruned_fetch_idx_name_map):
......@@ -510,20 +567,24 @@ class StaticGraphAdapter(object):
metrics = []
for metric, state in zip(self.model._metrics, metric_states):
# cut off padding size
if self.mode != 'train' and self.model._test_dataloader is not None \
and isinstance(self.model._test_dataloader, DataLoader) \
and self._nranks > 1:
if (
self.mode != 'train'
and self.model._test_dataloader is not None
and isinstance(self.model._test_dataloader, DataLoader)
and self._nranks > 1
):
total_size = len(self.model._test_dataloader.dataset)
# TODO: fixme if have better way to get batch size
samples = state[0].shape[0]
current_count = self._merge_count.get(self.mode + '_total', 0)
if current_count + samples >= total_size:
state = [
s[:int(total_size - current_count), ...] for s in state
s[: int(total_size - current_count), ...] for s in state
]
self._merge_count[self.mode + '_total'] = 0
self._merge_count[self.mode + '_batch'] = int(total_size -
current_count)
self._merge_count[self.mode + '_batch'] = int(
total_size - current_count
)
else:
self._merge_count[self.mode + '_total'] += samples
self._merge_count[self.mode + '_batch'] = samples
......@@ -555,8 +616,11 @@ class StaticGraphAdapter(object):
if mode != 'train':
for op in list(prog.global_block().ops):
prog.global_block()._remove_op(0)
if mode == 'train' and self.model._optimizer \
and self.model._optimizer._learning_rate_map:
if (
mode == 'train'
and self.model._optimizer
and self.model._optimizer._learning_rate_map
):
# HACK workaround learning rate map issue
lr_var = self.model._optimizer._learning_rate_map[self._orig_prog]
new_lr_var = prog.global_block().vars[lr_var.name]
......@@ -594,20 +658,27 @@ class StaticGraphAdapter(object):
dist_strategy.amp = True
dist_strategy.amp_configs = self._amp_configs.copy()
dist_strategy.amp_configs.update(self._amp_custom_lists)
dist_strategy.amp_configs[
'use_pure_fp16'] = self._amp_level == 'O2'
dist_strategy.amp_configs['use_pure_fp16'] = (
self._amp_level == 'O2'
)
self.model._optimizer = fleet.distributed_optimizer(
self.model._optimizer, strategy=dist_strategy)
self.model._optimizer, strategy=dist_strategy
)
elif self._amp_level != "O0" and core.is_compiled_with_cuda:
amp_lists = paddle.static.amp.AutoMixedPrecisionLists(
amp_lists = (
paddle.static.amp.AutoMixedPrecisionLists(
**self._amp_custom_lists
) if self._amp_custom_lists else None
)
if self._amp_custom_lists
else None
)
self.model._optimizer = paddle.static.amp.decorate(
self.model._optimizer,
amp_lists=amp_lists,
use_pure_fp16=self._amp_level == "O2",
use_fp16_guard=self._use_fp16_guard,
**self._amp_configs)
**self._amp_configs
)
self.model._optimizer.minimize(self._loss_endpoint)
......@@ -620,7 +691,7 @@ class StaticGraphAdapter(object):
self._endpoints[mode] = {
"output": outputs,
"loss": to_list(losses),
"metric": metrics
"metric": metrics,
}
def _compile_and_initialize(self, prog, mode):
......@@ -628,8 +699,9 @@ class StaticGraphAdapter(object):
if compiled_prog is not None:
return compiled_prog
assert self.model._place is not None, \
"device is not set, please call `model.prepare()` first"
assert (
self.model._place is not None
), "device is not set, please call `model.prepare()` first"
place = self.model._place
......@@ -642,8 +714,11 @@ class StaticGraphAdapter(object):
uninitialized = []
for var_py in self._startup_prog.list_vars():
var = fluid.global_scope().find_var(var_py.name)
if not var_py.name.startswith('nccl_id') and var and \
var.get_tensor()._is_initialized():
if (
not var_py.name.startswith('nccl_id')
and var
and var.get_tensor()._is_initialized()
):
continue
uninitialized.append(var_py)
......@@ -651,7 +726,10 @@ class StaticGraphAdapter(object):
startup_prog = self._startup_prog._prune(uninitialized)
self._executor.run(startup_prog)
if self._amp_level == "O2" and mode == 'train' and core.is_compiled_with_cuda(
if (
self._amp_level == "O2"
and mode == 'train'
and core.is_compiled_with_cuda()
):
self.model._optimizer.amp_init(place)
......@@ -664,7 +742,6 @@ class StaticGraphAdapter(object):
class DynamicGraphAdapter(object):
def __init__(self, model):
super(DynamicGraphAdapter, self).__init__()
self.model = model
......@@ -674,7 +751,7 @@ class DynamicGraphAdapter(object):
'eval_total': 0,
'test_total': 0,
'eval_batch': 0,
'test_batch': 0
'test_batch': 0,
}
self._input_info = None
......@@ -691,7 +768,8 @@ class DynamicGraphAdapter(object):
stradegy.trainer_endpoints = ParallelEnv().trainer_endpoints
stradegy.current_endpoint = ParallelEnv().current_endpoint
self.ddp_model = fluid.dygraph.parallel.DataParallel(
self.model.network, stradegy)
self.model.network, stradegy
)
@property
def mode(self):
......@@ -703,8 +781,9 @@ class DynamicGraphAdapter(object):
# TODO multi device in dygraph mode not implemented at present time
def train_batch(self, inputs, labels=None, update=True):
assert self.model._optimizer, \
"model not ready, please call `model.prepare()` first"
assert (
self.model._optimizer
), "model not ready, please call `model.prepare()` first"
self.model.network.train()
self.mode = 'train'
inputs = to_list(inputs)
......@@ -716,9 +795,11 @@ class DynamicGraphAdapter(object):
if self._amp_level != "O0" and self.model._scaler is None:
self.model._scaler = paddle.amp.GradScaler(**self._amp_configs)
with paddle.amp.auto_cast(enable=self._amp_level != 'O0',
with paddle.amp.auto_cast(
enable=self._amp_level != 'O0',
**self._amp_custom_lists,
level=self._amp_level):
level=self._amp_level
):
if self._nranks > 1:
outputs = self.ddp_model(*[to_variable(x) for x in inputs])
else:
......@@ -746,8 +827,11 @@ class DynamicGraphAdapter(object):
m = metric.update(*[to_numpy(m) for m in to_list(metric_outs)])
metrics.append(m)
return ([to_numpy(l) for l in losses], metrics) \
if len(metrics) > 0 else [to_numpy(l) for l in losses]
return (
([to_numpy(l) for l in losses], metrics)
if len(metrics) > 0
else [to_numpy(l) for l in losses]
)
def eval_batch(self, inputs, labels=None):
self.model.network.eval()
......@@ -777,21 +861,25 @@ class DynamicGraphAdapter(object):
metrics = []
for metric in self.model._metrics:
# cut off padding value.
if self.model._test_dataloader is not None and self._nranks > 1 \
and isinstance(self.model._test_dataloader, DataLoader):
if (
self.model._test_dataloader is not None
and self._nranks > 1
and isinstance(self.model._test_dataloader, DataLoader)
):
total_size = len(self.model._test_dataloader.dataset)
samples = outputs[0].shape[0]
current_count = self._merge_count.get(self.mode + '_total', 0)
if current_count + samples >= total_size:
outputs = [
o[:int(total_size - current_count)] for o in outputs
o[: int(total_size - current_count)] for o in outputs
]
labels = [
l[:int(total_size - current_count)] for l in labels
l[: int(total_size - current_count)] for l in labels
]
self._merge_count[self.mode + '_total'] = 0
self._merge_count[self.mode + '_batch'] = int(total_size -
current_count)
self._merge_count[self.mode + '_batch'] = int(
total_size - current_count
)
else:
self._merge_count[self.mode + '_total'] += samples
self._merge_count[self.mode + '_batch'] = samples
......@@ -858,38 +946,48 @@ class DynamicGraphAdapter(object):
opt_unq_name = ''
opt_cls_name = self.model._optimizer.__class__.__name__
opt_name = opt_unq_name[:opt_unq_name.rfind("_")] # remove suffix idx
opt_name = opt_unq_name[: opt_unq_name.rfind("_")] # remove suffix idx
param_names = [param.name for param in self.model.network.parameters()]
for var_name, state_var in sorted(optim_state.items(),
key=lambda x: len(x[0]),
reverse=True):
for var_name, state_var in sorted(
optim_state.items(), key=lambda x: len(x[0]), reverse=True
):
if var_name in ["@LR_DECAY_COUNTER@", "global_step"]:
# NOTE: dygraph saved global_step is 1 larger than that in
# static-graph, since the time of global_step to increase is
# different.
if var_name == "@LR_DECAY_COUNTER@":
converted_state["global_step"] = np.array(
converted_state.pop("@LR_DECAY_COUNTER@")) + 1
converted_state["global_step"] = (
np.array(converted_state.pop("@LR_DECAY_COUNTER@")) + 1
)
else:
# moment and other accumulators
# extend state dict to include promising dygraph names
for param_name in param_names:
if var_name.startswith(param_name + "_" + opt_name):
# when init optimizer with name
accum_name = var_name[len(param_name + "_" + opt_name +
"_"):]
elif var_name.startswith(param_name +
"_") and opt_name == opt_cls_name:
accum_name = var_name[
len(param_name + "_" + opt_name + "_") :
]
elif (
var_name.startswith(param_name + "_")
and opt_name == opt_cls_name
):
# when init optimizer without name
accum_name = var_name[len(param_name + "_"):]
accum_name = var_name[len(param_name + "_") :]
else:
continue
# remove suffix idx
accum_name = accum_name[:accum_name.rfind("_")]
accum_name = accum_name[: accum_name.rfind("_")]
# state names always end with "_0" in dygraph because of the
# unique optimizer._name
dy_state_name = (param_name + "_" + opt_unq_name + "_" +
accum_name + "_0")
dy_state_name = (
param_name
+ "_"
+ opt_unq_name
+ "_"
+ accum_name
+ "_0"
)
converted_state[dy_state_name] = state_var
if not hasattr(self.model._optimizer, 'set_state_dict'):
......@@ -901,18 +999,23 @@ class DynamicGraphAdapter(object):
self.model._optimizer.set_state_dict(converted_state)
def prepare(self):
if self._amp_level == "O2" and self.model.mode == 'train' and core.is_compiled_with_cuda(
if (
self._amp_level == "O2"
and self.model.mode == 'train'
and core.is_compiled_with_cuda()
):
self.model.network, self.model._optimizer = paddle.amp.decorate(
models=self.model.network,
optimizers=self.model._optimizer,
level='O2')
level='O2',
)
if self._amp_level != "O0":
self.model._scaler = None
class Model(object):
"""
An Model object is network with training and inference features.
Dynamic graph and static graph are supported at the same time,
switched by `paddle.enable_static()`. The usage is as follows.
......@@ -1053,6 +1156,7 @@ class Model(object):
def train_batch(self, inputs, labels=None, update=True):
"""
Run one training step on one batch of data. And using `update` indicates
whether optimizer update gradients computing by this batch.
......@@ -1098,6 +1202,7 @@ class Model(object):
loss = model.train_batch([data], [label])
print(loss)
# [array([2.192784], dtype=float32)]
"""
loss = self._adapter.train_batch(inputs, labels, update)
if fluid._non_static_mode() and self._input_info is None:
......@@ -1107,6 +1212,7 @@ class Model(object):
@no_grad()
def eval_batch(self, inputs, labels=None):
"""
Run one evaluating step on a batch of data.
Args:
......@@ -1150,6 +1256,7 @@ class Model(object):
loss, acc = model.eval_batch([data], [label])
print(loss, acc)
# [array([2.8825705], dtype=float32)] [0.0]
"""
loss = self._adapter.eval_batch(inputs, labels)
if fluid._non_static_mode() and self._input_info is None:
......@@ -1159,6 +1266,7 @@ class Model(object):
@no_grad()
def predict_batch(self, inputs):
"""
Run one predicting step on a batch of data.
Args:
......@@ -1197,6 +1305,7 @@ class Model(object):
# [array([[0.08189095, 0.16740078, 0.06889386, 0.05085445, 0.10729759,
# 0.02217775, 0.14518553, 0.1591538 , 0.01808308, 0.17906217]],
# dtype=float32)]
"""
loss = self._adapter.predict_batch(inputs)
if fluid._non_static_mode() and self._input_info is None:
......@@ -1205,6 +1314,7 @@ class Model(object):
def save(self, path, training=True):
"""
This function saves parameters, optimizer information or model and
paramters only for inference to path. It depends on the parameter
`training`.
......@@ -1272,6 +1382,7 @@ class Model(object):
model.fit(data, epochs=1, batch_size=32, verbose=0)
model.save('checkpoint/test') # save for training
model.save('inference_model', False) # save for inference
"""
if ParallelEnv().local_rank == 0:
......@@ -1282,6 +1393,7 @@ class Model(object):
def load(self, path, skip_mismatch=False, reset_optimizer=False):
"""
Load from files storing the model states and optimizer states. The file
for optimizer states is not necessary if no need to restore the optimizer.
......@@ -1329,6 +1441,7 @@ class Model(object):
model.save('checkpoint/test')
model.load('checkpoint/test')
"""
def _load_state_from_path(path):
......@@ -1341,17 +1454,24 @@ class Model(object):
state = param_state.get(key, None)
if state is None:
raise ValueError(
"{} is not found in the providing file.".format(key))
"{} is not found in the providing file.".format(key)
)
if list(state.shape) != list(param.shape):
raise ValueError(
"{} receives a shape {}, but the expected shape is {}.".
format(key, list(state.shape), list(param.shape)))
"{} receives a shape {}, but the expected shape is {}.".format(
key, list(state.shape), list(param.shape)
)
)
return param, state
def _strip_postfix(path):
path, ext = os.path.splitext(path)
assert ext in ['', '.pdparams', '.pdopt', '.pdmodel'], \
"Unknown postfix {} from weights".format(ext)
assert ext in [
'',
'.pdparams',
'.pdopt',
'.pdmodel',
], "Unknown postfix {} from weights".format(ext)
return path
path = _strip_postfix(path)
......@@ -1365,15 +1485,17 @@ class Model(object):
except ValueError as err:
if skip_mismatch:
warnings.warn(
("Skip loading for {}. ".format(key) + str(err)))
("Skip loading for {}. ".format(key) + str(err))
)
# reset optimizer when mismatch happens
reset_optimizer = True
else:
raise err
matched_param_state.append(match_res)
optim_state = None if reset_optimizer else _load_state_from_path(
path + ".pdopt")
optim_state = (
None if reset_optimizer else _load_state_from_path(path + ".pdopt")
)
# TODO: support save/load scaler state in static graph
if _non_static_mode():
......@@ -1382,13 +1504,15 @@ class Model(object):
if os.path.exists(path + '.pdscaler'):
scaler_state = paddle.load(path + '.pdscaler')
return self._adapter.load(matched_param_state, optim_state,
scaler_state)
return self._adapter.load(
matched_param_state, optim_state, scaler_state
)
else:
return self._adapter.load(matched_param_state, optim_state)
def parameters(self, *args, **kwargs):
"""
Returns a list of parameters of the model.
Returns:
......@@ -1411,17 +1535,19 @@ class Model(object):
nn.Linear(200, 10)), input)
params = model.parameters()
"""
return self._adapter.parameters()
def _prepare_amp(self, amp_configs):
def _check_pure_fp16_configs():
# pure float16 training has some restricts now
if self._adapter._amp_level == "O2" and self._optimizer._grad_clip:
# clip by value is not supported
assert isinstance(self._optimizer._grad_clip, (paddle.nn.ClipGradByGlobalNorm, paddle.nn.ClipGradByNorm)), \
"Only GradientClipByNorm and GradientClipByGlobalNorm are supported in amp training with level=O2 currently."
assert isinstance(
self._optimizer._grad_clip,
(paddle.nn.ClipGradByGlobalNorm, paddle.nn.ClipGradByNorm),
), "Only GradientClipByNorm and GradientClipByGlobalNorm are supported in amp training with level=O2 currently."
self._adapter._amp_custom_lists = {}
self._adapter._amp_configs = {}
......@@ -1433,7 +1559,8 @@ class Model(object):
elif isinstance(amp_configs, str):
if amp_configs not in ('O0', 'O1', 'O2'):
raise ValueError(
"The level of amp_configs should be 'O0', 'O1' or 'O2'.")
"The level of amp_configs should be 'O0', 'O1' or 'O2'."
)
self._adapter._amp_level = amp_configs
_check_pure_fp16_configs()
return
......@@ -1442,7 +1569,8 @@ class Model(object):
self._adapter._amp_level = 'O1'
elif amp_configs['level'] not in ('O0', 'O1', 'O2'):
raise ValueError(
"amp_configs['level'] should be 'O0', 'O1' or 'O2'.")
"amp_configs['level'] should be 'O0', 'O1' or 'O2'."
)
else:
self._adapter._amp_level = amp_configs['level']
amp_config_key_set = set(amp_configs.keys()) - {'level'}
......@@ -1459,12 +1587,14 @@ class Model(object):
# construct amp_custom_lists
if self._adapter._amp_level != 'O0' and amp_config_key_set:
for param_name in [
'custom_white_list', 'custom_black_list',
'custom_black_varnames'
'custom_white_list',
'custom_black_list',
'custom_black_varnames',
]:
if param_name in amp_config_key_set:
self._adapter._amp_custom_lists[param_name] = amp_configs[
param_name]
param_name
]
amp_config_key_set -= {param_name}
def _check_amp_configs(amp_config_key_set):
......@@ -1479,13 +1609,16 @@ class Model(object):
}
if amp_config_key_set - accepted_param_set:
raise ValueError(
"Except for 'level', the keys of 'amp_configs' must be accepted by mixed precision APIs, but {} could not be recognized."
.format(tuple(amp_config_key_set - accepted_param_set)))
"Except for 'level', the keys of 'amp_configs' must be accepted by mixed precision APIs, but {} could not be recognized.".format(
tuple(amp_config_key_set - accepted_param_set)
)
)
if 'use_fp16_guard' in amp_config_key_set:
if _non_static_mode():
raise ValueError(
"'use_fp16_guard' is supported in static mode only.")
"'use_fp16_guard' is supported in static mode only."
)
self._adapter._use_fp16_guard = amp_configs['use_fp16_guard']
amp_config_key_set.remove('use_fp16_guard')
......@@ -1495,12 +1628,11 @@ class Model(object):
for key in amp_configs_set:
self._adapter._amp_configs[key] = amp_configs[key]
def prepare(self,
optimizer=None,
loss=None,
metrics=None,
amp_configs=None):
def prepare(
self, optimizer=None, loss=None, metrics=None, amp_configs=None
):
"""
Configures the model before runing.
Args:
......@@ -1532,6 +1664,7 @@ class Model(object):
Returns:
None
"""
self._place = _get_device()
if isinstance(self._place, fluid.CUDAPlace):
......@@ -1539,15 +1672,17 @@ class Model(object):
if ParallelEnv().nranks > 1 and not _parallel_context_initialized:
if fluid._non_static_mode():
main_prog_seed = fluid.default_main_program().random_seed
startup_prog_seed = fluid.default_startup_program(
).random_seed
startup_prog_seed = (
fluid.default_startup_program().random_seed
)
fluid.disable_dygraph()
paddle.disable_static(self._place)
# enable_dygraph would create and switch to a new program,
# thus also copy seed to the new program
fluid.default_main_program().random_seed = main_prog_seed
fluid.default_startup_program(
).random_seed = startup_prog_seed
fluid.default_startup_program().random_seed = (
startup_prog_seed
)
else:
prepare_distributed_context(self._place)
_parallel_context_initialized = True
......@@ -1562,15 +1697,16 @@ class Model(object):
metrics = metrics or []
for metric in to_list(metrics):
assert isinstance(metric, Metric), \
"{} is not sub class of Metric".format(
metric.__class__.__name__)
assert isinstance(
metric, Metric
), "{} is not sub class of Metric".format(metric.__class__.__name__)
self._metrics = to_list(metrics)
self._prepare_amp(amp_configs)
self._adapter.prepare()
def fit(self,
def fit(
self,
train_data=None,
eval_data=None,
batch_size=1,
......@@ -1585,8 +1721,10 @@ class Model(object):
num_workers=0,
callbacks=None,
accumulate_grad_batches=1,
num_iters=None):
num_iters=None,
):
"""
Trains the model for a fixed number of epochs. If `eval_data` is set,
evaluation will be done at the end of each epoch.
......@@ -1641,7 +1779,7 @@ class Model(object):
How to make a batch is done internally.
.. code-block:: python
:name: code-example1
:name: code-example3
import paddle
import paddle.vision.transforms as T
......@@ -1681,7 +1819,7 @@ class Model(object):
DataLoader.
.. code-block:: python
:name: code-example2
:name: code-example4
import paddle
import paddle.vision.transforms as T
......@@ -1718,31 +1856,38 @@ class Model(object):
val_loader,
epochs=2,
save_dir='mnist_checkpoint')
"""
assert train_data is not None, \
"train_data must be given!"
assert train_data is not None, "train_data must be given!"
if isinstance(train_data, Dataset):
train_sampler = DistributedBatchSampler(train_data,
train_sampler = DistributedBatchSampler(
train_data,
batch_size=batch_size,
shuffle=shuffle,
drop_last=drop_last)
train_loader = DataLoader(train_data,
drop_last=drop_last,
)
train_loader = DataLoader(
train_data,
batch_sampler=train_sampler,
places=self._place,
num_workers=num_workers,
return_list=True)
return_list=True,
)
else:
train_loader = train_data
if eval_data is not None and isinstance(eval_data, Dataset):
eval_sampler = DistributedBatchSampler(eval_data,
batch_size=batch_size)
eval_loader = DataLoader(eval_data,
eval_sampler = DistributedBatchSampler(
eval_data, batch_size=batch_size
)
eval_loader = DataLoader(
eval_data,
batch_sampler=eval_sampler,
places=self._place,
num_workers=num_workers,
return_list=True)
return_list=True,
)
elif eval_data is not None:
eval_loader = eval_data
else:
......@@ -1755,8 +1900,11 @@ class Model(object):
steps = self._len_data_loader(train_loader)
self.num_iters = num_iters
if num_iters is not None and isinstance(num_iters, int) and isinstance(
steps, int):
if (
num_iters is not None
and isinstance(num_iters, int)
and isinstance(steps, int)
):
assert num_iters > 0, "num_iters must be greater than 0!"
epochs = (num_iters // steps) + 1
steps = min(num_iters, steps)
......@@ -1784,10 +1932,10 @@ class Model(object):
if do_eval and epoch % eval_freq == 0:
eval_steps = self._len_data_loader(eval_loader)
cbks.on_begin('eval', {
'steps': eval_steps,
'metrics': self._metrics_name()
})
cbks.on_begin(
'eval',
{'steps': eval_steps, 'metrics': self._metrics_name()},
)
eval_logs = self._run_one_epoch(eval_loader, cbks, 'eval')
......@@ -1798,14 +1946,16 @@ class Model(object):
cbks.on_end('train', logs)
self._test_dataloader = None
def evaluate(self,
def evaluate(
self,
eval_data,
batch_size=1,
log_freq=10,
verbose=2,
num_workers=0,
callbacks=None,
num_iters=None):
num_iters=None,
):
"""
Evaluate the loss and metrics of the model on input dataset.
......@@ -1859,13 +2009,16 @@ class Model(object):
"""
if eval_data is not None and isinstance(eval_data, Dataset):
eval_sampler = DistributedBatchSampler(eval_data,
batch_size=batch_size)
eval_loader = DataLoader(eval_data,
eval_sampler = DistributedBatchSampler(
eval_data, batch_size=batch_size
)
eval_loader = DataLoader(
eval_data,
batch_sampler=eval_sampler,
places=self._place,
num_workers=num_workers,
return_list=True)
return_list=True,
)
else:
eval_loader = eval_data
......@@ -1881,15 +2034,17 @@ class Model(object):
eval_steps = self._len_data_loader(eval_loader)
self.num_iters = num_iters
if num_iters is not None and isinstance(num_iters, int) and isinstance(
eval_steps, int):
if (
num_iters is not None
and isinstance(num_iters, int)
and isinstance(eval_steps, int)
):
assert num_iters > 0, "num_iters must be greater than 0!"
eval_steps = min(num_iters, eval_steps)
self.num_iters = eval_steps
cbks.on_begin('eval', {
'steps': eval_steps,
'metrics': self._metrics_name()
})
cbks.on_begin(
'eval', {'steps': eval_steps, 'metrics': self._metrics_name()}
)
logs = self._run_one_epoch(eval_loader, cbks, 'eval')
......@@ -1903,13 +2058,15 @@ class Model(object):
return eval_result
def predict(self,
def predict(
self,
test_data,
batch_size=1,
num_workers=0,
stack_outputs=False,
verbose=1,
callbacks=None):
callbacks=None,
):
"""
Compute the output predictions on testing data.
......@@ -1980,13 +2137,16 @@ class Model(object):
"""
if test_data is not None and isinstance(test_data, Dataset):
test_sampler = DistributedBatchSampler(test_data,
batch_size=batch_size)
test_loader = DataLoader(test_data,
test_sampler = DistributedBatchSampler(
test_data, batch_size=batch_size
)
test_loader = DataLoader(
test_data,
batch_sampler=test_sampler,
places=self._place,
num_workers=num_workers,
return_list=True)
return_list=True,
)
else:
test_loader = test_data
......@@ -2036,7 +2196,8 @@ class Model(object):
if self._is_shape_inferred:
warnings.warn(
"'inputs' was not specified when Model initialization, so the input shape to be saved will be the shape derived from the user's actual inputs. The input shape to be saved is %s. For saving correct input shapes, please provide 'inputs' for Model initialization."
% self._input_info[0])
% self._input_info[0]
)
paddle.jit.save(layer, path, input_spec=self._inputs)
......@@ -2047,7 +2208,8 @@ class Model(object):
raise ValueError(
"The input path MUST be format of dirname/file_prefix "
"[dirname\\file_prefix in Windows system], but received "
"file_prefix is empty string.")
"file_prefix is empty string."
)
dirname = os.path.dirname(path)
if dirname and not os.path.exists(dirname):
......@@ -2058,21 +2220,24 @@ class Model(object):
params_filename = file_prefix + INFER_PARAMS_SUFFIX
prog = self._adapter._progs.get('test', None)
assert prog, \
"Model is not ready, please call `model.prepare()` first"
assert (
prog
), "Model is not ready, please call `model.prepare()` first"
infer_prog = prog.clone(for_test=True)
input_names = [v.name for v in self._adapter._input_vars['test']]
endpoints = self._adapter._endpoints['test']['output']
fluid.io.save_inference_model(model_path,
fluid.io.save_inference_model(
model_path,
input_names,
endpoints,
self._adapter._executor,
main_program=infer_prog,
model_filename=model_filename,
params_filename=params_filename)
params_filename=params_filename,
)
def _run_one_epoch(
self,
......@@ -2098,16 +2263,21 @@ class Model(object):
# LoDTensor.shape is callable, where LoDTensor comes from
# DataLoader in static graph
batch_size = data[0].shape()[0] if callable(
data[0].shape) else data[0].shape[0]
batch_size = (
data[0].shape()[0]
if callable(data[0].shape)
else data[0].shape[0]
)
callbacks.on_batch_begin(mode, step, logs)
if mode != 'predict':
_inputs = [data[:len(self._inputs)], data[len(self._inputs):]]
_inputs = [data[: len(self._inputs)], data[len(self._inputs) :]]
if mode == 'train':
_inputs.append((step + 1) % self._accumulate == 0
or step + 1 == len(data_loader))
_inputs.append(
(step + 1) % self._accumulate == 0
or step + 1 == len(data_loader)
)
outs = getattr(self, mode + '_batch')(*_inputs)
......@@ -2128,15 +2298,17 @@ class Model(object):
logs[k] = v
else:
if self._inputs is not None:
outs = self.predict_batch(data[:len(self._inputs)])
outs = self.predict_batch(data[: len(self._inputs)])
else:
outs = self.predict_batch(data)
outputs.append(outs)
logs['step'] = step
if mode == 'train' or self._adapter._merge_count.get(
mode + '_batch', 0) <= 0:
if (
mode == 'train'
or self._adapter._merge_count.get(mode + '_batch', 0) <= 0
):
logs['batch_size'] = batch_size * ParallelEnv().nranks
else:
logs['batch_size'] = self._adapter._merge_count[mode + '_batch']
......@@ -2190,8 +2362,9 @@ class Model(object):
# {'total_params': 61610, 'trainable_params': 61610}
"""
assert (input_size is not None or self._inputs
is not None), "'input_size' or 'self._input' must be set"
assert (
input_size is not None or self._inputs is not None
), "'input_size' or 'self._input' must be set"
if input_size is not None:
_input_size = input_size
else:
......@@ -2208,7 +2381,10 @@ class Model(object):
if is_input:
arg_names = extract_args(self.network.forward)[1:]
# While Saving inference model in dygraph, and providing inputs only in running.
if shapes is not None and dtypes is not None and fluid._non_static_mode(
if (
shapes is not None
and dtypes is not None
and fluid._non_static_mode()
):
out_specs = [
Input(name=n, dtype=dtypes[i], shape=shapes[i])
......@@ -2221,7 +2397,8 @@ class Model(object):
elif isinstance(specs, dict):
assert is_input is False
out_specs = [
specs[n] for n in extract_args(self.network.forward)
specs[n]
for n in extract_args(self.network.forward)
if n != 'self'
]
else:
......@@ -2232,8 +2409,10 @@ class Model(object):
assert isinstance(spec, Input)
if spec.name is None:
raise ValueError(
"Requires Input[{}].name != None, but receive `None` with {}."
.format(i, spec))
"Requires Input[{}].name != None, but receive `None` with {}.".format(
i, spec
)
)
return out_specs
......@@ -2258,6 +2437,7 @@ class Model(object):
"Update self._inputs according to given inputs."
self._input_info = self._adapter._input_info
if self._input_info is not None and len(self._input_info) == 2:
self._inputs = self._verify_spec(None, self._input_info[0],
self._input_info[1], True)
self._inputs = self._verify_spec(
None, self._input_info[0], self._input_info[1], True
)
self._is_shape_inferred = True
......@@ -284,9 +284,11 @@ def fused_bias_dropout_residual_layer_norm(
name=None,
):
r"""
The fused_bias_dropout_residual_layer_norm operator. The pseudo code is as follows:
.. code-block:: python
y = layer_norm(residual + dropout(bias + x))
Parameters:
......@@ -315,10 +317,9 @@ def fused_bias_dropout_residual_layer_norm(
name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
Returns:
Tensor: The output Tensor, the data type and shape is same as `x`.
Tensor, The output Tensor, the data type and shape is same as `x`.
Examples:
.. code-block:: python
# required: gpu
......@@ -336,6 +337,7 @@ def fused_bias_dropout_residual_layer_norm(
x, residual, bias)
# [2, 4, 128]
print(output.shape)
"""
seed = None
if mode not in ('downscale_in_infer', 'upscale_in_train'):
......
......@@ -16,7 +16,10 @@ from paddle.incubate.nn import functional as incubate_f
from paddle.nn import Layer
from paddle.framework import ParamAttr
import paddle
from paddle.nn.layer.transformer import _convert_attention_mask, _convert_param_attr_to_list
from paddle.nn.layer.transformer import (
_convert_attention_mask,
_convert_param_attr_to_list,
)
from paddle.nn.initializer import Constant
from paddle.fluid.dygraph import no_grad
from paddle.fluid.framework import convert_np_dtype_to_dtype_, _non_static_mode
......@@ -51,7 +54,8 @@ def _to_dtype(t, dtype):
if t.place.is_gpu_place():
size_dtype = core.size_of_dtype(dtype)
waiting_alloc_memory = (
(np.prod(t.shape) * size_dtype) / 256 + 1) * 256 * 1.2
((np.prod(t.shape) * size_dtype) / 256 + 1) * 256 * 1.2
)
gpu_memory_available = core.gpu_memory_available()
if gpu_memory_available < waiting_alloc_memory:
t_used = t._copy_to(paddle.CPUPlace(), False)
......@@ -106,31 +110,38 @@ class FusedBiasDropoutResidualLayerNorm(Layer):
output = fused_bias_dropout_residual_ln(x, residual) # [2, 4, 128]
"""
def __init__(self,
def __init__(
self,
embed_dim,
dropout_rate=0.5,
weight_attr=None,
bias_attr=None,
epsilon=1e-5,
name=None):
name=None,
):
super(FusedBiasDropoutResidualLayerNorm, self).__init__()
assert embed_dim > 0, ("Expected embed_dim to be greater than 0, "
"but recieved {}".format(embed_dim))
assert embed_dim > 0, (
"Expected embed_dim to be greater than 0, "
"but recieved {}".format(embed_dim)
)
self._dtype = self._helper.get_default_dtype()
self._bias_attr = bias_attr
self._weight_attr = weight_attr
self.embed_dim = embed_dim
self.linear_bias = self.create_parameter(shape=[embed_dim],
self.linear_bias = self.create_parameter(
shape=[embed_dim],
attr=self._bias_attr,
dtype=self._dtype,
is_bias=True)
is_bias=True,
)
self.ln_scale = self.create_parameter(
attr=self._weight_attr,
shape=[embed_dim],
default_initializer=Constant(value=1.0))
self.ln_bias = self.create_parameter(attr=self._bias_attr,
shape=[embed_dim],
is_bias=True)
default_initializer=Constant(value=1.0),
)
self.ln_bias = self.create_parameter(
attr=self._bias_attr, shape=[embed_dim], is_bias=True
)
self.dropout_rate = dropout_rate
self._epsilon = epsilon
......@@ -163,14 +174,20 @@ class FusedBiasDropoutResidualLayerNorm(Layer):
ln_epsilon=self._epsilon,
training=self.training,
mode='upscale_in_train',
name=self.name)
name=self.name,
)
return out
def extra_repr(self):
name_str = ', name={}'.format(self.name) if self.name else ''
return 'embed_dim={}, seq_len={}, dropout_rate={}, epsilon={}, dtype={}{}'.format(
self.embed_dim, self.seq_len, self.dropout_rate, self._epsilon,
self._dtype, name_str)
self.embed_dim,
self.seq_len,
self.dropout_rate,
self._epsilon,
self._dtype,
name_str,
)
class FusedMultiHeadAttention(Layer):
......@@ -246,7 +263,8 @@ class FusedMultiHeadAttention(Layer):
output = multi_head_attn(query, None, None, attn_mask=attn_mask) # [2, 4, 128]
"""
def __init__(self,
def __init__(
self,
embed_dim,
num_heads,
dropout_rate=0.5,
......@@ -266,13 +284,19 @@ class FusedMultiHeadAttention(Layer):
epsilon=1e-5,
nranks=1,
ring_id=-1,
name=None):
name=None,
):
super(FusedMultiHeadAttention, self).__init__()
assert embed_dim > 0, ("Expected embed_dim to be greater than 0, "
"but received {}".format(embed_dim))
assert num_heads > 0, ("Expected nhead to be greater than 0, "
"but received {}".format(num_heads))
assert embed_dim > 0, (
"Expected embed_dim to be greater than 0, "
"but received {}".format(embed_dim)
)
assert (
num_heads > 0
), "Expected nhead to be greater than 0, " "but received {}".format(
num_heads
)
self.normalize_before = normalize_before
self._dtype = self._helper.get_default_dtype()
......@@ -285,7 +309,9 @@ class FusedMultiHeadAttention(Layer):
self.kdim = kdim
self.vdim = vdim
self.need_weights = need_weights
assert self.head_dim * num_heads == embed_dim, "embed_dim must be divisible by num_heads"
assert (
self.head_dim * num_heads == embed_dim
), "embed_dim must be divisible by num_heads"
assert need_weights is False, "Only support need_weight is False now."
# tensor model parallel
......@@ -296,21 +322,26 @@ class FusedMultiHeadAttention(Layer):
shape=[3, num_heads, self.head_dim, embed_dim],
attr=qkv_weight_attr,
dtype=self._dtype,
is_bias=False)
is_bias=False,
)
self.qkv_bias = self.create_parameter(
shape=[3, num_heads, self.head_dim],
attr=qkv_bias_attr,
dtype=self._dtype,
is_bias=True)
is_bias=True,
)
self.linear_weight = self.create_parameter(
shape=[num_heads * self.head_dim, embed_dim],
attr=linear_weight_attr,
dtype=self._dtype,
is_bias=False)
self.linear_bias = self.create_parameter(shape=[embed_dim],
is_bias=False,
)
self.linear_bias = self.create_parameter(
shape=[embed_dim],
attr=linear_bias_attr,
dtype=self._dtype,
is_bias=True)
is_bias=True,
)
# tensor model parallel
if nranks > 1:
......@@ -325,10 +356,11 @@ class FusedMultiHeadAttention(Layer):
self.pre_ln_scale = self.create_parameter(
attr=pre_ln_scale_attr,
shape=[embed_dim],
default_initializer=Constant(value=1.0))
self.pre_ln_bias = self.create_parameter(attr=pre_ln_bias_attr,
shape=[embed_dim],
is_bias=True)
default_initializer=Constant(value=1.0),
)
self.pre_ln_bias = self.create_parameter(
attr=pre_ln_bias_attr, shape=[embed_dim], is_bias=True
)
self.ln_scale = None
self.ln_bias = None
else:
......@@ -337,10 +369,11 @@ class FusedMultiHeadAttention(Layer):
self.ln_scale = self.create_parameter(
attr=ln_scale_attr,
shape=[embed_dim],
default_initializer=Constant(value=1.0))
self.ln_bias = self.create_parameter(attr=ln_bias_attr,
shape=[embed_dim],
is_bias=True)
default_initializer=Constant(value=1.0),
)
self.ln_bias = self.create_parameter(
attr=ln_bias_attr, shape=[embed_dim], is_bias=True
)
self.dropout_rate = dropout_rate
self.attn_dropout_rate = attn_dropout_rate
......@@ -404,15 +437,25 @@ class FusedMultiHeadAttention(Layer):
ln_epsilon=self._epsilon,
training=self.training,
ring_id=self._ring_id,
name=self.name)
name=self.name,
)
return out
def extra_repr(self):
name_str = ', name={}'.format(self.name) if self.name else ''
return 'embed_dim={}, num_heads={}, dropout_rate={}, attn_dropout_rate={}, epsilon={}, kdim={}, vdim={}, normalize_before={}, need_weights={}, dtype={}{}'.format(
self.embed_dim, self.num_heads, self.dropout_rate,
self.attn_dropout_rate, self._epsilon, self.kdim, self.vdim,
self.normalize_before, self.need_weights, self._dtype, name_str)
self.embed_dim,
self.num_heads,
self.dropout_rate,
self.attn_dropout_rate,
self._epsilon,
self.kdim,
self.vdim,
self.normalize_before,
self.need_weights,
self._dtype,
name_str,
)
def _amp_decorate(self, dtype):
# tmp fix for amp.decorator(O2)
......@@ -495,7 +538,8 @@ class FusedFeedForward(Layer):
# (1, 8, 8)
"""
def __init__(self,
def __init__(
self,
d_model,
dim_feedforward,
dropout_rate=0.1,
......@@ -513,15 +557,20 @@ class FusedFeedForward(Layer):
ln2_bias_attr=None,
nranks=1,
ring_id=-1,
name=None):
name=None,
):
super(FusedFeedForward, self).__init__()
assert d_model > 0, (
"Expected d_model to be greater than 0, but received {}".format(
d_model))
assert dim_feedforward > 0, (
"Expected dim_feedforward to be greater than 0, but received {}".
format(dim_feedforward))
assert (
d_model > 0
), "Expected d_model to be greater than 0, but received {}".format(
d_model
)
assert (
dim_feedforward > 0
), "Expected dim_feedforward to be greater than 0, but received {}".format(
dim_feedforward
)
self._dtype = self._helper.get_default_dtype()
self._d_model = d_model
......@@ -530,7 +579,9 @@ class FusedFeedForward(Layer):
dim_feedforward = dim_feedforward // nranks
self._dim_feedforward = dim_feedforward
self._dropout_rate = dropout_rate
self._act_dropout_rate = dropout_rate if act_dropout_rate is None else act_dropout_rate
self._act_dropout_rate = (
dropout_rate if act_dropout_rate is None else act_dropout_rate
)
self._act_method = activation
self._normalize_before = normalize_before
self._epsilon = epsilon
......@@ -540,22 +591,28 @@ class FusedFeedForward(Layer):
shape=[d_model, dim_feedforward],
attr=linear1_weight_attr,
dtype=self._dtype,
is_bias=False)
self._linear1_bias = self.create_parameter(shape=[dim_feedforward],
is_bias=False,
)
self._linear1_bias = self.create_parameter(
shape=[dim_feedforward],
attr=linear1_bias_attr,
dtype=self._dtype,
is_bias=True)
is_bias=True,
)
self._linear2_weight = self.create_parameter(
shape=[dim_feedforward, d_model],
attr=linear2_weight_attr,
dtype=self._dtype,
is_bias=False)
is_bias=False,
)
self._linear2_bias = self.create_parameter(shape=[d_model],
self._linear2_bias = self.create_parameter(
shape=[d_model],
attr=linear2_bias_attr,
dtype=self._dtype,
is_bias=True)
is_bias=True,
)
if nranks > 1:
assert ring_id != -1
......@@ -569,10 +626,11 @@ class FusedFeedForward(Layer):
shape=[d_model],
attr=ln1_scale_attr,
is_bias=False,
default_initializer=Constant(1.0))
self._ln1_bias = self.create_parameter(shape=[d_model],
attr=ln1_bias_attr,
is_bias=True)
default_initializer=Constant(1.0),
)
self._ln1_bias = self.create_parameter(
shape=[d_model], attr=ln1_bias_attr, is_bias=True
)
self._ln2_scale = None
self._ln2_bias = None
else:
......@@ -582,10 +640,11 @@ class FusedFeedForward(Layer):
shape=[d_model],
attr=ln2_scale_attr,
is_bias=False,
default_initializer=Constant(1.0))
self._ln2_bias = self.create_parameter(shape=[d_model],
attr=ln2_bias_attr,
is_bias=True)
default_initializer=Constant(1.0),
)
self._ln2_bias = self.create_parameter(
shape=[d_model], attr=ln2_bias_attr, is_bias=True
)
self.name = name
......@@ -608,15 +667,23 @@ class FusedFeedForward(Layer):
pre_layer_norm=self._normalize_before,
training=self.training,
ring_id=self._ring_id,
name=self.name)
name=self.name,
)
return out
def extra_repr(self):
name_str = ', name={}'.format(self.name) if self.name else ''
return 'd_model={}, dim_feedforward={}, dropout_rate={}, epsilon={}, activation={}, act_dropout_rate={}, normalize_before={}, dtype={}{}'.format(
self._d_model, self._dim_feedforward, self._dropout_rate,
self._epsilon, self._act_method, self._act_dropout_rate,
self._normalize_before, self._dtype, name_str)
self._d_model,
self._dim_feedforward,
self._dropout_rate,
self._epsilon,
self._act_method,
self._act_dropout_rate,
self._normalize_before,
self._dtype,
name_str,
)
def _amp_decorate(self, dtype):
# tmp fix for amp.decorator(O2)
......@@ -640,6 +707,7 @@ class FusedFeedForward(Layer):
class FusedTransformerEncoderLayer(Layer):
"""
FusedTransformerEncoderLayer is composed of two sub-layers which are self (multi-head)
attention and feedforward network. Before and after each sub-layer, pre-process
and post-precess would be applied on the input and output accordingly. If
......@@ -681,7 +749,6 @@ class FusedTransformerEncoderLayer(Layer):
Examples:
.. code-block:: python
# required: gpu
......@@ -694,9 +761,11 @@ class FusedTransformerEncoderLayer(Layer):
attn_mask = paddle.rand((2, 2, 4, 4))
encoder_layer = FusedTransformerEncoderLayer(128, 2, 512)
enc_output = encoder_layer(enc_input, attn_mask) # [2, 4, 128]
"""
def __init__(self,
def __init__(
self,
d_model,
nhead,
dim_feedforward,
......@@ -706,21 +775,33 @@ class FusedTransformerEncoderLayer(Layer):
act_dropout_rate=None,
normalize_before=False,
weight_attr=None,
bias_attr=None):
bias_attr=None,
):
self._config = locals()
self._config.pop("self")
self._config.pop("__class__", None) # py3
super(FusedTransformerEncoderLayer, self).__init__()
assert d_model > 0, ("Expected d_model to be greater than 0, "
"but received {}".format(d_model))
assert nhead > 0, ("Expected nhead to be greater than 0, "
"but received {}".format(nhead))
assert (
d_model > 0
), "Expected d_model to be greater than 0, " "but received {}".format(
d_model
)
assert (
nhead > 0
), "Expected nhead to be greater than 0, " "but received {}".format(
nhead
)
assert dim_feedforward > 0, (
"Expected dim_feedforward to be greater than 0, "
"but received {}".format(dim_feedforward))
attn_dropout_rate = dropout_rate if attn_dropout_rate is None else attn_dropout_rate
act_dropout_rate = dropout_rate if act_dropout_rate is None else act_dropout_rate
"but received {}".format(dim_feedforward)
)
attn_dropout_rate = (
dropout_rate if attn_dropout_rate is None else attn_dropout_rate
)
act_dropout_rate = (
dropout_rate if act_dropout_rate is None else act_dropout_rate
)
self.normalize_before = normalize_before
weight_attrs = _convert_param_attr_to_list(weight_attr, 2)
......@@ -739,9 +820,11 @@ class FusedTransformerEncoderLayer(Layer):
pre_ln_scale_attr=weight_attrs[0],
pre_ln_bias_attr=bias_attrs[0],
ln_scale_attr=weight_attrs[0],
ln_bias_attr=bias_attrs[0])
ln_bias_attr=bias_attrs[0],
)
self.ffn = FusedFeedForward(d_model,
self.ffn = FusedFeedForward(
d_model,
dim_feedforward,
dropout_rate=dropout_rate,
activation=activation,
......@@ -750,11 +833,14 @@ class FusedTransformerEncoderLayer(Layer):
linear1_weight_attr=weight_attrs[1],
linear1_bias_attr=bias_attrs[1],
linear2_weight_attr=weight_attrs[1],
linear2_bias_attr=bias_attrs[1])
linear2_bias_attr=bias_attrs[1],
)
def forward(self, src, src_mask=None, cache=None):
"""
Applies a Transformer encoder layer on the input.
Parameters:
src (Tensor): The input of Transformer encoder layer. It is
a tensor with shape `[batch_size, sequence_length, d_model]`.
......@@ -770,25 +856,27 @@ class FusedTransformerEncoderLayer(Layer):
`-INF` values and the others have 0 values. It can be None when
nothing wanted or needed to be prevented attention to. Default None.
cache (Tensor, optional): It is an instance of `MultiHeadAttention.Cache`.
See `TransformerEncoderLayer.gen_cache` for more details. It is
See :ref:`api_paddle_nn_TransformerEncoderLayer`.gen_cache for more details. It is
only used for inference and should be None for training. Default
None.
Returns:
Tensor|tuple: It is a tensor that has the same shape and data type \
Tensor|tuple, It is a tensor that has the same shape and data type \
as `enc_input`, representing the output of Transformer encoder \
layer. Or a tuple if `cache` is not None, except for encoder \
layer output, the tuple includes the new cache which is same \
as input `cache` argument but `incremental_cache` has an \
incremental length. See `MultiHeadAttention.gen_cache` and \
`MultiHeadAttention.forward` for more details.
"""
src_mask = _convert_attention_mask(src_mask, src.dtype)
if cache is None:
attn_out = self.fused_attn(src, attn_mask=src_mask)
else:
attn_out, incremental_cache = self.fused_attn(src,
attn_mask=src_mask,
cache=cache)
attn_out, incremental_cache = self.fused_attn(
src, attn_mask=src_mask, cache=cache
)
ffn_out = self.ffn(attn_out)
......@@ -889,7 +977,8 @@ class FusedTransformer(Layer):
cross_attn_mask) # [2, 6, 128]
"""
def __init__(self,
def __init__(
self,
d_model=512,
nhead=8,
num_encoder_layers=6,
......@@ -903,7 +992,8 @@ class FusedTransformer(Layer):
weight_attr=None,
bias_attr=None,
custom_encoder=None,
custom_decoder=None):
custom_decoder=None,
):
super(fusedTransformer, self).__init__()
raise NotImplementedError()
......@@ -1071,7 +1161,8 @@ class FusedMultiTransformer(Layer):
enc_output = encoder_layers(enc_input, attn_mask) # [2, 4, 128]
"""
def __init__(self,
def __init__(
self,
embed_dim,
num_heads,
dim_feedforward,
......@@ -1095,16 +1186,24 @@ class FusedMultiTransformer(Layer):
nranks=1,
trans_qkvw=True,
ring_id=-1,
name=None):
name=None,
):
super(FusedMultiTransformer, self).__init__()
assert embed_dim > 0, ("Expected embed_dim to be greater than 0, "
"but received {}".format(embed_dim))
assert num_heads > 0, ("Expected nhead to be greater than 0, "
"but received {}".format(num_heads))
assert dim_feedforward > 0, (
"Expected dim_feedforward to be greater than 0, but received {}".
format(dim_feedforward))
assert embed_dim > 0, (
"Expected embed_dim to be greater than 0, "
"but received {}".format(embed_dim)
)
assert (
num_heads > 0
), "Expected nhead to be greater than 0, " "but received {}".format(
num_heads
)
assert (
dim_feedforward > 0
), "Expected dim_feedforward to be greater than 0, but received {}".format(
dim_feedforward
)
self.normalize_before = normalize_before
self._dtype = self._helper.get_default_dtype()
......@@ -1115,7 +1214,9 @@ class FusedMultiTransformer(Layer):
self.embed_dim = embed_dim
self.num_heads = num_heads
self.head_dim = embed_dim // num_heads
assert self.head_dim * num_heads == embed_dim, "embed_dim must be divisible by num_heads"
assert (
self.head_dim * num_heads == embed_dim
), "embed_dim must be divisible by num_heads"
# tensor model parallel
if nranks > 1:
......@@ -1161,57 +1262,71 @@ class FusedMultiTransformer(Layer):
ln_scale = self.create_parameter(
attr=ln_scale_attr,
shape=[embed_dim],
default_initializer=Constant(value=1.0))
ln_bias = self.create_parameter(attr=ln_bias_attr,
shape=[embed_dim],
is_bias=True)
default_initializer=Constant(value=1.0),
)
ln_bias = self.create_parameter(
attr=ln_bias_attr, shape=[embed_dim], is_bias=True
)
qkv_weight = self.create_parameter(
shape=[3, num_heads, self.head_dim, embed_dim]
if trans_qkvw else [embed_dim, 3, num_heads, self.head_dim],
if trans_qkvw
else [embed_dim, 3, num_heads, self.head_dim],
attr=qkv_weight_attr,
dtype=self._dtype,
is_bias=False)
is_bias=False,
)
qkv_bias = self.create_parameter(
shape=[3, num_heads, self.head_dim],
attr=qkv_bias_attr,
dtype=self._dtype,
is_bias=True)
is_bias=True,
)
linear_weight = self.create_parameter(
shape=[num_heads * self.head_dim, embed_dim],
attr=linear_weight_attr,
dtype=self._dtype,
is_bias=False)
linear_bias = self.create_parameter(shape=[embed_dim],
is_bias=False,
)
linear_bias = self.create_parameter(
shape=[embed_dim],
attr=linear_bias_attr,
dtype=self._dtype,
is_bias=True)
is_bias=True,
)
ffn_ln_scale = self.create_parameter(
shape=[embed_dim],
attr=ffn_ln_scale_attr,
is_bias=False,
default_initializer=Constant(1.0))
ffn_ln_bias = self.create_parameter(shape=[embed_dim],
attr=ffn_ln_bias_attr,
is_bias=True)
default_initializer=Constant(1.0),
)
ffn_ln_bias = self.create_parameter(
shape=[embed_dim], attr=ffn_ln_bias_attr, is_bias=True
)
ffn1_weight = self.create_parameter(
shape=[embed_dim, dim_feedforward],
attr=ffn1_weight_attr,
dtype=self._dtype,
is_bias=False)
ffn1_bias = self.create_parameter(shape=[dim_feedforward],
is_bias=False,
)
ffn1_bias = self.create_parameter(
shape=[dim_feedforward],
attr=ffn1_bias_attr,
dtype=self._dtype,
is_bias=True)
is_bias=True,
)
ffn2_weight = self.create_parameter(
shape=[dim_feedforward, embed_dim],
attr=ffn2_weight_attr,
dtype=self._dtype,
is_bias=False)
ffn2_bias = self.create_parameter(shape=[embed_dim],
is_bias=False,
)
ffn2_bias = self.create_parameter(
shape=[embed_dim],
attr=ffn2_bias_attr,
dtype=self._dtype,
is_bias=True)
is_bias=True,
)
# tensor model parallel
if nranks > 1:
......@@ -1300,5 +1415,6 @@ class FusedMultiTransformer(Layer):
mode='upscale_in_train',
trans_qkvw=self._trans_qkvw,
ring_id=self._ring_id,
name=self.name)
name=self.name,
)
return out
......@@ -20,14 +20,17 @@ from paddle.fluid import core
from paddle import _C_ops, _legacy_C_ops
def graph_khop_sampler(row,
def graph_khop_sampler(
row,
colptr,
input_nodes,
sample_sizes,
sorted_eids=None,
return_eids=False,
name=None):
name=None,
):
"""
Graph Khop Sampler API.
This API is mainly used in Graph Learning domain, and the main purpose is to
......@@ -50,24 +53,23 @@ def graph_khop_sampler(row,
sample_sizes (list|tuple): The number of neighbors and number of layers we want
to sample. The data type should be int, and the shape
should only have one dimension.
sorted_eids (Tensor): The sorted edge ids, should not be None when `return_eids`
sorted_eids (Tensor, optional): The sorted edge ids, should not be None when `return_eids`
is True. The shape should be [num_edges, 1], and the data
type should be the same with `row`.
return_eids (bool): Whether to return the id of the sample edges. Default is False.
type should be the same with `row`. Default is None.
return_eids (bool, optional): Whether to return the id of the sample edges. Default is False.
name (str, optional): Name for the operation (optional, default is None).
For more information, please refer to :ref:`api_guide_Name`.
Returns:
edge_src (Tensor): The src index of the output edges, also means the first column of
- edge_src (Tensor), The src index of the output edges, also means the first column of
the edges. The shape is [num_sample_edges, 1] currently.
edge_dst (Tensor): The dst index of the output edges, also means the second column
- edge_dst (Tensor), The dst index of the output edges, also means the second column
of the edges. The shape is [num_sample_edges, 1] currently.
sample_index (Tensor): The original id of the input nodes and sampled neighbor nodes.
reindex_nodes (Tensor): The reindex id of the input nodes.
edge_eids (Tensor): Return the id of the sample edges if `return_eids` is True.
- sample_index (Tensor), The original id of the input nodes and sampled neighbor nodes.
- reindex_nodes (Tensor), The reindex id of the input nodes.
- edge_eids (Tensor), Return the id of the sample edges if `return_eids` is True.
Examples:
.. code-block:: python
import paddle
......@@ -80,44 +82,72 @@ def graph_khop_sampler(row,
colptr = paddle.to_tensor(colptr, dtype="int64")
nodes = paddle.to_tensor(nodes, dtype="int64")
edge_src, edge_dst, sample_index, reindex_nodes = \
paddle.incubate.graph_khop_sampler(row, colptr, nodes, sample_sizes, False)
edge_src, edge_dst, sample_index, reindex_nodes = paddle.incubate.graph_khop_sampler(row, colptr, nodes, sample_sizes, False)
"""
if _non_static_mode():
if return_eids:
if sorted_eids is None:
raise ValueError(f"`sorted_eid` should not be None "
f"if return_eids is True.")
edge_src, edge_dst, sample_index, reindex_nodes, edge_eids = \
_legacy_C_ops.graph_khop_sampler(row, sorted_eids,
colptr, input_nodes,
"sample_sizes", sample_sizes,
"return_eids", True)
raise ValueError(
f"`sorted_eid` should not be None "
f"if return_eids is True."
)
(
edge_src,
edge_dst,
sample_index,
reindex_nodes,
edge_eids,
) = _legacy_C_ops.graph_khop_sampler(
row,
sorted_eids,
colptr,
input_nodes,
"sample_sizes",
sample_sizes,
"return_eids",
True,
)
return edge_src, edge_dst, sample_index, reindex_nodes, edge_eids
else:
edge_src, edge_dst, sample_index, reindex_nodes, _ = \
_legacy_C_ops.graph_khop_sampler(row, None,
colptr, input_nodes,
"sample_sizes", sample_sizes,
"return_eids", False)
(
edge_src,
edge_dst,
sample_index,
reindex_nodes,
_,
) = _legacy_C_ops.graph_khop_sampler(
row,
None,
colptr,
input_nodes,
"sample_sizes",
sample_sizes,
"return_eids",
False,
)
return edge_src, edge_dst, sample_index, reindex_nodes
check_variable_and_dtype(row, "Row", ("int32", "int64"),
"graph_khop_sampler")
check_variable_and_dtype(
row, "Row", ("int32", "int64"), "graph_khop_sampler"
)
if return_eids:
if sorted_eids is None:
raise ValueError(f"`sorted_eid` should not be None "
f"if return_eids is True.")
check_variable_and_dtype(sorted_eids, "Eids", ("int32", "int64"),
"graph_khop_sampler")
check_variable_and_dtype(colptr, "Col_Ptr", ("int32", "int64"),
"graph_khop_sampler")
check_variable_and_dtype(input_nodes, "X", ("int32", "int64"),
"graph_khop_sampler")
raise ValueError(
f"`sorted_eid` should not be None " f"if return_eids is True."
)
check_variable_and_dtype(
sorted_eids, "Eids", ("int32", "int64"), "graph_khop_sampler"
)
check_variable_and_dtype(
colptr, "Col_Ptr", ("int32", "int64"), "graph_khop_sampler"
)
check_variable_and_dtype(
input_nodes, "X", ("int32", "int64"), "graph_khop_sampler"
)
helper = LayerHelper("graph_khop_sampler", **locals())
edge_src = helper.create_variable_for_type_inference(dtype=row.dtype)
......@@ -125,24 +155,23 @@ def graph_khop_sampler(row,
sample_index = helper.create_variable_for_type_inference(dtype=row.dtype)
reindex_nodes = helper.create_variable_for_type_inference(dtype=row.dtype)
edge_eids = helper.create_variable_for_type_inference(dtype=row.dtype)
helper.append_op(type="graph_khop_sampler",
helper.append_op(
type="graph_khop_sampler",
inputs={
"Row": row,
"Eids": sorted_eids,
"Col_Ptr": colptr,
"X": input_nodes
"X": input_nodes,
},
outputs={
"Out_Src": edge_src,
"Out_Dst": edge_dst,
"Sample_Index": sample_index,
"Reindex_X": reindex_nodes,
"Out_Eids": edge_eids
"Out_Eids": edge_eids,
},
attrs={
"sample_sizes": sample_sizes,
"return_eids": return_eids
})
attrs={"sample_sizes": sample_sizes, "return_eids": return_eids},
)
if return_eids:
return edge_src, edge_dst, sample_index, reindex_nodes, edge_eids
else:
......
......@@ -21,18 +21,23 @@ from paddle import _C_ops, _legacy_C_ops
import paddle.utils.deprecated as deprecated
@deprecated(since="2.4.0",
@deprecated(
since="2.4.0",
update_to="paddle.geometric.reindex_graph",
level=1,
reason="paddle.incubate.graph_reindex will be removed in future")
def graph_reindex(x,
reason="paddle.incubate.graph_reindex will be removed in future",
)
def graph_reindex(
x,
neighbors,
count,
value_buffer=None,
index_buffer=None,
flag_buffer_hashtable=False,
name=None):
name=None,
):
"""
Graph Reindex API.
This API is mainly used in Graph Learning domain, which should be used
......@@ -40,7 +45,7 @@ def graph_reindex(x,
is to reindex the ids information of the input nodes, and return the
corresponding graph edges after reindex.
**Notes**:
Notes:
The number in x should be unique, otherwise it would cause potential errors.
Besides, we also support multi-edge-types neighbors reindexing. If we have different
edge_type neighbors for x, we should concatenate all the neighbors and count of x.
......@@ -58,24 +63,23 @@ def graph_reindex(x,
should be the same with `x`.
count (Tensor): The neighbor count of the input nodes `x`. And the
data type should be int32.
value_buffer (Tensor|None): Value buffer for hashtable. The data type should
be int32, and should be filled with -1.
index_buffer (Tensor|None): Index buffer for hashtable. The data type should
be int32, and should be filled with -1.
flag_buffer_hashtable (bool): Whether to use buffer for hashtable to speed up.
value_buffer (Tensor, optional): Value buffer for hashtable. The data type should
be int32, and should be filled with -1. Default is None.
index_buffer (Tensor, optional): Index buffer for hashtable. The data type should
be int32, and should be filled with -1. Default is None.
flag_buffer_hashtable (bool, optional): Whether to use buffer for hashtable to speed up.
Default is False. Only useful for gpu version currently.
name (str, optional): Name for the operation (optional, default is None).
For more information, please refer to :ref:`api_guide_Name`.
Returns:
reindex_src (Tensor): The source node index of graph edges after reindex.
reindex_dst (Tensor): The destination node index of graph edges after reindex.
out_nodes (Tensor): The index of unique input nodes and neighbors before reindex,
- reindex_src (Tensor), The source node index of graph edges after reindex.
- reindex_dst (Tensor), The destination node index of graph edges after reindex.
- out_nodes (Tensor), The index of unique input nodes and neighbors before reindex,
where we put the input nodes `x` in the front, and put neighbor
nodes in the back.
Examples:
.. code-block:: python
import paddle
......@@ -109,47 +113,55 @@ def graph_reindex(x,
"""
if flag_buffer_hashtable:
if value_buffer is None or index_buffer is None:
raise ValueError(f"`value_buffer` and `index_buffer` should not"
"be None if `flag_buffer_hashtable` is True.")
raise ValueError(
f"`value_buffer` and `index_buffer` should not"
"be None if `flag_buffer_hashtable` is True."
)
if _non_static_mode():
reindex_src, reindex_dst, out_nodes = \
_legacy_C_ops.graph_reindex(x, neighbors, count, value_buffer, index_buffer,
"flag_buffer_hashtable", flag_buffer_hashtable)
reindex_src, reindex_dst, out_nodes = _legacy_C_ops.graph_reindex(
x,
neighbors,
count,
value_buffer,
index_buffer,
"flag_buffer_hashtable",
flag_buffer_hashtable,
)
return reindex_src, reindex_dst, out_nodes
check_variable_and_dtype(x, "X", ("int32", "int64"), "graph_reindex")
check_variable_and_dtype(neighbors, "Neighbors", ("int32", "int64"),
"graph_reindex")
check_variable_and_dtype(
neighbors, "Neighbors", ("int32", "int64"), "graph_reindex"
)
check_variable_and_dtype(count, "Count", ("int32"), "graph_reindex")
if flag_buffer_hashtable:
check_variable_and_dtype(value_buffer, "HashTable_Value", ("int32"),
"graph_reindex")
check_variable_and_dtype(index_buffer, "HashTable_Index", ("int32"),
"graph_reindex")
check_variable_and_dtype(
value_buffer, "HashTable_Value", ("int32"), "graph_reindex"
)
check_variable_and_dtype(
index_buffer, "HashTable_Index", ("int32"), "graph_reindex"
)
helper = LayerHelper("graph_reindex", **locals())
reindex_src = helper.create_variable_for_type_inference(dtype=x.dtype)
reindex_dst = helper.create_variable_for_type_inference(dtype=x.dtype)
out_nodes = helper.create_variable_for_type_inference(dtype=x.dtype)
helper.append_op(type="graph_reindex",
helper.append_op(
type="graph_reindex",
inputs={
"X":
x,
"Neighbors":
neighbors,
"Count":
count,
"HashTable_Value":
value_buffer if flag_buffer_hashtable else None,
"HashTable_Index":
index_buffer if flag_buffer_hashtable else None,
"X": x,
"Neighbors": neighbors,
"Count": count,
"HashTable_Value": value_buffer if flag_buffer_hashtable else None,
"HashTable_Index": index_buffer if flag_buffer_hashtable else None,
},
outputs={
"Reindex_Src": reindex_src,
"Reindex_Dst": reindex_dst,
"Out_Nodes": out_nodes
"Out_Nodes": out_nodes,
},
attrs={"flag_buffer_hashtable": flag_buffer_hashtable})
attrs={"flag_buffer_hashtable": flag_buffer_hashtable},
)
return reindex_src, reindex_dst, out_nodes
......@@ -25,8 +25,10 @@ import paddle.utils.deprecated as deprecated
since="2.4.0",
update_to="paddle.geometric.sample_neighbors",
level=1,
reason="paddle.incubate.graph_sample_neighbors will be removed in future")
def graph_sample_neighbors(row,
reason="paddle.incubate.graph_sample_neighbors will be removed in future",
)
def graph_sample_neighbors(
row,
colptr,
input_nodes,
eids=None,
......@@ -34,8 +36,10 @@ def graph_sample_neighbors(row,
sample_size=-1,
return_eids=False,
flag_perm_buffer=False,
name=None):
name=None,
):
"""
Graph Sample Neighbors API.
This API is mainly used in Graph Learning domain, and the main purpose is to
......@@ -71,14 +75,13 @@ def graph_sample_neighbors(row,
For more information, please refer to :ref:`api_guide_Name`.
Returns:
out_neighbors (Tensor): The sample neighbors of the input nodes.
out_count (Tensor): The number of sampling neighbors of each input node, and the shape
should be the same with `input_nodes`.
out_eids (Tensor): If `return_eids` is True, we will return the eid information of the
sample edges.
- out_neighbors (Tensor), The sample neighbors of the input nodes.
- out_count (Tensor), The number of sampling neighbors of each input node, and the shape should be the same with `input_nodes`.
- out_eids (Tensor), If `return_eids` is True, we will return the eid information of the sample edges.
Examples:
.. code-block:: python
import paddle
# edges: (3, 0), (7, 0), (0, 1), (9, 1), (1, 2), (4, 3), (2, 4),
# (9, 5), (3, 5), (9, 6), (1, 6), (9, 8), (7, 8)
......@@ -98,59 +101,83 @@ def graph_sample_neighbors(row,
if return_eids:
if eids is None:
raise ValueError(
f"`eids` should not be None if `return_eids` is True.")
f"`eids` should not be None if `return_eids` is True."
)
if flag_perm_buffer:
if perm_buffer is None:
raise ValueError(
f"`perm_buffer` should not be None if `flag_perm_buffer`"
"is True.")
"is True."
)
if _non_static_mode():
out_neighbors, out_count, out_eids = _legacy_C_ops.graph_sample_neighbors(
row, colptr, input_nodes, eids, perm_buffer, "sample_size",
sample_size, "return_eids", return_eids, "flag_perm_buffer",
flag_perm_buffer)
(
out_neighbors,
out_count,
out_eids,
) = _legacy_C_ops.graph_sample_neighbors(
row,
colptr,
input_nodes,
eids,
perm_buffer,
"sample_size",
sample_size,
"return_eids",
return_eids,
"flag_perm_buffer",
flag_perm_buffer,
)
if return_eids:
return out_neighbors, out_count, out_eids
return out_neighbors, out_count
check_variable_and_dtype(row, "Row", ("int32", "int64"),
"graph_sample_neighbors")
check_variable_and_dtype(colptr, "Col_Ptr", ("int32", "int64"),
"graph_sample_neighbors")
check_variable_and_dtype(input_nodes, "X", ("int32", "int64"),
"graph_sample_neighbors")
check_variable_and_dtype(
row, "Row", ("int32", "int64"), "graph_sample_neighbors"
)
check_variable_and_dtype(
colptr, "Col_Ptr", ("int32", "int64"), "graph_sample_neighbors"
)
check_variable_and_dtype(
input_nodes, "X", ("int32", "int64"), "graph_sample_neighbors"
)
if return_eids:
check_variable_and_dtype(eids, "Eids", ("int32", "int64"),
"graph_sample_neighbors")
check_variable_and_dtype(
eids, "Eids", ("int32", "int64"), "graph_sample_neighbors"
)
if flag_perm_buffer:
check_variable_and_dtype(perm_buffer, "Perm_Buffer", ("int32", "int64"),
"graph_sample_neighbors")
check_variable_and_dtype(
perm_buffer,
"Perm_Buffer",
("int32", "int64"),
"graph_sample_neighbors",
)
helper = LayerHelper("graph_sample_neighbors", **locals())
out_neighbors = helper.create_variable_for_type_inference(dtype=row.dtype)
out_count = helper.create_variable_for_type_inference(dtype=row.dtype)
out_eids = helper.create_variable_for_type_inference(dtype=row.dtype)
helper.append_op(type="graph_sample_neighbors",
helper.append_op(
type="graph_sample_neighbors",
inputs={
"Row": row,
"Col_Ptr": colptr,
"X": input_nodes,
"Eids": eids if return_eids else None,
"Perm_Buffer":
perm_buffer if flag_perm_buffer else None
"Perm_Buffer": perm_buffer if flag_perm_buffer else None,
},
outputs={
"Out": out_neighbors,
"Out_Count": out_count,
"Out_Eids": out_eids
"Out_Eids": out_eids,
},
attrs={
"sample_size": sample_size,
"return_eids": return_eids,
"flag_perm_buffer": flag_perm_buffer
})
"flag_perm_buffer": flag_perm_buffer,
},
)
if return_eids:
return out_neighbors, out_count, out_eids
return out_neighbors, out_count
......@@ -36,7 +36,8 @@ from paddle import _C_ops, _legacy_C_ops
__all__ = ['resnet_basic_block', 'ResNetBasicBlock']
def resnet_basic_block(x,
def resnet_basic_block(
x,
filter1,
scale1,
bias1,
......@@ -69,73 +70,198 @@ def resnet_basic_block(x,
use_global_stats=None,
training=False,
trainable_statistics=False,
find_conv_max=True):
find_conv_max=True,
):
if fluid.framework.in_dygraph_mode():
attrs = ('stride1', stride1, 'stride2', stride2, 'stride3', stride3,
'padding1', padding1, 'padding2', padding2, 'padding3',
padding3, 'dilation1', dilation1, 'dilation2', dilation2,
'dilation3', dilation3, 'group', groups, 'momentum', momentum,
'epsilon', eps, 'data_format', data_format, 'has_shortcut',
has_shortcut, 'use_global_stats', use_global_stats,
"trainable_statistics", trainable_statistics, 'is_test',
not training, 'act_type', "relu", 'find_conv_input_max',
find_conv_max)
out, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _ = \
getattr(_C_ops, "resnet_basic_block")(x, filter1, scale1, bias1, mean1, var1, filter2, scale2, bias2, mean2, var2, \
filter3, scale3, bias3, mean3, var3, mean1, var1, mean2, var2, mean3, var3, *attrs)
attrs = (
'stride1',
stride1,
'stride2',
stride2,
'stride3',
stride3,
'padding1',
padding1,
'padding2',
padding2,
'padding3',
padding3,
'dilation1',
dilation1,
'dilation2',
dilation2,
'dilation3',
dilation3,
'group',
groups,
'momentum',
momentum,
'epsilon',
eps,
'data_format',
data_format,
'has_shortcut',
has_shortcut,
'use_global_stats',
use_global_stats,
"trainable_statistics",
trainable_statistics,
'is_test',
not training,
'act_type',
"relu",
'find_conv_input_max',
find_conv_max,
)
(
out,
_,
_,
_,
_,
_,
_,
_,
_,
_,
_,
_,
_,
_,
_,
_,
_,
_,
_,
_,
_,
_,
_,
) = getattr(_C_ops, "resnet_basic_block")(
x,
filter1,
scale1,
bias1,
mean1,
var1,
filter2,
scale2,
bias2,
mean2,
var2,
filter3,
scale3,
bias3,
mean3,
var3,
mean1,
var1,
mean2,
var2,
mean3,
var3,
*attrs
)
return out
helper = LayerHelper('resnet_basic_block', **locals())
bn_param_dtype = fluid.core.VarDesc.VarType.FP32
max_dtype = fluid.core.VarDesc.VarType.FP32
out = helper.create_variable_for_type_inference(dtype=x.dtype,
stop_gradient=True)
conv1 = helper.create_variable_for_type_inference(dtype=x.dtype,
stop_gradient=True)
out = helper.create_variable_for_type_inference(
dtype=x.dtype, stop_gradient=True
)
conv1 = helper.create_variable_for_type_inference(
dtype=x.dtype, stop_gradient=True
)
saved_mean1 = helper.create_variable_for_type_inference(
dtype=bn_param_dtype, stop_gradient=True)
dtype=bn_param_dtype, stop_gradient=True
)
saved_invstd1 = helper.create_variable_for_type_inference(
dtype=bn_param_dtype, stop_gradient=True)
running_mean1 = helper.create_variable_for_type_inference(
dtype=bn_param_dtype, stop_gradient=True) if mean1 is None else mean1
running_var1 = helper.create_variable_for_type_inference(
dtype=bn_param_dtype, stop_gradient=True) if var1 is None else var1
conv2 = helper.create_variable_for_type_inference(dtype=x.dtype,
stop_gradient=True)
conv2_input = helper.create_variable_for_type_inference(dtype=x.dtype,
stop_gradient=True)
dtype=bn_param_dtype, stop_gradient=True
)
running_mean1 = (
helper.create_variable_for_type_inference(
dtype=bn_param_dtype, stop_gradient=True
)
if mean1 is None
else mean1
)
running_var1 = (
helper.create_variable_for_type_inference(
dtype=bn_param_dtype, stop_gradient=True
)
if var1 is None
else var1
)
conv2 = helper.create_variable_for_type_inference(
dtype=x.dtype, stop_gradient=True
)
conv2_input = helper.create_variable_for_type_inference(
dtype=x.dtype, stop_gradient=True
)
saved_mean2 = helper.create_variable_for_type_inference(
dtype=bn_param_dtype, stop_gradient=True)
dtype=bn_param_dtype, stop_gradient=True
)
saved_invstd2 = helper.create_variable_for_type_inference(
dtype=bn_param_dtype, stop_gradient=True)
running_mean2 = helper.create_variable_for_type_inference(
dtype=bn_param_dtype, stop_gradient=True) if mean2 is None else mean2
running_var2 = helper.create_variable_for_type_inference(
dtype=bn_param_dtype, stop_gradient=True) if var2 is None else var2
conv3 = helper.create_variable_for_type_inference(dtype=x.dtype,
stop_gradient=True)
dtype=bn_param_dtype, stop_gradient=True
)
running_mean2 = (
helper.create_variable_for_type_inference(
dtype=bn_param_dtype, stop_gradient=True
)
if mean2 is None
else mean2
)
running_var2 = (
helper.create_variable_for_type_inference(
dtype=bn_param_dtype, stop_gradient=True
)
if var2 is None
else var2
)
conv3 = helper.create_variable_for_type_inference(
dtype=x.dtype, stop_gradient=True
)
saved_mean3 = helper.create_variable_for_type_inference(
dtype=bn_param_dtype, stop_gradient=True)
dtype=bn_param_dtype, stop_gradient=True
)
saved_invstd3 = helper.create_variable_for_type_inference(
dtype=bn_param_dtype, stop_gradient=True)
running_mean3 = helper.create_variable_for_type_inference(
dtype=bn_param_dtype, stop_gradient=True) if mean3 is None else mean3
running_var3 = helper.create_variable_for_type_inference(
dtype=bn_param_dtype, stop_gradient=True) if var3 is None else var3
dtype=bn_param_dtype, stop_gradient=True
)
running_mean3 = (
helper.create_variable_for_type_inference(
dtype=bn_param_dtype, stop_gradient=True
)
if mean3 is None
else mean3
)
running_var3 = (
helper.create_variable_for_type_inference(
dtype=bn_param_dtype, stop_gradient=True
)
if var3 is None
else var3
)
conv1_input_max = helper.create_variable_for_type_inference(
dtype=max_dtype, stop_gradient=True)
dtype=max_dtype, stop_gradient=True
)
conv1_filter_max = helper.create_variable_for_type_inference(
dtype=max_dtype, stop_gradient=True)
dtype=max_dtype, stop_gradient=True
)
conv2_input_max = helper.create_variable_for_type_inference(
dtype=max_dtype, stop_gradient=True)
dtype=max_dtype, stop_gradient=True
)
conv2_filter_max = helper.create_variable_for_type_inference(
dtype=max_dtype, stop_gradient=True)
dtype=max_dtype, stop_gradient=True
)
conv3_input_max = helper.create_variable_for_type_inference(
dtype=max_dtype, stop_gradient=True)
dtype=max_dtype, stop_gradient=True
)
conv3_filter_max = helper.create_variable_for_type_inference(
dtype=max_dtype, stop_gradient=True)
dtype=max_dtype, stop_gradient=True
)
inputs = {
'X': x,
......@@ -175,7 +301,7 @@ def resnet_basic_block(x,
"trainable_statistics": trainable_statistics,
'is_test': not training,
'act_type': "relu",
'find_conv_input_max': find_conv_max
'find_conv_input_max': find_conv_max,
}
outputs = {
......@@ -203,39 +329,120 @@ def resnet_basic_block(x,
'MaxInput3': conv3_input_max,
'MaxFilter3': conv3_filter_max,
}
helper.append_op(type='resnet_basic_block',
inputs=inputs,
outputs=outputs,
attrs=attrs)
helper.append_op(
type='resnet_basic_block', inputs=inputs, outputs=outputs, attrs=attrs
)
return out
class ResNetBasicBlock(Layer):
"""
r"""
ResNetBasicBlock is designed for optimize the performence of the basic unit of ssd resnet block.
The fusion op architecture like this:
has_shortcut = True: else:
X X
/ /
| | | |
CONV1 | CONV1 |
| | | |
BN1 | BN1 |
| | | |
RELU1 | RELU1 |
| | | |
CONV2 CONV3 CONV2 |
| | | |
BN2 BN3 BN2 |
\ / \ /
ADD ADD
| |
RELU RELU
| |
Y Y
If has_shortcut = True, it can calculate 3 Conv2D, 3 BatchNorm and 2 ReLU in one time.
If has_shortcut = False, it can calculate 2 Conv2D, 2 BatchNorm and 2 ReLU in one time. In this
case the shape of output is same with input.
Args:
num_channels (int): The number of input image channel.
num_filter (int): The number of filter. It is as same as the output image channel.
filter_size (int|list|tuple): The filter size. If filter_size
is a tuple, it must contain two integers, (filter_size_height,
filter_size_width). Otherwise, filter_size_height = filter_size_width =\
filter_size.
stride (int, optional): The stride size. It means the stride in convolution.
If stride is a tuple, it must contain two integers, (stride_height, stride_width).
Otherwise, stride_height = stride_width = stride. Default: stride = 1.
act (str, optional): Activation type, if it is set to None, activation is not appended.
Default: None
momentum (float, optional): The value used for the moving_mean and
moving_var computation. This should be a float number or a Tensor with
shape [1] and data type as float32. The updated formula is:
:math:`moving\_mean = moving\_mean * momentum + new\_mean * (1. - momentum)`
:math:`moving\_var = moving\_var * momentum + new\_var * (1. - momentum)`
Default is 0.9.
eps (float, optional): A value added to the denominator for
numerical stability. Default is 1e-5.
data_format (str, optional): Specify the data format of the input, and the data format of the output
will be consistent with that of the input. Now is only support `"NCHW"`, the data is stored in
the order of: `[batch_size, input_channels, input_height, input_width]`.
has_shortcut (bool, optional): Whether to calculate CONV3 and BN3. Default: False.
use_global_stats (bool, optional): Whether to use global mean and
variance. In inference or test mode, set use_global_stats to true
or is_test to true, and the behavior is equivalent.
In train mode, when setting use_global_stats True, the global mean
and variance are also used during train period. Default: False.
is_test (bool, optional): A flag indicating whether it is in
test phrase or not. Default: False.
filter_attr (ParamAttr, optional): The parameter attribute for learnable parameters/weights
of conv2d. If it is set to None or one attribute of ParamAttr, conv2d
will create ParamAttr as param_attr. Default: None.
scale_attr (ParamAttr, optional): The parameter attribute for Parameter `scale`
of batch_norm. If it is set to None or one attribute of ParamAttr, batch_norm will create ParamAttr
as param_attr, the name of scale can be set in ParamAttr. If the Initializer of the param_attr is not set,
the parameter is initialized with Xavier. Default: None.
bias_attr (ParamAttr, optional): The parameter attribute for the bias of batch_norm.
If it is set to None or one attribute of ParamAttr, batch_norm
will create ParamAttr as bias_attr, the name of bias can be set in ParamAttr.
If the Initializer of the bias_attr is not set, the bias is initialized zero.
Default: None.
moving_mean_name (str, optional): The name of moving_mean which store the global Mean. If it
is set to None, batch_norm will save global mean with a random name, otherwise, batch_norm
will save global mean with the string. Default: None.
moving_var_name (str, optional): The name of the moving_variance which store the global Variance.
If it is set to None, batch_norm will save global variance with a random name, otherwise, batch_norm
will save global variance with the string. Default: None.
padding (int, optional): The padding size. It is only spupport padding_height = padding_width = padding.
Default: padding = 0.
dilation (int, optional): The dilation size. It means the spacing between the kernel
points. It is only spupport dilation_height = dilation_width = dilation.
Default: dilation = 1.
trainable_statistics (bool, optional): Whether to calculate mean and var in eval mode. In eval mode, when
setting trainable_statistics True, mean and variance will be calculated by current batch statistics.
Default: False.
find_conv_max (bool, optional): Whether to calculate max value of each conv2d. Default: True.
Returns:
A Tensor representing the ResNetBasicBlock, whose data type is the same with input.
Examples:
.. code-block:: python
# required: xpu
import paddle
from paddle.incubate.xpu.resnet_block import ResNetBasicBlock
ch_in = 4
ch_out = 8
x = paddle.uniform((2, ch_in, 16, 16), dtype='float32', min=-1., max=1.)
resnet_basic_block = ResNetBasicBlock(num_channels1=ch_in,
num_filter1=ch_out,
filter1_size=3,
num_channels2=ch_out,
num_filter2=ch_out,
filter2_size=3,
num_channels3=ch_in,
num_filter3=ch_out,
filter3_size=1,
stride1=1,
stride2=1,
stride3=1,
act='relu',
padding1=1,
padding2=1,
padding3=0,
has_shortcut=True)
out = resnet_basic_block.forward(x)
print(out.shape) # [2, 8, 16, 16]
"""
def __init__(self,
def __init__(
self,
num_channels1,
num_filter1,
filter1_size,
......@@ -277,14 +484,17 @@ class ResNetBasicBlock(Layer):
dilation2=1,
dilation3=1,
trainable_statistics=False,
find_conv_max=True):
find_conv_max=True,
):
super(ResNetBasicBlock, self).__init__()
self._stride1 = stride1
self._stride2 = stride2
self._kernel1_size = utils.convert_to_list(filter1_size, 2,
'filter1_size')
self._kernel2_size = utils.convert_to_list(filter2_size, 2,
'filter2_size')
self._kernel1_size = utils.convert_to_list(
filter1_size, 2, 'filter1_size'
)
self._kernel2_size = utils.convert_to_list(
filter2_size, 2, 'filter2_size'
)
self._dilation1 = dilation1
self._dilation2 = dilation2
self._padding1 = padding1
......@@ -301,8 +511,9 @@ class ResNetBasicBlock(Layer):
self._find_conv_max = find_conv_max
if has_shortcut:
self._kernel3_size = utils.convert_to_list(filter3_size, 2,
'filter3_size')
self._kernel3_size = utils.convert_to_list(
filter3_size, 2, 'filter3_size'
)
self._padding3 = padding3
self._stride3 = stride3
self._dilation3 = dilation3
......@@ -317,11 +528,13 @@ class ResNetBasicBlock(Layer):
if data_format not in valid_format:
raise ValueError(
"conv_format must be one of {}, but got conv_format={}".format(
valid_format, data_format))
valid_format, data_format
)
)
def _get_default_param_initializer(channels, kernel_size):
filter_elem_num = np.prod(kernel_size) * channels
std = (2.0 / filter_elem_num)**0.5
std = (2.0 / filter_elem_num) ** 0.5
return I.Normal(0.0, std)
# init filter
......@@ -335,92 +548,128 @@ class ResNetBasicBlock(Layer):
shape=filter1_shape,
attr=filter1_attr,
default_initializer=_get_default_param_initializer(
num_channels1, self._kernel1_size))
num_channels1, self._kernel1_size
),
)
self.scale_1 = self.create_parameter(
shape=bn1_param_shape,
attr=scale1_attr,
dtype=bn_param_dtype,
default_initializer=I.Constant(1.0))
self.bias_1 = self.create_parameter(shape=bn1_param_shape,
default_initializer=I.Constant(1.0),
)
self.bias_1 = self.create_parameter(
shape=bn1_param_shape,
attr=bias1_attr,
dtype=bn_param_dtype,
is_bias=True)
self.mean_1 = self.create_parameter(attr=ParamAttr(
is_bias=True,
)
self.mean_1 = self.create_parameter(
attr=ParamAttr(
name=moving_mean1_name,
initializer=I.Constant(0.0),
trainable=False),
trainable=False,
),
shape=bn1_param_shape,
dtype=bn_param_dtype)
dtype=bn_param_dtype,
)
self.mean_1.stop_gradient = True
self.var_1 = self.create_parameter(
attr=ParamAttr(name=moving_var1_name,
attr=ParamAttr(
name=moving_var1_name,
initializer=I.Constant(1.0),
trainable=False),
trainable=False,
),
shape=bn1_param_shape,
dtype=bn_param_dtype)
dtype=bn_param_dtype,
)
self.var_1.stop_gradient = True
self.filter_2 = self.create_parameter(
shape=filter2_shape,
attr=filter2_attr,
default_initializer=_get_default_param_initializer(
num_channels2, self._kernel2_size))
num_channels2, self._kernel2_size
),
)
self.scale_2 = self.create_parameter(
shape=bn2_param_shape,
attr=scale2_attr,
dtype=bn_param_dtype,
default_initializer=I.Constant(1.0))
self.bias_2 = self.create_parameter(shape=bn2_param_shape,
default_initializer=I.Constant(1.0),
)
self.bias_2 = self.create_parameter(
shape=bn2_param_shape,
attr=bias2_attr,
dtype=bn_param_dtype,
is_bias=True)
self.mean_2 = self.create_parameter(attr=ParamAttr(
is_bias=True,
)
self.mean_2 = self.create_parameter(
attr=ParamAttr(
name=moving_mean2_name,
initializer=I.Constant(0.0),
trainable=False),
trainable=False,
),
shape=bn2_param_shape,
dtype=bn_param_dtype)
dtype=bn_param_dtype,
)
self.mean_2.stop_gradient = True
self.var_2 = self.create_parameter(
attr=ParamAttr(name=moving_var2_name,
attr=ParamAttr(
name=moving_var2_name,
initializer=I.Constant(1.0),
trainable=False),
trainable=False,
),
shape=bn2_param_shape,
dtype=bn_param_dtype)
dtype=bn_param_dtype,
)
self.var_2.stop_gradient = True
if has_shortcut:
bn3_param_shape = [1, 1, num_filter3]
filter3_shape = [
num_filter3, num_channels3, filter3_size, filter3_size
num_filter3,
num_channels3,
filter3_size,
filter3_size,
]
self.filter_3 = self.create_parameter(
shape=filter3_shape,
attr=filter3_attr,
default_initializer=_get_default_param_initializer(
num_channels3, self._kernel3_size))
num_channels3, self._kernel3_size
),
)
self.scale_3 = self.create_parameter(
shape=bn3_param_shape,
attr=scale3_attr,
dtype=bn_param_dtype,
default_initializer=I.Constant(1.0))
self.bias_3 = self.create_parameter(shape=bn3_param_shape,
default_initializer=I.Constant(1.0),
)
self.bias_3 = self.create_parameter(
shape=bn3_param_shape,
attr=bias3_attr,
dtype=bn_param_dtype,
is_bias=True)
self.mean_3 = self.create_parameter(attr=ParamAttr(
is_bias=True,
)
self.mean_3 = self.create_parameter(
attr=ParamAttr(
name=moving_mean3_name,
initializer=I.Constant(0.0),
trainable=False),
trainable=False,
),
shape=bn3_param_shape,
dtype=bn_param_dtype)
dtype=bn_param_dtype,
)
self.mean_3.stop_gradient = True
self.var_3 = self.create_parameter(attr=ParamAttr(
self.var_3 = self.create_parameter(
attr=ParamAttr(
name=moving_var3_name,
initializer=I.Constant(1.0),
trainable=False),
trainable=False,
),
shape=bn3_param_shape,
dtype=bn_param_dtype)
dtype=bn_param_dtype,
)
self.var_3.stop_gradient = True
else:
self.filter_3 = None
......@@ -464,5 +713,6 @@ class ResNetBasicBlock(Layer):
use_global_stats=self._use_global_stats,
training=self.training,
trainable_statistics=self._trainable_statistics,
find_conv_max=self._find_conv_max)
find_conv_max=self._find_conv_max,
)
return out
......@@ -715,6 +715,7 @@ def upsample(
name=None,
):
"""
This API resizes a batch of images.
The input must be a 3-D Tensor of the shape (num_batches, channels, in_w)
......@@ -725,11 +726,12 @@ def upsample(
and the resizing only applies on the three dimensions(depth, height and width).
Supporting resample methods:
'linear' : Linear interpolation
'bilinear' : Bilinear interpolation
'trilinear' : Trilinear interpolation
'nearest' : Nearest neighbor interpolation
'bicubic' : Bicubic interpolation
- 'linear' : Linear interpolation
- 'bilinear' : Bilinear interpolation
- 'trilinear' : Trilinear interpolation
- 'nearest' : Nearest neighbor interpolation
- 'bicubic' : Bicubic interpolation
Linear interpolation is the method of using a line connecting two known quantities
to determine the value of an unknown quantity between the two known quantities.
......@@ -831,8 +833,9 @@ def upsample(
D_out = D_{in} * scale_{factor}
H_out = H_{in} * scale_{factor}
W_out = W_{in} * scale_{factor}
https://en.wikipedia.org/wiki/Linear_interpolation.
For details of linear interpolation, please refer to Wikipedia:
https://en.wikipedia.org/wiki/Linear_interpolation.
For details of nearest neighbor interpolation, please refer to Wikipedia:
https://en.wikipedia.org/wiki/Nearest-neighbor_interpolation.
......@@ -876,6 +879,7 @@ def upsample(
name(str, optional): The default value is None.
Normally there is no need for user to set this property.
For more information, please refer to :ref:`api_guide_Name`
Returns:
A 3-D Tensor of the shape (num_batches, channels, out_w) or (num_batches, out_w, channels),
A 4-D Tensor of the shape (num_batches, channels, out_h, out_w) or (num_batches, out_h, out_w, channels),
......
......@@ -23,6 +23,7 @@ __all__ = []
def pairwise_distance(x, y, p=2., epsilon=1e-6, keepdim=False, name=None):
r"""
It computes the pairwise distance between two vectors. The
distance is calculated by p-oreder norm:
......@@ -48,6 +49,7 @@ def pairwise_distance(x, y, p=2., epsilon=1e-6, keepdim=False, name=None):
Returns:
Tensor, the dtype is same as input tensor.
- If :attr:`keepdim` is True, the output shape is :math:`[N, 1]` or :math:`[1]`,
depending on whether the input has data shaped as :math:`[N, D]`.
- If :attr:`keepdim` is False, the output shape is :math:`[N]` or :math:`[]`,
......
......@@ -1310,6 +1310,7 @@ def margin_ranking_loss(
def l1_loss(input, label, reduction='mean', name=None):
r"""
Computes the L1 Loss of Tensor ``input`` and ``label`` as follows.
If `reduction` set to ``'none'``, the loss is:
......@@ -1341,7 +1342,7 @@ def l1_loss(input, label, reduction='mean', name=None):
Returns:
Tensor, the L1 Loss of Tensor ``input`` and ``label``.
If `reduction` is ``'none'``, the shape of output loss is [N, *], the same as ``input`` .
If `reduction` is ``'none'``, the shape of output loss is :math:`[N, *]`, the same as ``input`` .
If `reduction` is ``'mean'`` or ``'sum'``, the shape of output loss is [1].
Examples:
......@@ -1364,6 +1365,7 @@ def l1_loss(input, label, reduction='mean', name=None):
l1_loss = paddle.nn.functional.l1_loss(input, label, reduction='sum')
print(l1_loss.numpy())
# [1.4]
"""
if reduction not in ['sum', 'mean', 'none']:
raise ValueError(
......@@ -2286,6 +2288,7 @@ def cross_entropy(
name=None,
):
r"""
By default, this operator implements the cross entropy loss function with softmax. This function
combines the calculation of the softmax operation and the cross entropy loss function
to provide a more numerically stable computing.
......@@ -2399,21 +2402,13 @@ def cross_entropy(
Parameters:
- **input** (Tensor)
Input tensor, the data type is float32, float64. Shape is
:math:`[N_1, N_2, ..., N_k, C]`, where C is number of classes , ``k >= 1`` .
input (Tensor): the data type is float32, float64. Shape is :math:`[N_1, N_2, ..., N_k, C]`, where C is number of classes, ``k >= 1`` .
Note:
1. when use_softmax=True, it expects unscaled logits. This operator should not be used with the
output of softmax operator, which will produce incorrect results.
1. when use_softmax=True, it expects unscaled logits. This operator should not be used with the output of softmax operator, which will produce incorrect results.
2. when use_softmax=False, it expects the output of softmax operator.
- **label** (Tensor)
label (Tensor):
1. If soft_label=False, the shape is
:math:`[N_1, N_2, ..., N_k]` or :math:`[N_1, N_2, ..., N_k, 1]`, k >= 1.
the data type is int32, int64, float32, float64, where each value is [0, C-1].
......@@ -2421,48 +2416,27 @@ def cross_entropy(
2. If soft_label=True, the shape and data type should be same with ``input`` ,
and the sum of the labels for each sample should be 1.
- **weight** (Tensor, optional)
a manual rescaling weight given to each class.
weight (Tensor, optional): a manual rescaling weight given to each class.
If given, has to be a Tensor of size C and the data type is float32, float64.
Default is ``'None'`` .
- **ignore_index** (int64, optional)
Specifies a target value that is ignored
ignore_index (int64, optional): Specifies a target value that is ignored
and does not contribute to the loss. A negative value means that no label
value needs to be ignored. Only valid when soft_label = False.
Default is ``-100`` .
- **reduction** (str, optional)
Indicate how to average the loss by batch_size,
reduction (str, optional): Indicate how to average the loss by batch_size,
the candicates are ``'none'`` | ``'mean'`` | ``'sum'``.
If :attr:`reduction` is ``'mean'``, the reduced mean loss is returned;
If :attr:`size_average` is ``'sum'``, the reduced sum loss is returned.
If :attr:`reduction` is ``'none'``, the unreduced loss is returned.
Default is ``'mean'``.
- **soft_label** (bool, optional)
Indicate whether label is soft.
Default is ``False``.
- **axis** (int, optional)
The index of dimension to perform softmax calculations.
soft_label (bool, optional): Indicate whether label is soft. Default is ``False``.
axis (int, optional):The index of dimension to perform softmax calculations.
It should be in range :math:`[-1, rank - 1]`, where :math:`rank` is the
number of dimensions of input :attr:`input`.
Default is ``-1`` .
- **use_softmax** (bool, optional)
Indicate whether compute softmax before cross_entropy.
use_softmax (bool, optional): Indicate whether compute softmax before cross_entropy.
Default is ``True``.
- **name** (str, optional)
The name of the operator. Default is ``None`` .
name (str, optional): The name of the operator. Default is ``None`` .
For more information, please refer to :ref:`api_guide_Name` .
Returns:
......@@ -2478,9 +2452,7 @@ def cross_entropy(
2. if soft_label = True, the dimension of return value is :math:`[N_1, N_2, ..., N_k, 1]` .
Examples:
.. code-block:: python
# hard labels
......@@ -3834,6 +3806,7 @@ def triplet_margin_loss(
def soft_margin_loss(input, label, reduction='mean', name=None):
"""
The API measures the soft margin loss between input predictions ``input``
and target labels ``label`` . It can be described as:
......@@ -3842,7 +3815,7 @@ def soft_margin_loss(input, label, reduction='mean', name=None):
Parameters:
input (Tensor): The input predications tensor with shape: [N, *],
input (Tensor): The input predications tensor with shape: ``[N, *]``,
N is batch_size, `*` means any number of additional dimensions. The ``input`` ranges from -inf to inf.
Available dtype is float32, float64.
......@@ -3862,8 +3835,7 @@ def soft_margin_loss(input, label, reduction='mean', name=None):
Returns:
Output (Tensor): If ``reduction`` is ``'none'``, the shape of output is
same as ``input`` , else the shape of output is [1].
Output (Tensor): If ``reduction`` is ``'none'``, the shape of output is same as ``input`` , else the shape of output is [1].
Examples:
.. code-block:: python
......@@ -3889,6 +3861,7 @@ def soft_margin_loss(input, label, reduction='mean', name=None):
# [0.84367639, 0.74795729, 0.44629076, 0.55123353, 0.77659678],
# [0.39465919, 0.76651484, 0.54485321, 0.76609844, 0.77166790],
# [0.51283568, 0.84757161, 0.78913331, 1.05268764, 0.45318675]])
"""
if reduction not in ['sum', 'mean', 'none']:
raise ValueError(
......
......@@ -1735,16 +1735,18 @@ def adaptive_avg_pool1d(x, output_size, name=None):
def adaptive_avg_pool2d(x, output_size, data_format='NCHW', name=None):
"""
r"""
Applies 2D adaptive avg pooling on input tensor. The h and w dimensions
of the output tensor are determined by the parameter output_size.
For avg adaptive pool2d:
.. math::
hstart &= floor(i * H_{in} / H_{out})
hend &= ceil((i + 1) * H_{in} / H_{out})
wstart &= floor(j * W_{in} / W_{out})
wend &= ceil((j + 1) * W_{in} / W_{out})
hstart &= floor(i * H_{in} / H_{out}) \\
hend &= ceil((i + 1) * H_{in} / H_{out}) \\
wstart &= floor(j * W_{in} / W_{out}) \\
wend &= ceil((j + 1) * W_{in} / W_{out}) \\
Output(i ,j) &= \frac{\sum Input[hstart:hend, wstart:wend]}{(hend - hstart) * (wend - wstart)}
Args:
......@@ -1753,14 +1755,15 @@ def adaptive_avg_pool2d(x, output_size, data_format='NCHW', name=None):
output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
it must contain two element, (H, W). H and W can be either a int, or None which means
the size will be the same as that of the input.
data_format (str): The data format of the input and output data. An optional string
data_format (str, optional): The data format of the input and output data. An optional string
from: "NCHW", "NHWC". The default is "NCHW". When it is "NCHW", the data is stored in
the order of: [batch_size, input_channels, input_height, input_width].
name(str, optional): For detailed information, please refer
to :ref:`api_guide_Name`. Usually name is no need to set and
None by default.
Returns:
Tensor: The output tensor of avg adaptive pool2d result. The data type is same as input tensor.
Tensor, The output tensor of avg adaptive pool2d result. The data type is same as input tensor.
Examples:
.. code-block:: python
......@@ -1788,6 +1791,7 @@ def adaptive_avg_pool2d(x, output_size, data_format='NCHW', name=None):
x = x,
output_size=[3, 3])
# out.shape is [2, 3, 3, 3]
"""
if not in_dynamic_mode():
check_variable_and_dtype(
......@@ -1879,35 +1883,37 @@ def adaptive_avg_pool2d(x, output_size, data_format='NCHW', name=None):
def adaptive_avg_pool3d(x, output_size, data_format='NCDHW', name=None):
"""
r"""
This operation applies 3D adaptive avg pooling on input tensor. The h and w dimensions
of the output tensor are determined by the parameter output_size.
For avg adaptive pool3d:
.. math::
dstart &= floor(i * D_{in} / D_{out})
dend &= ceil((i + 1) * D_{in} / D_{out})
hstart &= floor(j * H_{in} / H_{out})
hend &= ceil((j + 1) * H_{in} / H_{out})
wstart &= floor(k * W_{in} / W_{out})
wend &= ceil((k + 1) * W_{in} / W_{out})
dstart &= floor(i * D_{in} / D_{out}) \\
dend &= ceil((i + 1) * D_{in} / D_{out}) \\
hstart &= floor(j * H_{in} / H_{out}) \\
hend &= ceil((j + 1) * H_{in} / H_{out}) \\
wstart &= floor(k * W_{in} / W_{out}) \\
wend &= ceil((k + 1) * W_{in} / W_{out}) \\
Output(i ,j, k) &= \frac{\sum Input[dstart:dend, hstart:hend, wstart:wend]}
{(dend - dstart) * (hend - hstart) * (wend - wstart)}
Args:
x (Tensor): The input tensor of adaptive avg pool3d operator, which is a 5-D tensor.
The data type can be float32, float64.
output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
it must contain three elements, (D, H, W). D, H and W can be either a int, or None which means
the size will be the same as that of the input.
data_format (str): The data format of the input and output data. An optional string
output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or
list, it must contain three elements, (D, H, W). D, H and W can be either a int,
or None which means the size will be the same as that of the input.
data_format (str, optional): The data format of the input and output data. An optional string
from: "NCDHW", "NDHWC". The default is "NCDHW". When it is "NCDHW", the data is stored in
the order of: [batch_size, input_channels, input_depth, input_height, input_width].
name(str, optional): For detailed information, please refer
to :ref:`api_guide_Name`. Usually name is no need to set and
None by default.
name(str, optional): For detailed information, please refer to :ref:`api_guide_Name`.
Usually name is no need to set and None by default.
Returns:
Tensor: The output tensor of avg adaptive pool3d result. The data type is same as input tensor.
Tensor, The output tensor of avg adaptive pool3d result. The data type is same as input tensor.
Examples:
.. code-block:: python
......@@ -1937,6 +1943,7 @@ def adaptive_avg_pool3d(x, output_size, data_format='NCDHW', name=None):
x = input_data,
output_size=[3, 3, 3])
# out.shape is [2, 3, 3, 3, 3]
"""
if not in_dynamic_mode():
check_variable_and_dtype(
......
......@@ -1450,15 +1450,16 @@ class Maxout(Layer):
class Softmax2D(Layer):
r"""
Softmax2D Activation.
Given a Tensor with shape (B, C, H, W) or (C, H, W), it will apply Softmax to each location (C, h_i, w_j).
The sum of result in each location (C, H_i, W_j) will be one.
Shape:
- Input: :math:`(B, C, H, W)` or :math:`(C, H, W)`
- Output: :math:`(B, C, H, W)` or :math:`(C, H, W)`(same as input)
- Output: :math:`(B, C, H, W)` or :math:`(C, H, W)` (same as input)
Return:
Returns:
A Tensor of the same shape and dtype as input with value in range [0, 1].
Examples:
......@@ -1483,6 +1484,7 @@ class Softmax2D(Layer):
# [[0.42368975 0.51082766 0.47752273 0.5258871 ]
# [0.66754097 0.47182566 0.5187628 0.5402329 ]
# [0.49014282 0.46369177 0.50340754 0.5289428 ]]]]
"""
def __init__(self, name=None):
......
......@@ -20,6 +20,7 @@ __all__ = []
class PairwiseDistance(Layer):
r"""
It computes the pairwise distance between two vectors. The
distance is calculated by p-oreder norm:
......@@ -38,10 +39,10 @@ class PairwiseDistance(Layer):
Generally, no setting is required. Default: None.
Shape:
x: :math:`[N, D]` or :math:`[D]`, where :math:`N` is batch size, :math:`D`
- x: :math:`[N, D]` or :math:`[D]`, where :math:`N` is batch size, :math:`D`
is the dimension of the data. Available data type is float32, float64.
y: :math:`[N, D]` or :math:`[D]`, y have the same dtype as x.
output: The same dtype as input tensor.
- y: :math:`[N, D]` or :math:`[D]`, y have the same dtype as x.
- output: The same dtype as input tensor.
- If :attr:`keepdim` is True, the output shape is :math:`[N, 1]` or :math:`[1]`,
depending on whether the input has data shaped as :math:`[N, D]`.
- If :attr:`keepdim` is False, the output shape is :math:`[N]` or :math:`[]`,
......
......@@ -31,7 +31,8 @@ __all__ = []
class BCEWithLogitsLoss(Layer):
r"""
This operator combines the sigmoid layer and the :ref:`api_nn_loss_BCELoss` layer.
This operator combines the sigmoid layer and the :ref:`api_paddle_nn_BCELoss` layer.
Also, we can see it as the combine of ``sigmoid_cross_entropy_with_logits``
layer and some reduce operations.
......@@ -86,21 +87,21 @@ class BCEWithLogitsLoss(Layer):
For more information, please refer to :ref:`api_guide_Name`.
Shapes:
logit (Tensor): The input predications tensor. 2-D tensor with shape: [N, *],
- logit (Tensor): The input predications tensor. 2-D tensor with shape: [N, `*`],
N is batch_size, `*` means number of additional dimensions. The ``logit``
is usually the output of Linear layer. Available dtype is float32, float64.
label (Tensor): The target labels tensor. 2-D tensor with the same shape as
- label (Tensor): The target labels tensor. 2-D tensor with the same shape as
``logit``. The target labels which values should be numbers between 0 and 1.
Available dtype is float32, float64.
output (Tensor): If ``reduction`` is ``'none'``, the shape of output is
- output (Tensor): If ``reduction`` is ``'none'``, the shape of output is
same as ``logit`` , else the shape of output is scalar.
Returns:
A callable object of BCEWithLogitsLoss.
Examples:
.. code-block:: python
import paddle
logit = paddle.to_tensor([5.0, 1.0, 3.0], dtype="float32")
label = paddle.to_tensor([1.0, 0.0, 1.0], dtype="float32")
......@@ -139,6 +140,7 @@ class BCEWithLogitsLoss(Layer):
class CrossEntropyLoss(Layer):
r"""
By default, this operator implements the cross entropy loss function with softmax. This function
combines the calculation of the softmax operation and the cross entropy loss function
to provide a more numerically stable computing.
......@@ -251,60 +253,35 @@ class CrossEntropyLoss(Layer):
Parameters:
- **weight** (Tensor, optional)
a manual rescaling weight given to each class.
weight (Tensor, optional): a manual rescaling weight given to each class.
If given, has to be a Tensor of size C and the data type is float32, float64.
Default is ``'None'`` .
- **ignore_index** (int64, optional)
Specifies a target value that is ignored
ignore_index (int64, optional): Specifies a target value that is ignored
and does not contribute to the loss. A negative value means that no label
value needs to be ignored. Only valid when soft_label = False.
Default is ``-100`` .
- **reduction** (str, optional)
Indicate how to average the loss by batch_size,
reduction (str, optional): Indicate how to average the loss by batch_size,
the candicates are ``'none'`` | ``'mean'`` | ``'sum'``.
If :attr:`reduction` is ``'mean'``, the reduced mean loss is returned;
If :attr:`size_average` is ``'sum'``, the reduced sum loss is returned.
If :attr:`reduction` is ``'none'``, the unreduced loss is returned.
Default is ``'mean'``.
- **soft_label** (bool, optional)
Indicate whether label is soft.
soft_label (bool, optional): Indicate whether label is soft.
If soft_label=False, the label is hard. If soft_label=True, the label is soft.
Default is ``False``.
- **axis** (int, optional)
The index of dimension to perform softmax calculations.
axis (int, optional): The index of dimension to perform softmax calculations.
It should be in range :math:`[-1, rank - 1]`, where :math:`rank` is the number
of dimensions of input :attr:`input`.
Default is ``-1`` .
- **use_softmax** (bool, optional)
Indicate whether compute softmax before cross_entropy.
use_softmax (bool, optional): Indicate whether compute softmax before cross_entropy.
Default is ``True``.
- **name** (str, optional)
The name of the operator. Default is ``None`` .
name (str, optional): The name of the operator. Default is ``None`` .
For more information, please refer to :ref:`api_guide_Name` .
Shape:
- **input** (Tensor)
Input tensor, the data type is float32, float64. Shape is
- **input** (Tensor), the data type is float32, float64. Shape is
:math:`[N_1, N_2, ..., N_k, C]`, where C is number of classes , ``k >= 1`` .
Note:
1. when use_softmax=True, it expects unscaled logits. This operator should not be used with the
......@@ -312,7 +289,6 @@ class CrossEntropyLoss(Layer):
2. when use_softmax=False, it expects the output of softmax operator.
- **label** (Tensor)
1. If soft_label=False, the shape is
......@@ -322,14 +298,9 @@ class CrossEntropyLoss(Layer):
2. If soft_label=True, the shape and data type should be same with ``input`` ,
and the sum of the labels for each sample should be 1.
- **output** (Tensor)
Return the softmax cross_entropy loss of ``input`` and ``label``.
- **output** (Tensor), Return the softmax cross_entropy loss of ``input`` and ``label``.
The data type is the same as input.
If :attr:`reduction` is ``'mean'`` or ``'sum'`` , the dimension of return value is ``1``.
If :attr:`reduction` is ``'none'``:
1. If soft_label = False, the dimension of return value is the same with ``label`` .
......@@ -634,6 +605,7 @@ class MSELoss(Layer):
class L1Loss(Layer):
r"""
Construct a callable object of the ``L1Loss`` class.
The L1Loss layer calculates the L1 Loss of ``input`` and ``label`` as follows.
......@@ -663,10 +635,10 @@ class L1Loss(Layer):
name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
Shape:
input (Tensor): The input tensor. The shapes is [N, *], where N is batch size and `*` means any number of additional dimensions. It's data type should be float32, float64, int32, int64.
label (Tensor): label. The shapes is [N, *], same shape as ``input`` . It's data type should be float32, float64, int32, int64.
output (Tensor): The L1 Loss of ``input`` and ``label``.
If `reduction` is ``'none'``, the shape of output loss is [N, *], the same as ``input`` .
- input (Tensor): The input tensor. The shapes is ``[N, *]``, where N is batch size and `*` means any number of additional dimensions. It's data type should be float32, float64, int32, int64.
- label (Tensor): label. The shapes is ``[N, *]``, same shape as ``input`` . It's data type should be float32, float64, int32, int64.
- output (Tensor): The L1 Loss of ``input`` and ``label``.
If `reduction` is ``'none'``, the shape of output loss is ``[N, *]``, the same as ``input`` .
If `reduction` is ``'mean'`` or ``'sum'``, the shape of output loss is [1].
Examples:
......@@ -692,6 +664,7 @@ class L1Loss(Layer):
print(output)
# [[0.20000005 0.19999999]
# [0.2 0.79999995]]
"""
def __init__(self, reduction='mean', name=None):
......@@ -712,6 +685,7 @@ class L1Loss(Layer):
class BCELoss(Layer):
"""
This interface is used to construct a callable object of the ``BCELoss`` class.
The BCELoss layer measures the binary_cross_entropy loss between input predictions ``input``
and target labels ``label`` . The binary_cross_entropy loss can be described as:
......@@ -755,13 +729,13 @@ class BCELoss(Layer):
For more information, please refer to :ref:`api_guide_Name`.
Shape:
input (Tensor): 2-D tensor with shape: [N, *], N is batch_size, `*` means
- input (Tensor): 2-D tensor with shape: ``[N, *]``, N is batch_size, `*` means
number of additional dimensions. The input ``input`` should always
be the output of sigmod. Available dtype is float32, float64.
label (Tensor): 2-D tensor with the same shape as ``input``. The target
- label (Tensor): 2-D tensor with the same shape as ``input``. The target
labels which values should be numbers between 0 and 1. Available
dtype is float32, float64.
output (Tensor): If ``reduction`` is ``'none'``, the shape of output is
- output (Tensor): If ``reduction`` is ``'none'``, the shape of output is
same as ``input`` , else the shape of output is scalar.
Returns:
......@@ -914,6 +888,7 @@ class NLLLoss(Layer):
class KLDivLoss(Layer):
r"""
Generate a callable object of 'KLDivLoss' to calculate the
Kullback-Leibler divergence loss between Input(X) and
Input(Target). Notes that Input(X) is the log-probability
......@@ -933,14 +908,10 @@ class KLDivLoss(Layer):
Default is ``'mean'``.
Shape:
- input (Tensor): (N, *), where * means, any number of additional dimensions.
- label (Tensor): (N, *), same shape as input.
- input (Tensor): ``(N, *)``, where ``*`` means, any number of additional dimensions.
- label (Tensor): ``(N, *)``, same shape as input.
- output (Tensor): tensor with shape: [1] by default.
Examples:
.. code-block:: python
......@@ -970,6 +941,7 @@ class KLDivLoss(Layer):
kldiv_criterion = nn.KLDivLoss(reduction='none')
pred_loss = kldiv_criterion(x, target)
# shape=[5, 20]
"""
def __init__(self, reduction='mean'):
......@@ -1720,6 +1692,7 @@ class TripletMarginLoss(Layer):
class SoftMarginLoss(Layer):
r"""
Creates a criterion that measures a two-class soft margin loss between input predictions ``input``
and target labels ``label`` . It can be described as:
......@@ -1738,16 +1711,13 @@ class SoftMarginLoss(Layer):
name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
Shapes:
Input (Tensor): The input tensor with shape: [N, *],
- Input (Tensor): The input tensor with shape: ``[N, *]``,
N is batch_size, `*` means any number of additional dimensions. The ``input`` ranges from -inf to inf
Available dtype is float32, float64.
Label (Tensor): The target labels tensor with the same shape as
- Label (Tensor): The target labels tensor with the same shape as
``input``. The target labels which values should be numbers -1 or 1.
Available dtype is int32, int64, float32, float64.
Output (Tensor): If ``reduction`` is ``'none'``, the shape of output is
- Output (Tensor): If ``reduction`` is ``'none'``, the shape of output is
same as ``input`` , else the shape of output is [1].
Returns:
......@@ -1780,6 +1750,7 @@ class SoftMarginLoss(Layer):
# [0.55476735, 1.10505384, 0.89923519, 0.45018155, 1.06587511],
# [0.37998142, 0.48067240, 0.47791212, 0.55664053, 0.98581399],
# [0.78571653, 0.59319711, 0.39701841, 0.76172109, 0.83781742]])
"""
def __init__(self, reduction='mean', name=None):
......
......@@ -321,6 +321,7 @@ Where `H` means height of feature map, `W` means width of feature map.
class GroupNorm(Layer):
"""
This interface is used to construct a callable object of the ``GroupNorm`` class.
For more details, refer to code examples.
It implements the function of the Group Normalization Layer.
......@@ -341,7 +342,7 @@ class GroupNorm(Layer):
name(str, optional): Name for the GroupNorm, default is None. For more information, please refer to :ref:`api_guide_Name`..
Shape:
- x: Tensor with shape: (batch, num_features, *).
- x: Tensor with shape: attr:`(batch, num_features, *)`.
- output: The same shape as input x.
Returns:
......@@ -1047,6 +1048,7 @@ class BatchNorm3D(_BatchNormBase):
class SyncBatchNorm(_BatchNormBase):
r"""
This interface is used to construct a callable object of the ``SyncBatchNorm`` class.
It implements the function of the Cross-GPU Synchronized Batch Normalization Layer, and can
be used as a normalizer function for other operations, such as conv2d and fully connected
......@@ -1092,9 +1094,9 @@ class SyncBatchNorm(_BatchNormBase):
- :math:`\beta` : trainable shift parameter vector
Note:
If you want to use container to pack your model and has ``SyncBatchNorm`` in the
evaluation phase, please use ``nn.LayerList`` or ``nn.Sequential`` instead of
``list`` to pack the model.
If you want to use container to pack your model and has :ref:`api_paddle_nn_SyncBatchNorm` in the
evaluation phase, please use :ref:`api_paddle_nn_LayerList` or :ref:`api_paddle_nn_Sequential` instead of
:ref:`api_paddle_hub_list` to pack the model.
Parameters:
num_features(int): Indicate the number of channels of the input ``Tensor``.
......@@ -1112,8 +1114,8 @@ class SyncBatchNorm(_BatchNormBase):
have trainable bias parameter. Default: None.
Shapes:
input: Tensor that the dimension from 2 to 5.
output: Tensor with the same shape as input.
- input: Tensor that the dimension from 2 to 5.
- output: Tensor with the same shape as input.
Examples:
.. code-block:: python
......@@ -1135,6 +1137,7 @@ class SyncBatchNorm(_BatchNormBase):
# [[ 0.80956620, -0.66528702],
# [-1.27446556, 1.13018656]]]])
"""
def __init__(
......@@ -1284,8 +1287,8 @@ class SyncBatchNorm(_BatchNormBase):
The original model with converted SyncBatchNorm layers. If BatchNorm*d layer in the model, use SyncBatchNorm layer instead.
Examples:
.. code-block:: python
import paddle
import paddle.nn as nn
......
......@@ -224,6 +224,7 @@ class AvgPool2D(Layer):
class AvgPool3D(Layer):
"""
This operation applies 3D max pooling over input features based on the input,
and kernel_size, stride, padding parameters. Input(X) and Output(Out) are
in NCDHW format, where N is batch size, C is the number of channels,
......@@ -264,6 +265,7 @@ class AvgPool3D(Layer):
The data type can be float32, float64.
- output(Tensor): The output tensor of avg pool3d operator, which is a 5-D tensor.
The data type is same as input x.
Examples:
.. code-block:: python
......
......@@ -514,14 +514,17 @@ class QuantizedConv2D(Layer):
class QuantizedConv2DTranspose(Layer):
"""
The computational logic of QuantizedConv2DTranspose is the same with Conv2DTranspose.
The only difference is that its inputs are all fake quantized.
Examples:
.. code-block:: python
import paddle
import paddle.nn as nn
from paddle.nn.quant.quant_layers import QuantizedConv2DTranspose
x_var = paddle.uniform((2, 4, 8, 8), dtype='float32', min=-1., max=1.)
conv = nn.Conv2DTranspose(4, 6, (3, 3))
conv_quantized = QuantizedConv2DTranspose(conv)
......@@ -531,6 +534,7 @@ class QuantizedConv2DTranspose(Layer):
y_np = y_var.numpy()
print(y_np.shape, y_quantized_np.shape)
# (2, 6, 10, 10), (2, 6, 10, 10)
"""
def __init__(self,
......
......@@ -1661,6 +1661,7 @@ class MultiplicativeDecay(LRScheduler):
class OneCycleLR(LRScheduler):
r"""
Sets the learning rate according to the one cycle learning rate scheduler.
The scheduler adjusts the learning rate from an initial learning rate to the maximum learning rate and then
from that maximum learning rate to the minimum learning rate, which is much less than the initial learning rate.
......@@ -1674,22 +1675,25 @@ class OneCycleLR(LRScheduler):
Also note that you should update learning rate each step.
Args:
max_learning_rate (float): The maximum learning rate. It is a python float number.
Functionally, it defines the initial learning rate by ``divide_factor`` .
max_learning_rate (float): The maximum learning rate. It is a python float number. Functionally, it defines the initial learning rate by ``divide_factor`` .
total_steps (int): Number of total training steps.
divide_factor (float): Initial learning rate will be determined by initial_learning_rate = max_learning_rate / divide_factor. Default: 25.
divide_factor (float, optional): Initial learning rate will be determined by initial_learning_rate = max_learning_rate / divide_factor. Default: 25.
end_learning_rate (float, optional): The minimum learning rate during training, it should be much less than initial learning rate.
phase_pct (float): The percentage of total steps which used to increasing learning rate. Default: 0.3.
anneal_strategy (str, optional): Strategy of adjusting learning rate.'cos' for cosine annealing,
'linear' for linear annealing. Default: 'cos'.
anneal_strategy (str, optional): Strategy of adjusting learning rate.'cos' for cosine annealing, 'linear' for linear annealing. Default: 'cos'.
three_phase (bool, optional): Whether to use three phase.
If ``True``:
1. The learning rate will first increase from initial learning rate to maximum learning rate.
2. Then it will decrease to initial learning rate. Number of step in this phase is the same as the one in first phase.
3. Finally, it will decrease to minimum learning rate which is much less than initial learning rate.
If ``False``:
1. The learning rate will increase to maximum learning rate.
2. Then it will directly decrease to minimum learning rate.
last_epoch (int, optional): The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
......@@ -1741,6 +1745,7 @@ class OneCycleLR(LRScheduler):
},
fetch_list=loss.name)
scheduler.step() # You should update learning rate each step
"""
def __init__(
......
......@@ -124,7 +124,8 @@ def frame(x, frame_length, hop_length, axis=-1, name=None):
if frame_length > x.shape[axis]:
raise ValueError(
f'Attribute frame_length should be less equal than sequence length, '
f'but got ({frame_length}) > ({x.shape[axis]}).')
f'but got ({frame_length}) > ({x.shape[axis]}).'
)
op_type = 'frame'
......@@ -132,25 +133,33 @@ def frame(x, frame_length, hop_length, axis=-1, name=None):
return _C_ops.frame(x, frame_length, hop_length, axis)
if _in_legacy_dygraph():
attrs = ('frame_length', frame_length, 'hop_length', hop_length, 'axis',
axis)
attrs = (
'frame_length',
frame_length,
'hop_length',
hop_length,
'axis',
axis,
)
op = getattr(_legacy_C_ops, op_type)
out = op(x, *attrs)
else:
check_variable_and_dtype(
x, 'x', ['int32', 'int64', 'float16', 'float32', 'float64'],
op_type)
x, 'x', ['int32', 'int64', 'float16', 'float32', 'float64'], op_type
)
helper = LayerHelper(op_type, **locals())
dtype = helper.input_dtype(input_param_name='x')
out = helper.create_variable_for_type_inference(dtype=dtype)
helper.append_op(type=op_type,
helper.append_op(
type=op_type,
inputs={'X': x},
attrs={
'frame_length': frame_length,
'hop_length': hop_length,
'axis': axis
'axis': axis,
},
outputs={'Out': out})
outputs={'Out': out},
)
return out
......@@ -225,22 +234,22 @@ def overlap_add(x, hop_length, axis=-1, name=None):
out = op(x, *attrs)
else:
check_variable_and_dtype(
x, 'x', ['int32', 'int64', 'float16', 'float32', 'float64'],
op_type)
x, 'x', ['int32', 'int64', 'float16', 'float32', 'float64'], op_type
)
helper = LayerHelper(op_type, **locals())
dtype = helper.input_dtype(input_param_name='x')
out = helper.create_variable_for_type_inference(dtype=dtype)
helper.append_op(type=op_type,
helper.append_op(
type=op_type,
inputs={'X': x},
attrs={
'hop_length': hop_length,
'axis': axis
},
outputs={'Out': out})
attrs={'hop_length': hop_length, 'axis': axis},
outputs={'Out': out},
)
return out
def stft(x,
def stft(
x,
n_fft,
hop_length=None,
win_length=None,
......@@ -249,8 +258,10 @@ def stft(x,
pad_mode='reflect',
normalized=False,
onesided=True,
name=None):
name=None,
):
r"""
Short-time Fourier transform (STFT).
The STFT computes the discrete Fourier transforms (DFT) of short overlapping
......@@ -263,9 +274,12 @@ def stft(x,
Where:
- :math:`t`: The :math:`t`-th input window.
- :math:`\omega`: Frequency :math:`0 \leq \omega < \text{n\_fft}` for `onesided=False`,
or :math:`0 \leq \omega < \lfloor \text{n\_fft} / 2 \rfloor + 1` for `onesided=True`.
- :math:`N`: Value of `n_fft`.
- :math:`H`: Value of `hop_length`.
Args:
......@@ -292,9 +306,9 @@ def stft(x,
to set this property. For more information, please refer to :ref:`api_guide_Name`.
Returns:
The complex STFT output tensor with shape `[..., n_fft//2 + 1, num_frames]`(
real-valued input and `onesided` is `True`) or `[..., n_fft, num_frames]`(
`onesided` is `False`)
The complex STFT output tensor with shape `[..., n_fft//2 + 1, num_frames]`
(real-valued input and `onesided` is `True`) or `[..., n_fft, num_frames]`
(`onesided` is `False`)
Examples:
.. code-block:: python
......@@ -311,14 +325,17 @@ def stft(x,
x = paddle.randn([8, 48000], dtype=paddle.float64) + \
paddle.randn([8, 48000], dtype=paddle.float64)*1j # [8, 48000] complex128
y1 = stft(x, n_fft=512, center=False, onesided=False) # [8, 512, 372]
"""
check_variable_and_dtype(x, 'x',
['float32', 'float64', 'complex64', 'complex128'],
'stft')
check_variable_and_dtype(
x, 'x', ['float32', 'float64', 'complex64', 'complex128'], 'stft'
)
x_rank = len(x.shape)
assert x_rank in [1, 2], \
f'x should be a 1D or 2D real tensor, but got rank of x is {x_rank}'
assert x_rank in [
1,
2,
], f'x should be a 1D or 2D real tensor, but got rank of x is {x_rank}'
if x_rank == 1: # (batch, seq_length)
x = x.unsqueeze(0)
......@@ -326,69 +343,77 @@ def stft(x,
if hop_length is None:
hop_length = int(n_fft // 4)
assert hop_length > 0, \
f'hop_length should be > 0, but got {hop_length}.'
assert hop_length > 0, f'hop_length should be > 0, but got {hop_length}.'
if win_length is None:
win_length = n_fft
if _non_static_mode():
assert 0 < n_fft <= x.shape[-1], \
f'n_fft should be in (0, seq_length({x.shape[-1]})], but got {n_fft}.'
assert (
0 < n_fft <= x.shape[-1]
), f'n_fft should be in (0, seq_length({x.shape[-1]})], but got {n_fft}.'
assert 0 < win_length <= n_fft, \
f'win_length should be in (0, n_fft({n_fft})], but got {win_length}.'
assert (
0 < win_length <= n_fft
), f'win_length should be in (0, n_fft({n_fft})], but got {win_length}.'
if window is not None:
assert len(window.shape) == 1 and len(window) == win_length, \
f'expected a 1D window tensor of size equal to win_length({win_length}), but got window with shape {window.shape}.'
assert (
len(window.shape) == 1 and len(window) == win_length
), f'expected a 1D window tensor of size equal to win_length({win_length}), but got window with shape {window.shape}.'
else:
window = paddle.ones(shape=(win_length, ), dtype=x.dtype)
window = paddle.ones(shape=(win_length,), dtype=x.dtype)
if win_length < n_fft:
pad_left = (n_fft - win_length) // 2
pad_right = n_fft - win_length - pad_left
window = paddle.nn.functional.pad(window,
pad=[pad_left, pad_right],
mode='constant')
window = paddle.nn.functional.pad(
window, pad=[pad_left, pad_right], mode='constant'
)
if center:
assert pad_mode in ['constant', 'reflect'], \
'pad_mode should be "reflect" or "constant", but got "{}".'.format(pad_mode)
assert pad_mode in [
'constant',
'reflect',
], 'pad_mode should be "reflect" or "constant", but got "{}".'.format(
pad_mode
)
pad_length = n_fft // 2
# FIXME: Input `x` can be a complex tensor but pad does not supprt complex input.
x = paddle.nn.functional.pad(x.unsqueeze(-1),
x = paddle.nn.functional.pad(
x.unsqueeze(-1),
pad=[pad_length, pad_length],
mode=pad_mode,
data_format="NLC").squeeze(-1)
data_format="NLC",
).squeeze(-1)
x_frames = frame(x=x, frame_length=n_fft, hop_length=hop_length, axis=-1)
x_frames = x_frames.transpose(
perm=[0, 2,
1]) # switch n_fft to last dim, egs: (batch, num_frames, n_fft)
perm=[0, 2, 1]
) # switch n_fft to last dim, egs: (batch, num_frames, n_fft)
x_frames = paddle.multiply(x_frames, window)
norm = 'ortho' if normalized else 'backward'
if is_complex(x_frames):
assert not onesided, \
'onesided should be False when input or window is a complex Tensor.'
assert (
not onesided
), 'onesided should be False when input or window is a complex Tensor.'
if not is_complex(x):
out = fft_r2c(x=x_frames,
out = fft_r2c(
x=x_frames,
n=None,
axis=-1,
norm=norm,
forward=True,
onesided=onesided,
name=name)
name=name,
)
else:
out = fft_c2c(x=x_frames,
n=None,
axis=-1,
norm=norm,
forward=True,
name=name)
out = fft_c2c(
x=x_frames, n=None, axis=-1, norm=norm, forward=True, name=name
)
out = out.transpose(perm=[0, 2, 1]) # (batch, n_fft, num_frames)
......@@ -398,7 +423,8 @@ def stft(x,
return out
def istft(x,
def istft(
x,
n_fft,
hop_length=None,
win_length=None,
......@@ -408,7 +434,8 @@ def istft(x,
onesided=True,
length=None,
return_complex=False,
name=None):
name=None,
):
r"""
Inverse short-time Fourier transform (ISTFT).
......@@ -484,8 +511,12 @@ def istft(x,
check_variable_and_dtype(x, 'x', ['complex64', 'complex128'], 'istft')
x_rank = len(x.shape)
assert x_rank in [2, 3], \
'x should be a 2D or 3D complex tensor, but got rank of x is {}'.format(x_rank)
assert x_rank in [
2,
3,
], 'x should be a 2D or 3D complex tensor, but got rank of x is {}'.format(
x_rank
)
if x_rank == 2: # (batch, n_fft, n_frames)
x = x.unsqueeze(0)
......@@ -497,83 +528,107 @@ def istft(x,
win_length = n_fft
# Assure no gaps between frames.
assert 0 < hop_length <= win_length, \
'hop_length should be in (0, win_length({})], but got {}.'.format(win_length, hop_length)
assert (
0 < hop_length <= win_length
), 'hop_length should be in (0, win_length({})], but got {}.'.format(
win_length, hop_length
)
assert 0 < win_length <= n_fft, \
'win_length should be in (0, n_fft({})], but got {}.'.format(n_fft, win_length)
assert (
0 < win_length <= n_fft
), 'win_length should be in (0, n_fft({})], but got {}.'.format(
n_fft, win_length
)
n_frames = x.shape[-1]
fft_size = x.shape[-2]
if _non_static_mode():
if onesided:
assert (fft_size == n_fft // 2 + 1), \
'fft_size should be equal to n_fft // 2 + 1({}) when onesided is True, but got {}.'.format(n_fft // 2 + 1, fft_size)
assert (
fft_size == n_fft // 2 + 1
), 'fft_size should be equal to n_fft // 2 + 1({}) when onesided is True, but got {}.'.format(
n_fft // 2 + 1, fft_size
)
else:
assert (fft_size == n_fft), \
'fft_size should be equal to n_fft({}) when onesided is False, but got {}.'.format(n_fft, fft_size)
assert (
fft_size == n_fft
), 'fft_size should be equal to n_fft({}) when onesided is False, but got {}.'.format(
n_fft, fft_size
)
if window is not None:
assert len(window.shape) == 1 and len(window) == win_length, \
'expected a 1D window tensor of size equal to win_length({}), but got window with shape {}.'.format(win_length, window.shape)
assert (
len(window.shape) == 1 and len(window) == win_length
), 'expected a 1D window tensor of size equal to win_length({}), but got window with shape {}.'.format(
win_length, window.shape
)
else:
window_dtype = paddle.float32 if x.dtype in [
paddle.float32, paddle.complex64
] else paddle.float64
window = paddle.ones(shape=(win_length, ), dtype=window_dtype)
window_dtype = (
paddle.float32
if x.dtype in [paddle.float32, paddle.complex64]
else paddle.float64
)
window = paddle.ones(shape=(win_length,), dtype=window_dtype)
if win_length < n_fft:
pad_left = (n_fft - win_length) // 2
pad_right = n_fft - win_length - pad_left
# FIXME: Input `window` can be a complex tensor but pad does not supprt complex input.
window = paddle.nn.functional.pad(window,
pad=[pad_left, pad_right],
mode='constant')
window = paddle.nn.functional.pad(
window, pad=[pad_left, pad_right], mode='constant'
)
x = x.transpose(
perm=[0, 2,
1]) # switch n_fft to last dim, egs: (batch, num_frames, n_fft)
perm=[0, 2, 1]
) # switch n_fft to last dim, egs: (batch, num_frames, n_fft)
norm = 'ortho' if normalized else 'backward'
if return_complex:
assert not onesided, \
'onesided should be False when input(output of istft) or window is a complex Tensor.'
assert (
not onesided
), 'onesided should be False when input(output of istft) or window is a complex Tensor.'
out = fft_c2c(x=x, n=None, axis=-1, norm=norm, forward=False, name=None)
else:
assert not is_complex(window), \
'Data type of window should not be complex when return_complex is False.'
assert not is_complex(
window
), 'Data type of window should not be complex when return_complex is False.'
if onesided is False:
x = x[:, :, :n_fft // 2 + 1]
x = x[:, :, : n_fft // 2 + 1]
out = fft_c2r(x=x, n=None, axis=-1, norm=norm, forward=False, name=None)
out = paddle.multiply(out, window).transpose(
perm=[0, 2, 1]) # (batch, n_fft, num_frames)
out = overlap_add(x=out, hop_length=hop_length,
axis=-1) # (batch, seq_length)
perm=[0, 2, 1]
) # (batch, n_fft, num_frames)
out = overlap_add(
x=out, hop_length=hop_length, axis=-1
) # (batch, seq_length)
window_envelop = overlap_add(
x=paddle.tile(
x=paddle.multiply(window, window).unsqueeze(0),
repeat_times=[n_frames,
1]).transpose(perm=[1, 0]), # (n_fft, num_frames)
repeat_times=[n_frames, 1],
).transpose(
perm=[1, 0]
), # (n_fft, num_frames)
hop_length=hop_length,
axis=-1) # (seq_length, )
axis=-1,
) # (seq_length, )
if length is None:
if center:
out = out[:, (n_fft // 2):-(n_fft // 2)]
window_envelop = window_envelop[(n_fft // 2):-(n_fft // 2)]
out = out[:, (n_fft // 2) : -(n_fft // 2)]
window_envelop = window_envelop[(n_fft // 2) : -(n_fft // 2)]
else:
if center:
start = n_fft // 2
else:
start = 0
out = out[:, start:start + length]
window_envelop = window_envelop[start:start + length]
out = out[:, start : start + length]
window_envelop = window_envelop[start : start + length]
# Check whether the Nonzero Overlap Add (NOLA) constraint is met.
if _non_static_mode() and window_envelop.abs().min().item() < 1e-11:
......
......@@ -20,6 +20,7 @@ __all__ = []
class ReLU(Layer):
"""
Sparse ReLU Activation, requiring x to be a SparseCooTensor or SparseCsrTensor.
.. math::
......@@ -44,6 +45,7 @@ class ReLU(Layer):
relu = paddle.sparse.nn.ReLU()
out = relu(sparse_x)
# [0., 0., 1.]
"""
def __init__(self, name=None):
......@@ -59,7 +61,8 @@ class ReLU(Layer):
class Softmax(Layer):
"""
r"""
Sparse Softmax Activation, requiring x to be a SparseCooTensor or SparseCsrTensor.
Note:
......@@ -126,6 +129,7 @@ class Softmax(Layer):
class ReLU6(Layer):
"""
Sparse ReLU6 Activation, requiring x to be a SparseCooTensor or SparseCsrTensor.
.. math::
......@@ -149,6 +153,7 @@ class ReLU6(Layer):
sparse_x = dense_x.to_sparse_coo(1)
relu6 = paddle.sparse.nn.ReLU6()
out = relu6(sparse_x)
"""
def __init__(self, name=None):
......@@ -164,7 +169,8 @@ class ReLU6(Layer):
class LeakyReLU(Layer):
"""
r"""
Sparse Leaky ReLU Activation, requiring x to be a SparseCooTensor or SparseCsrTensor.
.. math::
......@@ -196,6 +202,7 @@ class LeakyReLU(Layer):
sparse_x = dense_x.to_sparse_coo(1)
leaky_relu = paddle.sparse.nn.LeakyReLU(0.5)
out = leaky_relu(sparse_x)
"""
def __init__(self, negative_slope=0.01, name=None):
......
......@@ -1180,7 +1180,8 @@ def triu(x, diagonal=0, name=None):
def meshgrid(*args, **kwargs):
"""
Takes a list of N tensors as input *args, each of which is 1-dimensional vector, and creates N-dimensional grids.
Takes a list of N tensors as input :attr:`*args`, each of which is 1-dimensional vector, and creates N-dimensional grids.
Args:
*args(Tensor|list of Tensor) : tensors (tuple(list) of tensor): the shapes of input k tensors are (N1,),
......
......@@ -22,9 +22,17 @@ from .math import multiply
from .math import sum as paddle_sum
from ..fluid.framework import _in_legacy_dygraph
from paddle import _C_ops, _legacy_C_ops
from ..fluid.data_feeder import check_variable_and_dtype, check_type, check_dtype
from ..fluid.data_feeder import (
check_variable_and_dtype,
check_type,
check_dtype,
)
from ..fluid.layer_helper import LayerHelper
from ..fluid.framework import _non_static_mode, in_dygraph_mode, _in_legacy_dygraph
from ..fluid.framework import (
_non_static_mode,
in_dygraph_mode,
_in_legacy_dygraph,
)
import collections
import string
import opt_einsum
......@@ -52,12 +60,13 @@ def parse_op_labels(labelstr, operand):
'''
# Sanity checks
for c in labelstr.replace('.', ''):
assert c.isalpha(), (
f"Invalid equation: {c} is not a valid label, which should be letters."
)
assert (
c.isalpha()
), f"Invalid equation: {c} is not a valid label, which should be letters."
assert labelstr.replace('...', '', 1).find('.') == -1, (
f"Invalid equation: `.` is found outside of an ellipsis.")
assert (
labelstr.replace('...', '', 1).find('.') == -1
), f"Invalid equation: `.` is found outside of an ellipsis."
# Check shape. Note, in Paddle a tensor rank is always nonzero
ndims = len(operand.shape)
......@@ -65,8 +74,9 @@ def parse_op_labels(labelstr, operand):
full_labelstr = labelstr.replace('...', '.' * (ndims - len(labelstr) + 3))
assert len(full_labelstr) == ndims, (
f"Invalid equation: the label string '{labelstr}' misses dimensions.")
assert (
len(full_labelstr) == ndims
), f"Invalid equation: the label string '{labelstr}' misses dimensions."
return full_labelstr
......@@ -90,7 +100,8 @@ def parse_labels(labelstr, operands):
nop_labels = labelstr.split(',')
assert len(nop_labels) == len(operands), (
f"Invalid equation: the number of operands is {len(operands)}, "
f"but found {len(nop_labels)} segments in the label equation.")
f"but found {len(nop_labels)} segments in the label equation."
)
return list(map(parse_op_labels, nop_labels, operands))
......@@ -101,8 +112,9 @@ def validate_rhs(rhs, input_labels, n_bcast_dims):
'''
# Sanity check.
if n_bcast_dims > 0:
assert '...' in rhs, (
f"Invalid equation: missing ellipsis in output labels.")
assert (
'...' in rhs
), f"Invalid equation: missing ellipsis in output labels."
rhs = rhs.replace('...', '')
rhs_set = set(rhs)
......@@ -114,10 +126,12 @@ def validate_rhs(rhs, input_labels, n_bcast_dims):
non_input_labels = rhs_set.difference(input_labels)
assert not non_input_labels, (
f"Invalid equation: "
f"output label {sorted(non_input_labels)} not used by any input.")
f"output label {sorted(non_input_labels)} not used by any input."
)
# Verify that output labels are not duplicate
assert len(rhs) == len(rhs_set), (
f"Invalid equation: duplicate output labels are found.")
assert len(rhs) == len(
rhs_set
), f"Invalid equation: duplicate output labels are found."
def build_view(in_labels, out_labels):
......@@ -159,8 +173,8 @@ def build_view(in_labels, out_labels):
# fill the broadcast dimension indices from right to left.
if s:
for ax, dim in zip(
range(start, end)[::-1],
range(s.start(), s.end())[::-1]):
range(start, end)[::-1], range(s.start(), s.end())[::-1]
):
inv_map[ax] = dim
# Now work on non-broadcast dimensions
......@@ -219,7 +233,8 @@ def build_global_view(nop_labels, rhs, n_bcast_dims):
g_labels_out = rhs.replace('...', '.' * n_bcast_dims)
else:
g_labels_out = '.' * n_bcast_dims + ''.join(
l for l, c in zip(labels, count) if c == 1)
l for l, c in zip(labels, count) if c == 1
)
for i in range(len(count))[::-1]:
if labels[i] in g_labels_out:
......@@ -267,12 +282,14 @@ def build_global_shape(g_view, g_labels, op_shapes):
assert not non_bcastable, (
f"Invalid operands: label {g_labels[non_bcastable[0]]} "
f"corresponds to non-broadcastable dimensions.")
f"corresponds to non-broadcastable dimensions."
)
g_shape = [sizes.pop() if len(sizes) > 0 else 1 for sizes in g_shape]
g_masks = [[s > 1 or s == -1 for s in view_shape]
for view_shape in view_shapes]
g_masks = [
[s > 1 or s == -1 for s in view_shape] for view_shape in view_shapes
]
return g_shape, g_masks
......@@ -297,8 +314,9 @@ def diagonalize(labels, operand):
--------
'ijj...i' would be merged into 'ij...'
'''
assert not has_duplicated_labels(labels), (
f'Duplicate labels are not supported.')
assert not has_duplicated_labels(
labels
), f'Duplicate labels are not supported.'
return labels, operand
......@@ -358,12 +376,21 @@ def plan_matmul(plan, g_view, op1, op2, g_supports, g_shape, I, J1, J2, K):
plan.add_step(step)
# Check if conditions hold for turnning the operation into a matmul
if j1 + j2 > 0 and k > 0 and -1 not in np.concatenate(
(op1_vshape, op2_vshape)):
op1_shape = list(op1_vshape[I]) + [np.prod(op1_vshape[J1])
] + [np.prod(op1_vshape[K])]
op2_shape = list(op2_vshape[I]) + [np.prod(op2_vshape[J2])
] + [np.prod(op2_vshape[K])]
if (
j1 + j2 > 0
and k > 0
and -1 not in np.concatenate((op1_vshape, op2_vshape))
):
op1_shape = (
list(op1_vshape[I])
+ [np.prod(op1_vshape[J1])]
+ [np.prod(op1_vshape[K])]
)
op2_shape = (
list(op2_vshape[I])
+ [np.prod(op2_vshape[J2])]
+ [np.prod(op2_vshape[K])]
)
# Merge J dims and K dims by reshaping
step = reshape, [var1], var1, op1_shape
......@@ -412,15 +439,22 @@ def plan_matmul(plan, g_view, op1, op2, g_supports, g_shape, I, J1, J2, K):
step = squeeze, [var2], var2, [-1, -2]
plan.add_step(step)
elif j1 + j2 == 0 and not -1 in np.concatenate(
(op1_vshape[K], op2_vshape[K])):
(op1_vshape[K], op2_vshape[K])
):
assert all(op1_vshape[K] == op2_vshape[K])
step = reshape, [
var1
], var1, list(op1_vshape[I]) + [1] + [np.prod(op1_vshape[K])]
step = (
reshape,
[var1],
var1,
list(op1_vshape[I]) + [1] + [np.prod(op1_vshape[K])],
)
plan.add_step(step)
step = reshape, [
var2
], var2, list(op2_vshape[I]) + [1] + [np.prod(op2_vshape[K])]
step = (
reshape,
[var2],
var2,
list(op2_vshape[I]) + [1] + [np.prod(op2_vshape[K])],
)
plan.add_step(step)
step = matmul, [var1, var2], var2, False, True
plan.add_step(step)
......@@ -449,8 +483,9 @@ def plan_matmul(plan, g_view, op1, op2, g_supports, g_shape, I, J1, J2, K):
g_view[op2] = list(op2_view)
def plan_summation(plan, g_view, op1, op2, g_supports, g_shape, g_count,
n_bcast):
def plan_summation(
plan, g_view, op1, op2, g_supports, g_shape, g_count, n_bcast
):
'''
Plan various kinds of summation
'''
......@@ -464,8 +499,9 @@ def plan_summation(plan, g_view, op1, op2, g_supports, g_shape, g_count,
I, K, J1, J2 = list(range(n_bcast)), [], [], []
for ax, dim1, dim2 in zip(range(n_bcast, ndim), op1_view[n_bcast:],
op2_view[n_bcast:]):
for ax, dim1, dim2 in zip(
range(n_bcast, ndim), op1_view[n_bcast:], op2_view[n_bcast:]
):
if (dim1 != -1) != (dim2 != -1):
if dim1 != -1:
......@@ -531,7 +567,6 @@ def plan_broadcast(plan, operands, nop_axes):
class Plan:
def __init__(self):
self.env = {}
self.steps = []
......@@ -635,8 +670,9 @@ def plan_einsum(operands, g_view, g_shape, g_supports, g_count, n_bcast):
# op1 is a one element tensor.
plan_scalar_prod(plan, i - 1, i)
else:
plan_summation(plan, g_view, i - 1, i, g_supports, g_shape, g_count,
n_bcast)
plan_summation(
plan, g_view, i - 1, i, g_supports, g_shape, g_count, n_bcast
)
# for ax, dim in enumerate(g_view[nop-1][:nout]):
# assert dim == ax
......@@ -678,7 +714,9 @@ def preprocess(equation, *operands):
"""
equation = equation.replace(" ", "")
nop = len(operands)
assert nop > 0, "Required at least one operand in Einsum API, but received %s " % nop
assert nop > 0, (
"Required at least one operand in Einsum API, but received %s " % nop
)
# Part the equation to left hand side and right hand side
lhs, *rhs = equation.lower().split('->')
......@@ -692,22 +730,27 @@ def preprocess(equation, *operands):
assert len(lhs.split(',')) == len(operands), (
f"Invalid equation: the number of operands is {len(operands)}, "
f"but found {len(lhs.split(','))} segments in the label equation.")
f"but found {len(lhs.split(','))} segments in the label equation."
)
assert not ('...' in lhs and '...' not in rhs
assert not (
'...' in lhs and '...' not in rhs
), f'Invalid equation: missing ellipsis in output labels.'
assert not (len(list(filter(has_duplicated_labels, lhs.split(',')))) >
0), f'Duplicate labels are not supported.'
assert not (
len(list(filter(has_duplicated_labels, lhs.split(',')))) > 0
), f'Duplicate labels are not supported.'
assert not has_duplicated_labels(
rhs), f'Invalid equation: duplicate output labels are found.'
rhs
), f'Invalid equation: duplicate output labels are found.'
return lhs, rhs, labels
def parse_fake_shape(equation, operands, labels):
"""
this shape is just used for operands planning. may differ with the original shape.
for example:
... is replaced by 1
......@@ -715,14 +758,15 @@ def parse_fake_shape(equation, operands, labels):
Results
-------
list of shape
"""
shaped = collections.namedtuple('shaped', ['shape'])
def fake_shape(label, op):
assert len(op.shape) == len(
label
), "length of shape and length of label must be the same, but received %d != %d" % (
len(op.shape), len(label))
assert len(op.shape) == len(label), (
"length of shape and length of label must be the same, but received %d != %d"
% (len(op.shape), len(label))
)
fakes = [s for i, (l, s) in enumerate(zip(label, op.shape)) if l != '.']
fakes = list(map(abs, fakes)) # make -1 -> 1
if '.' in label:
......@@ -734,7 +778,6 @@ def parse_fake_shape(equation, operands, labels):
def rhs_inference(lhs):
def is_free(key):
return cnt.get(key) == 1 and key not in ['.', ',']
......@@ -753,7 +796,8 @@ def gen_equation_for_opteinsum(lhs, rhs):
def get_used_label(counter):
used = set(counter.elements())
for c in string.ascii_lowercase:
if c not in used: return c
if c not in used:
return c
raise ValueError(
"You have used all `a` - `z`, there can't find a unused for einsum optimization"
)
......@@ -786,14 +830,15 @@ def einsum_v2(equation, *operands):
var_list = list(operands)
for path in cons:
(a, b), _, eq, *__ = path
assert a > b, "Assume the first var_idx is smaller than the second_idx. opt_einsum can guarantee it."
assert (
a > b
), "Assume the first var_idx is smaller than the second_idx. opt_einsum can guarantee it."
var_s = [var_list.pop(a), var_list.pop(b)]
eq = eq.replace(broadcast_label, "...")
var_list.append(gen_einsum_op(eq, *var_s))
assert len(
var_list
) == 1, "There must be one elements in list, but received %d." % len(
var_list)
assert (
len(var_list) == 1
), "There must be one elements in list, but received %d." % len(var_list)
return var_list[0]
......@@ -807,8 +852,9 @@ def gen_einsum_op(equation, *operands):
if _in_legacy_dygraph():
# dygraph
return _legacy_C_ops.einsum(operands, len(operands), len(operands),
'equation', equation)[0]
return _legacy_C_ops.einsum(
operands, len(operands), len(operands), 'equation', equation
)[0]
for inp in operands:
check_variable_and_dtype(inp, 'dtype', ['float32', 'float64'], 'einsum')
......@@ -825,19 +871,18 @@ def gen_einsum_op(equation, *operands):
helper.create_variable_for_type_inference(dtype=operands[0].dtype)
for i in range(len(operands))
]
helper.append_op(type='einsum',
helper.append_op(
type='einsum',
inputs={'Operands': operands},
outputs={
'Out': out,
"InnerCache": caches,
"XShape": xshape
},
attrs=attrs)
outputs={'Out': out, "InnerCache": caches, "XShape": xshape},
attrs=attrs,
)
return out
def einsum(equation, *operands):
r"""
einsum(equation, *operands)
The current version of this API should be used in dygraph only mode.
......@@ -873,8 +918,7 @@ def einsum(equation, *operands):
dimensions into broadcasting dimensions.
- Singular labels are called free labels, duplicate are dummy labels. Dummy labeled
dimensions will be reduced and removed in the output.
- Output labels can be explicitly specified on the right hand side of `->` or omitted.
In the latter case, the output labels will be inferred from the input labels.
- Output labels can be explicitly specified on the right hand side of `->` or omitted. In the latter case, the output labels will be inferred from the input labels.
- Inference of output labels
- Broadcasting label `...`, if present, is put on the leftmost position.
- Free labels are reordered alphabetically and put after `...`.
......@@ -884,10 +928,11 @@ def einsum(equation, *operands):
the sum over the original output.
- Non-input labels are invalid.
- Duplicate labels are invalid.
- For any dummmy label which is present for the output, it's promoted to
- For any dummy label which is present for the output, it's promoted to
a free label.
- For any free label which is not present for the output, it's lowered to
a dummy label.
- Examples
- '...ij, ...jk', where i and k are free labels, j is dummy. The output label
string is '...ik'
......@@ -920,7 +965,7 @@ def einsum(equation, *operands):
operands should equal the number of input terms in the equation.
Returns:
result (`Tensor`): the result tensor.
result (`Tensor`), the result tensor.
Examples:
.. code-block:: python
......@@ -992,8 +1037,10 @@ def einsum(equation, *operands):
# [[0.32043904, 0.18164253, 0.27810261],
# [0.50226176, 0.24512935, 0.39881429],
# [0.51476848, 0.23367381, 0.39229113]]])
"""
import os
if int(os.environ.get('FLAGS_new_einsum', "1")):
return einsum_v2(equation, *operands)
......@@ -1039,9 +1086,11 @@ def einsum(equation, *operands):
# Counting how many non-trivial dimensions remain for each ax
g_labels, g_view, g_nout, g_count = build_global_view(
nop_labels, rhs, n_bcast_dims)
g_shape, g_supports = build_global_shape(g_view, g_labels,
[op.shape for op in operands])
nop_labels, rhs, n_bcast_dims
)
g_shape, g_supports = build_global_shape(
g_view, g_labels, [op.shape for op in operands]
)
# Now we're ready to build up an execution plan
args = operands, g_view, g_shape, g_supports, g_count, n_bcast_dims
......
......@@ -1912,12 +1912,15 @@ def mv(x, vec, name=None):
def det(x, name=None):
"""
Calculates determinant value of a square matrix or batches of square matrices.
Args:
x (Tensor): input (Tensor): the input matrix of size `(n, n)` or the
x (Tensor): the input matrix of size `(n, n)` or the
batch of matrices of size `(*, n, n)` where `*` is one or more
batch dimensions.
name(str, optional): Name of the output. Default is None. It's used
to print debug info for developers. Details: :ref:`api_guide_Name`
Returns:
Tensor, the determinant value of a square matrix or batches of square matrices.
......@@ -1968,18 +1971,20 @@ def det(x, name=None):
def slogdet(x, name=None):
"""
Calculates the sign and natural logarithm of the absolute value of a square matrix's or batches square matrices' determinant.
The determinant can be computed with ``sign * exp(logabsdet)
The determinant can be computed with ``sign * exp`` (logabsdet)
Supports input of float, double
Note that for matrices that have zero determinant, this returns ``(0, -inf)``
Args:
x (Tensor): the batch of matrices of size :math:`(*, n, n)`
where math:`*` is one or more batch dimensions.
Returns:
y (Tensor): A tensor containing the sign of the determinant and the natural logarithm
y (Tensor), A tensor containing the sign of the determinant and the natural logarithm
of the absolute value of determinant, respectively.
Examples:
......@@ -2097,6 +2102,7 @@ def svd(x, full_matrices=False, name=None):
def matrix_power(x, n, name=None):
r"""
Computes the n-th power of a square matrix or a batch of square matrices.
Let :math:`X` be a sqaure matrix or a batch of square matrices, :math:`n` be
......@@ -2122,7 +2128,7 @@ def matrix_power(x, n, name=None):
For more information, please refer to :ref:`api_guide_Name`.
Returns:
Tensor: The n-th power of the matrix (or the batch of matrices) `x`. Its
- Tensor, The n-th power of the matrix (or the batch of matrices) `x`. Its
data type should be the same as that of `x`.
Examples:
......@@ -3058,8 +3064,9 @@ def pinv(x, rcond=1e-15, hermitian=False, name=None):
def solve(x, y, name=None):
r"""
Computes the solution of a square system of linear equations with a unique solution for input 'X' and 'Y'.
Let :math: `X` be a sqaure matrix or a batch of square matrices, :math:`Y` be
Let :math:`X` be a sqaure matrix or a batch of square matrices, :math:`Y` be
a vector/matrix or a batch of vectors/matrices, the equation should be:
.. math::
......@@ -3068,9 +3075,9 @@ def solve(x, y, name=None):
Specifically, this system of linear equations has one solution if and only if input 'X' is invertible.
Args:
x (Tensor): A square matrix or a batch of square matrices. Its shape should be `[*, M, M]`, where `*` is zero or
x (Tensor): A square matrix or a batch of square matrices. Its shape should be ``[*, M, M]``, where ``*`` is zero or
more batch dimensions. Its data type should be float32 or float64.
y (Tensor): A vector/matrix or a batch of vectors/matrices. Its shape should be `[*, M, K]`, where `*` is zero or
y (Tensor): A vector/matrix or a batch of vectors/matrices. Its shape should be ``[*, M, K]``, where ``*`` is zero or
more batch dimensions. Its data type should be float32 or float64.
name(str, optional): Name for the operation (optional, default is None).
For more information, please refer to :ref:`api_guide_Name`.
......
......@@ -223,7 +223,8 @@ def scale(x, scale=1.0, bias=0.0, bias_after_scale=True, act=None, name=None):
def stanh(x, scale_a=0.67, scale_b=1.7159, name=None):
"""
r"""
stanh activation.
.. math::
......@@ -234,8 +235,7 @@ def stanh(x, scale_a=0.67, scale_b=1.7159, name=None):
x (Tensor): The input Tensor with data type float32, float64.
scale_a (float, optional): The scale factor a of the input. Default is 0.67.
scale_b (float, optional): The scale factor b of the output. Default is 1.7159.
name (str, optional): Name for the operation (optional, default is None).
For more information, please refer to :ref:`api_guide_Name`.
name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
Returns:
A Tensor with the same data type and shape as ``x`` .
......
......@@ -1301,6 +1301,7 @@ def distribute_fpn_proposals(
name=None,
):
r"""
In Feature Pyramid Networks (FPN) models, it is needed to distribute
all proposals into different FPN level, with respect to scale of the proposals,
the referring scale and the referring level. Besides, to restore the order of
......@@ -1308,8 +1309,9 @@ def distribute_fpn_proposals(
in current proposals. To compute FPN level for each roi, the formula is given as follows:
.. math::
roi\_scale &= \sqrt{BBoxArea(fpn\_roi)}
level = floor(&\log(\\frac{roi\_scale}{refer\_scale}) + refer\_level)
roi\_scale &= \sqrt{BBoxArea(fpn\_roi)} \\
level &= floor(\log(\frac{roi\_scale}{refer\_scale}) + refer\_level)
where BBoxArea is a function to compute the area of each roi.
Args:
......@@ -1333,11 +1335,11 @@ def distribute_fpn_proposals(
None by default.
Returns:
multi_rois (List) : The proposals in each FPN level. It is a list of 2-D Tensor with shape [M, 4], where M is
- multi_rois (List), The proposals in each FPN level. It is a list of 2-D Tensor with shape [M, 4], where M is
and data type is same as `fpn_rois` . The length is max_level-min_level+1.
restore_ind (Tensor): The index used to restore the order of fpn_rois. It is a 2-D Tensor with shape [N, 1]
- restore_ind (Tensor), The index used to restore the order of fpn_rois. It is a 2-D Tensor with shape [N, 1]
, where N is the number of total rois. The data type is int32.
rois_num_per_level (List): A list of 1-D Tensor and each Tensor is
- rois_num_per_level (List), A list of 1-D Tensor and each Tensor is
the RoIs' number in each image on the corresponding level. The shape
is [B] and data type of int32, where B is the number of images.
......@@ -1356,6 +1358,7 @@ def distribute_fpn_proposals(
refer_level=4,
refer_scale=224,
rois_num=rois_num)
"""
num_lvl = max_level - min_level + 1
......@@ -2441,6 +2444,7 @@ def matrix_nms(
name=None,
):
"""
This operator does matrix non maximum suppression (NMS).
First selects a subset of candidate bounding boxes that have higher scores
than score_threshold (if provided), then the top k candidate is selected if
......@@ -2448,6 +2452,7 @@ def matrix_nms(
decayed according to the Matrix NMS scheme.
Aftern NMS step, at most keep_top_k number of total bboxes are to be kept
per image if keep_top_k is larger than -1.
Args:
bboxes (Tensor): A 3-D Tensor with shape [N, M, 4] represents the
predicted locations of M bounding bboxes,
......@@ -2471,29 +2476,32 @@ def matrix_nms(
on score_threshold.
keep_top_k (int): Number of total bboxes to be kept per image after NMS
step. -1 means keeping all bboxes after NMS step.
use_gaussian (bool): Use Gaussian as the decay function. Default: False
gaussian_sigma (float): Sigma for Gaussian decay function. Default: 2.0
background_label (int): The index of background label, the background
use_gaussian (bool, optional): Use Gaussian as the decay function. Default: False
gaussian_sigma (float, optional): Sigma for Gaussian decay function. Default: 2.0
background_label (int, optional): The index of background label, the background
label will be ignored. If set to -1, then all
categories will be considered. Default: 0
normalized (bool): Whether detections are normalized. Default: True
return_index(bool): Whether return selected index. Default: False
return_rois_num(bool): whether return rois_num. Default: True
name(str): Name of the matrix nms op. Default: None.
normalized (bool, optional): Whether detections are normalized. Default: True
return_index(bool, optional): Whether return selected index. Default: False
return_rois_num(bool, optional): whether return rois_num. Default: True
name(str, optional): Name of the matrix nms op. Default: None.
Returns:
A tuple with three Tensor: (Out, Index, RoisNum) if return_index is True,
- A tuple with three Tensor, (Out, Index, RoisNum) if return_index is True,
otherwise, a tuple with two Tensor (Out, RoisNum) is returned.
Out (Tensor): A 2-D Tensor with shape [No, 6] containing the
- Out (Tensor), A 2-D Tensor with shape [No, 6] containing the
detection results.
Each row has 6 values: [label, confidence, xmin, ymin, xmax, ymax]
Index (Tensor): A 2-D Tensor with shape [No, 1] containing the
Each row has 6 values, [label, confidence, xmin, ymin, xmax, ymax]
- Index (Tensor), A 2-D Tensor with shape [No, 1] containing the
selected indices, which are absolute values cross batches.
rois_num (Tensor): A 1-D Tensor with shape [N] containing
- rois_num (Tensor), A 1-D Tensor with shape [N] containing
the number of detected boxes in each image.
Examples:
.. code-block:: python
import paddle
from paddle.vision.ops import matrix_nms
boxes = paddle.rand([4, 1, 4])
boxes[..., 2] = boxes[..., 0] + boxes[..., 2]
boxes[..., 3] = boxes[..., 1] + boxes[..., 3]
......@@ -2501,6 +2509,7 @@ def matrix_nms(
out = matrix_nms(bboxes=boxes, scores=scores, background_label=0,
score_threshold=0.5, post_threshold=0.1,
nms_top_k=400, keep_top_k=200, normalized=False)
"""
check_variable_and_dtype(
bboxes, 'BBoxes', ['float32', 'float64'], 'matrix_nms'
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册