“6dd2a578dbdba2737e7250d2e91cd0c262840837”上不存在“...mgmt/mgmt_vnode/git@gitcode.net:taosdata/tdengine.git”
未验证 提交 1490aaa9 编写于 作者: U ustiniankw 提交者: GitHub

[cherry-pick2.4]en-docs warning&error fix (#48332)

* fixdocs, test=document_fix

* fixdocs, test=document_fix
上级 3fa7a736
...@@ -26,7 +26,6 @@ non_auto_func_called = True ...@@ -26,7 +26,6 @@ non_auto_func_called = True
def __non_auto_func_called__(func): def __non_auto_func_called__(func):
def __impl__(*args, **kwargs): def __impl__(*args, **kwargs):
global non_auto_func_called global non_auto_func_called
non_auto_func_called = False non_auto_func_called = False
...@@ -112,6 +111,7 @@ class DistributedStrategy(object): ...@@ -112,6 +111,7 @@ class DistributedStrategy(object):
def __init__(self): def __init__(self):
""" """
DistributedStrategy is the main configuration entry for distributed training of Paddle. DistributedStrategy is the main configuration entry for distributed training of Paddle.
All of the distributed training configurations can be configured in DistributedStrategy, All of the distributed training configurations can be configured in DistributedStrategy,
such as automatic mixed precision (AMP), Layer-wise Adaptive Rate Scaling (LARS), such as automatic mixed precision (AMP), Layer-wise Adaptive Rate Scaling (LARS),
...@@ -129,7 +129,8 @@ class DistributedStrategy(object): ...@@ -129,7 +129,8 @@ class DistributedStrategy(object):
key = 'FLAGS_cudnn_batchnorm_spatial_persistent' key = 'FLAGS_cudnn_batchnorm_spatial_persistent'
if _global_flags().is_public(key): if _global_flags().is_public(key):
self.strategy.cudnn_batchnorm_spatial_persistent = bool( self.strategy.cudnn_batchnorm_spatial_persistent = bool(
_global_flags()[key]) _global_flags()[key]
)
key = 'FLAGS_conv_workspace_size_limit' key = 'FLAGS_conv_workspace_size_limit'
if _global_flags().is_public(key): if _global_flags().is_public(key):
self.strategy.conv_workspace_size_limit = int(_global_flags()[key]) self.strategy.conv_workspace_size_limit = int(_global_flags()[key])
...@@ -144,16 +145,17 @@ class DistributedStrategy(object): ...@@ -144,16 +145,17 @@ class DistributedStrategy(object):
def __setattr__(self, key, value): def __setattr__(self, key, value):
if self.__lock_attr and not hasattr(self, key): if self.__lock_attr and not hasattr(self, key):
raise TypeError("%s is not a attribute of %s" % raise TypeError(
(key, self.__class__.__name__)) "%s is not a attribute of %s" % (key, self.__class__.__name__)
)
object.__setattr__(self, key, value) object.__setattr__(self, key, value)
def save_to_prototxt(self, output): def save_to_prototxt(self, output):
""" """
Serialize current DistributedStrategy to string and save to output file Serialize current DistributedStrategy to string and save to output file
Examples: Examples:
.. code-block:: python .. code-block:: python
import paddle.distributed.fleet as fleet import paddle.distributed.fleet as fleet
...@@ -162,25 +164,28 @@ class DistributedStrategy(object): ...@@ -162,25 +164,28 @@ class DistributedStrategy(object):
strategy.recompute = True strategy.recompute = True
strategy.recompute_configs = {"checkpoints": ["x"]} strategy.recompute_configs = {"checkpoints": ["x"]}
strategy.save_to_prototxt("dist_strategy.prototxt") strategy.save_to_prototxt("dist_strategy.prototxt")
""" """
with open(output, "w") as fout: with open(output, "w") as fout:
fout.write(str(self.strategy)) fout.write(str(self.strategy))
def load_from_prototxt(self, pb_file): def load_from_prototxt(self, pb_file):
""" """
Load from prototxt file for DistributedStrategy initialization Load from prototxt file for DistributedStrategy initialization
Examples: Examples:
.. code-block:: python .. code-block:: python
import paddle.distributed.fleet as fleet import paddle.distributed.fleet as fleet
strategy = fleet.DistributedStrategy() strategy = fleet.DistributedStrategy()
strategy.load_from_prototxt("dist_strategy.prototxt") strategy.load_from_prototxt("dist_strategy.prototxt")
""" """
with open(pb_file, 'r') as f: with open(pb_file, 'r') as f:
self.strategy = google.protobuf.text_format.Merge( self.strategy = google.protobuf.text_format.Merge(
str(f.read()), self.strategy) str(f.read()), self.strategy
)
@property @property
def execution_strategy(self): def execution_strategy(self):
...@@ -188,7 +193,6 @@ class DistributedStrategy(object): ...@@ -188,7 +193,6 @@ class DistributedStrategy(object):
Configure ExecutionStrategy for DistributedStrategy Configure ExecutionStrategy for DistributedStrategy
Examples: Examples:
.. code-block:: python .. code-block:: python
import paddle import paddle
...@@ -199,12 +203,16 @@ class DistributedStrategy(object): ...@@ -199,12 +203,16 @@ class DistributedStrategy(object):
strategy = paddle.distributed.fleet.DistributedStrategy() strategy = paddle.distributed.fleet.DistributedStrategy()
strategy.execution_strategy = exe_strategy strategy.execution_strategy = exe_strategy
""" """
execution_strategy = paddle.fluid.ExecutionStrategy() execution_strategy = paddle.fluid.ExecutionStrategy()
fields = self.strategy.execution_strategy.DESCRIPTOR.fields fields = self.strategy.execution_strategy.DESCRIPTOR.fields
for f in fields: for f in fields:
setattr(execution_strategy, f.name, setattr(
getattr(self.strategy.execution_strategy, f.name)) execution_strategy,
f.name,
getattr(self.strategy.execution_strategy, f.name),
)
return execution_strategy return execution_strategy
@execution_strategy.setter @execution_strategy.setter
...@@ -212,18 +220,21 @@ class DistributedStrategy(object): ...@@ -212,18 +220,21 @@ class DistributedStrategy(object):
def execution_strategy(self, strategy): def execution_strategy(self, strategy):
fields = self.strategy.execution_strategy.DESCRIPTOR.fields fields = self.strategy.execution_strategy.DESCRIPTOR.fields
for f in fields: for f in fields:
setattr(self.strategy.execution_strategy, f.name, setattr(
getattr(strategy, f.name)) self.strategy.execution_strategy,
f.name,
getattr(strategy, f.name),
)
@property @property
def build_strategy(self): def build_strategy(self):
""" """
Configure BuildStrategy for DistributedStrategy Configure BuildStrategy for DistributedStrategy
Note that the properties of BuildStrategy are valid in DistributedStrategy Note that the properties of BuildStrategy are valid in DistributedStrategy
only if the property is non-distributed strategy. only if the property is non-distributed strategy.
Examples: Examples:
.. code-block:: python .. code-block:: python
import paddle import paddle
...@@ -239,6 +250,7 @@ class DistributedStrategy(object): ...@@ -239,6 +250,7 @@ class DistributedStrategy(object):
strategy = paddle.distributed.fleet.DistributedStrategy() strategy = paddle.distributed.fleet.DistributedStrategy()
strategy.build_strategy = build_strategy strategy.build_strategy = build_strategy
""" """
build_strategy = paddle.fluid.BuildStrategy() build_strategy = paddle.fluid.BuildStrategy()
...@@ -261,41 +273,48 @@ class DistributedStrategy(object): ...@@ -261,41 +273,48 @@ class DistributedStrategy(object):
value = ReduceStrategyFleet(value) value = ReduceStrategyFleet(value)
setattr(self.strategy.build_strategy, f.name, value) setattr(self.strategy.build_strategy, f.name, value)
elif f.label == 3: # repeated field elif f.label == 3: # repeated field
getattr(self.strategy.build_strategy, getattr(self.strategy.build_strategy, f.name).extend(
f.name).extend(getattr(strategy, f.name)) getattr(strategy, f.name)
)
@property @property
def gradient_scale_configs(self): def gradient_scale_configs(self):
""" """
Set the strategy of gradient scale Set the strategy of gradient scale
Examples:
Examples:
.. code-block:: python .. code-block:: python
import paddle.distributed.fleet as fleet import paddle.distributed.fleet as fleet
strategy = fleet.DistributedStrategy() strategy = fleet.DistributedStrategy()
strategy.gradient_scale_configs = {'scale_strategy': 'avg'} strategy.gradient_scale_configs = {'scale_strategy': 'avg'}
Note that, strategy must be in 'avg', 'sum' or 'customized' Note that, strategy must be in 'avg', 'sum' or 'customized'
""" """
return get_msg_dict(self.strategy.gradient_scale_configs) return get_msg_dict(self.strategy.gradient_scale_configs)
@gradient_scale_configs.setter @gradient_scale_configs.setter
@is_strict_auto @is_strict_auto
def gradient_scale_configs(self, config): def gradient_scale_configs(self, config):
check_configs_key(self.strategy.gradient_scale_configs, config, check_configs_key(
'gradient_scale_configs') self.strategy.gradient_scale_configs,
config,
'gradient_scale_configs',
)
assign_configs_value(self.strategy.gradient_scale_configs, config) assign_configs_value(self.strategy.gradient_scale_configs, config)
@property @property
def a_sync(self): def a_sync(self):
""" """
Indicating whether we are using asynchronous stocastic gradient descent updates Indicating whether we are using asynchronous stocastic gradient descent updates
for training. This property is valid when we are using parameter server training, for training. This property is valid when we are using parameter server training,
which is implied by setting approperate RoleMaker which is implied by setting approperate RoleMaker
Default value: True Default value: True
Examples: Examples:
.. code-block:: python .. code-block:: python
import paddle.distributed.fleet as fleet import paddle.distributed.fleet as fleet
...@@ -307,6 +326,7 @@ class DistributedStrategy(object): ...@@ -307,6 +326,7 @@ class DistributedStrategy(object):
# code block for defining loss and local optimizer # code block for defining loss and local optimizer
# sgd = fleet.distributed_optimizer(optimizer, strategy) # sgd = fleet.distributed_optimizer(optimizer, strategy)
""" """
return self.strategy.a_sync return self.strategy.a_sync
...@@ -318,12 +338,15 @@ class DistributedStrategy(object): ...@@ -318,12 +338,15 @@ class DistributedStrategy(object):
self.a_sync_configs = {"k_steps": 0} self.a_sync_configs = {"k_steps": 0}
else: else:
raise ValueError( raise ValueError(
"The type of `flag` is invalid, expected type is bool, but received {}" "The type of `flag` is invalid, expected type is bool, but received {}".format(
.format(type(flag))) type(flag)
)
)
@property @property
def a_sync_configs(self): def a_sync_configs(self):
""" """
Set a_sync update configurations. In general, asynchronous parameter server Set a_sync update configurations. In general, asynchronous parameter server
training has serveral configurable settings that can be configured through training has serveral configurable settings that can be configured through
a dict. a dict.
...@@ -344,7 +367,6 @@ class DistributedStrategy(object): ...@@ -344,7 +367,6 @@ class DistributedStrategy(object):
runtime_split_send_recv(bool): if we are using Tensor split for send and recv during runtime runtime_split_send_recv(bool): if we are using Tensor split for send and recv during runtime
Examples: Examples:
.. code-block:: python .. code-block:: python
import paddle.distributed.fleet as fleet import paddle.distributed.fleet as fleet
...@@ -365,13 +387,15 @@ class DistributedStrategy(object): ...@@ -365,13 +387,15 @@ class DistributedStrategy(object):
@a_sync_configs.setter @a_sync_configs.setter
@is_strict_auto @is_strict_auto
def a_sync_configs(self, configs): def a_sync_configs(self, configs):
check_configs_key(self.strategy.a_sync_configs, configs, check_configs_key(
"a_sync_configs") self.strategy.a_sync_configs, configs, "a_sync_configs"
)
assign_configs_value(self.strategy.a_sync_configs, configs) assign_configs_value(self.strategy.a_sync_configs, configs)
@property @property
def trainer_desc_configs(self): def trainer_desc_configs(self):
""" """
Set trainer desc configurations. Set trainer desc configurations.
**Notes**: **Notes**:
...@@ -384,7 +408,6 @@ class DistributedStrategy(object): ...@@ -384,7 +408,6 @@ class DistributedStrategy(object):
stat_var_names(list(str)): stat_var_names(list(str)):
Examples: Examples:
.. code-block:: python .. code-block:: python
import paddle.distributed.fleet as fleet import paddle.distributed.fleet as fleet
...@@ -404,11 +427,11 @@ class DistributedStrategy(object): ...@@ -404,11 +427,11 @@ class DistributedStrategy(object):
@property @property
def adam_d2sum(self): def adam_d2sum(self):
""" """
set adam_d2sum set adam_d2sum
Default value: False Default value: False
Examples: Examples:
.. code-block:: python .. code-block:: python
import paddle.distributed.fleet as fleet import paddle.distributed.fleet as fleet
...@@ -420,6 +443,7 @@ class DistributedStrategy(object): ...@@ -420,6 +443,7 @@ class DistributedStrategy(object):
# code block for defining loss and local optimizer # code block for defining loss and local optimizer
# sgd = fleet.distributed_optimizer(optimizer, strategy) # sgd = fleet.distributed_optimizer(optimizer, strategy)
""" """
return self.strategy.adam_d2sum return self.strategy.adam_d2sum
...@@ -430,27 +454,37 @@ class DistributedStrategy(object): ...@@ -430,27 +454,37 @@ class DistributedStrategy(object):
self.strategy.adam_d2sum = flag self.strategy.adam_d2sum = flag
else: else:
raise ValueError( raise ValueError(
"The type of `flag` is invalid, expected type is bool, but received {}" "The type of `flag` is invalid, expected type is bool, but received {}".format(
.format(type(flag))) type(flag)
)
)
@trainer_desc_configs.setter @trainer_desc_configs.setter
@is_strict_auto @is_strict_auto
def trainer_desc_configs(self, configs): def trainer_desc_configs(self, configs):
check_configs_key(self.strategy.trainer_desc_configs, configs, check_configs_key(
"trainer_desc_configs") self.strategy.trainer_desc_configs, configs, "trainer_desc_configs"
)
assign_configs_value(self.strategy.trainer_desc_configs, configs) assign_configs_value(self.strategy.trainer_desc_configs, configs)
@property @property
def fs_client_param(self): def fs_client_param(self):
""" """
Set fs client configurations. Set fs client configurations.
**Notes**:
Note:
uri(str): the uri of fs client uri(str): the uri of fs client
user(str): the user_name of fs client user(str): the user_name of fs client
passwd(str): the passwd of fs client passwd(str): the passwd of fs client
hadoop_bin(str): hadoop_bin(str):
Examples: Examples:
.. code-block:: python .. code-block:: python
import paddle.distributed.fleet as fleet import paddle.distributed.fleet as fleet
role_maker = fleet.PaddleCloudRoleMaker() role_maker = fleet.PaddleCloudRoleMaker()
fleet.init(role_maker) fleet.init(role_maker)
...@@ -459,14 +493,16 @@ class DistributedStrategy(object): ...@@ -459,14 +493,16 @@ class DistributedStrategy(object):
strategy.fs_client_param = configs strategy.fs_client_param = configs
# code block for defining loss and local optimizer # code block for defining loss and local optimizer
# sgd = fleet.distributed_optimizer(optimizer, strategy) # sgd = fleet.distributed_optimizer(optimizer, strategy)
""" """
return self.strategy.fs_client_param return self.strategy.fs_client_param
@fs_client_param.setter @fs_client_param.setter
@is_strict_auto @is_strict_auto
def fs_client_param(self, configs): def fs_client_param(self, configs):
check_configs_key(self.strategy.fs_client_param, configs, check_configs_key(
"fs_client_param") self.strategy.fs_client_param, configs, "fs_client_param"
)
assign_configs_value(self.strategy.fs_client_param, configs) assign_configs_value(self.strategy.fs_client_param, configs)
@property @property
...@@ -477,6 +513,7 @@ class DistributedStrategy(object): ...@@ -477,6 +513,7 @@ class DistributedStrategy(object):
@is_strict_auto @is_strict_auto
def sparse_table_configs(self, configs): def sparse_table_configs(self, configs):
from google.protobuf.descriptor import FieldDescriptor from google.protobuf.descriptor import FieldDescriptor
table_param = self.strategy.downpour_table_param table_param = self.strategy.downpour_table_param
def set_table_config(msg, config_name, configs, index=0): def set_table_config(msg, config_name, configs, index=0):
...@@ -493,8 +530,9 @@ class DistributedStrategy(object): ...@@ -493,8 +530,9 @@ class DistributedStrategy(object):
data = getattr(msg, field.name).add() data = getattr(msg, field.name).add()
set_table_config(data, name, configs, i) set_table_config(data, name, configs, i)
else: else:
set_table_config(getattr(msg, field.name), name, set_table_config(
configs) getattr(msg, field.name), name, configs
)
else: else:
# print("not message:", name) # print("not message:", name)
if name not in configs: if name not in configs:
...@@ -513,133 +551,206 @@ class DistributedStrategy(object): ...@@ -513,133 +551,206 @@ class DistributedStrategy(object):
for table_name in configs: for table_name in configs:
table_data = table_param.add() table_data = table_param.add()
table_data.table_name = table_name table_data.table_name = table_name
set_table_config(table_data, "table_parameters." + table_name, set_table_config(
configs[table_name]) table_data,
"table_parameters." + table_name,
configs[table_name],
)
@sparse_table_configs.setter @sparse_table_configs.setter
def fleet_desc_configs(self, configs): def fleet_desc_configs(self, configs):
support_sparse_key_list = ['sparse_table_class', 'sparse_compress_in_save', 'sparse_shard_num', \ support_sparse_key_list = [
'sparse_accessor_class', 'sparse_learning_rate', 'sparse_initial_g2sum', 'sparse_initial_range', \ 'sparse_table_class',
'sparse_weight_bounds', 'sparse_fea_dim', 'sparse_embedx_dim', 'sparse_embedx_threshold', 'sparse_nonclk_coeff', \ 'sparse_compress_in_save',
'sparse_click_coeff', 'sparse_base_threshold', 'sparse_delta_threshold', 'sparse_delta_keep_days', \ 'sparse_shard_num',
'sparse_delete_after_unseen_days', 'sparse_show_click_decay_rate', 'sparse_delete_threshold', \ 'sparse_accessor_class',
'sparse_converter', 'sparse_deconverter', 'sparse_enable_cache', 'sparse_cache_rate', \ 'sparse_learning_rate',
'sparse_cache_file_num', 'sparse_beta1_decay_rate', 'sparse_beta2_decay_rate', \ 'sparse_initial_g2sum',
'sparse_ada_epsilon', 'sparse_optimizer', 'sparse_ssd_unseenday_threshold', 'sparse_initial_range',
'embed_sparse_optimizer', 'embed_sparse_learning_rate', 'embed_sparse_weight_bounds', \ 'sparse_weight_bounds',
'embed_sparse_initial_range', 'embed_sparse_initial_g2sum', 'embed_sparse_beta1_decay_rate', \ 'sparse_fea_dim',
'embed_sparse_beta2_decay_rate', 'embedx_sparse_optimizer', 'embedx_sparse_learning_rate', \ 'sparse_embedx_dim',
'embedx_sparse_weight_bounds', 'embedx_sparse_initial_range', 'embedx_sparse_initial_g2sum', \ 'sparse_embedx_threshold',
'embedx_sparse_beta1_decay_rate', 'embedx_sparse_beta2_decay_rate', 'feature_learning_rate', 'nodeid_slot'] 'sparse_nonclk_coeff',
'sparse_click_coeff',
'sparse_base_threshold',
'sparse_delta_threshold',
'sparse_delta_keep_days',
'sparse_delete_after_unseen_days',
'sparse_show_click_decay_rate',
'sparse_delete_threshold',
'sparse_converter',
'sparse_deconverter',
'sparse_enable_cache',
'sparse_cache_rate',
'sparse_cache_file_num',
'sparse_beta1_decay_rate',
'sparse_beta2_decay_rate',
'sparse_ada_epsilon',
'sparse_optimizer',
'sparse_ssd_unseenday_threshold',
'embed_sparse_optimizer',
'embed_sparse_learning_rate',
'embed_sparse_weight_bounds',
'embed_sparse_initial_range',
'embed_sparse_initial_g2sum',
'embed_sparse_beta1_decay_rate',
'embed_sparse_beta2_decay_rate',
'embedx_sparse_optimizer',
'embedx_sparse_learning_rate',
'embedx_sparse_weight_bounds',
'embedx_sparse_initial_range',
'embedx_sparse_initial_g2sum',
'embedx_sparse_beta1_decay_rate',
'embedx_sparse_beta2_decay_rate',
'feature_learning_rate',
'nodeid_slot',
]
support_sparse_table_class = ['DownpourSparseTable'] support_sparse_table_class = ['DownpourSparseTable']
support_sparse_accessor_class = [ support_sparse_accessor_class = [
'DownpourSparseValueAccessor', 'DownpourCtrAccessor', 'DownpourSparseValueAccessor',
'DownpourCtrDoubleAccessor', 'DownpourUnitAccessor', 'DownpourCtrAccessor',
'DownpourDoubleUnitAccessor', 'DownpourCtrDymfAccessor' 'DownpourCtrDoubleAccessor',
'DownpourUnitAccessor',
'DownpourDoubleUnitAccessor',
'DownpourCtrDymfAccessor',
] ]
from google.protobuf.descriptor import FieldDescriptor from google.protobuf.descriptor import FieldDescriptor
table_param = self.strategy.downpour_table_param table_param = self.strategy.downpour_table_param
def add_graph_config(graph, strategy): def add_graph_config(graph, strategy):
graph.feature_learning_rate = strategy.get('feature_learning_rate', graph.feature_learning_rate = strategy.get(
0.05) 'feature_learning_rate', 0.05
)
graph.nodeid_slot = strategy.get('nodeid_slot', 9008) graph.nodeid_slot = strategy.get('nodeid_slot', 9008)
def sparse_optimizer_config(sgd, strategy, prefix): def sparse_optimizer_config(sgd, strategy, prefix):
optimizer_name = strategy.get(prefix + "sparse_optimizer", optimizer_name = strategy.get(
"adagrad") prefix + "sparse_optimizer", "adagrad"
)
sgd.name = optimizer_name sgd.name = optimizer_name
if optimizer_name == "naive": if optimizer_name == "naive":
sgd.name = "SparseNaiveSGDRule" sgd.name = "SparseNaiveSGDRule"
sgd.naive.learning_rate = strategy.get( sgd.naive.learning_rate = strategy.get(
prefix + 'sparse_learning_rate', 0.05) prefix + 'sparse_learning_rate', 0.05
)
sgd.naive.initial_range = strategy.get( sgd.naive.initial_range = strategy.get(
prefix + 'sparse_initial_range', 1e-4) prefix + 'sparse_initial_range', 1e-4
bounds = strategy.get(prefix + 'sparse_weight_bounds', )
[-10, 10]) bounds = strategy.get(
prefix + 'sparse_weight_bounds', [-10, 10]
)
sgd.naive.weight_bounds.extend(bounds) sgd.naive.weight_bounds.extend(bounds)
elif optimizer_name == "adagrad": elif optimizer_name == "adagrad":
sgd.name = 'SparseAdaGradSGDRule' sgd.name = 'SparseAdaGradSGDRule'
sgd.adagrad.learning_rate = strategy.get( sgd.adagrad.learning_rate = strategy.get(
prefix + 'sparse_learning_rate', 0.05) prefix + 'sparse_learning_rate', 0.05
)
sgd.adagrad.initial_range = strategy.get( sgd.adagrad.initial_range = strategy.get(
prefix + 'sparse_initial_range', 1e-4) prefix + 'sparse_initial_range', 1e-4
)
if prefix == "embed_": if prefix == "embed_":
sgd.adagrad.initial_range = 0 sgd.adagrad.initial_range = 0
sgd.adagrad.initial_g2sum = strategy.get( sgd.adagrad.initial_g2sum = strategy.get(
prefix + 'sparse_initial_g2sum', 3) prefix + 'sparse_initial_g2sum', 3
bounds = strategy.get(prefix + 'sparse_weight_bounds', )
[-10, 10]) bounds = strategy.get(
prefix + 'sparse_weight_bounds', [-10, 10]
)
sgd.adagrad.weight_bounds.extend(bounds) sgd.adagrad.weight_bounds.extend(bounds)
elif optimizer_name == "std_adagrad": elif optimizer_name == "std_adagrad":
sgd.name = 'StdAdaGradSGDRule' sgd.name = 'StdAdaGradSGDRule'
sgd.adagrad.learning_rate = strategy.get( sgd.adagrad.learning_rate = strategy.get(
prefix + 'sparse_learning_rate', 0.05) prefix + 'sparse_learning_rate', 0.05
)
sgd.adagrad.initial_range = strategy.get( sgd.adagrad.initial_range = strategy.get(
prefix + 'sparse_initial_range', 1e-4) prefix + 'sparse_initial_range', 1e-4
)
if prefix == "embed_": if prefix == "embed_":
sgd.adagrad.initial_range = 0 sgd.adagrad.initial_range = 0
sgd.adagrad.initial_g2sum = strategy.get( sgd.adagrad.initial_g2sum = strategy.get(
prefix + 'sparse_initial_g2sum', 3) prefix + 'sparse_initial_g2sum', 3
bounds = strategy.get(prefix + 'sparse_weight_bounds', )
[-10, 10]) bounds = strategy.get(
prefix + 'sparse_weight_bounds', [-10, 10]
)
sgd.adagrad.weight_bounds.extend(bounds) sgd.adagrad.weight_bounds.extend(bounds)
elif optimizer_name == "adam": elif optimizer_name == "adam":
sgd.name = 'SparseAdamSGDRule' sgd.name = 'SparseAdamSGDRule'
sgd.adam.learning_rate = strategy.get( sgd.adam.learning_rate = strategy.get(
prefix + 'sparse_learning_rate', 0.001) prefix + 'sparse_learning_rate', 0.001
)
sgd.adam.initial_range = strategy.get( sgd.adam.initial_range = strategy.get(
prefix + 'sparse_initial_range', 1e-4) prefix + 'sparse_initial_range', 1e-4
)
sgd.adam.beta1_decay_rate = strategy.get( sgd.adam.beta1_decay_rate = strategy.get(
prefix + 'sparse_beta1_decay_rate', 0.9) prefix + 'sparse_beta1_decay_rate', 0.9
)
sgd.adam.beta2_decay_rate = strategy.get( sgd.adam.beta2_decay_rate = strategy.get(
prefix + 'sparse_beta2_decay_rate', 0.999) prefix + 'sparse_beta2_decay_rate', 0.999
)
sgd.adam.ada_epsilon = strategy.get( sgd.adam.ada_epsilon = strategy.get(
prefix + 'sparse_ada_epsilon', 1e-8) prefix + 'sparse_ada_epsilon', 1e-8
bounds = strategy.get(prefix + 'sparse_weight_bounds', )
[-10, 10]) bounds = strategy.get(
prefix + 'sparse_weight_bounds', [-10, 10]
)
sgd.adam.weight_bounds.extend(bounds) sgd.adam.weight_bounds.extend(bounds)
elif optimizer_name == "shared_adam": elif optimizer_name == "shared_adam":
sgd.name = 'SparseSharedAdamSGDRule' sgd.name = 'SparseSharedAdamSGDRule'
sgd.adam.learning_rate = strategy.get( sgd.adam.learning_rate = strategy.get(
prefix + 'sparse_learning_rate', 0.001) prefix + 'sparse_learning_rate', 0.001
)
sgd.adam.initial_range = strategy.get( sgd.adam.initial_range = strategy.get(
prefix + 'sparse_initial_range', 1e-4) prefix + 'sparse_initial_range', 1e-4
)
sgd.adam.beta1_decay_rate = strategy.get( sgd.adam.beta1_decay_rate = strategy.get(
prefix + 'sparse_beta1_decay_rate', 0.9) prefix + 'sparse_beta1_decay_rate', 0.9
)
sgd.adam.beta2_decay_rate = strategy.get( sgd.adam.beta2_decay_rate = strategy.get(
prefix + 'sparse_beta2_decay_rate', 0.999) prefix + 'sparse_beta2_decay_rate', 0.999
)
sgd.adam.ada_epsilon = strategy.get( sgd.adam.ada_epsilon = strategy.get(
prefix + 'sparse_ada_epsilon', 1e-8) prefix + 'sparse_ada_epsilon', 1e-8
bounds = strategy.get(prefix + 'sparse_weight_bounds', )
[-10, 10]) bounds = strategy.get(
prefix + 'sparse_weight_bounds', [-10, 10]
)
sgd.adam.weight_bounds.extend(bounds) sgd.adam.weight_bounds.extend(bounds)
def set_sparse_table_config(table_data, config): def set_sparse_table_config(table_data, config):
for key in config: for key in config:
if key not in support_sparse_key_list: if key not in support_sparse_key_list:
raise ValueError("strategy key '%s' not support" % (key)) raise ValueError("strategy key '%s' not support" % (key))
table_class = config.get("sparse_table_class", table_class = config.get(
"DownpourSparseTable") "sparse_table_class", "DownpourSparseTable"
)
if table_class not in support_sparse_table_class: if table_class not in support_sparse_table_class:
raise ValueError( raise ValueError(
"support sparse_table_class: ['DownpourSparseTable'], but actual %s" "support sparse_table_class: ['DownpourSparseTable'], but actual %s"
% (table_class)) % (table_class)
)
table_data.table_class = 'MemorySparseTable' table_data.table_class = 'MemorySparseTable'
table_data.shard_num = config.get('sparse_shard_num', 1000) table_data.shard_num = config.get('sparse_shard_num', 1000)
table_data.enable_sparse_table_cache = config.get( table_data.enable_sparse_table_cache = config.get(
'sparse_enable_cache', True) 'sparse_enable_cache', True
)
table_data.sparse_table_cache_rate = config.get( table_data.sparse_table_cache_rate = config.get(
'sparse_cache_rate', 0.00055) 'sparse_cache_rate', 0.00055
)
table_data.sparse_table_cache_file_num = config.get( table_data.sparse_table_cache_file_num = config.get(
'sparse_cache_file_num', 16) 'sparse_cache_file_num', 16
)
accessor_class = config.get("sparse_accessor_class", accessor_class = config.get(
"DownpourCtrAccessor") "sparse_accessor_class", "DownpourCtrAccessor"
)
if accessor_class not in support_sparse_accessor_class: if accessor_class not in support_sparse_accessor_class:
raise ValueError( raise ValueError(
"support sparse_accessor_class: ['DownpourSparseValueAccessor', 'DownpourCtrAccessor', 'DownpourCtrDoubleAccessor', 'DownpourUnitAccessor', 'DownpourDoubleUnitAccessor'], but actual %s" "support sparse_accessor_class: ['DownpourSparseValueAccessor', 'DownpourCtrAccessor', 'DownpourCtrDoubleAccessor', 'DownpourUnitAccessor', 'DownpourDoubleUnitAccessor'], but actual %s"
% (accessor_class)) % (accessor_class)
)
if accessor_class.find("Double") >= 0: if accessor_class.find("Double") >= 0:
table_data.accessor.accessor_class = 'CtrDoubleAccessor' table_data.accessor.accessor_class = 'CtrDoubleAccessor'
...@@ -654,7 +765,8 @@ class DistributedStrategy(object): ...@@ -654,7 +765,8 @@ class DistributedStrategy(object):
table_data.accessor.embedx_dim = config.get('sparse_embedx_dim', 8) table_data.accessor.embedx_dim = config.get('sparse_embedx_dim', 8)
table_data.accessor.fea_dim = table_data.accessor.embedx_dim + 3 table_data.accessor.fea_dim = table_data.accessor.embedx_dim + 3
table_data.accessor.embedx_threshold = config.get( table_data.accessor.embedx_threshold = config.get(
'sparse_embedx_threshold', 10) 'sparse_embedx_threshold', 10
)
if accessor_class == 'DownpourUnitAccessor': if accessor_class == 'DownpourUnitAccessor':
table_data.accessor.ctr_accessor_param.show_scale = False table_data.accessor.ctr_accessor_param.show_scale = False
...@@ -662,23 +774,32 @@ class DistributedStrategy(object): ...@@ -662,23 +774,32 @@ class DistributedStrategy(object):
table_data.accessor.ctr_accessor_param.show_scale = True table_data.accessor.ctr_accessor_param.show_scale = True
table_data.accessor.ctr_accessor_param.nonclk_coeff = config.get( table_data.accessor.ctr_accessor_param.nonclk_coeff = config.get(
'sparse_nonclk_coeff', 0.1) 'sparse_nonclk_coeff', 0.1
)
table_data.accessor.ctr_accessor_param.click_coeff = config.get( table_data.accessor.ctr_accessor_param.click_coeff = config.get(
'sparse_click_coeff', 1) 'sparse_click_coeff', 1
)
table_data.accessor.ctr_accessor_param.base_threshold = config.get( table_data.accessor.ctr_accessor_param.base_threshold = config.get(
'sparse_base_threshold', 1.5) 'sparse_base_threshold', 1.5
)
table_data.accessor.ctr_accessor_param.delta_threshold = config.get( table_data.accessor.ctr_accessor_param.delta_threshold = config.get(
'sparse_delta_threshold', 0.25) 'sparse_delta_threshold', 0.25
)
table_data.accessor.ctr_accessor_param.delta_keep_days = config.get( table_data.accessor.ctr_accessor_param.delta_keep_days = config.get(
'sparse_delta_keep_days', 16) 'sparse_delta_keep_days', 16
table_data.accessor.ctr_accessor_param.show_click_decay_rate = config.get( )
'sparse_show_click_decay_rate', 0.98) table_data.accessor.ctr_accessor_param.show_click_decay_rate = (
table_data.accessor.ctr_accessor_param.delete_threshold = config.get( config.get('sparse_show_click_decay_rate', 0.98)
'sparse_delete_threshold', 0.8) )
table_data.accessor.ctr_accessor_param.delete_after_unseen_days = config.get( table_data.accessor.ctr_accessor_param.delete_threshold = (
'sparse_delete_after_unseen_days', 30) config.get('sparse_delete_threshold', 0.8)
table_data.accessor.ctr_accessor_param.ssd_unseenday_threshold = config.get( )
'sparse_ssd_unseenday_threshold', 1) table_data.accessor.ctr_accessor_param.delete_after_unseen_days = (
config.get('sparse_delete_after_unseen_days', 30)
)
table_data.accessor.ctr_accessor_param.ssd_unseenday_threshold = (
config.get('sparse_ssd_unseenday_threshold', 1)
)
converter = config.get('sparse_converter', "") converter = config.get('sparse_converter', "")
deconverter = config.get('sparse_deconverter', "") deconverter = config.get('sparse_deconverter', "")
...@@ -692,23 +813,33 @@ class DistributedStrategy(object): ...@@ -692,23 +813,33 @@ class DistributedStrategy(object):
save_data2.converter = converter save_data2.converter = converter
save_data2.deconverter = deconverter save_data2.deconverter = deconverter
if accessor_class == 'DownpourCtrAccessor' or accessor_class == 'DownpourCtrDoubleAccessor': if (
sparse_optimizer_config(table_data.accessor.embed_sgd_param, accessor_class == 'DownpourCtrAccessor'
config, '') or accessor_class == 'DownpourCtrDoubleAccessor'
sparse_optimizer_config(table_data.accessor.embedx_sgd_param, ):
config, '') sparse_optimizer_config(
table_data.accessor.embed_sgd_param, config, ''
)
sparse_optimizer_config(
table_data.accessor.embedx_sgd_param, config, ''
)
else: else:
sparse_optimizer_config(table_data.accessor.embed_sgd_param, sparse_optimizer_config(
config, 'embed_') table_data.accessor.embed_sgd_param, config, 'embed_'
sparse_optimizer_config(table_data.accessor.embedx_sgd_param, )
config, 'embedx_') sparse_optimizer_config(
table_data.accessor.embedx_sgd_param, config, 'embedx_'
)
add_graph_config(table_data.accessor.graph_sgd_param, config) add_graph_config(table_data.accessor.graph_sgd_param, config)
if not configs: if not configs:
print("fleet desc config is empty") print("fleet desc config is empty")
else: else:
for table_name in configs: for table_name in configs:
if table_name == 'dense_table' or table_name == 'datanorm_table': if (
table_name == 'dense_table'
or table_name == 'datanorm_table'
):
continue continue
if type(configs[table_name]) != dict: if type(configs[table_name]) != dict:
continue continue
...@@ -744,6 +875,7 @@ class DistributedStrategy(object): ...@@ -744,6 +875,7 @@ class DistributedStrategy(object):
@property @property
def amp_configs(self): def amp_configs(self):
""" """
Set automatic mixed precision training configurations. In general, amp has serveral configurable Set automatic mixed precision training configurations. In general, amp has serveral configurable
settings that can be configured through a dict. settings that can be configured through a dict.
...@@ -772,7 +904,6 @@ class DistributedStrategy(object): ...@@ -772,7 +904,6 @@ class DistributedStrategy(object):
Default True. Only takes effect when `use_pure_fp16` is turned on. Default True. Only takes effect when `use_pure_fp16` is turned on.
Examples 1: Examples 1:
.. code-block:: python .. code-block:: python
import paddle.distributed.fleet as fleet import paddle.distributed.fleet as fleet
...@@ -783,7 +914,6 @@ class DistributedStrategy(object): ...@@ -783,7 +914,6 @@ class DistributedStrategy(object):
"custom_white_list": ['conv2d']} "custom_white_list": ['conv2d']}
Examples 2: Examples 2:
.. code-block:: python .. code-block:: python
import paddle.distributed.fleet as fleet import paddle.distributed.fleet as fleet
...@@ -794,6 +924,7 @@ class DistributedStrategy(object): ...@@ -794,6 +924,7 @@ class DistributedStrategy(object):
"init_loss_scaling": 32768, "init_loss_scaling": 32768,
"use_pure_fp16": True "use_pure_fp16": True
} }
""" """
return get_msg_dict(self.strategy.amp_configs) return get_msg_dict(self.strategy.amp_configs)
...@@ -806,11 +937,11 @@ class DistributedStrategy(object): ...@@ -806,11 +937,11 @@ class DistributedStrategy(object):
@property @property
def asp(self): def asp(self):
""" """
Indicating whether we are using automatic sparsity training Indicating whether we are using automatic sparsity training
Default Value: False Default Value: False
Examples: Examples:
.. code-block:: python .. code-block:: python
import paddle.distributed.fleet as fleet import paddle.distributed.fleet as fleet
...@@ -835,7 +966,6 @@ class DistributedStrategy(object): ...@@ -835,7 +966,6 @@ class DistributedStrategy(object):
Default value: False Default value: False
Examples: Examples:
.. code-block:: python .. code-block:: python
import paddle.distributed.fleet as fleet import paddle.distributed.fleet as fleet
...@@ -843,22 +973,24 @@ class DistributedStrategy(object): ...@@ -843,22 +973,24 @@ class DistributedStrategy(object):
strategy.recompute = True strategy.recompute = True
# suppose x and y are names of checkpoint tensors for recomputation # suppose x and y are names of checkpoint tensors for recomputation
strategy.recompute_configs = {"checkpoints": ["x", "y"]} strategy.recompute_configs = {"checkpoints": ["x", "y"]}
""" """
return self.strategy.recompute return self.strategy.recompute
@property @property
def sync_nccl_allreduce(self): def sync_nccl_allreduce(self):
""" """
Indicating whether we are using synchronized all reduce in each communication thread Indicating whether we are using synchronized all reduce in each communication thread
We note that system overhead is usually lower when sync_nccl_allreduce = True We note that system overhead is usually lower when sync_nccl_allreduce = True
Examples: Examples:
.. code-block:: python .. code-block:: python
import paddle.distributed.fleet as fleet import paddle.distributed.fleet as fleet
strategy = fleet.DistributedStrategy() strategy = fleet.DistributedStrategy()
strategy.sync_nccl_allreduce = True strategy.sync_nccl_allreduce = True
""" """
return self.strategy.sync_nccl_allreduce return self.strategy.sync_nccl_allreduce
...@@ -873,17 +1005,18 @@ class DistributedStrategy(object): ...@@ -873,17 +1005,18 @@ class DistributedStrategy(object):
@property @property
def use_hierarchical_allreduce(self): def use_hierarchical_allreduce(self):
""" """
Indicating whether we are using hierarchical allreduce in collective communication Indicating whether we are using hierarchical allreduce in collective communication
Hierarchical allreduce often does allreduce within a certain node group and then do Hierarchical allreduce often does allreduce within a certain node group and then do
allreduce among the leaders of each group allreduce among the leaders of each group
Examples: Examples:
.. code-block:: python .. code-block:: python
import paddle.distributed.fleet as fleet import paddle.distributed.fleet as fleet
strategy = fleet.DistributedStrategy() strategy = fleet.DistributedStrategy()
strategy.use_hierarchical_allreduce = True strategy.use_hierarchical_allreduce = True
""" """
return self.strategy.use_hierarchical_allreduce return self.strategy.use_hierarchical_allreduce
...@@ -900,16 +1033,17 @@ class DistributedStrategy(object): ...@@ -900,16 +1033,17 @@ class DistributedStrategy(object):
@property @property
def hierarchical_allreduce_inter_nranks(self): def hierarchical_allreduce_inter_nranks(self):
""" """
Number of ranks for low level node groups in hierarchical allreduce Number of ranks for low level node groups in hierarchical allreduce
Default value: number of GPU cards on each single GPU machine Default value: number of GPU cards on each single GPU machine
Example: Example:
.. code-block:: python .. code-block:: python
import paddle.distributed.fleet as fleet import paddle.distributed.fleet as fleet
strategy = fleet.DistributedStrategy() strategy = fleet.DistributedStrategy()
strategy.hierarchical_allreduce_inter_nranks = 8 strategy.hierarchical_allreduce_inter_nranks = 8
""" """
return self.strategy.hierarchical_allreduce_inter_nranks return self.strategy.hierarchical_allreduce_inter_nranks
...@@ -926,17 +1060,18 @@ class DistributedStrategy(object): ...@@ -926,17 +1060,18 @@ class DistributedStrategy(object):
@property @property
def sync_batch_norm(self): def sync_batch_norm(self):
""" """
Indicating whether we are using sync_batch_norm to do synchronous batch normalization among all training nodes. Indicating whether we are using sync_batch_norm to do synchronous batch normalization among all training nodes.
Default value: False Default value: False
Examples: Examples:
.. code-block:: python .. code-block:: python
import paddle.distributed.fleet as fleet import paddle.distributed.fleet as fleet
strategy = fleet.DistributedStrategy() strategy = fleet.DistributedStrategy()
strategy.sync_batch_norm = True strategy.sync_batch_norm = True
""" """
return self.strategy.sync_batch_norm return self.strategy.sync_batch_norm
...@@ -952,16 +1087,17 @@ class DistributedStrategy(object): ...@@ -952,16 +1087,17 @@ class DistributedStrategy(object):
@property @property
def fuse_all_reduce_ops(self): def fuse_all_reduce_ops(self):
""" """
Indicating whether we are using fuse_all_reduce_ops for gradient fusion during backward phase of training Indicating whether we are using fuse_all_reduce_ops for gradient fusion during backward phase of training
Default value: True Default value: True
Examples: Examples:
.. code-block:: python .. code-block:: python
import paddle.distributed.fleet as fleet import paddle.distributed.fleet as fleet
strategy = fleet.DistributedStrategy() strategy = fleet.DistributedStrategy()
strategy.fuse_all_reduce_ops = False strategy.fuse_all_reduce_ops = False
""" """
return self.strategy.fuse_all_reduce_ops return self.strategy.fuse_all_reduce_ops
...@@ -976,17 +1112,18 @@ class DistributedStrategy(object): ...@@ -976,17 +1112,18 @@ class DistributedStrategy(object):
@property @property
def fuse_grad_size_in_MB(self): def fuse_grad_size_in_MB(self):
""" """
Specifying the size of gradient to fuse in Mega-Bytes Specifying the size of gradient to fuse in Mega-Bytes
Default value: 32 Default value: 32
Examples: Examples:
.. code-block:: python .. code-block:: python
import paddle.distributed.fleet as fleet import paddle.distributed.fleet as fleet
strategy = fleet.DistributedStrategy() strategy = fleet.DistributedStrategy()
strategy.fuse_grad_size_in_MB = 50 strategy.fuse_grad_size_in_MB = 50
""" """
return self.strategy.fuse_grad_size_in_MB return self.strategy.fuse_grad_size_in_MB
...@@ -1001,6 +1138,7 @@ class DistributedStrategy(object): ...@@ -1001,6 +1138,7 @@ class DistributedStrategy(object):
@property @property
def last_comm_group_size_MB(self): def last_comm_group_size_MB(self):
""" """
Specifying the size of gradient to fuse in Mega-Bytes when Specifying the size of gradient to fuse in Mega-Bytes when
the last group of each batch communicates. Making the last group the last group of each batch communicates. Making the last group
small is useful to improve performance. small is useful to improve performance.
...@@ -1013,6 +1151,7 @@ class DistributedStrategy(object): ...@@ -1013,6 +1151,7 @@ class DistributedStrategy(object):
import paddle.distributed.fleet as fleet import paddle.distributed.fleet as fleet
strategy = fleet.DistributedStrategy() strategy = fleet.DistributedStrategy()
strategy.last_comm_group_size_MB = 2 strategy.last_comm_group_size_MB = 2
""" """
return self.strategy.last_comm_group_size_MB return self.strategy.last_comm_group_size_MB
...@@ -1027,18 +1166,19 @@ class DistributedStrategy(object): ...@@ -1027,18 +1166,19 @@ class DistributedStrategy(object):
@property @property
def find_unused_parameters(self): def find_unused_parameters(self):
""" """
Indicating whether we are using find_unused_parameters to Indicating whether we are using find_unused_parameters to
find unused parameters in DataParallel. find unused parameters in DataParallel.
Default value: False Default value: False
Examples: Examples:
.. code-block:: python .. code-block:: python
import paddle.distributed.fleet as fleet import paddle.distributed.fleet as fleet
strategy = fleet.DistributedStrategy() strategy = fleet.DistributedStrategy()
strategy.find_unused_parameters = True strategy.find_unused_parameters = True
""" """
return self.strategy.find_unused_parameters return self.strategy.find_unused_parameters
...@@ -1070,17 +1210,18 @@ class DistributedStrategy(object): ...@@ -1070,17 +1210,18 @@ class DistributedStrategy(object):
@property @property
def nccl_comm_num(self): def nccl_comm_num(self):
""" """
Specifying the number of NCCL communicator Specifying the number of NCCL communicator
Default value: 1 Default value: 1
Examples: Examples:
.. code-block:: python .. code-block:: python
import paddle.distributed.fleet as fleet import paddle.distributed.fleet as fleet
strategy = fleet.DistributedStrategy() strategy = fleet.DistributedStrategy()
strategy.nccl_comm_num = 2 strategy.nccl_comm_num = 2
""" """
return self.strategy.nccl_comm_num return self.strategy.nccl_comm_num
...@@ -1104,6 +1245,7 @@ class DistributedStrategy(object): ...@@ -1104,6 +1245,7 @@ class DistributedStrategy(object):
@property @property
def recompute_configs(self): def recompute_configs(self):
""" """
Set recompute configurations. Set recompute configurations.
**Note**: **Note**:
...@@ -1120,7 +1262,6 @@ class DistributedStrategy(object): ...@@ -1120,7 +1262,6 @@ class DistributedStrategy(object):
specific here should be determined ("-1" is not allowed). specific here should be determined ("-1" is not allowed).
Examples: Examples:
.. code-block:: python .. code-block:: python
import paddle.distributed.fleet as fleet import paddle.distributed.fleet as fleet
...@@ -1137,13 +1278,15 @@ class DistributedStrategy(object): ...@@ -1137,13 +1278,15 @@ class DistributedStrategy(object):
@recompute_configs.setter @recompute_configs.setter
@is_strict_auto @is_strict_auto
def recompute_configs(self, configs): def recompute_configs(self, configs):
check_configs_key(self.strategy.recompute_configs, configs, check_configs_key(
"checkpoint_configs") self.strategy.recompute_configs, configs, "checkpoint_configs"
)
assign_configs_value(self.strategy.recompute_configs, configs) assign_configs_value(self.strategy.recompute_configs, configs)
@property @property
def sharding(self): def sharding(self):
""" """
Indicating whether we are using sharding Optimizer for memory Indicating whether we are using sharding Optimizer for memory
optimization. We implement the sharding optimizer following the ZeRO-DP optimization. We implement the sharding optimizer following the ZeRO-DP
idea from [ZeRO: Memory Optimizations Toward Training Trillion Parameter Models](https://arxiv.org/abs/1910.02054). idea from [ZeRO: Memory Optimizations Toward Training Trillion Parameter Models](https://arxiv.org/abs/1910.02054).
...@@ -1154,12 +1297,12 @@ class DistributedStrategy(object): ...@@ -1154,12 +1297,12 @@ class DistributedStrategy(object):
Default value: False Default value: False
Examples: Examples:
.. code-block:: python .. code-block:: python
import paddle.fleet as fleet import paddle.fleet as fleet
strategy = fleet.DistributedStrategy() strategy = fleet.DistributedStrategy()
strategy.sharding = True strategy.sharding = True
""" """
return self.strategy.sharding return self.strategy.sharding
...@@ -1174,6 +1317,7 @@ class DistributedStrategy(object): ...@@ -1174,6 +1317,7 @@ class DistributedStrategy(object):
@property @property
def sharding_configs(self): def sharding_configs(self):
""" """
Set sharding configurations. Set sharding configurations.
**Note**: **Note**:
...@@ -1211,7 +1355,6 @@ class DistributedStrategy(object): ...@@ -1211,7 +1355,6 @@ class DistributedStrategy(object):
Examples: Examples:
.. code-block:: python .. code-block:: python
# sharding-DP, 2 nodes with 8 gpus per node # sharding-DP, 2 nodes with 8 gpus per node
...@@ -1225,23 +1368,25 @@ class DistributedStrategy(object): ...@@ -1225,23 +1368,25 @@ class DistributedStrategy(object):
"dp_degree": 2, "dp_degree": 2,
"gradient_merge_acc_step": 4, "gradient_merge_acc_step": 4,
} }
""" """
return get_msg_dict(self.strategy.sharding_configs) return get_msg_dict(self.strategy.sharding_configs)
@sharding_configs.setter @sharding_configs.setter
@is_strict_auto @is_strict_auto
def sharding_configs(self, configs): def sharding_configs(self, configs):
check_configs_key(self.strategy.sharding_configs, configs, check_configs_key(
"sharding_configs") self.strategy.sharding_configs, configs, "sharding_configs"
)
assign_configs_value(self.strategy.sharding_configs, configs) assign_configs_value(self.strategy.sharding_configs, configs)
@property @property
def without_graph_optimization(self): def without_graph_optimization(self):
""" """
Run program using Executor other than ParallelExecutor. Run program using Executor other than ParallelExecutor.
Examples: Examples:
.. code-block:: python .. code-block:: python
import paddle.distributed.fleet as fleet import paddle.distributed.fleet as fleet
...@@ -1264,14 +1409,18 @@ class DistributedStrategy(object): ...@@ -1264,14 +1409,18 @@ class DistributedStrategy(object):
@property @property
def _calc_comm_same_stream(self): def _calc_comm_same_stream(self):
""" """
This based on raw_program_optimizer program This based on raw_program_optimizer program
Set whether use same stream for calc and comm when fuse allreduce Set whether use same stream for calc and comm when fuse allreduce
The default value for the calc_comm_same_stream is False The default value for the calc_comm_same_stream is False
Examples: Examples:
.. code-block:: python .. code-block:: python
import paddle.distributed.fleet as fleet import paddle.distributed.fleet as fleet
strategy = fleet.DistributedStrategy() strategy = fleet.DistributedStrategy()
strategy.calc_comm_same_stream = True strategy.calc_comm_same_stream = True
""" """
return self.strategy.calc_comm_same_stream return self.strategy.calc_comm_same_stream
...@@ -1288,14 +1437,18 @@ class DistributedStrategy(object): ...@@ -1288,14 +1437,18 @@ class DistributedStrategy(object):
@property @property
def fuse_grad_merge(self): def fuse_grad_merge(self):
""" """
Set whether fuse the grad for gradient merge. Set whether fuse the grad for gradient merge.
Note: this flag will only effect the gradient merge under pipeline mode Note: this flag will only effect the gradient merge under pipeline mode
The default value for the fuse_grad_merge is False The default value for the fuse_grad_merge is False
Examples: Examples:
.. code-block:: python .. code-block:: python
import paddle.distributed.fleet as fleet import paddle.distributed.fleet as fleet
strategy = fleet.DistributedStrategy() strategy = fleet.DistributedStrategy()
strategy.fuse_param_grad = True strategy.fuse_param_grad = True
""" """
return self.strategy.fuse_grad_merge return self.strategy.fuse_grad_merge
...@@ -1310,12 +1463,17 @@ class DistributedStrategy(object): ...@@ -1310,12 +1463,17 @@ class DistributedStrategy(object):
@property @property
def fuse_grad_size_in_num(self): def fuse_grad_size_in_num(self):
""" """
This based on raw_program_optimizer program and allreduce the num of the fused op This based on raw_program_optimizer program and allreduce the num of the fused op
Examples: Examples:
.. code-block:: python .. code-block:: python
import paddle.distributed.fleet as fleet import paddle.distributed.fleet as fleet
strategy = fleet.DistributedStrategy() strategy = fleet.DistributedStrategy()
strategy.fuse_grad_size_in_num = 2 strategy.fuse_grad_size_in_num = 2
""" """
return self.strategy.fuse_grad_size_in_num return self.strategy.fuse_grad_size_in_num
...@@ -1332,13 +1490,13 @@ class DistributedStrategy(object): ...@@ -1332,13 +1490,13 @@ class DistributedStrategy(object):
@property @property
def pipeline(self): def pipeline(self):
""" """
Indicating whether we are using pipeline parallelism for distributed training. Indicating whether we are using pipeline parallelism for distributed training.
Current implementation mainly focus on single GPU machine pipeline parallelism and Current implementation mainly focus on single GPU machine pipeline parallelism and
data parallelism across GPU machine. The pipeline information is indicated through data parallelism across GPU machine. The pipeline information is indicated through
device_guard information in user-defined program. device_guard information in user-defined program.
Examples: Examples:
.. code-block:: python .. code-block:: python
import paddle.distributed.fleet as fleet import paddle.distributed.fleet as fleet
...@@ -1383,6 +1541,7 @@ class DistributedStrategy(object): ...@@ -1383,6 +1541,7 @@ class DistributedStrategy(object):
@property @property
def pipeline_configs(self): def pipeline_configs(self):
""" """
Set pipeline parallelism configurations. In pipeline parallelism, Set pipeline parallelism configurations. In pipeline parallelism,
different parts of neural networks are running on different GPUS. different parts of neural networks are running on different GPUS.
There are Tensor queue buffer between each pair of neighborhood GPUS There are Tensor queue buffer between each pair of neighborhood GPUS
...@@ -1398,7 +1557,6 @@ class DistributedStrategy(object): ...@@ -1398,7 +1557,6 @@ class DistributedStrategy(object):
**micro_batch_size**: the number of small batches in each user defined batch **micro_batch_size**: the number of small batches in each user defined batch
Examples: Examples:
.. code-block:: python .. code-block:: python
import paddle.distributed.fleet as fleet import paddle.distributed.fleet as fleet
...@@ -1413,17 +1571,18 @@ class DistributedStrategy(object): ...@@ -1413,17 +1571,18 @@ class DistributedStrategy(object):
@pipeline_configs.setter @pipeline_configs.setter
@is_strict_auto @is_strict_auto
def pipeline_configs(self, configs): def pipeline_configs(self, configs):
check_configs_key(self.strategy.pipeline_configs, configs, check_configs_key(
"pipeline_configs") self.strategy.pipeline_configs, configs, "pipeline_configs"
)
assign_configs_value(self.strategy.pipeline_configs, configs) assign_configs_value(self.strategy.pipeline_configs, configs)
@property @property
def tensor_parallel(self): def tensor_parallel(self):
""" """
Indicating whether we are using tensor parallel for distributed training. Indicating whether we are using tensor parallel for distributed training.
Examples: Examples:
.. code-block:: python .. code-block:: python
import paddle.distributed.fleet as fleet import paddle.distributed.fleet as fleet
...@@ -1444,16 +1603,18 @@ class DistributedStrategy(object): ...@@ -1444,16 +1603,18 @@ class DistributedStrategy(object):
@property @property
def tensor_parallel_configs(self): def tensor_parallel_configs(self):
""" """
Set tensor_parallel configurations. Set tensor_parallel configurations.
**Notes**: **Notes**:
**Detailed arguments for tensor_parallel_configs** **Detailed arguments for tensor_parallel_configs**
**tensor_parallel_degree**: degree of tensor parallel **tensor_parallel_degree**: degree of tensor parallel
**tensor_init_seed**: parameter initialization random seed **tensor_init_seed**: parameter initialization random seed
Examples: Examples:
.. code-block:: python .. code-block:: python
import paddle.distributed.fleet as fleet import paddle.distributed.fleet as fleet
...@@ -1468,54 +1629,62 @@ class DistributedStrategy(object): ...@@ -1468,54 +1629,62 @@ class DistributedStrategy(object):
@tensor_parallel_configs.setter @tensor_parallel_configs.setter
@is_strict_auto @is_strict_auto
def tensor_parallel_configs(self, configs): def tensor_parallel_configs(self, configs):
check_configs_key(self.strategy.tensor_parallel_configs, configs, check_configs_key(
"tensor_parallel_configs") self.strategy.tensor_parallel_configs,
configs,
"tensor_parallel_configs",
)
assign_configs_value(self.strategy.tensor_parallel_configs, configs) assign_configs_value(self.strategy.tensor_parallel_configs, configs)
@property @property
def hybrid_configs(self): def hybrid_configs(self):
""" """
Dynamic graph hybrid parallel strategy configuration. Three-way hybrid parallelism Dynamic graph hybrid parallel strategy configuration. Three-way hybrid parallelism
needs to meet the following relationships needs to meet the following relationships
total_number_GPUs = dp_degree * mp_degree * pp_degree total_number_GPUs = dp_degree * mp_degree * pp_degree
**Note**: **Note**:
dp_degree(int): set number of GPUs in a data parallel group. Default -1. **dp_degree(int)**: set number of GPUs in a data parallel group. Default -1.
This value should be an integer greater than 0. This value should be an integer greater than 0.
If it is not set, or set to -1, its value will be inferred If it is not set, or set to -1, its value will be inferred
based on the total number of cards. based on the total number of cards.
mp_degree(int): set number of GPUs in a model parallel group. Default 1
pp_degree(int): set number of GPUs in a pipeline parallel group. Default 1
**mp_degree(int)**: set number of GPUs in a model parallel group. Default 1
**pp_degree(int)**: set number of GPUs in a pipeline parallel group. Default 1
Examples: Examples:
.. code-block:: python .. code-block:: python
import paddle.distributed.fleet as fleet import paddle.distributed.fleet as fleet
strategy = fleet.DistributedStrategy() strategy = fleet.DistributedStrategy()
strategy.hybrid_configs = { strategy.hybrid_configs = {
"dp_degree": 1, "dp_degree": 1,
"mp_degree": 2, "mp_degree": 2,
"pp_degree": 1} "pp_degree": 1}
""" """
return get_msg_dict(self.strategy.hybrid_configs) return get_msg_dict(self.strategy.hybrid_configs)
@hybrid_configs.setter @hybrid_configs.setter
def hybrid_configs(self, configs): def hybrid_configs(self, configs):
check_configs_key(self.strategy.hybrid_configs, configs, check_configs_key(
"hybrid_configs") self.strategy.hybrid_configs, configs, "hybrid_configs"
)
assign_configs_value(self.strategy.hybrid_configs, configs) assign_configs_value(self.strategy.hybrid_configs, configs)
@property @property
def localsgd(self): def localsgd(self):
""" """
Indicating whether we are using Local SGD training. Default Value: False Indicating whether we are using Local SGD training. Default Value: False
For more details, please refer to For more details, please refer to
`Don't Use Large Mini-Batches, Use Local SGD <https://arxiv.org/pdf/1808.07217.pdf>`_. `Don't Use Large Mini-Batches, Use Local SGD <https://arxiv.org/pdf/1808.07217.pdf>`_.
Examples: Examples:
.. code-block:: python .. code-block:: python
import paddle.distributed.fleet as fleet import paddle.distributed.fleet as fleet
...@@ -1536,6 +1705,7 @@ class DistributedStrategy(object): ...@@ -1536,6 +1705,7 @@ class DistributedStrategy(object):
@property @property
def localsgd_configs(self): def localsgd_configs(self):
""" """
Set LocalSGD training configurations. LocalSGD has a configurable Set LocalSGD training configurations. LocalSGD has a configurable
setting that can be configured through a dict. setting that can be configured through a dict.
...@@ -1544,7 +1714,6 @@ class DistributedStrategy(object): ...@@ -1544,7 +1714,6 @@ class DistributedStrategy(object):
begin_step(int) The step of beginning training by localsgd. Default 1. begin_step(int) The step of beginning training by localsgd. Default 1.
Examples: Examples:
.. code-block:: python .. code-block:: python
import paddle.distributed.fleet as fleet import paddle.distributed.fleet as fleet
...@@ -1552,6 +1721,7 @@ class DistributedStrategy(object): ...@@ -1552,6 +1721,7 @@ class DistributedStrategy(object):
strategy.localsgd = True strategy.localsgd = True
strategy.localsgd_configs = {"k_steps": 4, strategy.localsgd_configs = {"k_steps": 4,
"begin_step": 30} "begin_step": 30}
""" """
return get_msg_dict(self.strategy.localsgd_configs) return get_msg_dict(self.strategy.localsgd_configs)
...@@ -1559,20 +1729,20 @@ class DistributedStrategy(object): ...@@ -1559,20 +1729,20 @@ class DistributedStrategy(object):
@localsgd_configs.setter @localsgd_configs.setter
@is_strict_auto @is_strict_auto
def localsgd_configs(self, configs): def localsgd_configs(self, configs):
check_configs_key(self.strategy.localsgd_configs, configs, check_configs_key(
"localsgd_configs") self.strategy.localsgd_configs, configs, "localsgd_configs"
)
assign_configs_value(self.strategy.localsgd_configs, configs) assign_configs_value(self.strategy.localsgd_configs, configs)
@property @property
def adaptive_localsgd(self): def adaptive_localsgd(self):
""" """
Indicating whether we are using Adaptive Local SGD training. Default Value: False Indicating whether we are using Adaptive Local SGD training. Default Value: False
For more details, please refer to `Adaptive Communication Strategies to Achieve For more details, please refer to `Adaptive Communication Strategies to Achieve
the Best Error-Runtime Trade-off in Local-Update SGD <https://arxiv.org/pdf/1810.08313.pdf>`_. the Best Error-Runtime Trade-off in Local-Update SGD <https://arxiv.org/pdf/1810.08313.pdf>`_.
Examples: Examples:
.. code-block:: python .. code-block:: python
import paddle.distributed.fleet as fleet import paddle.distributed.fleet as fleet
...@@ -1593,6 +1763,7 @@ class DistributedStrategy(object): ...@@ -1593,6 +1763,7 @@ class DistributedStrategy(object):
@property @property
def adaptive_localsgd_configs(self): def adaptive_localsgd_configs(self):
""" """
Set AdaptiveLocalSGD training configurations. AdaptiveLocalSGD has a configurable Set AdaptiveLocalSGD training configurations. AdaptiveLocalSGD has a configurable
setting that can be configured through a dict. setting that can be configured through a dict.
...@@ -1600,10 +1771,10 @@ class DistributedStrategy(object): ...@@ -1600,10 +1771,10 @@ class DistributedStrategy(object):
init_k_steps(int) The initial steps for training before adaptive localsgd. init_k_steps(int) The initial steps for training before adaptive localsgd.
Then, the adaptive localsgd method will modify init_k_steps automatically. Then, the adaptive localsgd method will modify init_k_steps automatically.
Default 1. Default 1.
begin_step(int) The step of beginning training by adaptive localsgd. Default 1. begin_step(int) The step of beginning training by adaptive localsgd. Default 1.
Examples: Examples:
.. code-block:: python .. code-block:: python
import paddle.distributed.fleet as fleet import paddle.distributed.fleet as fleet
...@@ -1611,6 +1782,7 @@ class DistributedStrategy(object): ...@@ -1611,6 +1782,7 @@ class DistributedStrategy(object):
strategy.adaptive_localsgd = True strategy.adaptive_localsgd = True
strategy.adaptive_localsgd_configs = {"init_k_steps": 1, strategy.adaptive_localsgd_configs = {"init_k_steps": 1,
"begin_step": 30} "begin_step": 30}
""" """
return get_msg_dict(self.strategy.adaptive_localsgd_configs) return get_msg_dict(self.strategy.adaptive_localsgd_configs)
...@@ -1618,20 +1790,23 @@ class DistributedStrategy(object): ...@@ -1618,20 +1790,23 @@ class DistributedStrategy(object):
@adaptive_localsgd_configs.setter @adaptive_localsgd_configs.setter
@is_strict_auto @is_strict_auto
def adaptive_localsgd_configs(self, configs): def adaptive_localsgd_configs(self, configs):
check_configs_key(self.strategy.adaptive_localsgd_configs, configs, check_configs_key(
"adaptive_localsgd_configs") self.strategy.adaptive_localsgd_configs,
configs,
"adaptive_localsgd_configs",
)
assign_configs_value(self.strategy.adaptive_localsgd_configs, configs) assign_configs_value(self.strategy.adaptive_localsgd_configs, configs)
@property @property
def dgc(self): def dgc(self):
""" """
Indicating whether we are using Deep Gradient Compression training. For more details, please refer to Indicating whether we are using Deep Gradient Compression training. For more details, please refer to
[Deep Gradient Compression](https://arxiv.org/abs/1712.01887). [Deep Gradient Compression](https://arxiv.org/abs/1712.01887).
Default Value: False Default Value: False
Examples: Examples:
.. code-block:: python .. code-block:: python
import paddle.distributed.fleet as fleet import paddle.distributed.fleet as fleet
...@@ -1652,6 +1827,7 @@ class DistributedStrategy(object): ...@@ -1652,6 +1827,7 @@ class DistributedStrategy(object):
@property @property
def dgc_configs(self): def dgc_configs(self):
r""" r"""
Set Deep Gradient Compression training configurations. In general, dgc has serveral configurable Set Deep Gradient Compression training configurations. In general, dgc has serveral configurable
settings that can be configured through a dict. settings that can be configured through a dict.
...@@ -1668,13 +1844,13 @@ class DistributedStrategy(object): ...@@ -1668,13 +1844,13 @@ class DistributedStrategy(object):
element will be transmitted. element will be transmitted.
Examples: Examples:
.. code-block:: python .. code-block:: python
import paddle.distributed.fleet as fleet import paddle.distributed.fleet as fleet
strategy = fleet.DistributedStrategy() strategy = fleet.DistributedStrategy()
strategy.dgc = True strategy.dgc = True
strategy.dgc_configs = {"rampup_begin_step": 1252} strategy.dgc_configs = {"rampup_begin_step": 1252}
""" """
return get_msg_dict(self.strategy.dgc_configs) return get_msg_dict(self.strategy.dgc_configs)
...@@ -1687,14 +1863,15 @@ class DistributedStrategy(object): ...@@ -1687,14 +1863,15 @@ class DistributedStrategy(object):
@property @property
def fp16_allreduce(self): def fp16_allreduce(self):
""" """
Indicating whether we are using fp16 gradient allreduce training Indicating whether we are using fp16 gradient allreduce training
Default Value: False Default Value: False
Examples: Examples:
.. code-block:: python .. code-block:: python
import paddle.distributed.fleet as fleet import paddle.distributed.fleet as fleet
strategy = fleet.DistributedStrategy() strategy = fleet.DistributedStrategy()
strategy.fp16_allreduce = True # by default this is false strategy.fp16_allreduce = True # by default this is false
...@@ -1711,6 +1888,7 @@ class DistributedStrategy(object): ...@@ -1711,6 +1888,7 @@ class DistributedStrategy(object):
@property @property
def gradient_merge(self): def gradient_merge(self):
""" """
Gradient Merge, also called as Gradient Accumulation, Gradient Merge, also called as Gradient Accumulation,
is a strategy for large batch training. With this strategy, is a strategy for large batch training. With this strategy,
model parameter will not be updated until user-defined steps. model parameter will not be updated until user-defined steps.
...@@ -1721,13 +1899,13 @@ class DistributedStrategy(object): ...@@ -1721,13 +1899,13 @@ class DistributedStrategy(object):
to model parameters. to model parameters.
Examples: Examples:
.. code-block:: python .. code-block:: python
import paddle.distributed.fleet as fleet import paddle.distributed.fleet as fleet
strategy = fleet.DistributedStrategy() strategy = fleet.DistributedStrategy()
strategy.gradient_merge = True strategy.gradient_merge = True
strategy.gradient_merge_configs = {"k_steps": 4, "avg": True} strategy.gradient_merge_configs = {"k_steps": 4, "avg": True}
""" """
return self.strategy.gradient_merge return self.strategy.gradient_merge
...@@ -1742,6 +1920,7 @@ class DistributedStrategy(object): ...@@ -1742,6 +1920,7 @@ class DistributedStrategy(object):
@property @property
def gradient_merge_configs(self): def gradient_merge_configs(self):
""" """
the key-value configs of distribute_strategy the key-value configs of distribute_strategy
**Note**: **Note**:
...@@ -1750,26 +1929,28 @@ class DistributedStrategy(object): ...@@ -1750,26 +1929,28 @@ class DistributedStrategy(object):
avg(bool): whether to average the gradients of each mini-batch, the default value is `True` avg(bool): whether to average the gradients of each mini-batch, the default value is `True`
Examples: Examples:
.. code-block:: python .. code-block:: python
import paddle.distributed.fleet as fleet import paddle.distributed.fleet as fleet
strategy = fleet.DistributedStrategy() strategy = fleet.DistributedStrategy()
strategy.gradient_merge = True strategy.gradient_merge = True
strategy.gradient_merge_configs = {"k_steps": 4, "avg": True} strategy.gradient_merge_configs = {"k_steps": 4, "avg": True}
""" """
return get_msg_dict(self.strategy.gradient_merge_configs) return get_msg_dict(self.strategy.gradient_merge_configs)
@gradient_merge_configs.setter @gradient_merge_configs.setter
@is_strict_auto @is_strict_auto
def gradient_merge_configs(self, configs): def gradient_merge_configs(self, configs):
check_configs_key(self.strategy.gradient_merge_configs, configs, check_configs_key(
"gradient_configs") self.strategy.gradient_merge_configs, configs, "gradient_configs"
)
assign_configs_value(self.strategy.gradient_merge_configs, configs) assign_configs_value(self.strategy.gradient_merge_configs, configs)
@property @property
def lars(self): def lars(self):
""" """
Set lars configurations. lars is used to deal with the convergence problems when the global Set lars configurations. lars is used to deal with the convergence problems when the global
batch size is larger than 8k. For more details, please refer to batch size is larger than 8k. For more details, please refer to
[Large Batch Training of Convolutional Networks](https://arxiv.org/abs/1708.03888). [Large Batch Training of Convolutional Networks](https://arxiv.org/abs/1708.03888).
...@@ -1777,12 +1958,12 @@ class DistributedStrategy(object): ...@@ -1777,12 +1958,12 @@ class DistributedStrategy(object):
Default Value: False Default Value: False
Examples: Examples:
.. code-block:: python .. code-block:: python
import paddle.distributed.fleet as fleet import paddle.distributed.fleet as fleet
strategy = fleet.DistributedStrategy() strategy = fleet.DistributedStrategy()
strategy.lars = True # by default this is false strategy.lars = True # by default this is false
""" """
return self.strategy.lars return self.strategy.lars
...@@ -1797,6 +1978,7 @@ class DistributedStrategy(object): ...@@ -1797,6 +1978,7 @@ class DistributedStrategy(object):
@property @property
def lars_configs(self): def lars_configs(self):
""" """
Set Lars training configurations. Set Lars training configurations.
**Notes**: **Notes**:
...@@ -1808,7 +1990,6 @@ class DistributedStrategy(object): ...@@ -1808,7 +1990,6 @@ class DistributedStrategy(object):
will be exclude from weight decay in lars formula. will be exclude from weight decay in lars formula.
Examples: Examples:
.. code-block:: python .. code-block:: python
import paddle.distributed.fleet as fleet import paddle.distributed.fleet as fleet
...@@ -1820,6 +2001,7 @@ class DistributedStrategy(object): ...@@ -1820,6 +2001,7 @@ class DistributedStrategy(object):
"epsilon": 0, "epsilon": 0,
"exclude_from_weight_decay": ['batch_norm', '.b_0'] "exclude_from_weight_decay": ['batch_norm', '.b_0']
} }
""" """
return get_msg_dict(self.strategy.lars_configs) return get_msg_dict(self.strategy.lars_configs)
...@@ -1832,6 +2014,7 @@ class DistributedStrategy(object): ...@@ -1832,6 +2014,7 @@ class DistributedStrategy(object):
@property @property
def lamb(self): def lamb(self):
""" """
Set lamb configurations. lamb is used to deal with the convergence problems for large Set lamb configurations. lamb is used to deal with the convergence problems for large
batch size training, specially for attention-related model like BERT. For more details, batch size training, specially for attention-related model like BERT. For more details,
please refer to please refer to
...@@ -1840,12 +2023,12 @@ class DistributedStrategy(object): ...@@ -1840,12 +2023,12 @@ class DistributedStrategy(object):
Default Value: False Default Value: False
Examples: Examples:
.. code-block:: python .. code-block:: python
import paddle.distributed.fleet as fleet import paddle.distributed.fleet as fleet
strategy = fleet.DistributedStrategy() strategy = fleet.DistributedStrategy()
strategy.lamb = True # by default this is false strategy.lamb = True # by default this is false
""" """
return self.strategy.lamb return self.strategy.lamb
...@@ -1861,6 +2044,7 @@ class DistributedStrategy(object): ...@@ -1861,6 +2044,7 @@ class DistributedStrategy(object):
@property @property
def lamb_configs(self): def lamb_configs(self):
""" """
Set Lars training configurations. Set Lars training configurations.
**Notes**: **Notes**:
...@@ -1869,7 +2053,6 @@ class DistributedStrategy(object): ...@@ -1869,7 +2053,6 @@ class DistributedStrategy(object):
will be exclude from weight decay in lamb formula. will be exclude from weight decay in lamb formula.
Examples: Examples:
.. code-block:: python .. code-block:: python
import paddle.distributed.fleet as fleet import paddle.distributed.fleet as fleet
...@@ -1879,6 +2062,7 @@ class DistributedStrategy(object): ...@@ -1879,6 +2062,7 @@ class DistributedStrategy(object):
'lamb_weight_decay': 0.01, 'lamb_weight_decay': 0.01,
'exclude_from_weight_decay': [], 'exclude_from_weight_decay': [],
} }
""" """
return get_msg_dict(self.strategy.lamb_configs) return get_msg_dict(self.strategy.lamb_configs)
...@@ -1891,8 +2075,10 @@ class DistributedStrategy(object): ...@@ -1891,8 +2075,10 @@ class DistributedStrategy(object):
@property @property
def elastic(self): def elastic(self):
""" """
Indicating whether we want to do current distributed training on clusters with elastic resources. Indicating whether we want to do current distributed training on clusters with elastic resources.
Currently, this is configuration is not valid. Currently, this is configuration is not valid.
""" """
return self.strategy.elastic return self.strategy.elastic
...@@ -1907,6 +2093,7 @@ class DistributedStrategy(object): ...@@ -1907,6 +2093,7 @@ class DistributedStrategy(object):
@property @property
def auto(self): def auto(self):
""" """
Indicating whether we are using auto-parallel configuration Indicating whether we are using auto-parallel configuration
This feature is currently an experimental feature. Currently, This feature is currently an experimental feature. Currently,
auto-parallelism can be used only when a user does not set any other auto-parallelism can be used only when a user does not set any other
...@@ -1915,7 +2102,6 @@ class DistributedStrategy(object): ...@@ -1915,7 +2102,6 @@ class DistributedStrategy(object):
Default Value: False Default Value: False
Examples: Examples:
.. code-block:: python .. code-block:: python
import paddle import paddle
...@@ -1929,6 +2115,7 @@ class DistributedStrategy(object): ...@@ -1929,6 +2115,7 @@ class DistributedStrategy(object):
optimizer = paddle.optimizer.SGD(learning_rate=0.01) optimizer = paddle.optimizer.SGD(learning_rate=0.01)
optimizer = fleet.distributed_optimizer(optimizer, strategy) optimizer = fleet.distributed_optimizer(optimizer, strategy)
""" """
return self.strategy.auto return self.strategy.auto
...@@ -1942,6 +2129,7 @@ class DistributedStrategy(object): ...@@ -1942,6 +2129,7 @@ class DistributedStrategy(object):
@property @property
def semi_auto(self): def semi_auto(self):
""" """
Indicating whether we are using semi-auto parallel function Indicating whether we are using semi-auto parallel function
This feature is currently an experimental feature. Currently, This feature is currently an experimental feature. Currently,
auto-parallelism can be used only when a user does not set any other auto-parallelism can be used only when a user does not set any other
...@@ -1950,7 +2138,6 @@ class DistributedStrategy(object): ...@@ -1950,7 +2138,6 @@ class DistributedStrategy(object):
Default Value: False Default Value: False
Examples: Examples:
.. code-block:: python .. code-block:: python
import paddle import paddle
...@@ -1964,6 +2151,7 @@ class DistributedStrategy(object): ...@@ -1964,6 +2151,7 @@ class DistributedStrategy(object):
optimizer = paddle.optimizer.SGD(learning_rate=0.01) optimizer = paddle.optimizer.SGD(learning_rate=0.01)
optimizer = fleet.distributed_optimizer(optimizer, strategy) optimizer = fleet.distributed_optimizer(optimizer, strategy)
""" """
return self.strategy.semi_auto return self.strategy.semi_auto
...@@ -1977,16 +2165,21 @@ class DistributedStrategy(object): ...@@ -1977,16 +2165,21 @@ class DistributedStrategy(object):
@property @property
def auto_search(self): def auto_search(self):
""" """
Indicating whether we are using auto-search parallel function Indicating whether we are using auto-search parallel function
For details, please reference the following code example For details, please reference the following code example
Default Value: False Default Value: False
Examples: Examples:
.. code-block:: python .. code-block:: python
import paddle import paddle
paddle.enable_static() paddle.enable_static()
import paddle.distributed.fleet as fleet import paddle.distributed.fleet as fleet
strategy = fleet.DistributedStrategy() strategy = fleet.DistributedStrategy()
strategy.auto_search = True strategy.auto_search = True
""" """
return self.strategy.auto_search return self.strategy.auto_search
...@@ -2000,15 +2193,20 @@ class DistributedStrategy(object): ...@@ -2000,15 +2193,20 @@ class DistributedStrategy(object):
@property @property
def split_data(self): def split_data(self):
""" """
Indicating whether we split the data. If True, we split the data. Indicating whether we split the data. If True, we split the data.
Default Value: True Default Value: True
Examples: Examples:
.. code-block:: python .. code-block:: python
import paddle import paddle
paddle.enable_static() paddle.enable_static()
import paddle.distributed.fleet as fleet import paddle.distributed.fleet as fleet
strategy = fleet.DistributedStrategy() strategy = fleet.DistributedStrategy()
strategy.split_data = True strategy.split_data = True
""" """
return self.strategy.split_data return self.strategy.split_data
...@@ -2022,8 +2220,10 @@ class DistributedStrategy(object): ...@@ -2022,8 +2220,10 @@ class DistributedStrategy(object):
@property @property
def qat(self): def qat(self):
""" """
Indicating whether we are using quantization training Indicating whether we are using quantization training
Default Value: False Default Value: False
""" """
return self.strategy.qat return self.strategy.qat
...@@ -2037,6 +2237,7 @@ class DistributedStrategy(object): ...@@ -2037,6 +2237,7 @@ class DistributedStrategy(object):
@property @property
def qat_configs(self): def qat_configs(self):
""" """
Set quantization training configurations. In general, qat has serveral configurable Set quantization training configurations. In general, qat has serveral configurable
settings that can be configured through a dict. settings that can be configured through a dict.
...@@ -2053,10 +2254,10 @@ class DistributedStrategy(object): ...@@ -2053,10 +2254,10 @@ class DistributedStrategy(object):
algo(str): Other quantization training algorithm. algo(str): Other quantization training algorithm.
Exampless: Exampless:
.. code-block:: python .. code-block:: python
import paddle.distributed.fleet as fleet import paddle.distributed.fleet as fleet
strategy = fleet.DistributedStrategy() strategy = fleet.DistributedStrategy()
strategy.qat = True strategy.qat = True
strategy.qat_configs = { strategy.qat_configs = {
...@@ -2076,13 +2277,13 @@ class DistributedStrategy(object): ...@@ -2076,13 +2277,13 @@ class DistributedStrategy(object):
@property @property
def heter_ccl_mode(self): def heter_ccl_mode(self):
""" """
Indicating whether we are using heter_ccl_mode for model training. Indicating whether we are using heter_ccl_mode for model training.
This feature is currently an experimental feature. Currently, This feature is currently an experimental feature. Currently,
heter_ccl_mode can be used only for dataparallel with dygraph mode. heter_ccl_mode can be used only for dataparallel with dygraph mode.
Default Value: False Default Value: False
Examples: Examples:
.. code-block:: python .. code-block:: python
import paddle import paddle
...@@ -2094,6 +2295,7 @@ class DistributedStrategy(object): ...@@ -2094,6 +2295,7 @@ class DistributedStrategy(object):
# for initialize parallel env, only need to call # for initialize parallel env, only need to call
paddle.distributed.init_parallel_env() paddle.distributed.init_parallel_env()
# then the heterogenous context will be created. # then the heterogenous context will be created.
""" """
return self.strategy.heter_ccl_mode return self.strategy.heter_ccl_mode
...@@ -2107,6 +2309,7 @@ class DistributedStrategy(object): ...@@ -2107,6 +2309,7 @@ class DistributedStrategy(object):
@property @property
def cudnn_exhaustive_search(self): def cudnn_exhaustive_search(self):
""" """
Indicating whether to use exhaustive search method to choose convolution algorithms. Indicating whether to use exhaustive search method to choose convolution algorithms.
Exhaustive search attempts all cuDNN algorithms to choose the fastest algorithm. Exhaustive search attempts all cuDNN algorithms to choose the fastest algorithm.
This method is time-consuming, the choosed algorithm will be cached for the given layer specifications. This method is time-consuming, the choosed algorithm will be cached for the given layer specifications.
...@@ -2114,17 +2317,18 @@ class DistributedStrategy(object): ...@@ -2114,17 +2317,18 @@ class DistributedStrategy(object):
Default Value: True Default Value: True
Examples: Examples:
.. code-block:: python .. code-block:: python
import paddle import paddle
paddle.enable_static() paddle.enable_static()
import paddle.distributed.fleet as fleet import paddle.distributed.fleet as fleet
strategy = fleet.DistributedStrategy() strategy = fleet.DistributedStrategy()
strategy.cudnn_exhaustive_search = False strategy.cudnn_exhaustive_search = False
optimizer = paddle.optimizer.SGD(learning_rate=0.01) optimizer = paddle.optimizer.SGD(learning_rate=0.01)
optimizer = fleet.distributed_optimizer(optimizer, strategy) optimizer = fleet.distributed_optimizer(optimizer, strategy)
""" """
return self.strategy.cudnn_exhaustive_search return self.strategy.cudnn_exhaustive_search
...@@ -2141,6 +2345,7 @@ class DistributedStrategy(object): ...@@ -2141,6 +2345,7 @@ class DistributedStrategy(object):
@property @property
def conv_workspace_size_limit(self): def conv_workspace_size_limit(self):
""" """
The workspace limit size in MB unit for choosing cuDNN convolution algorithms. The workspace limit size in MB unit for choosing cuDNN convolution algorithms.
The inner funciton of cuDNN obtain the fastest suited algorithm that fits within this memory limit. The inner funciton of cuDNN obtain the fastest suited algorithm that fits within this memory limit.
Usually, large workspace size may lead to choose faster algorithms, Usually, large workspace size may lead to choose faster algorithms,
...@@ -2148,12 +2353,12 @@ class DistributedStrategy(object): ...@@ -2148,12 +2353,12 @@ class DistributedStrategy(object):
Default Value: 4000 Default Value: 4000
Examples: Examples:
.. code-block:: python .. code-block:: python
import paddle import paddle
paddle.enable_static() paddle.enable_static()
import paddle.distributed.fleet as fleet import paddle.distributed.fleet as fleet
strategy = fleet.DistributedStrategy() strategy = fleet.DistributedStrategy()
strategy.conv_workspace_size_limit = 1024 strategy.conv_workspace_size_limit = 1024
...@@ -2176,17 +2381,18 @@ class DistributedStrategy(object): ...@@ -2176,17 +2381,18 @@ class DistributedStrategy(object):
@property @property
def cudnn_batchnorm_spatial_persistent(self): def cudnn_batchnorm_spatial_persistent(self):
""" """
Indicates whether to use the mode CUDNN_BATCHNORM_SPATIAL_PERSISTENT function in batchnorm. Indicates whether to use the mode CUDNN_BATCHNORM_SPATIAL_PERSISTENT function in batchnorm.
This is only useful in cudnn. This is only useful in cudnn.
Default Value: True Default Value: True
Examples: Examples:
.. code-block:: python .. code-block:: python
import paddle import paddle
paddle.enable_static() paddle.enable_static()
import paddle.distributed.fleet as fleet import paddle.distributed.fleet as fleet
strategy = fleet.DistributedStrategy() strategy = fleet.DistributedStrategy()
strategy.cudnn_batchnorm_spatial_persistent = True strategy.cudnn_batchnorm_spatial_persistent = True
...@@ -2244,7 +2450,8 @@ class DistributedStrategy(object): ...@@ -2244,7 +2450,8 @@ class DistributedStrategy(object):
h1_format = " " + "|{{:^{}s}}|\n".format(length) h1_format = " " + "|{{:^{}s}}|\n".format(length)
h2_format = " " + "|{{:>{}s}}{}{{:^{}s}}|\n".format( h2_format = " " + "|{{:>{}s}}{}{{:^{}s}}|\n".format(
max_k, " " * spacing, max_v) max_k, " " * spacing, max_v
)
border = " +" + "".join(["="] * length) + "+" border = " +" + "".join(["="] * length) + "+"
line = " +" + "".join(["-"] * length) + "+" line = " +" + "".join(["-"] * length) + "+"
...@@ -2269,37 +2476,48 @@ class DistributedStrategy(object): ...@@ -2269,37 +2476,48 @@ class DistributedStrategy(object):
if getattr(self.strategy, f.name): if getattr(self.strategy, f.name):
draws += border + "\n" draws += border + "\n"
draws += h1_format.format( draws += h1_format.format(
"{}=True <-> {}_configs".format(f.name, f.name)) "{}=True <-> {}_configs".format(f.name, f.name)
)
draws += line + "\n" draws += line + "\n"
my_configs = getattr(self.strategy, my_configs = getattr(
f.name + "_configs") self.strategy, f.name + "_configs"
)
config_fields = my_configs.DESCRIPTOR.fields config_fields = my_configs.DESCRIPTOR.fields
for ff in config_fields: for ff in config_fields:
if isinstance( if isinstance(
getattr(my_configs, getattr(my_configs, ff.name),
ff.name), google.protobuf.pyext. google.protobuf.pyext._message.RepeatedScalarContainer,
_message.RepeatedScalarContainer): ):
values = getattr(my_configs, ff.name) values = getattr(my_configs, ff.name)
for i, v in enumerate(values): for i, v in enumerate(values):
if i == 0: if i == 0:
draws += h2_format.format( draws += h2_format.format(
ff.name, str(v)) ff.name, str(v)
)
else: else:
draws += h2_format.format( draws += h2_format.format(
"", str(v)) "", str(v)
)
else: else:
draws += h2_format.format( draws += h2_format.format(
ff.name, ff.name,
str(getattr(my_configs, ff.name))) str(getattr(my_configs, ff.name)),
)
else: else:
env_draws += h2_format.format( env_draws += h2_format.format(
f.name, str(getattr(self.strategy, f.name))) f.name, str(getattr(self.strategy, f.name))
)
else: else:
env_draws += h2_format.format( env_draws += h2_format.format(
f.name, str(getattr(self.strategy, f.name))) f.name, str(getattr(self.strategy, f.name))
)
result_res = draws + border + "\n" + h1_format.format( result_res = (
"Environment Flags, Communication Flags") draws
+ border
+ "\n"
+ h1_format.format("Environment Flags, Communication Flags")
)
result_res += env_draws result_res += env_draws
build_strategy_str = border + "\n" build_strategy_str = border + "\n"
...@@ -2309,7 +2527,8 @@ class DistributedStrategy(object): ...@@ -2309,7 +2527,8 @@ class DistributedStrategy(object):
fields = self.strategy.build_strategy.DESCRIPTOR.fields fields = self.strategy.build_strategy.DESCRIPTOR.fields
for f in fields: for f in fields:
build_strategy_str += h2_format.format( build_strategy_str += h2_format.format(
f.name, str(getattr(self.strategy.build_strategy, f.name))) f.name, str(getattr(self.strategy.build_strategy, f.name))
)
build_strategy_str += border + "\n" build_strategy_str += border + "\n"
execution_strategy_str = h1_format.format("Execution Strategy") execution_strategy_str = h1_format.format("Execution Strategy")
...@@ -2318,7 +2537,8 @@ class DistributedStrategy(object): ...@@ -2318,7 +2537,8 @@ class DistributedStrategy(object):
fields = self.strategy.execution_strategy.DESCRIPTOR.fields fields = self.strategy.execution_strategy.DESCRIPTOR.fields
for f in fields: for f in fields:
execution_strategy_str += h2_format.format( execution_strategy_str += h2_format.format(
f.name, str(getattr(self.strategy.execution_strategy, f.name))) f.name, str(getattr(self.strategy.execution_strategy, f.name))
)
execution_strategy_str += border + "\n" execution_strategy_str += border + "\n"
result_res += build_strategy_str + execution_strategy_str result_res += build_strategy_str + execution_strategy_str
......
...@@ -28,12 +28,13 @@ _HYBRID_PARALLEL_GROUP = None ...@@ -28,12 +28,13 @@ _HYBRID_PARALLEL_GROUP = None
class ParallelMode(object): class ParallelMode(object):
""" """
There are all the parallel modes currently supported: There are all the parallel modes currently supported:
- DATA_PARALLEL: Distribute input data to different devices. - DATA_PARALLEL: Distribute input data to different devices.
- TENSOR_PARALLEL: Shards tensors in the network to different devices. - TENSOR_PARALLEL: Shards tensors in the network to different devices.
- PIPELINE_PARALLEL: Place different layers of the network on different devices. - PIPELINE_PARALLEL: Place different layers of the network on different devices.
- SHARDING_PARALLEL: Segment the model parameters, parameter gradients and optimizer states - SHARDING_PARALLEL: Segment the model parameters, parameter gradients and optimizer states corresponding to the parameters to each device.
corresponding to the parameters to each device.
Examples: Examples:
.. code-block:: python .. code-block:: python
...@@ -43,6 +44,7 @@ class ParallelMode(object): ...@@ -43,6 +44,7 @@ class ParallelMode(object):
print(parallel_mode.DATA_PARALLEL) # 0 print(parallel_mode.DATA_PARALLEL) # 0
""" """
DATA_PARALLEL = 0 DATA_PARALLEL = 0
TENSOR_PARALLEL = 1 TENSOR_PARALLEL = 1
PIPELINE_PARALLEL = 2 PIPELINE_PARALLEL = 2
...@@ -50,14 +52,16 @@ class ParallelMode(object): ...@@ -50,14 +52,16 @@ class ParallelMode(object):
class CommunicateTopology(object): class CommunicateTopology(object):
def __init__(
def __init__(self, self,
hybrid_group_names=["data", "pipe", "sharding", "model"], hybrid_group_names=["data", "pipe", "sharding", "model"],
dims=[1, 1, 1, 1]): dims=[1, 1, 1, 1],
):
self._parallel_names = hybrid_group_names self._parallel_names = hybrid_group_names
self._dims = dims self._dims = dims
self.coordinate = collections.namedtuple('Coordinate', self.coordinate = collections.namedtuple(
self._parallel_names) 'Coordinate', self._parallel_names
)
self._world_size = reduce(lambda x, y: x * y, self._dims) self._world_size = reduce(lambda x, y: x * y, self._dims)
ranges = [range(d) for d in self._dims] ranges = [range(d) for d in self._dims]
...@@ -65,7 +69,8 @@ class CommunicateTopology(object): ...@@ -65,7 +69,8 @@ class CommunicateTopology(object):
self._coord2rank = dict(zip(all_coordinate, range(len(all_coordinate)))) self._coord2rank = dict(zip(all_coordinate, range(len(all_coordinate))))
self._rank2coord = dict( self._rank2coord = dict(
zip(self._coord2rank.values(), self._coord2rank.keys())) zip(self._coord2rank.values(), self._coord2rank.keys())
)
def get_hybrid_group_names(self): def get_hybrid_group_names(self):
return self._parallel_names return self._parallel_names
...@@ -90,7 +95,8 @@ class CommunicateTopology(object): ...@@ -90,7 +95,8 @@ class CommunicateTopology(object):
def get_axis_list(self, axis_name, index): def get_axis_list(self, axis_name, index):
axis = self._parallel_names.index(axis_name) axis = self._parallel_names.index(axis_name)
ranks = [ ranks = [
self._coord2rank[coord] for coord in self._coord2rank.keys() self._coord2rank[coord]
for coord in self._coord2rank.keys()
if coord[axis] == index if coord[axis] == index
] ]
ranks.sort() ranks.sort()
...@@ -132,7 +138,6 @@ class CommunicateTopology(object): ...@@ -132,7 +138,6 @@ class CommunicateTopology(object):
class HybridCommunicateGroup(object): class HybridCommunicateGroup(object):
def __init__(self, topology): def __init__(self, topology):
self.nranks = paddle.distributed.get_world_size() self.nranks = paddle.distributed.get_world_size()
self.global_rank = paddle.distributed.get_rank() self.global_rank = paddle.distributed.get_rank()
...@@ -148,10 +153,16 @@ class HybridCommunicateGroup(object): ...@@ -148,10 +153,16 @@ class HybridCommunicateGroup(object):
self._sharding_parallel_id = self._get_sharding_parallel_id() self._sharding_parallel_id = self._get_sharding_parallel_id()
self.stage_id = self._get_pipe_parallel_id() self.stage_id = self._get_pipe_parallel_id()
assert self._check_vaild_topo( assert self._check_vaild_topo(), (
), "Here is an unreasonable topogy setting. world_size: {}, but" \ "Here is an unreasonable topogy setting. world_size: {}, but"
"mp_num: {}, sharding_num: {}, pp_num: {}, dp_num: {}".format(self.nranks, "mp_num: {}, sharding_num: {}, pp_num: {}, dp_num: {}".format(
self._mp_degree, self._sharding_degree, self._pp_degree, self._dp_degree) self.nranks,
self._mp_degree,
self._sharding_degree,
self._pp_degree,
self._dp_degree,
)
)
# create comm group for data parallel # create comm group for data parallel
self._dp_group, self._dp_comm_group = self._set_comm_group("data") self._dp_group, self._dp_comm_group = self._set_comm_group("data")
...@@ -164,26 +175,43 @@ class HybridCommunicateGroup(object): ...@@ -164,26 +175,43 @@ class HybridCommunicateGroup(object):
# create comm group for sharding parallel # create comm group for sharding parallel
self._sharding_group, self._sharding_comm_group = self._set_comm_group( self._sharding_group, self._sharding_comm_group = self._set_comm_group(
"sharding") "sharding"
)
# create global group for check inf_nan / clip global norm # create global group for check inf_nan / clip global norm
self._check_group, self._check_comm_group = self._set_check_group( self._check_group, self._check_comm_group = self._set_check_group(
"data") "data"
)
# create p2p group # create p2p group
self.is_first_stage = (self.stage_id == 0) self.is_first_stage = self.stage_id == 0
self.is_last_stage = (self.stage_id == (self._pp_degree - 1)) self.is_last_stage = self.stage_id == (self._pp_degree - 1)
# create p2p_groups # create p2p_groups
if self._pp_degree > 1: if self._pp_degree > 1:
self._set_p2p_group() self._set_p2p_group()
debug_str = "HybridParallelInfo: rank_id: %d, mp_degree: %d, " \ debug_str = (
"sharding_degree: %d, pp_degree: %d, dp_degree: %d" % (self.global_rank, self._mp_degree, "HybridParallelInfo: rank_id: %d, mp_degree: %d, "
self._sharding_degree, self._pp_degree, self._dp_degree) "sharding_degree: %d, pp_degree: %d, dp_degree: %d"
debug_str += ", mp_group: %s, sharding_group: %s, pp_group: %s, dp_group: %s, check/clip group: %s" % ( % (
self._mp_group, self._sharding_group, self._pp_group, self.global_rank,
self._dp_group, self._check_group) self._mp_degree,
self._sharding_degree,
self._pp_degree,
self._dp_degree,
)
)
debug_str += (
", mp_group: %s, sharding_group: %s, pp_group: %s, dp_group: %s, check/clip group: %s"
% (
self._mp_group,
self._sharding_group,
self._pp_group,
self._dp_group,
self._check_group,
)
)
logger.info(debug_str) logger.info(debug_str)
global _HYBRID_PARALLEL_GROUP global _HYBRID_PARALLEL_GROUP
...@@ -195,7 +223,12 @@ class HybridCommunicateGroup(object): ...@@ -195,7 +223,12 @@ class HybridCommunicateGroup(object):
# adding its parallel logic within that parallelism # adding its parallel logic within that parallelism
# when use sharding alone, it should have its own parallelism for its parallel logic # when use sharding alone, it should have its own parallelism for its parallel logic
# TODO modify 3 others parallel to support sharding # TODO modify 3 others parallel to support sharding
if self._mp_degree == 1 and self._pp_degree == 1 and self._dp_degree == 1 and self._sharding_degree > 1: if (
self._mp_degree == 1
and self._pp_degree == 1
and self._dp_degree == 1
and self._sharding_degree > 1
):
return ParallelMode.SHARDING_PARALLEL return ParallelMode.SHARDING_PARALLEL
elif self._mp_degree == 1 and self._pp_degree == 1: elif self._mp_degree == 1 and self._pp_degree == 1:
return ParallelMode.DATA_PARALLEL return ParallelMode.DATA_PARALLEL
...@@ -206,7 +239,13 @@ class HybridCommunicateGroup(object): ...@@ -206,7 +239,13 @@ class HybridCommunicateGroup(object):
return ParallelMode.PIPELINE_PARALLEL return ParallelMode.PIPELINE_PARALLEL
def _check_vaild_topo(self): def _check_vaild_topo(self):
return self._dp_degree * self._mp_degree * self._pp_degree * self._sharding_degree == self.nranks return (
self._dp_degree
* self._mp_degree
* self._pp_degree
* self._sharding_degree
== self.nranks
)
def _set_comm_group(self, parallel_method="data"): def _set_comm_group(self, parallel_method="data"):
parallel_group = [] parallel_group = []
...@@ -268,14 +307,16 @@ class HybridCommunicateGroup(object): ...@@ -268,14 +307,16 @@ class HybridCommunicateGroup(object):
self.prev_rank = prev_rank self.prev_rank = prev_rank
next_group = paddle.distributed.new_group( next_group = paddle.distributed.new_group(
ranks=[curr_rank, next_rank]) ranks=[curr_rank, next_rank]
)
if self.global_rank == curr_rank: if self.global_rank == curr_rank:
self.send_next_group = next_group self.send_next_group = next_group
elif self.global_rank == next_rank: elif self.global_rank == next_rank:
self.recv_prev_group = next_group self.recv_prev_group = next_group
prev_group = paddle.distributed.new_group( prev_group = paddle.distributed.new_group(
ranks=[prev_rank, curr_rank]) ranks=[prev_rank, curr_rank]
)
if self.global_rank == curr_rank: if self.global_rank == curr_rank:
self.send_prev_group = prev_group self.send_prev_group = prev_group
...@@ -339,7 +380,12 @@ class HybridCommunicateGroup(object): ...@@ -339,7 +380,12 @@ class HybridCommunicateGroup(object):
return self._pp_comm_group return self._pp_comm_group
def get_p2p_groups(self): def get_p2p_groups(self):
return self.send_next_group, self.send_prev_group, self.recv_next_group, self.recv_prev_group return (
self.send_next_group,
self.send_prev_group,
self.recv_next_group,
self.recv_prev_group,
)
# sharding parallel message: # sharding parallel message:
def _get_sharding_parallel_id(self): def _get_sharding_parallel_id(self):
...@@ -363,23 +409,25 @@ class HybridCommunicateGroup(object): ...@@ -363,23 +409,25 @@ class HybridCommunicateGroup(object):
return self._check_comm_group return self._check_comm_group
def get_rank_from_stage(self, stage_id, **kwargs): def get_rank_from_stage(self, stage_id, **kwargs):
return self._topo.get_rank_from_stage(self.global_rank, return self._topo.get_rank_from_stage(
pipe=stage_id, self.global_rank, pipe=stage_id, **kwargs
**kwargs) )
class _CommunicateGroup(object): class _CommunicateGroup(object):
""" tmp for static """ """tmp for static"""
def __init__(self): def __init__(self):
global _HYBRID_PARALLEL_GROUP global _HYBRID_PARALLEL_GROUP
_HYBRID_PARALLEL_GROUP = self _HYBRID_PARALLEL_GROUP = self
self.groups = dict() self.groups = dict()
def set_comm_group(self, group_name, group_rank, group_size, ring_id, def set_comm_group(
group_ranks): self, group_name, group_rank, group_size, ring_id, group_ranks
group = paddle.distributed.collective.Group(group_rank, ring_id, ):
group_ranks) group = paddle.distributed.collective.Group(
group_rank, ring_id, group_ranks
)
self.groups[group_name] = group self.groups[group_name] = group
def get_group(self, group_name): def get_group(self, group_name):
......
...@@ -103,6 +103,7 @@ def _check_var_exists(var_name): ...@@ -103,6 +103,7 @@ def _check_var_exists(var_name):
def init_parallel_env(): def init_parallel_env():
""" """
Initialize parallel training environment in dynamic graph mode. Initialize parallel training environment in dynamic graph mode.
Note: Note:
...@@ -118,6 +119,7 @@ def init_parallel_env(): ...@@ -118,6 +119,7 @@ def init_parallel_env():
Examples: Examples:
.. code-block:: python .. code-block:: python
# required: gpu # required: gpu
import paddle import paddle
import paddle.nn as nn import paddle.nn as nn
...@@ -158,6 +160,7 @@ def init_parallel_env(): ...@@ -158,6 +160,7 @@ def init_parallel_env():
if __name__ == '__main__': if __name__ == '__main__':
dist.spawn(train) dist.spawn(train)
""" """
# 0. get env & check world size # 0. get env & check world size
......
...@@ -51,61 +51,76 @@ __all__ = [ ...@@ -51,61 +51,76 @@ __all__ = [
def _check_normalization(norm): def _check_normalization(norm):
if norm not in ['forward', 'backward', 'ortho']: if norm not in ['forward', 'backward', 'ortho']:
raise ValueError( raise ValueError(
"Unexpected norm: {}. Norm should be forward, backward or ortho". "Unexpected norm: {}. Norm should be forward, backward or ortho".format(
format(norm)) norm
)
)
def _check_fft_n(n): def _check_fft_n(n):
if not isinstance(n, int): if not isinstance(n, int):
raise ValueError( raise ValueError(
"Invalid FFT argument n({}), it shoule be an integer.".format(n)) "Invalid FFT argument n({}), it shoule be an integer.".format(n)
)
if n <= 0: if n <= 0:
raise ValueError( raise ValueError(
"Invalid FFT argument n({}), it should be positive.".format(n)) "Invalid FFT argument n({}), it should be positive.".format(n)
)
def _check_fft_shape(x, s): def _check_fft_shape(x, s):
ndim = x.ndim ndim = x.ndim
if not isinstance(s, Sequence): if not isinstance(s, Sequence):
raise ValueError( raise ValueError(
"Invaid FFT argument s({}), it should be a sequence of integers.") "Invaid FFT argument s({}), it should be a sequence of integers."
)
if len(s) > ndim: if len(s) > ndim:
raise ValueError( raise ValueError(
"Length of FFT argument s should not be larger than the rank of input. " "Length of FFT argument s should not be larger than the rank of input. "
"Received s: {}, rank of x: {}".format(s, ndim)) "Received s: {}, rank of x: {}".format(s, ndim)
)
for size in s: for size in s:
if not isinstance(size, int) or size <= 0: if not isinstance(size, int) or size <= 0:
raise ValueError("FFT sizes {} contains invalid value ({})".format( raise ValueError(
s, size)) "FFT sizes {} contains invalid value ({})".format(s, size)
)
def _check_fft_axis(x, axis): def _check_fft_axis(x, axis):
ndim = x.ndim ndim = x.ndim
if not isinstance(axis, int): if not isinstance(axis, int):
raise ValueError( raise ValueError(
"Invalid FFT axis ({}), it shoule be an integer.".format(axis)) "Invalid FFT axis ({}), it shoule be an integer.".format(axis)
)
if axis < -ndim or axis >= ndim: if axis < -ndim or axis >= ndim:
raise ValueError( raise ValueError(
"Invalid FFT axis ({}), it should be in range [-{}, {})".format( "Invalid FFT axis ({}), it should be in range [-{}, {})".format(
axis, ndim, ndim)) axis, ndim, ndim
)
)
def _check_fft_axes(x, axes): def _check_fft_axes(x, axes):
ndim = x.ndim ndim = x.ndim
if not isinstance(axes, Sequence): if not isinstance(axes, Sequence):
raise ValueError( raise ValueError(
"Invalid FFT axes ({}), it should be a sequence of integers.". "Invalid FFT axes ({}), it should be a sequence of integers.".format(
format(axes)) axes
)
)
if len(axes) > ndim: if len(axes) > ndim:
raise ValueError( raise ValueError(
"Length of fft axes should not be larger than the rank of input. " "Length of fft axes should not be larger than the rank of input. "
"Received, len of axes: {}, rank of x: {}".format(len(axes), ndim)) "Received, len of axes: {}, rank of x: {}".format(len(axes), ndim)
)
for axis in axes: for axis in axes:
if not isinstance(axis, int) or axis < -ndim or axis >= ndim: if not isinstance(axis, int) or axis < -ndim or axis >= ndim:
raise ValueError( raise ValueError(
"FFT axes {} contains invalid value ({}), it should be in range [-{}, {})" "FFT axes {} contains invalid value ({}), it should be in range [-{}, {})".format(
.format(axes, axis, ndim, ndim)) axes, axis, ndim, ndim
)
)
def _resize_fft_input(x, s, axes): def _resize_fft_input(x, s, axes):
...@@ -127,10 +142,12 @@ def _resize_fft_input(x, s, axes): ...@@ -127,10 +142,12 @@ def _resize_fft_input(x, s, axes):
slices.append((0, s[i])) slices.append((0, s[i]))
if axes_to_slice: if axes_to_slice:
x = paddle.slice(x, x = paddle.slice(
x,
axes_to_slice, axes_to_slice,
starts=[item[0] for item in slices], starts=[item[0] for item in slices],
ends=[item[1] for item in slices]) ends=[item[1] for item in slices],
)
if axes_to_pad: if axes_to_pad:
padding_widths = [0] * (2 * ndim) padding_widths = [0] * (2 * ndim)
for axis, pad in zip(axes_to_pad, paddings): for axis, pad in zip(axes_to_pad, paddings):
...@@ -146,8 +163,9 @@ def _normalize_axes(x, axes): ...@@ -146,8 +163,9 @@ def _normalize_axes(x, axes):
def _check_at_least_ndim(x, rank): def _check_at_least_ndim(x, rank):
if x.ndim < rank: if x.ndim < rank:
raise ValueError("The rank of the input ({}) should >= {}".format( raise ValueError(
x.ndim, rank)) "The rank of the input ({}) should >= {}".format(x.ndim, rank)
)
# public APIs 1d # public APIs 1d
...@@ -197,13 +215,9 @@ def fft(x, n=None, axis=-1, norm="backward", name=None): ...@@ -197,13 +215,9 @@ def fft(x, n=None, axis=-1, norm="backward", name=None):
""" """
if is_integer(x) or is_floating_point(x): if is_integer(x) or is_floating_point(x):
return fft_r2c(x, return fft_r2c(
n, x, n, axis, norm, forward=True, onesided=False, name=name
axis, )
norm,
forward=True,
onesided=False,
name=name)
else: else:
return fft_c2c(x, n, axis, norm, forward=True, name=name) return fft_c2c(x, n, axis, norm, forward=True, name=name)
...@@ -266,13 +280,9 @@ def ifft(x, n=None, axis=-1, norm="backward", name=None): ...@@ -266,13 +280,9 @@ def ifft(x, n=None, axis=-1, norm="backward", name=None):
""" """
if is_integer(x) or is_floating_point(x): if is_integer(x) or is_floating_point(x):
return fft_r2c(x, return fft_r2c(
n, x, n, axis, norm, forward=False, onesided=False, name=name
axis, )
norm,
forward=False,
onesided=False,
name=name)
else: else:
return fft_c2c(x, n, axis, norm, forward=False, name=name) return fft_c2c(x, n, axis, norm, forward=False, name=name)
...@@ -536,13 +546,9 @@ def fftn(x, s=None, axes=None, norm="backward", name=None): ...@@ -536,13 +546,9 @@ def fftn(x, s=None, axes=None, norm="backward", name=None):
# [-8.-8.j 0.+0.j 0.+0.j 0.-0.j]]] # [-8.-8.j 0.+0.j 0.+0.j 0.-0.j]]]
""" """
if is_integer(x) or is_floating_point(x): if is_integer(x) or is_floating_point(x):
return fftn_r2c(x, return fftn_r2c(
s, x, s, axes, norm, forward=True, onesided=False, name=name
axes, )
norm,
forward=True,
onesided=False,
name=name)
else: else:
return fftn_c2c(x, s, axes, norm, forward=True, name=name) return fftn_c2c(x, s, axes, norm, forward=True, name=name)
...@@ -608,19 +614,16 @@ def ifftn(x, s=None, axes=None, norm="backward", name=None): ...@@ -608,19 +614,16 @@ def ifftn(x, s=None, axes=None, norm="backward", name=None):
# (-0.1666666716337204+0.28867512941360474j)]]) # (-0.1666666716337204+0.28867512941360474j)]])
""" """
if is_integer(x) or is_floating_point(x): if is_integer(x) or is_floating_point(x):
return fftn_r2c(x, return fftn_r2c(
s, x, s, axes, norm, forward=False, onesided=False, name=name
axes, )
norm,
forward=False,
onesided=False,
name=name)
else: else:
return fftn_c2c(x, s, axes, norm, forward=False, name=name) return fftn_c2c(x, s, axes, norm, forward=False, name=name)
def rfftn(x, s=None, axes=None, norm="backward", name=None): def rfftn(x, s=None, axes=None, norm="backward", name=None):
""" """
The N dimensional FFT for real input. The N dimensional FFT for real input.
This function computes the N-dimensional discrete Fourier Transform over This function computes the N-dimensional discrete Fourier Transform over
...@@ -665,10 +668,9 @@ def rfftn(x, s=None, axes=None, norm="backward", name=None): ...@@ -665,10 +668,9 @@ def rfftn(x, s=None, axes=None, norm="backward", name=None):
refer to :ref:`api_guide_Name` . refer to :ref:`api_guide_Name` .
Returns: Returns:
out(Tensor): complex tensor out(Tensor), complex tensor
Examples: Examples:
.. code-block:: python .. code-block:: python
import paddle import paddle
...@@ -914,13 +916,17 @@ def fft2(x, s=None, axes=(-2, -1), norm="backward", name=None): ...@@ -914,13 +916,17 @@ def fft2(x, s=None, axes=(-2, -1), norm="backward", name=None):
if s is not None: if s is not None:
if not isinstance(s, Sequence) or len(s) != 2: if not isinstance(s, Sequence) or len(s) != 2:
raise ValueError( raise ValueError(
"Invalid FFT argument s ({}), it should be a sequence of 2 integers." "Invalid FFT argument s ({}), it should be a sequence of 2 integers.".format(
.format(s)) s
)
)
if axes is not None: if axes is not None:
if not isinstance(axes, Sequence) or len(axes) != 2: if not isinstance(axes, Sequence) or len(axes) != 2:
raise ValueError( raise ValueError(
"Invalid FFT argument axes ({}), it should be a sequence of 2 integers." "Invalid FFT argument axes ({}), it should be a sequence of 2 integers.".format(
.format(axes)) axes
)
)
return fftn(x, s, axes, norm, name) return fftn(x, s, axes, norm, name)
...@@ -979,13 +985,17 @@ def ifft2(x, s=None, axes=(-2, -1), norm="backward", name=None): ...@@ -979,13 +985,17 @@ def ifft2(x, s=None, axes=(-2, -1), norm="backward", name=None):
if s is not None: if s is not None:
if not isinstance(s, Sequence) or len(s) != 2: if not isinstance(s, Sequence) or len(s) != 2:
raise ValueError( raise ValueError(
"Invalid FFT argument s ({}), it should be a sequence of 2 integers." "Invalid FFT argument s ({}), it should be a sequence of 2 integers.".format(
.format(s)) s
)
)
if axes is not None: if axes is not None:
if not isinstance(axes, Sequence) or len(axes) != 2: if not isinstance(axes, Sequence) or len(axes) != 2:
raise ValueError( raise ValueError(
"Invalid FFT argument axes ({}), it should be a sequence of 2 integers." "Invalid FFT argument axes ({}), it should be a sequence of 2 integers.".format(
.format(axes)) axes
)
)
return ifftn(x, s, axes, norm, name) return ifftn(x, s, axes, norm, name)
...@@ -1038,13 +1048,17 @@ def rfft2(x, s=None, axes=(-2, -1), norm="backward", name=None): ...@@ -1038,13 +1048,17 @@ def rfft2(x, s=None, axes=(-2, -1), norm="backward", name=None):
if s is not None: if s is not None:
if not isinstance(s, Sequence) or len(s) != 2: if not isinstance(s, Sequence) or len(s) != 2:
raise ValueError( raise ValueError(
"Invalid FFT argument s ({}), it should be a sequence of 2 integers." "Invalid FFT argument s ({}), it should be a sequence of 2 integers.".format(
.format(s)) s
)
)
if axes is not None: if axes is not None:
if not isinstance(axes, Sequence) or len(axes) != 2: if not isinstance(axes, Sequence) or len(axes) != 2:
raise ValueError( raise ValueError(
"Invalid FFT argument axes ({}), it should be a sequence of 2 integers." "Invalid FFT argument axes ({}), it should be a sequence of 2 integers.".format(
.format(axes)) axes
)
)
return rfftn(x, s, axes, norm, name) return rfftn(x, s, axes, norm, name)
...@@ -1090,13 +1104,17 @@ def irfft2(x, s=None, axes=(-2, -1), norm="backward", name=None): ...@@ -1090,13 +1104,17 @@ def irfft2(x, s=None, axes=(-2, -1), norm="backward", name=None):
if s is not None: if s is not None:
if not isinstance(s, Sequence) or len(s) != 2: if not isinstance(s, Sequence) or len(s) != 2:
raise ValueError( raise ValueError(
"Invalid FFT argument s ({}), it should be a sequence of 2 integers." "Invalid FFT argument s ({}), it should be a sequence of 2 integers.".format(
.format(s)) s
)
)
if axes is not None: if axes is not None:
if not isinstance(axes, Sequence) or len(axes) != 2: if not isinstance(axes, Sequence) or len(axes) != 2:
raise ValueError( raise ValueError(
"Invalid FFT argument axes ({}), it should be a sequence of 2 integers." "Invalid FFT argument axes ({}), it should be a sequence of 2 integers.".format(
.format(axes)) axes
)
)
return irfftn(x, s, axes, norm, name) return irfftn(x, s, axes, norm, name)
...@@ -1135,13 +1153,17 @@ def hfft2(x, s=None, axes=(-2, -1), norm="backward", name=None): ...@@ -1135,13 +1153,17 @@ def hfft2(x, s=None, axes=(-2, -1), norm="backward", name=None):
if s is not None: if s is not None:
if not isinstance(s, Sequence) or len(s) != 2: if not isinstance(s, Sequence) or len(s) != 2:
raise ValueError( raise ValueError(
"Invalid FFT argument s ({}), it should be a sequence of 2 integers." "Invalid FFT argument s ({}), it should be a sequence of 2 integers.".format(
.format(s)) s
)
)
if axes is not None: if axes is not None:
if not isinstance(axes, Sequence) or len(axes) != 2: if not isinstance(axes, Sequence) or len(axes) != 2:
raise ValueError( raise ValueError(
"Invalid FFT argument axes ({}), it should be a sequence of 2 integers." "Invalid FFT argument axes ({}), it should be a sequence of 2 integers.".format(
.format(axes)) axes
)
)
return hfftn(x, s, axes, norm, name) return hfftn(x, s, axes, norm, name)
...@@ -1187,13 +1209,17 @@ def ihfft2(x, s=None, axes=(-2, -1), norm="backward", name=None): ...@@ -1187,13 +1209,17 @@ def ihfft2(x, s=None, axes=(-2, -1), norm="backward", name=None):
if s is not None: if s is not None:
if not isinstance(s, Sequence) or len(s) != 2: if not isinstance(s, Sequence) or len(s) != 2:
raise ValueError( raise ValueError(
"Invalid FFT argument s ({}), it should be a sequence of 2 integers." "Invalid FFT argument s ({}), it should be a sequence of 2 integers.".format(
.format(s)) s
)
)
if axes is not None: if axes is not None:
if not isinstance(axes, Sequence) or len(axes) != 2: if not isinstance(axes, Sequence) or len(axes) != 2:
raise ValueError( raise ValueError(
"Invalid FFT argument axes ({}), it should be a sequence of 2 integers." "Invalid FFT argument axes ({}), it should be a sequence of 2 integers.".format(
.format(axes)) axes
)
)
return ihfftn(x, s, axes, norm, name) return ihfftn(x, s, axes, norm, name)
...@@ -1417,10 +1443,9 @@ def fft_c2c(x, n, axis, norm, forward, name): ...@@ -1417,10 +1443,9 @@ def fft_c2c(x, n, axis, norm, forward, name):
dtype = helper.input_dtype(input_param_name='x') dtype = helper.input_dtype(input_param_name='x')
out = helper.create_variable_for_type_inference(dtype) out = helper.create_variable_for_type_inference(dtype)
outputs = {"Out": [out]} outputs = {"Out": [out]}
helper.append_op(type=op_type, helper.append_op(
inputs=inputs, type=op_type, inputs=inputs, outputs=outputs, attrs=attrs
outputs=outputs, )
attrs=attrs)
return out return out
...@@ -1442,8 +1467,16 @@ def fft_r2c(x, n, axis, norm, forward, onesided, name): ...@@ -1442,8 +1467,16 @@ def fft_r2c(x, n, axis, norm, forward, onesided, name):
if in_dygraph_mode(): if in_dygraph_mode():
out = _C_ops.fft_r2c(x, axes, norm, forward, onesided) out = _C_ops.fft_r2c(x, axes, norm, forward, onesided)
elif _in_legacy_dygraph(): elif _in_legacy_dygraph():
attrs = ('axes', axes, 'normalization', norm, 'forward', forward, attrs = (
'onesided', onesided) 'axes',
axes,
'normalization',
norm,
'forward',
forward,
'onesided',
onesided,
)
out = getattr(_legacy_C_ops, op_type)(x, *attrs) out = getattr(_legacy_C_ops, op_type)(x, *attrs)
else: else:
inputs = { inputs = {
...@@ -1458,12 +1491,12 @@ def fft_r2c(x, n, axis, norm, forward, onesided, name): ...@@ -1458,12 +1491,12 @@ def fft_r2c(x, n, axis, norm, forward, onesided, name):
helper = LayerHelper(op_type, **locals()) helper = LayerHelper(op_type, **locals())
dtype = helper.input_dtype(input_param_name='x') dtype = helper.input_dtype(input_param_name='x')
out = helper.create_variable_for_type_inference( out = helper.create_variable_for_type_inference(
_real_to_complex_dtype(dtype)) _real_to_complex_dtype(dtype)
)
outputs = {"Out": [out]} outputs = {"Out": [out]}
helper.append_op(type=op_type, helper.append_op(
inputs=inputs, type=op_type, inputs=inputs, outputs=outputs, attrs=attrs
outputs=outputs, )
attrs=attrs)
return out return out
...@@ -1491,8 +1524,16 @@ def fft_c2r(x, n, axis, norm, forward, name): ...@@ -1491,8 +1524,16 @@ def fft_c2r(x, n, axis, norm, forward, name):
out = _C_ops.fft_c2r(x, axes, norm, forward, 0) out = _C_ops.fft_c2r(x, axes, norm, forward, 0)
elif _in_legacy_dygraph(): elif _in_legacy_dygraph():
if n is not None: if n is not None:
attrs = ('axes', axes, 'normalization', norm, 'forward', forward, attrs = (
'last_dim_size', n) 'axes',
axes,
'normalization',
norm,
'forward',
forward,
'last_dim_size',
n,
)
else: else:
attrs = ('axes', axes, 'normalization', norm, 'forward', forward) attrs = ('axes', axes, 'normalization', norm, 'forward', forward)
out = getattr(_legacy_C_ops, op_type)(x, *attrs) out = getattr(_legacy_C_ops, op_type)(x, *attrs)
...@@ -1506,12 +1547,12 @@ def fft_c2r(x, n, axis, norm, forward, name): ...@@ -1506,12 +1547,12 @@ def fft_c2r(x, n, axis, norm, forward, name):
helper = LayerHelper(op_type, **locals()) helper = LayerHelper(op_type, **locals())
dtype = helper.input_dtype(input_param_name='x') dtype = helper.input_dtype(input_param_name='x')
out = helper.create_variable_for_type_inference( out = helper.create_variable_for_type_inference(
_complex_to_real_dtype(dtype)) _complex_to_real_dtype(dtype)
)
outputs = {"Out": [out]} outputs = {"Out": [out]}
helper.append_op(type=op_type, helper.append_op(
inputs=inputs, type=op_type, inputs=inputs, outputs=outputs, attrs=attrs
outputs=outputs, )
attrs=attrs)
return out return out
...@@ -1539,8 +1580,10 @@ def fftn_c2c(x, s, axes, norm, forward, name): ...@@ -1539,8 +1580,10 @@ def fftn_c2c(x, s, axes, norm, forward, name):
if s is not None: if s is not None:
if len(s) != len(axes): if len(s) != len(axes):
raise ValueError( raise ValueError(
"Length of s ({}) and length of axes ({}) does not match.". "Length of s ({}) and length of axes ({}) does not match.".format(
format(len(s), len(axes))) len(s), len(axes)
)
)
s = [s[i] for i in axes_argsoft] s = [s[i] for i in axes_argsoft]
if s is not None: if s is not None:
...@@ -1562,10 +1605,9 @@ def fftn_c2c(x, s, axes, norm, forward, name): ...@@ -1562,10 +1605,9 @@ def fftn_c2c(x, s, axes, norm, forward, name):
dtype = helper.input_dtype(input_param_name='x') dtype = helper.input_dtype(input_param_name='x')
out = helper.create_variable_for_type_inference(dtype) out = helper.create_variable_for_type_inference(dtype)
outputs = {"Out": [out]} outputs = {"Out": [out]}
helper.append_op(type=op_type, helper.append_op(
inputs=inputs, type=op_type, inputs=inputs, outputs=outputs, attrs=attrs
outputs=outputs, )
attrs=attrs)
return out return out
...@@ -1591,8 +1633,10 @@ def fftn_r2c(x, s, axes, norm, forward, onesided, name): ...@@ -1591,8 +1633,10 @@ def fftn_r2c(x, s, axes, norm, forward, onesided, name):
if s is not None: if s is not None:
if len(s) != len(axes): if len(s) != len(axes):
raise ValueError( raise ValueError(
"Length of s ({}) and length of axes ({}) does not match.". "Length of s ({}) and length of axes ({}) does not match.".format(
format(len(s), len(axes))) len(s), len(axes)
)
)
s = [s[i] for i in axes_argsoft] + [s[-1]] s = [s[i] for i in axes_argsoft] + [s[-1]]
if s is not None: if s is not None:
...@@ -1604,8 +1648,16 @@ def fftn_r2c(x, s, axes, norm, forward, onesided, name): ...@@ -1604,8 +1648,16 @@ def fftn_r2c(x, s, axes, norm, forward, onesided, name):
if in_dygraph_mode(): if in_dygraph_mode():
out = _C_ops.fft_r2c(x, axes, norm, forward, onesided) out = _C_ops.fft_r2c(x, axes, norm, forward, onesided)
elif _in_legacy_dygraph(): elif _in_legacy_dygraph():
attrs = ('axes', axes, 'normalization', norm, 'forward', forward, attrs = (
'onesided', onesided) 'axes',
axes,
'normalization',
norm,
'forward',
forward,
'onesided',
onesided,
)
out = getattr(_legacy_C_ops, op_type)(x, *attrs) out = getattr(_legacy_C_ops, op_type)(x, *attrs)
else: else:
inputs = { inputs = {
...@@ -1620,12 +1672,12 @@ def fftn_r2c(x, s, axes, norm, forward, onesided, name): ...@@ -1620,12 +1672,12 @@ def fftn_r2c(x, s, axes, norm, forward, onesided, name):
helper = LayerHelper(op_type, **locals()) helper = LayerHelper(op_type, **locals())
dtype = helper.input_dtype(input_param_name='x') dtype = helper.input_dtype(input_param_name='x')
out = helper.create_variable_for_type_inference( out = helper.create_variable_for_type_inference(
_real_to_complex_dtype(dtype)) _real_to_complex_dtype(dtype)
)
outputs = {"Out": [out]} outputs = {"Out": [out]}
helper.append_op(type=op_type, helper.append_op(
inputs=inputs, type=op_type, inputs=inputs, outputs=outputs, attrs=attrs
outputs=outputs, )
attrs=attrs)
return out return out
...@@ -1654,8 +1706,10 @@ def fftn_c2r(x, s, axes, norm, forward, name): ...@@ -1654,8 +1706,10 @@ def fftn_c2r(x, s, axes, norm, forward, name):
if s is not None: if s is not None:
if len(s) != len(axes): if len(s) != len(axes):
raise ValueError( raise ValueError(
"Length of s ({}) and length of axes ({}) does not match.". "Length of s ({}) and length of axes ({}) does not match.".format(
format(len(s), len(axes))) len(s), len(axes)
)
)
s = [s[i] for i in axes_argsoft] + [s[-1]] s = [s[i] for i in axes_argsoft] + [s[-1]]
if s is not None: if s is not None:
...@@ -1673,8 +1727,16 @@ def fftn_c2r(x, s, axes, norm, forward, name): ...@@ -1673,8 +1727,16 @@ def fftn_c2r(x, s, axes, norm, forward, name):
out = _C_ops.fft_c2r(x, axes, norm, forward, 0) out = _C_ops.fft_c2r(x, axes, norm, forward, 0)
elif _in_legacy_dygraph(): elif _in_legacy_dygraph():
if s: if s:
attrs = ('axes', axes, 'normalization', norm, 'forward', forward, attrs = (
'last_dim_size', s[-1]) 'axes',
axes,
'normalization',
norm,
'forward',
forward,
'last_dim_size',
s[-1],
)
else: else:
attrs = ('axes', axes, 'normalization', norm, 'forward', forward) attrs = ('axes', axes, 'normalization', norm, 'forward', forward)
out = getattr(_legacy_C_ops, op_type)(x, *attrs) out = getattr(_legacy_C_ops, op_type)(x, *attrs)
...@@ -1688,10 +1750,10 @@ def fftn_c2r(x, s, axes, norm, forward, name): ...@@ -1688,10 +1750,10 @@ def fftn_c2r(x, s, axes, norm, forward, name):
helper = LayerHelper(op_type, **locals()) helper = LayerHelper(op_type, **locals())
dtype = helper.input_dtype(input_param_name='x') dtype = helper.input_dtype(input_param_name='x')
out = helper.create_variable_for_type_inference( out = helper.create_variable_for_type_inference(
_complex_to_real_dtype(dtype)) _complex_to_real_dtype(dtype)
)
outputs = {"Out": [out]} outputs = {"Out": [out]}
helper.append_op(type=op_type, helper.append_op(
inputs=inputs, type=op_type, inputs=inputs, outputs=outputs, attrs=attrs
outputs=outputs, )
attrs=attrs)
return out return out
...@@ -23,9 +23,9 @@ from ...log_helper import get_logger ...@@ -23,9 +23,9 @@ from ...log_helper import get_logger
__all__ = ['add_supported_layer'] __all__ = ['add_supported_layer']
_logger = get_logger(__name__, _logger = get_logger(
logging.INFO, __name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s'
fmt='%(asctime)s-%(levelname)s: %(message)s') )
def _default_pruning(weight_nparray, m, n, func_name, param_name): def _default_pruning(weight_nparray, m, n, func_name, param_name):
...@@ -38,13 +38,17 @@ def _default_pruning(weight_nparray, m, n, func_name, param_name): ...@@ -38,13 +38,17 @@ def _default_pruning(weight_nparray, m, n, func_name, param_name):
exlude_cond_shape4 = len(shape) == 4 and shape[1] < m exlude_cond_shape4 = len(shape) == 4 and shape[1] < m
if exlude_cond_shape2: if exlude_cond_shape2:
_logger.warning( _logger.warning(
'{} is not pruned because the first dimension of {} is smaller than {}' '{} is not pruned because the first dimension of {} is smaller than {}'.format(
.format(param_name, shape, m)) param_name, shape, m
)
)
return weight_pruned_nparray, weight_sparse_mask return weight_pruned_nparray, weight_sparse_mask
if exlude_cond_shape4: if exlude_cond_shape4:
_logger.warning( _logger.warning(
'{} is not pruned because the second dimension of {} is smaller than {}' '{} is not pruned because the second dimension of {} is smaller than {}'.format(
.format(param_name, shape, m)) param_name, shape, m
)
)
return weight_pruned_nparray, weight_sparse_mask return weight_pruned_nparray, weight_sparse_mask
checked_func_name = sparsity.CheckMethod.get_checking_method(func_name) checked_func_name = sparsity.CheckMethod.get_checking_method(func_name)
...@@ -60,13 +64,13 @@ def _default_pruning(weight_nparray, m, n, func_name, param_name): ...@@ -60,13 +64,13 @@ def _default_pruning(weight_nparray, m, n, func_name, param_name):
# sparsity/utils is row-major pruning. That is the reason we have to transpose weight # sparsity/utils is row-major pruning. That is the reason we have to transpose weight
# matrices beforce invoking create_mask. Then we transpose the result mask to make # matrices beforce invoking create_mask. Then we transpose the result mask to make
# sure its shape to be the same as the input weight. # sure its shape to be the same as the input weight.
weight_sparse_mask = sparsity.create_mask(weight_nparray.T, weight_sparse_mask = sparsity.create_mask(
func_name=func_name, weight_nparray.T, func_name=func_name, n=n, m=m
n=n, ).T
m=m).T
weight_pruned_nparray = np.multiply(weight_nparray, weight_sparse_mask) weight_pruned_nparray = np.multiply(weight_nparray, weight_sparse_mask)
assert sparsity.check_sparsity(weight_pruned_nparray.T, n=n, m=m, func_name=checked_func_name), \ assert sparsity.check_sparsity(
'Pruning {} weight matrix failure!!!'.format(param_name) weight_pruned_nparray.T, n=n, m=m, func_name=checked_func_name
), 'Pruning {} weight matrix failure!!!'.format(param_name)
return weight_pruned_nparray, weight_sparse_mask return weight_pruned_nparray, weight_sparse_mask
...@@ -78,6 +82,7 @@ supported_layers_and_prune_func_map = {} ...@@ -78,6 +82,7 @@ supported_layers_and_prune_func_map = {}
def add_supported_layer(layer, pruning_func=None): def add_supported_layer(layer, pruning_func=None):
r""" r"""
Add supported layers and its corresponding pruning function. Add supported layers and its corresponding pruning function.
Args: Args:
...@@ -87,19 +92,25 @@ def add_supported_layer(layer, pruning_func=None): ...@@ -87,19 +92,25 @@ def add_supported_layer(layer, pruning_func=None):
pruning_func (function, optional): a function type which receives five argument (weight_nparray, pruning_func (function, optional): a function type which receives five argument (weight_nparray,
m, n, func_name, param_name), weight_nparray is a nparray of weight, param_name is the name of weight, m, n, func_name, param_name), weight_nparray is a nparray of weight, param_name is the name of weight,
m, n, and func_name, please see `prune_model` for details. m, n, and func_name, please see `prune_model` for details.
""" """
name = None name = None
if isinstance(layer, str): if isinstance(layer, str):
name = layer name = layer
elif isinstance(layer, paddle.fluid.dygraph.layers.Layer): elif isinstance(layer, paddle.fluid.dygraph.layers.Layer):
name = paddle.fluid.dygraph.layers._convert_camel_to_snake( name = paddle.fluid.dygraph.layers._convert_camel_to_snake(
type(layer).__name__) type(layer).__name__
)
elif issubclass(layer, paddle.fluid.dygraph.layers.Layer): elif issubclass(layer, paddle.fluid.dygraph.layers.Layer):
name = paddle.fluid.dygraph.layers._convert_camel_to_snake( name = paddle.fluid.dygraph.layers._convert_camel_to_snake(
layer.__name__) layer.__name__
)
else: else:
assert "The type of layer should be string of Layer, but got {}!".format( assert (
type(layer)) "The type of layer should be string of Layer, but got {}!".format(
type(layer)
)
)
if pruning_func is None: if pruning_func is None:
pruning_func = _default_pruning pruning_func = _default_pruning
_supported_layers_and_prune_func_map_lock.acquire() _supported_layers_and_prune_func_map_lock.acquire()
......
...@@ -27,9 +27,16 @@ from itertools import permutations ...@@ -27,9 +27,16 @@ from itertools import permutations
import threading import threading
__all__ = [ __all__ = [
'calculate_density', 'check_mask_1d', 'get_mask_1d', 'check_mask_2d', 'calculate_density',
'get_mask_2d_greedy', 'get_mask_2d_best', 'create_mask', 'check_sparsity', 'check_mask_1d',
'MaskAlgo', 'CheckMethod' 'get_mask_1d',
'check_mask_2d',
'get_mask_2d_greedy',
'get_mask_2d_best',
'create_mask',
'check_sparsity',
'MaskAlgo',
'CheckMethod',
] ]
...@@ -76,8 +83,9 @@ class CheckMethod(Enum): ...@@ -76,8 +83,9 @@ class CheckMethod(Enum):
CheckMethod.get_checking_method(MaskAlgo.MASK_2D_BEST) CheckMethod.get_checking_method(MaskAlgo.MASK_2D_BEST)
# CheckMethod.CHECK_2D # CheckMethod.CHECK_2D
""" """
assert isinstance(mask_algo, MaskAlgo), \ assert isinstance(
"mask_algo should be MaskAlgo type" mask_algo, MaskAlgo
), "mask_algo should be MaskAlgo type"
if mask_algo == MaskAlgo.MASK_1D: if mask_algo == MaskAlgo.MASK_1D:
return CheckMethod.CHECK_1D return CheckMethod.CHECK_1D
else: else:
...@@ -86,20 +94,25 @@ class CheckMethod(Enum): ...@@ -86,20 +94,25 @@ class CheckMethod(Enum):
def calculate_density(x): def calculate_density(x):
r""" r"""
Return the density of the input tensor. Return the density of the input tensor.
Args: Args:
x (nparray): The input tensor. x (nparray): The input tensor.
Returns: Returns:
float: The density of :attr:`x`. float, The density of :attr:`x`.
Examples: Examples:
.. code-block:: python .. code-block:: python
import paddle import paddle
import numpy as np import numpy as np
x = np.array([[0, 1, 3, 0], x = np.array([[0, 1, 3, 0],
[1, 1, 0, 1]]) [1, 1, 0, 1]])
paddle.incubate.asp.calculate_density(x) # 0.625 paddle.incubate.asp.calculate_density(x) # 0.625
""" """
x_flattened = x.flatten() x_flattened = x.flatten()
return float(np.nonzero(x_flattened)[0].size) / x_flattened.size return float(np.nonzero(x_flattened)[0].size) / x_flattened.size
...@@ -126,7 +139,7 @@ def _reshape_1d(mat, m): ...@@ -126,7 +139,7 @@ def _reshape_1d(mat, m):
remainder = mat.shape[1] % m remainder = mat.shape[1] % m
if mat.shape[1] % m > 0: if mat.shape[1] % m > 0:
mat_padded = np.zeros((mat.shape[0], mat.shape[1] + (m - remainder))) mat_padded = np.zeros((mat.shape[0], mat.shape[1] + (m - remainder)))
mat_padded[:, :mat.shape[1]] = mat mat_padded[:, : mat.shape[1]] = mat
shape = mat_padded.shape shape = mat_padded.shape
return mat_padded.reshape(-1, m), shape return mat_padded.reshape(-1, m), shape
else: else:
...@@ -213,7 +226,7 @@ def get_mask_1d(mat, n, m): ...@@ -213,7 +226,7 @@ def get_mask_1d(mat, n, m):
min_order_indices = np.argsort(np.absolute(sub_mat)) min_order_indices = np.argsort(np.absolute(sub_mat))
mask_flattern[i, min_order_indices[:n].tolist()] = 0 mask_flattern[i, min_order_indices[:n].tolist()] = 0
mask_flattern = mask_flattern.reshape(shape) mask_flattern = mask_flattern.reshape(shape)
mask[:, :] = mask_flattern[:, :mat.shape[1]] mask[:, :] = mask_flattern[:, : mat.shape[1]]
return mask return mask
...@@ -239,12 +252,12 @@ def _reshape_2d(mat, m): ...@@ -239,12 +252,12 @@ def _reshape_2d(mat, m):
remainder_0 = mat.shape[0] % m remainder_0 = mat.shape[0] % m
remainder_1 = mat.shape[1] % m remainder_1 = mat.shape[1] % m
new_shape = (mat.shape[0] if remainder_0 == 0 \ new_shape = (
else mat.shape[0] + (m - remainder_0), mat.shape[0] if remainder_0 == 0 else mat.shape[0] + (m - remainder_0),
mat.shape[1] if remainder_1 == 0 \ mat.shape[1] if remainder_1 == 0 else mat.shape[1] + (m - remainder_1),
else mat.shape[1] + (m - remainder_1)) )
mat_padded = np.zeros(new_shape) mat_padded = np.zeros(new_shape)
mat_padded[:mat.shape[0], :mat.shape[1]] = mat mat_padded[: mat.shape[0], : mat.shape[1]] = mat
mat_flattern = np.empty(new_shape).reshape(-1, m * m) mat_flattern = np.empty(new_shape).reshape(-1, m * m)
curr_idx = 0 curr_idx = 0
...@@ -252,9 +265,9 @@ def _reshape_2d(mat, m): ...@@ -252,9 +265,9 @@ def _reshape_2d(mat, m):
row_end = row_start + m row_end = row_start + m
for col_start in range(0, mat_padded.shape[1], m): for col_start in range(0, mat_padded.shape[1], m):
col_end = col_start + m col_end = col_start + m
sub_mat = np.squeeze(mat_padded[row_start:row_end, \ sub_mat = np.squeeze(
col_start:col_end] \ mat_padded[row_start:row_end, col_start:col_end].reshape(-1)
.reshape(-1)) )
mat_flattern[curr_idx] = sub_mat mat_flattern[curr_idx] = sub_mat
curr_idx += 1 curr_idx += 1
return mat_flattern, mat_padded.shape return mat_flattern, mat_padded.shape
...@@ -304,8 +317,9 @@ def check_mask_2d(mat, n, m): ...@@ -304,8 +317,9 @@ def check_mask_2d(mat, n, m):
mat_padded, shape = _reshape_2d(mat, m) mat_padded, shape = _reshape_2d(mat, m)
for sub_mat in mat_padded: for sub_mat in mat_padded:
sub_mask = np.absolute(np.squeeze(sub_mat.reshape(m, m))) > 0 sub_mask = np.absolute(np.squeeze(sub_mat.reshape(m, m))) > 0
if (np.sum(np.sum(sub_mask, axis=1) > (m-n)) != 0) and \ if (np.sum(np.sum(sub_mask, axis=1) > (m - n)) != 0) and (
(np.sum(np.sum(sub_mask, axis=0) > (m-n)) != 0): np.sum(np.sum(sub_mask, axis=0) > (m - n)) != 0
):
return False return False
return True return True
...@@ -350,15 +364,17 @@ def get_mask_2d_greedy(mat, n, m): ...@@ -350,15 +364,17 @@ def get_mask_2d_greedy(mat, n, m):
sub_mask = np.squeeze(mask_padded[idx]) sub_mask = np.squeeze(mask_padded[idx])
min_order_1d_indices = np.argsort(sub_mat) min_order_1d_indices = np.argsort(sub_mat)
min_order_2d_indices = [(int(x / m), x % m) min_order_2d_indices = [
for x in min_order_1d_indices] (int(x / m), x % m) for x in min_order_1d_indices
]
row_counter = collections.Counter() row_counter = collections.Counter()
col_counter = collections.Counter() col_counter = collections.Counter()
for i in range(len(min_order_1d_indices) - 1, -1, -1): for i in range(len(min_order_1d_indices) - 1, -1, -1):
matrix_entry = min_order_2d_indices[i] matrix_entry = min_order_2d_indices[i]
if (row_counter[matrix_entry[0]] == n) or \ if (row_counter[matrix_entry[0]] == n) or (
(col_counter[matrix_entry[1]] == n): col_counter[matrix_entry[1]] == n
):
continue continue
sub_mask[matrix_entry[0], matrix_entry[1]] = 1.0 sub_mask[matrix_entry[0], matrix_entry[1]] = 1.0
...@@ -373,7 +389,7 @@ def get_mask_2d_greedy(mat, n, m): ...@@ -373,7 +389,7 @@ def get_mask_2d_greedy(mat, n, m):
col_end = col_start + m col_end = col_start + m
mask[row_start:row_end, col_start:col_end] = mask_padded[curr_idx] mask[row_start:row_end, col_start:col_end] = mask_padded[curr_idx]
curr_idx += 1 curr_idx += 1
return mask[:mat.shape[0], :mat.shape[1]] return mask[: mat.shape[0], : mat.shape[1]]
_valid_2d_patterns_lock = threading.Lock() _valid_2d_patterns_lock = threading.Lock()
...@@ -406,8 +422,11 @@ def _compute_valid_2d_patterns(n, m): ...@@ -406,8 +422,11 @@ def _compute_valid_2d_patterns(n, m):
patterns = patterns + patterns patterns = patterns + patterns
patterns = np.asarray(list(set(permutations(patterns, m)))) patterns = np.asarray(list(set(permutations(patterns, m))))
valid = ((patterns.sum(axis=1) <= n).sum( valid = (
axis=1) == m).nonzero()[0].reshape(-1) ((patterns.sum(axis=1) <= n).sum(axis=1) == m)
.nonzero()[0]
.reshape(-1)
)
valid_patterns = np.empty((valid.shape[0], m, m)) valid_patterns = np.empty((valid.shape[0], m, m))
valid_patterns[:] = patterns[valid[:]] valid_patterns[:] = patterns[valid[:]]
...@@ -454,9 +473,10 @@ def get_mask_2d_best(mat, n, m): ...@@ -454,9 +473,10 @@ def get_mask_2d_best(mat, n, m):
mat_flattern, shape = _reshape_2d(mat, m) mat_flattern, shape = _reshape_2d(mat, m)
mask_flattern = np.ones_like(mat_flattern).reshape(-1, m, m) mask_flattern = np.ones_like(mat_flattern).reshape(-1, m, m)
pmax = np.argmax(np.matmul(mat_flattern, pmax = np.argmax(
patterns.reshape(patterns.shape[0], m * m).T), np.matmul(mat_flattern, patterns.reshape(patterns.shape[0], m * m).T),
axis=1) axis=1,
)
mask_flattern[:] = patterns[pmax[:]] mask_flattern[:] = patterns[pmax[:]]
mask = np.empty(shape) mask = np.empty(shape)
...@@ -468,7 +488,7 @@ def get_mask_2d_best(mat, n, m): ...@@ -468,7 +488,7 @@ def get_mask_2d_best(mat, n, m):
col_end = col_start + m col_end = col_start + m
mask[row_start:row_end, col_start:col_end] = mask_flattern[curr_idx] mask[row_start:row_end, col_start:col_end] = mask_flattern[curr_idx]
curr_idx += 1 curr_idx += 1
return mask[:mat.shape[0], :mat.shape[1]] return mask[: mat.shape[0], : mat.shape[1]]
def create_mask(tensor, func_name=MaskAlgo.MASK_1D, n=2, m=4): def create_mask(tensor, func_name=MaskAlgo.MASK_1D, n=2, m=4):
...@@ -508,9 +528,10 @@ def create_mask(tensor, func_name=MaskAlgo.MASK_1D, n=2, m=4): ...@@ -508,9 +528,10 @@ def create_mask(tensor, func_name=MaskAlgo.MASK_1D, n=2, m=4):
dtype = tensor.dtype dtype = tensor.dtype
t = tensor.astype(float) t = tensor.astype(float)
assert isinstance(func_name, MaskAlgo), \ assert isinstance(func_name, MaskAlgo), (
"func_name argumet of create_mask is only accepted as type MaskAlgo. " \ "func_name argumet of create_mask is only accepted as type MaskAlgo. "
"But got {}".format(type(func_name)) "But got {}".format(type(func_name))
)
func = getattr(sys.modules[__name__], func_name.value, None) func = getattr(sys.modules[__name__], func_name.value, None)
if len(shape) == 1: if len(shape) == 1:
t = t.reshape(1, shape[0]) t = t.reshape(1, shape[0])
...@@ -520,14 +541,20 @@ def create_mask(tensor, func_name=MaskAlgo.MASK_1D, n=2, m=4): ...@@ -520,14 +541,20 @@ def create_mask(tensor, func_name=MaskAlgo.MASK_1D, n=2, m=4):
t = t.reshape(shape[0] * shape[1], shape[2]) t = t.reshape(shape[0] * shape[1], shape[2])
# 4d-tensor conv (h, w, in, out) -> (h*w*out, in) in GemmConvKernel Op # 4d-tensor conv (h, w, in, out) -> (h*w*out, in) in GemmConvKernel Op
elif len(shape) == 4: elif len(shape) == 4:
t = t.transpose([0, 1, 3, 2]).reshape(shape[0] * shape[1] * shape[3], t = t.transpose([0, 1, 3, 2]).reshape(
shape[2]) shape[0] * shape[1] * shape[3], shape[2]
)
mask = func(t, n=n, m=m) mask = func(t, n=n, m=m)
return mask.reshape([shape[0], shape[1], shape[3], return (
shape[2]]).transpose([0, 1, 3, 2]).astype(dtype) mask.reshape([shape[0], shape[1], shape[3], shape[2]])
.transpose([0, 1, 3, 2])
.astype(dtype)
)
else: else:
raise ValueError("The dimension of input tensor is not supported in create_mask, " \ raise ValueError(
"Only dimension < 4 is supported but got {}".format(len(shape))) "The dimension of input tensor is not supported in create_mask, "
"Only dimension < 4 is supported but got {}".format(len(shape))
)
mask = func(t, n=n, m=m) mask = func(t, n=n, m=m)
return mask.reshape(shape).astype(dtype) return mask.reshape(shape).astype(dtype)
...@@ -566,9 +593,10 @@ def check_sparsity(tensor, func_name=CheckMethod.CHECK_1D, n=2, m=4): ...@@ -566,9 +593,10 @@ def check_sparsity(tensor, func_name=CheckMethod.CHECK_1D, n=2, m=4):
shape = tensor.shape shape = tensor.shape
t = tensor.astype(float) t = tensor.astype(float)
assert type(func_name) == CheckMethod, \ assert type(func_name) == CheckMethod, (
"func_name argumet of check_sparsity is only accepted as type CheckMethod. " \ "func_name argumet of check_sparsity is only accepted as type CheckMethod. "
"But got {}".format(type(func_name)) "But got {}".format(type(func_name))
)
func = getattr(sys.modules[__name__], func_name.value, None) func = getattr(sys.modules[__name__], func_name.value, None)
if len(shape) == 1: if len(shape) == 1:
t = t.reshape(1, shape[0]) t = t.reshape(1, shape[0])
...@@ -578,10 +606,13 @@ def check_sparsity(tensor, func_name=CheckMethod.CHECK_1D, n=2, m=4): ...@@ -578,10 +606,13 @@ def check_sparsity(tensor, func_name=CheckMethod.CHECK_1D, n=2, m=4):
t = t.reshape(shape[0] * shape[1], shape[2]) t = t.reshape(shape[0] * shape[1], shape[2])
# 4d-tensor conv (h, w, in, out) -> (h*w*out, in) in GemmConvKernel Op # 4d-tensor conv (h, w, in, out) -> (h*w*out, in) in GemmConvKernel Op
elif len(shape) == 4: elif len(shape) == 4:
t = t.transpose([0, 1, 3, t = t.transpose([0, 1, 3, 2]).reshape(
2]).reshape([shape[0] * shape[1] * shape[3], shape[2]]) [shape[0] * shape[1] * shape[3], shape[2]]
)
else: else:
raise ValueError("The dimension of input tensor is not supported in create_mask, " \ raise ValueError(
"Only dimension < 4 is supported but got {}".format(len(shape))) "The dimension of input tensor is not supported in create_mask, "
"Only dimension < 4 is supported but got {}".format(len(shape))
)
return func(t, n=n, m=m) return func(t, n=n, m=m)
...@@ -32,12 +32,25 @@ from . import parallel_helper ...@@ -32,12 +32,25 @@ from . import parallel_helper
from .. import unique_name from .. import unique_name
from paddle.fluid import core from paddle.fluid import core
from .layer_object_helper import LayerObjectHelper from .layer_object_helper import LayerObjectHelper
from .layer_hooks import record_program_ops_pre_hook, set_op_customized_attrs_post_hook, LayerOpsRecoder from .layer_hooks import (
from .base import program_desc_tracing_guard, param_guard, in_declarative_mode, _convert_into_variable record_program_ops_pre_hook,
set_op_customized_attrs_post_hook,
LayerOpsRecoder,
)
from .base import (
program_desc_tracing_guard,
param_guard,
in_declarative_mode,
_convert_into_variable,
)
from paddle.fluid import framework from paddle.fluid import framework
from ..param_attr import ParamAttr from ..param_attr import ParamAttr
from paddle.fluid.executor import Executor, global_scope from paddle.fluid.executor import Executor, global_scope
from paddle.fluid.framework import _non_static_mode, convert_np_dtype_to_dtype_, in_dygraph_mode from paddle.fluid.framework import (
_non_static_mode,
convert_np_dtype_to_dtype_,
in_dygraph_mode,
)
from paddle.fluid.framework import Program, program_guard from paddle.fluid.framework import Program, program_guard
from paddle.fluid.framework import _current_expected_place as _get_device from paddle.fluid.framework import _current_expected_place as _get_device
from paddle.fluid.core import VarDesc from paddle.fluid.core import VarDesc
...@@ -67,7 +80,7 @@ def _addindent(string, indent): ...@@ -67,7 +80,7 @@ def _addindent(string, indent):
class HookRemoveHelper(object): class HookRemoveHelper(object):
""" A HookRemoveHelper that can be used to remove hook. """ """A HookRemoveHelper that can be used to remove hook."""
next_hook_id = 0 next_hook_id = 0
...@@ -153,13 +166,14 @@ class Layer(object): ...@@ -153,13 +166,14 @@ class Layer(object):
def train(self): def train(self):
""" """
Sets this Layer and all its sublayers to training mode. Sets this Layer and all its sublayers to training mode.
This only effects certain modules like `Dropout` and `BatchNorm`. This only effects certain modules like `Dropout` and `BatchNorm`.
Returns: Returns:
None None
Example:: Examples:
.. code-block:: python .. code-block:: python
import paddle import paddle
...@@ -236,6 +250,7 @@ class Layer(object): ...@@ -236,6 +250,7 @@ class Layer(object):
def apply(self, fn): def apply(self, fn):
""" """
Applies ``fn`` recursively to every sublayer (as returned by ``.sublayers()``) Applies ``fn`` recursively to every sublayer (as returned by ``.sublayers()``)
as well as self. Typical use includes initializing the parameters of a model. as well as self. Typical use includes initializing the parameters of a model.
...@@ -243,7 +258,7 @@ class Layer(object): ...@@ -243,7 +258,7 @@ class Layer(object):
fn (function): a function to be applied to each sublayer fn (function): a function to be applied to each sublayer
Returns: Returns:
Layer: self Layer, self
Example:: Example::
.. code-block:: python .. code-block:: python
...@@ -263,6 +278,7 @@ class Layer(object): ...@@ -263,6 +278,7 @@ class Layer(object):
net.apply(init_weights) net.apply(init_weights)
print(net.state_dict()) print(net.state_dict())
""" """
for layer in self.children(): for layer in self.children():
layer.apply(fn) layer.apply(fn)
...@@ -272,10 +288,12 @@ class Layer(object): ...@@ -272,10 +288,12 @@ class Layer(object):
return self return self
def full_name(self): def full_name(self):
"""Full name for this layer, composed by name_scope + "/" + MyLayer.__class__.__name__ """
Full name for this layer, composed by name_scope + "/" + MyLayer.__class__.__name__
Returns: Returns:
str: full name of this layer. str, full name of this layer.
Example:: Example::
.. code-block:: python .. code-block:: python
...@@ -297,7 +315,9 @@ class Layer(object): ...@@ -297,7 +315,9 @@ class Layer(object):
return self._full_name return self._full_name
def register_forward_post_hook(self, hook): def register_forward_post_hook(self, hook):
"""Register a forward post-hook for Layer. The hook will be called after `forward` function has been computed. """
Register a forward post-hook for Layer. The hook will be called after `forward` function has been computed.
It should have the following form, `input` and `output` of the `hook` is `input` and `output` of the `Layer` respectively. It should have the following form, `input` and `output` of the `hook` is `input` and `output` of the `Layer` respectively.
User can use forward post-hook to change the output of the Layer or perform information statistics tasks on the Layer. User can use forward post-hook to change the output of the Layer or perform information statistics tasks on the Layer.
...@@ -308,7 +328,7 @@ class Layer(object): ...@@ -308,7 +328,7 @@ class Layer(object):
hook(function): a function registered as a forward post-hook hook(function): a function registered as a forward post-hook
Returns: Returns:
HookRemoveHelper: a HookRemoveHelper object that can be used to remove the added hook by calling `hook_remove_helper.remove()` . HookRemoveHelper, a HookRemoveHelper object that can be used to remove the added hook by calling `hook_remove_helper.remove()` .
Examples: Examples:
.. code-block:: python .. code-block:: python
...@@ -340,13 +360,16 @@ class Layer(object): ...@@ -340,13 +360,16 @@ class Layer(object):
# hook change the linear's output to output * 2, so out0 is equal to out1 * 2. # hook change the linear's output to output * 2, so out0 is equal to out1 * 2.
assert (out0.numpy() == (out1.numpy()) * 2).any() assert (out0.numpy() == (out1.numpy()) * 2).any()
""" """
hook_remove_helper = HookRemoveHelper(self._forward_post_hooks) hook_remove_helper = HookRemoveHelper(self._forward_post_hooks)
self._forward_post_hooks[hook_remove_helper._hook_id] = hook self._forward_post_hooks[hook_remove_helper._hook_id] = hook
return hook_remove_helper return hook_remove_helper
def register_forward_pre_hook(self, hook): def register_forward_pre_hook(self, hook):
"""Register a forward pre-hook for Layer. The hook will be called before `forward` function has been computed. """
Register a forward pre-hook for Layer. The hook will be called before `forward` function has been computed.
It should have the following form, `input` of the `hook` is `input` of the `Layer`, It should have the following form, `input` of the `hook` is `input` of the `Layer`,
hook can either return a tuple or a single modified value in the hook. We will wrap the value into a tuple if hook can either return a tuple or a single modified value in the hook. We will wrap the value into a tuple if
...@@ -359,7 +382,7 @@ class Layer(object): ...@@ -359,7 +382,7 @@ class Layer(object):
hook(function): a function registered as a forward pre-hook hook(function): a function registered as a forward pre-hook
Returns: Returns:
HookRemoveHelper: a HookRemoveHelper object that can be used to remove the added hook by calling `hook_remove_helper.remove()` . HookRemoveHelper, a HookRemoveHelper object that can be used to remove the added hook by calling `hook_remove_helper.remove()` .
Examples: Examples:
.. code-block:: python .. code-block:: python
...@@ -398,12 +421,14 @@ class Layer(object): ...@@ -398,12 +421,14 @@ class Layer(object):
self._forward_pre_hooks[hook_remove_helper._hook_id] = hook self._forward_pre_hooks[hook_remove_helper._hook_id] = hook
return hook_remove_helper return hook_remove_helper
def create_parameter(self, def create_parameter(
self,
shape, shape,
attr=None, attr=None,
dtype=None, dtype=None,
is_bias=False, is_bias=False,
default_initializer=None): default_initializer=None,
):
"""Create parameters for this layer. """Create parameters for this layer.
Parameters: Parameters:
...@@ -443,12 +468,15 @@ class Layer(object): ...@@ -443,12 +468,15 @@ class Layer(object):
temp_attr = copy.deepcopy(attr) temp_attr = copy.deepcopy(attr)
if isinstance(temp_attr, six.string_types) and temp_attr == "": if isinstance(temp_attr, six.string_types) and temp_attr == "":
temp_attr = None temp_attr = None
return self._helper.create_parameter(temp_attr, shape, dtype, is_bias, return self._helper.create_parameter(
default_initializer) temp_attr, shape, dtype, is_bias, default_initializer
)
@deprecated(since="2.0.0", @deprecated(
since="2.0.0",
update_to="paddle.nn.Layer.create_tensor", update_to="paddle.nn.Layer.create_tensor",
reason="New api in create_tensor, easier to use.") reason="New api in create_tensor, easier to use.",
)
def create_variable(self, name=None, persistable=None, dtype=None): def create_variable(self, name=None, persistable=None, dtype=None):
""" """
...@@ -488,14 +516,16 @@ class Layer(object): ...@@ -488,14 +516,16 @@ class Layer(object):
if name is not None: if name is not None:
var_name = ".".join([self._full_name, name]) var_name = ".".join([self._full_name, name])
else: else:
var_name = unique_name.generate(".".join( var_name = unique_name.generate(
[self._full_name, "_generated_var"])) ".".join([self._full_name, "_generated_var"])
)
return self._helper.main_program.current_block().create_var( return self._helper.main_program.current_block().create_var(
name=var_name, name=var_name,
persistable=persistable, persistable=persistable,
dtype=dtype, dtype=dtype,
type=core.VarDesc.VarType.LOD_TENSOR) type=core.VarDesc.VarType.LOD_TENSOR,
)
# TODO: Add more parameter list when we need them # TODO: Add more parameter list when we need them
def create_tensor(self, name=None, persistable=None, dtype=None): def create_tensor(self, name=None, persistable=None, dtype=None):
...@@ -538,20 +568,24 @@ class Layer(object): ...@@ -538,20 +568,24 @@ class Layer(object):
if name is not None: if name is not None:
var_name = ".".join([self._full_name, name]) var_name = ".".join([self._full_name, name])
else: else:
var_name = unique_name.generate(".".join( var_name = unique_name.generate(
[self._full_name, "_generated_var"])) ".".join([self._full_name, "_generated_var"])
)
return self._helper.main_program.current_block().create_var( return self._helper.main_program.current_block().create_var(
name=var_name, name=var_name,
persistable=persistable, persistable=persistable,
dtype=dtype, dtype=dtype,
type=core.VarDesc.VarType.LOD_TENSOR) type=core.VarDesc.VarType.LOD_TENSOR,
)
def parameters(self, include_sublayers=True): def parameters(self, include_sublayers=True):
"""Returns a list of all Parameters from current layer and its sub-layers. """
Returns a list of all Parameters from current layer and its sub-layers.
Returns: Returns:
list of Tensor : a list of Parameters. list of Tensor, a list of Parameters.
Examples: Examples:
.. code-block:: python .. code-block:: python
...@@ -563,13 +597,17 @@ class Layer(object): ...@@ -563,13 +597,17 @@ class Layer(object):
""" """
ret = [ ret = [
param for _, param in self.named_parameters( param
include_sublayers=include_sublayers) for _, param in self.named_parameters(
include_sublayers=include_sublayers
)
] ]
return ret return ret
def children(self): def children(self):
"""Returns an iterator over immediate children layers. """
Returns an iterator over immediate children layers.
Yields: Yields:
Layer: a child layer Layer: a child layer
...@@ -619,13 +657,15 @@ class Layer(object): ...@@ -619,13 +657,15 @@ class Layer(object):
yield name, layer yield name, layer
def sublayers(self, include_self=False): def sublayers(self, include_self=False):
"""Returns a list of sub layers. """
Returns a list of sub layers.
Parameters: Parameters:
include_self(bool, optional): Whether return self as sublayers. Default: False include_self(bool, optional): Whether return self as sublayers. Default: False
Returns: Returns:
list of Layer : a list of sub layers. list of Layer, a list of sub layers.
Examples: Examples:
.. code-block:: python .. code-block:: python
...@@ -678,9 +718,11 @@ class Layer(object): ...@@ -678,9 +718,11 @@ class Layer(object):
""" """
params_set = set() params_set = set()
named_sublayers = self.named_sublayers( named_sublayers = (
prefix=prefix, include_self=True) if include_sublayers else zip( self.named_sublayers(prefix=prefix, include_self=True)
[prefix], [self]) if include_sublayers
else zip([prefix], [self])
)
for layer_prefix, sublayer in named_sublayers: for layer_prefix, sublayer in named_sublayers:
params = sublayer._parameters.items() params = sublayer._parameters.items()
for key, param in params: for key, param in params:
...@@ -724,9 +766,9 @@ class Layer(object): ...@@ -724,9 +766,9 @@ class Layer(object):
if layer is None: if layer is None:
continue continue
layer_prefix = prefix + ('.' if prefix else '') + key layer_prefix = prefix + ('.' if prefix else '') + key
for p, l in layer.named_sublayers(prefix=layer_prefix, for p, l in layer.named_sublayers(
include_self=True, prefix=layer_prefix, include_self=True, layers_set=layers_set
layers_set=layers_set): ):
yield p, l yield p, l
def register_buffer(self, name, tensor, persistable=True): def register_buffer(self, name, tensor, persistable=True):
...@@ -769,25 +811,32 @@ class Layer(object): ...@@ -769,25 +811,32 @@ class Layer(object):
if '_buffers' not in self.__dict__: if '_buffers' not in self.__dict__:
raise ValueError( raise ValueError(
"super(YourLayer, self).__init__() should be called first") "super(YourLayer, self).__init__() should be called first"
)
elif not isinstance(name, six.string_types): elif not isinstance(name, six.string_types):
raise TypeError( raise TypeError(
"The name of buffer should be a string, but received {}.". "The name of buffer should be a string, but received {}.".format(
format(type(name).__name__)) type(name).__name__
)
)
elif '.' in name: elif '.' in name:
raise KeyError( raise KeyError(
"The name of buffer can not contain `.`, " "The name of buffer can not contain `.`, "
"because when you access the newly added buffer in the " "because when you access the newly added buffer in the "
"form of `self.**.**`, it will cause AttributeError.") "form of `self.**.**`, it will cause AttributeError."
)
elif name == '': elif name == '':
raise KeyError("The name of buffer can not be empty.") raise KeyError("The name of buffer can not be empty.")
elif hasattr(self, name) and name not in self._buffers: elif hasattr(self, name) and name not in self._buffers:
raise KeyError("attribute '{}' already exists.".format(name)) raise KeyError("attribute '{}' already exists.".format(name))
elif tensor is not None and not (type(tensor) == core.VarBase elif tensor is not None and not (
or type(tensor) == core.eager.Tensor): type(tensor) == core.VarBase or type(tensor) == core.eager.Tensor
):
raise TypeError( raise TypeError(
"The registered buffer should be a Paddle.Tensor, but received {}." "The registered buffer should be a Paddle.Tensor, but received {}.".format(
.format(type(tensor).__name__)) type(tensor).__name__
)
)
else: else:
self._buffers[name] = tensor self._buffers[name] = tensor
if persistable: if persistable:
...@@ -797,13 +846,14 @@ class Layer(object): ...@@ -797,13 +846,14 @@ class Layer(object):
def buffers(self, include_sublayers=True): def buffers(self, include_sublayers=True):
""" """
Returns a list of all buffers from current layer and its sub-layers. Returns a list of all buffers from current layer and its sub-layers.
Parameters: Parameters:
include_sublayers(bool, optional): Whether include the buffers of sublayers. If True, also include the buffers from sublayers. Default: True include_sublayers(bool, optional): Whether include the buffers of sublayers. If True, also include the buffers from sublayers. Default: True
Returns: Returns:
list of Tensor : a list of buffers. list of Tensor, a list of buffers.
Examples: Examples:
.. code-block:: python .. code-block:: python
...@@ -820,8 +870,10 @@ class Layer(object): ...@@ -820,8 +870,10 @@ class Layer(object):
""" """
ret = [ ret = [
buffer for _, buffer in self.named_buffers( buffer
include_sublayers=include_sublayers) for _, buffer in self.named_buffers(
include_sublayers=include_sublayers
)
] ]
return ret return ret
...@@ -862,9 +914,11 @@ class Layer(object): ...@@ -862,9 +914,11 @@ class Layer(object):
""" """
buffers_set = set() buffers_set = set()
named_sublayers = self.named_sublayers( named_sublayers = (
prefix=prefix, include_self=True) if include_sublayers else zip( self.named_sublayers(prefix=prefix, include_self=True)
[prefix], [self]) if include_sublayers
else zip([prefix], [self])
)
for layer_prefix, sublayer in named_sublayers: for layer_prefix, sublayer in named_sublayers:
buffers = sublayer._buffers.items() buffers = sublayer._buffers.items()
for key, buffer in buffers: for key, buffer in buffers:
...@@ -910,7 +964,7 @@ class Layer(object): ...@@ -910,7 +964,7 @@ class Layer(object):
hook_result = forward_pre_hook(self, inputs) hook_result = forward_pre_hook(self, inputs)
if hook_result is not None: if hook_result is not None:
if not isinstance(hook_result, tuple): if not isinstance(hook_result, tuple):
hook_result = (hook_result, ) hook_result = (hook_result,)
inputs = hook_result inputs = hook_result
if not self._built: if not self._built:
...@@ -920,16 +974,20 @@ class Layer(object): ...@@ -920,16 +974,20 @@ class Layer(object):
# TODO(liuyuhui) Only xpu broadcast parameters here. # TODO(liuyuhui) Only xpu broadcast parameters here.
# The other device is to call _sync_params_buffers in DataParallel # The other device is to call _sync_params_buffers in DataParallel
# to realize the parameter synchronization among multiply cards. # to realize the parameter synchronization among multiply cards.
if parallel_helper._is_data_parallel_mode( if (
) and paddle.is_compiled_with_xpu(): parallel_helper._is_data_parallel_mode()
and paddle.is_compiled_with_xpu()
):
parallel_helper._broadcast_parameters( parallel_helper._broadcast_parameters(
self._parameters.values()) self._parameters.values()
)
self._built = True self._built = True
if in_profiler_mode(): if in_profiler_mode():
with profiler.RecordEvent(self.__class__.__name__, with profiler.RecordEvent(
profiler.TracerEventType.Forward): self.__class__.__name__, profiler.TracerEventType.Forward
):
outputs = self.forward(*inputs, **kwargs) outputs = self.forward(*inputs, **kwargs)
else: else:
outputs = self.forward(*inputs, **kwargs) outputs = self.forward(*inputs, **kwargs)
...@@ -942,8 +1000,14 @@ class Layer(object): ...@@ -942,8 +1000,14 @@ class Layer(object):
return outputs return outputs
def __call__(self, *inputs, **kwargs): def __call__(self, *inputs, **kwargs):
if (not in_declarative_mode()) and (not self._forward_pre_hooks) \ if (
and (not self._forward_post_hooks) and (not self._built) and in_dygraph_mode() and (not in_profiler_mode()): (not in_declarative_mode())
and (not self._forward_pre_hooks)
and (not self._forward_post_hooks)
and (not self._built)
and in_dygraph_mode()
and (not in_profiler_mode())
):
self._build_once(*inputs, **kwargs) self._build_once(*inputs, **kwargs)
return self.forward(*inputs, **kwargs) return self.forward(*inputs, **kwargs)
else: else:
...@@ -964,7 +1028,9 @@ class Layer(object): ...@@ -964,7 +1028,9 @@ class Layer(object):
raise ValueError("Layer shouldn't implement backward") raise ValueError("Layer shouldn't implement backward")
def add_sublayer(self, name, sublayer): def add_sublayer(self, name, sublayer):
"""Adds a sub Layer instance. """
Adds a sub Layer instance.
Added sublayer can be accessed by self.name Added sublayer can be accessed by self.name
...@@ -972,7 +1038,7 @@ class Layer(object): ...@@ -972,7 +1038,7 @@ class Layer(object):
name(str): name of this sublayer. name(str): name of this sublayer.
sublayer(Layer): an instance of Layer. sublayer(Layer): an instance of Layer.
Returns: Returns:
Layer: the sublayer passed in. Layer, the sublayer passed in.
Examples: Examples:
.. code-block:: python .. code-block:: python
...@@ -999,8 +1065,9 @@ class Layer(object): ...@@ -999,8 +1065,9 @@ class Layer(object):
model = MySequential(fc1, fc2) model = MySequential(fc1, fc2)
for prefix, layer in model.named_sublayers(): for prefix, layer in model.named_sublayers():
print(prefix, layer) print(prefix, layer)
""" """
assert (isinstance(sublayer, Layer) or sublayer == None) assert isinstance(sublayer, Layer) or sublayer == None
self._sub_layers[name] = sublayer self._sub_layers[name] = sublayer
return sublayer return sublayer
...@@ -1014,7 +1081,7 @@ class Layer(object): ...@@ -1014,7 +1081,7 @@ class Layer(object):
name(str): name of this sublayer. name(str): name of this sublayer.
parameter(Parameter): an instance of Parameter. parameter(Parameter): an instance of Parameter.
Returns: Returns:
Parameter: the parameter passed in. Parameter, the parameter passed in.
Examples: Examples:
.. code-block:: python .. code-block:: python
...@@ -1037,32 +1104,42 @@ class Layer(object): ...@@ -1037,32 +1104,42 @@ class Layer(object):
""" """
if '_parameters' not in self.__dict__: if '_parameters' not in self.__dict__:
raise RuntimeError( raise RuntimeError(
"super(YourLayer, self).__init__() should be called firstly.") "super(YourLayer, self).__init__() should be called firstly."
)
elif not isinstance(name, six.string_types): elif not isinstance(name, six.string_types):
raise TypeError( raise TypeError(
"The name of parameter should be a string, but received {}.". "The name of parameter should be a string, but received {}.".format(
format(type(name).__name__)) type(name).__name__
)
)
elif '.' in name: elif '.' in name:
raise KeyError( raise KeyError(
"The name of parameter can not contain `.`, " "The name of parameter can not contain `.`, "
"because when you access the newly added parameter in the " "because when you access the newly added parameter in the "
"form of `self.**.**`, it will cause AttributeError.") "form of `self.**.**`, it will cause AttributeError."
)
elif name == '': elif name == '':
raise KeyError("The name of parameter can not be empty.") raise KeyError("The name of parameter can not be empty.")
elif hasattr(self, name) and name not in self._parameters: elif hasattr(self, name) and name not in self._parameters:
raise KeyError("The parameter '{}' already exists.".format(name)) raise KeyError("The parameter '{}' already exists.".format(name))
elif parameter is not None and not isinstance(parameter, elif parameter is not None and not isinstance(
framework.Parameter): parameter, framework.Parameter
):
raise TypeError( raise TypeError(
"The parameter to be added should be a Parameter, but received {}." "The parameter to be added should be a Parameter, but received {}.".format(
.format(type(parameter).__name__)) type(parameter).__name__
)
)
else: else:
if parameter is None: if parameter is None:
self._parameters[name] = None self._parameters[name] = None
if len(self._loaddict_holder) > 0: if len(self._loaddict_holder) > 0:
assert parameter.name in self._loaddict_holder, "Parameter not found, Can't not find [ {} ] in state_dict".format( assert (
parameter.name) parameter.name in self._loaddict_holder
), "Parameter not found, Can't not find [ {} ] in state_dict".format(
parameter.name
)
parameter.set_value(self._loaddict_holder[parameter.name]) parameter.set_value(self._loaddict_holder[parameter.name])
...@@ -1081,37 +1158,50 @@ class Layer(object): ...@@ -1081,37 +1158,50 @@ class Layer(object):
""" """
def is_already_registered(is_pre_hook): def is_already_registered(is_pre_hook):
layers_hooks = self._forward_pre_hooks if is_pre_hook else self._forward_post_hooks layers_hooks = (
candidate_hook = record_program_ops_pre_hook if is_pre_hook else set_op_customized_attrs_post_hook self._forward_pre_hooks
if is_pre_hook
else self._forward_post_hooks
)
candidate_hook = (
record_program_ops_pre_hook
if is_pre_hook
else set_op_customized_attrs_post_hook
)
already_registed = False already_registed = False
if layers_hooks: if layers_hooks:
last_key = next(reversed(layers_hooks)) last_key = next(reversed(layers_hooks))
already_registed = (layers_hooks[last_key] == candidate_hook) already_registed = layers_hooks[last_key] == candidate_hook
return already_registed return already_registed
if not isinstance(attrs, dict): if not isinstance(attrs, dict):
raise TypeError( raise TypeError(
"attrs should be type(dict), but received {}".format( "attrs should be type(dict), but received {}".format(
type(attrs).__name__)) type(attrs).__name__
)
)
# NOTE: Overwrite behavior for same key. # NOTE: Overwrite behavior for same key.
self._customized_attrs.update(attrs) self._customized_attrs.update(attrs)
if not is_already_registered(is_pre_hook=True): if not is_already_registered(is_pre_hook=True):
pre_hook_helper = self.register_forward_pre_hook( pre_hook_helper = self.register_forward_pre_hook(
record_program_ops_pre_hook) record_program_ops_pre_hook
)
assert len(self._op_recorder.hooks) == 0 assert len(self._op_recorder.hooks) == 0
self._op_recorder.hooks = [pre_hook_helper] self._op_recorder.hooks = [pre_hook_helper]
# manually register post_hook to ensure it is inserted into the head. # manually register post_hook to ensure it is inserted into the head.
if not is_already_registered(is_pre_hook=False): if not is_already_registered(is_pre_hook=False):
post_hook_helper = self.register_forward_post_hook( post_hook_helper = self.register_forward_post_hook(
set_op_customized_attrs_post_hook) set_op_customized_attrs_post_hook
)
if len(self._forward_post_hooks) > 1: if len(self._forward_post_hooks) > 1:
self._forward_post_hooks.move_to_end(post_hook_helper._hook_id, self._forward_post_hooks.move_to_end(
last=False) post_hook_helper._hook_id, last=False
)
assert len(self._op_recorder.hooks) == 1 assert len(self._op_recorder.hooks) == 1
...@@ -1144,7 +1234,6 @@ class Layer(object): ...@@ -1144,7 +1234,6 @@ class Layer(object):
return object.__getattribute__(self, name) return object.__getattribute__(self, name)
def __setattr__(self, name, value): def __setattr__(self, name, value):
def _remove_if_exist(*dicts): def _remove_if_exist(*dicts):
for d in dicts: for d in dicts:
if name in d: if name in d:
...@@ -1156,10 +1245,14 @@ class Layer(object): ...@@ -1156,10 +1245,14 @@ class Layer(object):
if isinstance(value, framework.Parameter): if isinstance(value, framework.Parameter):
if params is None: if params is None:
raise ValueError( raise ValueError(
"super(YourLayer, self).__init__() should be called first") "super(YourLayer, self).__init__() should be called first"
)
if len(self._loaddict_holder) > 0: if len(self._loaddict_holder) > 0:
assert value.name in self._loaddict_holder, "Parameter not found, Can't not find [ {} ] in state_dict".format( assert (
value.name) value.name in self._loaddict_holder
), "Parameter not found, Can't not find [ {} ] in state_dict".format(
value.name
)
value.set_value(self._loaddict_holder[value.name]) value.set_value(self._loaddict_holder[value.name])
...@@ -1168,9 +1261,10 @@ class Layer(object): ...@@ -1168,9 +1261,10 @@ class Layer(object):
elif params is not None and name in params: elif params is not None and name in params:
if value is not None: if value is not None:
raise TypeError( raise TypeError(
"assignment to parameter '{}' should be of type Parameter or None, but got '{}'" "assignment to parameter '{}' should be of type Parameter or None, but got '{}'".format(
.format(name, name, type(value).__name__
type(value).__name__)) )
)
params[name] = None params[name] = None
else: else:
layers = self.__dict__.get('_sub_layers', None) layers = self.__dict__.get('_sub_layers', None)
...@@ -1185,9 +1279,10 @@ class Layer(object): ...@@ -1185,9 +1279,10 @@ class Layer(object):
elif layers is not None and name in layers: elif layers is not None and name in layers:
if value is not None: if value is not None:
raise TypeError( raise TypeError(
"assignment to sublayer '{}' should be of type Layer or None, but got '{}'" "assignment to sublayer '{}' should be of type Layer or None, but got '{}'".format(
.format(name, name, type(value).__name__
type(value).__name__)) )
)
layers[name] = None layers[name] = None
else: else:
_buffers = self.__dict__.get('_buffers', None) _buffers = self.__dict__.get('_buffers', None)
...@@ -1196,8 +1291,9 @@ class Layer(object): ...@@ -1196,8 +1291,9 @@ class Layer(object):
raise ValueError( raise ValueError(
"super(YourLayer, self).__init__() should be called first" "super(YourLayer, self).__init__() should be called first"
) )
_remove_if_exist(self.__dict__, self._parameters, _remove_if_exist(
self._sub_layers) self.__dict__, self._parameters, self._sub_layers
)
# Set persistable=False by default. Only `register_buffer` can # Set persistable=False by default. Only `register_buffer` can
# add a persistable buffer. # add a persistable buffer.
if name not in self._buffers: if name not in self._buffers:
...@@ -1211,6 +1307,7 @@ class Layer(object): ...@@ -1211,6 +1307,7 @@ class Layer(object):
# value via `assign`. # value via `assign`.
if type(value) == framework.Variable: if type(value) == framework.Variable:
from paddle import assign from paddle import assign
# Note(zhhsplendid): the condition below happens in PaddleGan model, # Note(zhhsplendid): the condition below happens in PaddleGan model,
# but should all non-Variable _buffers[name] be re-assign? We # but should all non-Variable _buffers[name] be re-assign? We
# should consider it in the future. I current wrote this as # should consider it in the future. I current wrote this as
...@@ -1218,18 +1315,23 @@ class Layer(object): ...@@ -1218,18 +1315,23 @@ class Layer(object):
if in_declarative_mode() and _buffers[name] is None: if in_declarative_mode() and _buffers[name] is None:
raise RuntimeError( raise RuntimeError(
'In Dy2stat, self.{0} is a buffer and self.{0} is ' 'In Dy2stat, self.{0} is a buffer and self.{0} is '
'not allowed to be set to Variable when self.{0} is None.' 'not allowed to be set to Variable when self.{0} is None.'.format(
.format(name)) name
elif _buffers[name] is None or type(getattr( )
self, name)) == core.VarBase: )
elif (
_buffers[name] is None
or type(getattr(self, name)) == core.VarBase
):
_buffers[name] = assign(value) _buffers[name] = assign(value)
else: else:
assign(value, getattr(self, name)) assign(value, getattr(self, name))
elif value is not None: elif value is not None:
raise TypeError( raise TypeError(
"assignment to buffers '{}' should be of type core.VarBase or None, but got '{}'" "assignment to buffers '{}' should be of type core.VarBase or None, but got '{}'".format(
.format(name, name, type(value).__name__
type(value).__name__)) )
)
else: else:
# Assigning None will remove the buffer, but if re-assign a new varBase to it, # Assigning None will remove the buffer, but if re-assign a new varBase to it,
# it will be remarked as a buffer with same `persistable` attribute. # it will be remarked as a buffer with same `persistable` attribute.
...@@ -1316,10 +1418,12 @@ class Layer(object): ...@@ -1316,10 +1418,12 @@ class Layer(object):
self._state_dict_hooks[hook_remove_helper._hook_id] = hook self._state_dict_hooks[hook_remove_helper._hook_id] = hook
return hook_remove_helper return hook_remove_helper
def _obtain_parameters_buffers(self, def _obtain_parameters_buffers(
self,
destination=None, destination=None,
include_sublayers=True, include_sublayers=True,
structured_name_prefix=""): structured_name_prefix="",
):
""" """
The difference from state_dict() is that state_dict_hook will not be called, The difference from state_dict() is that state_dict_hook will not be called,
but the original types of parameters and buffers will be maintained. but the original types of parameters and buffers will be maintained.
...@@ -1330,7 +1434,10 @@ class Layer(object): ...@@ -1330,7 +1434,10 @@ class Layer(object):
if data is not None: if data is not None:
destination[structured_name_prefix + name] = data destination[structured_name_prefix + name] = data
for name, buffer in self._buffers.items(): for name, buffer in self._buffers.items():
if buffer is not None and name not in self._non_persistable_buffer_names_set: if (
buffer is not None
and name not in self._non_persistable_buffer_names_set
):
destination[structured_name_prefix + name] = buffer destination[structured_name_prefix + name] = buffer
if include_sublayers: if include_sublayers:
...@@ -1339,17 +1446,22 @@ class Layer(object): ...@@ -1339,17 +1446,22 @@ class Layer(object):
destination_temp = destination.copy() destination_temp = destination.copy()
destination_temp.update( destination_temp.update(
layer_item._obtain_parameters_buffers( layer_item._obtain_parameters_buffers(
destination_temp, include_sublayers, destination_temp,
structured_name_prefix + layer_name + ".")) include_sublayers,
structured_name_prefix + layer_name + ".",
)
)
destination = destination_temp destination = destination_temp
return destination return destination
def _state_dict_impl(self, def _state_dict_impl(
self,
destination=None, destination=None,
include_sublayers=True, include_sublayers=True,
structured_name_prefix="", structured_name_prefix="",
include_non_persistable_buffer=False, include_non_persistable_buffer=False,
use_hook=True): use_hook=True,
):
""" """
Get all parameters and persistable buffers of current layer and its sub-layers. And set them into a dict Get all parameters and persistable buffers of current layer and its sub-layers. And set them into a dict
...@@ -1367,7 +1479,10 @@ class Layer(object): ...@@ -1367,7 +1479,10 @@ class Layer(object):
destination[structured_name_prefix + name] = data destination[structured_name_prefix + name] = data
for name, buffer in self._buffers.items(): for name, buffer in self._buffers.items():
if not include_non_persistable_buffer: if not include_non_persistable_buffer:
if buffer is not None and name not in self._non_persistable_buffer_names_set: if (
buffer is not None
and name not in self._non_persistable_buffer_names_set
):
destination[structured_name_prefix + name] = buffer destination[structured_name_prefix + name] = buffer
else: else:
if buffer is not None: if buffer is not None:
...@@ -1379,9 +1494,13 @@ class Layer(object): ...@@ -1379,9 +1494,13 @@ class Layer(object):
destination_temp = destination.copy() destination_temp = destination.copy()
destination_temp.update( destination_temp.update(
layer_item._state_dict_impl( layer_item._state_dict_impl(
destination_temp, include_sublayers, destination_temp,
include_sublayers,
structured_name_prefix + layer_name + ".", structured_name_prefix + layer_name + ".",
include_non_persistable_buffer, use_hook)) include_non_persistable_buffer,
use_hook,
)
)
destination = destination_temp destination = destination_temp
if use_hook: if use_hook:
for state_dict_hook in self._state_dict_hooks.values(): for state_dict_hook in self._state_dict_hooks.values():
...@@ -1391,12 +1510,15 @@ class Layer(object): ...@@ -1391,12 +1510,15 @@ class Layer(object):
return destination return destination
def to_static_state_dict(self, def to_static_state_dict(
self,
destination=None, destination=None,
include_sublayers=True, include_sublayers=True,
structured_name_prefix="", structured_name_prefix="",
use_hook=True): use_hook=True,
):
''' '''
Get all parameters and buffers of current layer and its sub-layers. And set them into a dict Get all parameters and buffers of current layer and its sub-layers. And set them into a dict
Parameters: Parameters:
...@@ -1405,7 +1527,7 @@ class Layer(object): ...@@ -1405,7 +1527,7 @@ class Layer(object):
use_hook(bool, optional) : If true, the operations contained in _state_dict_hooks will be appended to the destination. Default: True use_hook(bool, optional) : If true, the operations contained in _state_dict_hooks will be appended to the destination. Default: True
Retruns: Retruns:
dict: a dict contains all the parameters and persistable buffers. dict, a dict contains all the parameters and persistable buffers.
Examples: Examples:
.. code-block:: python .. code-block:: python
...@@ -1423,13 +1545,16 @@ class Layer(object): ...@@ -1423,13 +1545,16 @@ class Layer(object):
include_sublayers=include_sublayers, include_sublayers=include_sublayers,
structured_name_prefix=structured_name_prefix, structured_name_prefix=structured_name_prefix,
include_non_persistable_buffer=True, include_non_persistable_buffer=True,
use_hook=use_hook) use_hook=use_hook,
)
def state_dict(self, def state_dict(
self,
destination=None, destination=None,
include_sublayers=True, include_sublayers=True,
structured_name_prefix="", structured_name_prefix="",
use_hook=True): use_hook=True,
):
''' '''
Get all parameters and persistable buffers of current layer and its sub-layers. And set them into a dict Get all parameters and persistable buffers of current layer and its sub-layers. And set them into a dict
...@@ -1457,7 +1582,8 @@ class Layer(object): ...@@ -1457,7 +1582,8 @@ class Layer(object):
include_sublayers=include_sublayers, include_sublayers=include_sublayers,
structured_name_prefix=structured_name_prefix, structured_name_prefix=structured_name_prefix,
include_non_persistable_buffer=False, include_non_persistable_buffer=False,
use_hook=use_hook) use_hook=use_hook,
)
@framework.deprecate_stat_dict @framework.deprecate_stat_dict
def set_state_dict(self, state_dict, use_structured_name=True): def set_state_dict(self, state_dict, use_structured_name=True):
...@@ -1489,22 +1615,31 @@ class Layer(object): ...@@ -1489,22 +1615,31 @@ class Layer(object):
state = state_dict.get(key, None) state = state_dict.get(key, None)
if state is None: if state is None:
raise ValueError( raise ValueError(
"{} is not found in the provided dict.".format(key)) "{} is not found in the provided dict.".format(key)
if (isinstance(state, dict) or isinstance(state, list)): )
if (len(state) != len(param)): if isinstance(state, dict) or isinstance(state, list):
raise ValueError("{} receieves the length of {}, " if len(state) != len(param):
raise ValueError(
"{} receieves the length of {}, "
"but the expected shape is {}".format( "but the expected shape is {}".format(
key, len(state), len(param))) key, len(state), len(param)
)
)
else: else:
return param, state return param, state
else: else:
state_shape = state.shape() if inspect.ismethod( state_shape = (
state.shape) else state.shape state.shape()
if inspect.ismethod(state.shape)
else state.shape
)
if list(state_shape) != list(param.shape): if list(state_shape) != list(param.shape):
raise ValueError( raise ValueError(
"{} receives a shape {}, but the expected shape is {}.". "{} receives a shape {}, but the expected shape is {}.".format(
format(key, list(state_shape), list(param.shape))) key, list(state_shape), list(param.shape)
)
)
return param, state return param, state
matched_param_state = [] matched_param_state = []
...@@ -1541,8 +1676,10 @@ class Layer(object): ...@@ -1541,8 +1676,10 @@ class Layer(object):
executor = Executor(_get_device())._default_executor executor = Executor(_get_device())._default_executor
# restore parameter states # restore parameter states
core._create_loaded_parameter( core._create_loaded_parameter(
[param for param, state in matched_param_state], global_scope(), [param for param, state in matched_param_state],
executor) global_scope(),
executor,
)
for param, state in matched_param_state: for param, state in matched_param_state:
_set_var(param, state) _set_var(param, state)
...@@ -1594,11 +1731,13 @@ class Layer(object): ...@@ -1594,11 +1731,13 @@ class Layer(object):
# [ 0.33960250, 0.96878713]]) # [ 0.33960250, 0.96878713]])
''' '''
return self._to_impl(device=device, return self._to_impl(
device=device,
dtype=dtype, dtype=dtype,
blocking=blocking, blocking=blocking,
include_sublayers=True, include_sublayers=True,
floating_only=False) floating_only=False,
)
def _apply(self, func, device, dtype, blocking, include_sublayers=True): def _apply(self, func, device, dtype, blocking, include_sublayers=True):
if include_sublayers: if include_sublayers:
...@@ -1612,8 +1751,9 @@ class Layer(object): ...@@ -1612,8 +1751,9 @@ class Layer(object):
if param.grad is not None: if param.grad is not None:
with no_grad(): with no_grad():
grad_applied = func(param._grad_ivar(), device, dtype, grad_applied = func(
blocking) param._grad_ivar(), device, dtype, blocking
)
for key, buf in self._buffers.items(): for key, buf in self._buffers.items():
if buf is not None: if buf is not None:
...@@ -1637,12 +1777,14 @@ class Layer(object): ...@@ -1637,12 +1777,14 @@ class Layer(object):
# Note(zhangbo): Paddle GPU minimum memory allocation unit is 256 bytes, waiting_alloc_memory will comput ‘t’ occupied memory space. # Note(zhangbo): Paddle GPU minimum memory allocation unit is 256 bytes, waiting_alloc_memory will comput ‘t’ occupied memory space.
# Coefficient 1.2 is used to avoid OOM that may occur in this critical state when the memory is just enough. # Coefficient 1.2 is used to avoid OOM that may occur in this critical state when the memory is just enough.
waiting_alloc_memory = ( waiting_alloc_memory = (
(np.prod(t.shape) * size_dtype) / 256 + 1) * 256 * 1.2 ((np.prod(t.shape) * size_dtype) / 256 + 1) * 256 * 1.2
)
gpu_memory_available = core.gpu_memory_available() gpu_memory_available = core.gpu_memory_available()
if gpu_memory_available < waiting_alloc_memory: if gpu_memory_available < waiting_alloc_memory:
# Copy param / Tensor to cpu # Copy param / Tensor to cpu
t_used = t._copy_to(paddle.CPUPlace(), t_used = t._copy_to(
blocking) # k-v type will error paddle.CPUPlace(), blocking
) # k-v type will error
# Release mem of t # Release mem of t
t.value().get_tensor()._clear() t.value().get_tensor()._clear()
else: else:
...@@ -1653,7 +1795,8 @@ class Layer(object): ...@@ -1653,7 +1795,8 @@ class Layer(object):
# 2. cast param / Tensor to dtype # 2. cast param / Tensor to dtype
if dtype is not None and dtype != t_used.dtype: if dtype is not None and dtype != t_used.dtype:
with paddle.fluid.framework._dygraph_place_guard( with paddle.fluid.framework._dygraph_place_guard(
place=t_used.place): place=t_used.place
):
t_casted = t_used.cast(dtype=dtype) t_casted = t_used.cast(dtype=dtype)
else: else:
t_casted = t_used t_casted = t_used
...@@ -1671,12 +1814,14 @@ class Layer(object): ...@@ -1671,12 +1814,14 @@ class Layer(object):
return t return t
def _to_impl(self, def _to_impl(
self,
device=None, device=None,
dtype=None, dtype=None,
blocking=None, blocking=None,
include_sublayers=True, include_sublayers=True,
floating_only=False): floating_only=False,
):
''' '''
Cast the parameters and buffers of Layer by the give device, dtype and blocking. Cast the parameters and buffers of Layer by the give device, dtype and blocking.
...@@ -1705,20 +1850,28 @@ class Layer(object): ...@@ -1705,20 +1850,28 @@ class Layer(object):
if device is not None: if device is not None:
if isinstance(device, str): if isinstance(device, str):
device = paddle.device._convert_to_place(device) device = paddle.device._convert_to_place(device)
elif isinstance(device, (core.CPUPlace, core.CUDAPlace, elif isinstance(
core.CUDAPinnedPlace, core.XPUPlace)): device,
(
core.CPUPlace,
core.CUDAPlace,
core.CUDAPinnedPlace,
core.XPUPlace,
),
):
pass pass
else: else:
raise ValueError( raise ValueError(
"device value error, must be str, paddle.CPUPlace(), paddle.CUDAPlace(), paddle.CUDAPinnedPlace() or paddle.XPUPlace(), but the type of device is " "device value error, must be str, paddle.CPUPlace(), paddle.CUDAPlace(), paddle.CUDAPinnedPlace() or paddle.XPUPlace(), but the type of device is "
+ type(device).__name__) + type(device).__name__
)
if blocking is None: if blocking is None:
blocking = True blocking = True
else: else:
assert isinstance( assert isinstance(
blocking, blocking, bool
bool), "blocking value error, must be the True, False or None" ), "blocking value error, must be the True, False or None"
def transform(t, device, dtype, blocking): def transform(t, device, dtype, blocking):
if floating_only and (not paddle.is_floating_point(t)): if floating_only and (not paddle.is_floating_point(t)):
......
...@@ -1352,12 +1352,13 @@ class ParameterMetaClass(VariableMetaClass): ...@@ -1352,12 +1352,13 @@ class ParameterMetaClass(VariableMetaClass):
@six.add_metaclass(VariableMetaClass) @six.add_metaclass(VariableMetaClass)
class Variable(object): class Variable(object):
""" """
**Notes**:
**The constructor of Variable should not be invoked directly.**
**In Static Graph Mode: Please use** `Block.create_var` **to create a Static variable which has no data until being feed.** Notes:
The constructor of Variable should not be invoked directly.
In Static Graph Mode: Please use ** `Block.create_var` ** to create a Static variable which has no data until being feed.
**In Dygraph Mode: Please use** :ref:`api_fluid_dygraph_to_variable` **to create a dygraph variable with real data** In Dygraph Mode: Please use ** :ref:`api_fluid_dygraph_to_variable` ** to create a dygraph variable with real data.
In Fluid, every input and output of an OP is a variable. In most In Fluid, every input and output of an OP is a variable. In most
cases, variables are used for holding different kinds of data or training cases, variables are used for holding different kinds of data or training
...@@ -1514,12 +1515,13 @@ class Variable(object): ...@@ -1514,12 +1515,13 @@ class Variable(object):
def detach(self): def detach(self):
""" """
Returns a new Variable, detached from the current graph. Returns a new Variable, detached from the current graph.
It will share data with origin Variable and without tensor copy. It will share data with origin Variable and without tensor copy.
In addition, the detached Variable doesn't provide gradient propagation. In addition, the detached Variable doesn't provide gradient propagation.
Returns: Returns:
( :ref:`api_guide_Variable_en` | dtype is same as current Variable): The detached Variable. ( :ref:`api_guide_Variable_en` | dtype is same as current Variable), The detached Variable.
Examples: Examples:
.. code-block:: python .. code-block:: python
...@@ -1533,6 +1535,7 @@ class Variable(object): ...@@ -1533,6 +1535,7 @@ class Variable(object):
# create a detached Variable # create a detached Variable
y = x.detach() y = x.detach()
""" """
assert ( assert (
...@@ -2085,6 +2088,7 @@ class Variable(object): ...@@ -2085,6 +2088,7 @@ class Variable(object):
@property @property
def T(self): def T(self):
""" """
Permute current Variable with its dimensions reversed. Permute current Variable with its dimensions reversed.
If `n` is the dimensions of `x` , `x.T` is equivalent to `x.transpose([n-1, n-2, ..., 0])`. If `n` is the dimensions of `x` , `x.T` is equivalent to `x.transpose([n-1, n-2, ..., 0])`.
...@@ -2103,6 +2107,7 @@ class Variable(object): ...@@ -2103,6 +2107,7 @@ class Variable(object):
x_T_np = exe.run(paddle.static.default_main_program(), fetch_list=[x_T])[0] x_T_np = exe.run(paddle.static.default_main_program(), fetch_list=[x_T])[0]
print(x_T_np.shape) print(x_T_np.shape)
# (5, 3, 2) # (5, 3, 2)
""" """
if len(self.shape) == 1: if len(self.shape) == 1:
return self return self
...@@ -2141,7 +2146,7 @@ class Variable(object): ...@@ -2141,7 +2146,7 @@ class Variable(object):
as ``out = assign(tensor)`` . as ``out = assign(tensor)`` .
Returns: Returns:
Variable: The cloned Variable. Variable, The cloned Variable.
Examples: Examples:
.. code-block:: python .. code-block:: python
...@@ -2171,6 +2176,7 @@ class Variable(object): ...@@ -2171,6 +2176,7 @@ class Variable(object):
def _set_error_clip(self, error_clip): def _set_error_clip(self, error_clip):
""" """
Set the error_clip. Set the error_clip.
Args: Args:
...@@ -2178,11 +2184,13 @@ class Variable(object): ...@@ -2178,11 +2184,13 @@ class Variable(object):
Returns: Returns:
None None
""" """
self.error_clip = error_clip self.error_clip = error_clip
def _set_info(self, key, value): def _set_info(self, key, value):
""" """
Set key-value information for this variable. Set key-value information for this variable.
Args: Args:
...@@ -2191,6 +2199,7 @@ class Variable(object): ...@@ -2191,6 +2199,7 @@ class Variable(object):
Returns: Returns:
None None
""" """
if not hasattr(self, "_info"): if not hasattr(self, "_info"):
self._info = {} self._info = {}
...@@ -2198,6 +2207,7 @@ class Variable(object): ...@@ -2198,6 +2207,7 @@ class Variable(object):
def _get_info(self, key): def _get_info(self, key):
""" """
Get the information of this variable corresponding to key. Get the information of this variable corresponding to key.
Args: Args:
...@@ -2205,6 +2215,7 @@ class Variable(object): ...@@ -2205,6 +2215,7 @@ class Variable(object):
Returns: Returns:
object object
""" """
if hasattr(self, "_info") and key in self._info: if hasattr(self, "_info") and key in self._info:
return self._info[key] return self._info[key]
...@@ -2212,7 +2223,9 @@ class Variable(object): ...@@ -2212,7 +2223,9 @@ class Variable(object):
def _slice_indices(self, slice, length): def _slice_indices(self, slice, length):
""" """
Reference implementation for the slice.indices method. Reference implementation for the slice.indices method.
""" """
# Compute step and length as integers. # Compute step and length as integers.
step = 1 if slice.step is None else slice.step step = 1 if slice.step is None else slice.step
...@@ -2383,7 +2396,7 @@ class Variable(object): ...@@ -2383,7 +2396,7 @@ class Variable(object):
Default: None Default: None
Returns: Returns:
Tensor: the value in given scope. Tensor, the value in given scope.
Examples: Examples:
.. code-block:: python .. code-block:: python
...@@ -2438,6 +2451,7 @@ class Variable(object): ...@@ -2438,6 +2451,7 @@ class Variable(object):
def set_value(self, value, scope=None): def set_value(self, value, scope=None):
''' '''
Set the value to the tensor in given scope. Set the value to the tensor in given scope.
Args: Args:
...@@ -2477,6 +2491,7 @@ class Variable(object): ...@@ -2477,6 +2491,7 @@ class Variable(object):
if var.persistable: if var.persistable:
t_load = paddle.load(path+var.name+'.pdtensor') t_load = paddle.load(path+var.name+'.pdtensor')
var.set_value(t_load) var.set_value(t_load)
''' '''
# The 'framework' is a low-level module, and 'executor' # The 'framework' is a low-level module, and 'executor'
...@@ -2547,10 +2562,11 @@ class Variable(object): ...@@ -2547,10 +2562,11 @@ class Variable(object):
def size(self): def size(self):
""" """
Returns the number of elements for current Variable, which is a int64 Variable with shape [1] Returns the number of elements for current Variable, which is a int64 Variable with shape [1]
Returns: Returns:
Variable: the number of elements for current Variable Variable, the number of elements for current Variable
Examples: Examples:
.. code-block:: python .. code-block:: python
...@@ -2564,6 +2580,7 @@ class Variable(object): ...@@ -2564,6 +2580,7 @@ class Variable(object):
# get the number of elements of the Variable # get the number of elements of the Variable
y = x.size() y = x.size()
""" """
output = self.block.create_var( output = self.block.create_var(
...@@ -2578,23 +2595,27 @@ class Variable(object): ...@@ -2578,23 +2595,27 @@ class Variable(object):
def _set_attr(self, name, val): def _set_attr(self, name, val):
""" """
Set the value of attribute by attribute's name. Set the value of attribute by attribute's name.
Args: Args:
name(str): the attribute name. name(str): the attribute name.
val(int|str|list): the value of the attribute. val(int|str|list): the value of the attribute.
""" """
self._update_desc_attr(name, val) self._update_desc_attr(name, val)
def _has_attr(self, name): def _has_attr(self, name):
""" """
Whether this Variable has the attribute with the name `name` or not. Whether this Variable has the attribute with the name `name` or not.
Args: Args:
name(str): the attribute name. name(str): the attribute name.
Returns: Returns:
bool: True if has this attribute. bool, True if has this attribute.
""" """
return self.desc.has_attr(name) return self.desc.has_attr(name)
...@@ -2624,7 +2645,7 @@ class Variable(object): ...@@ -2624,7 +2645,7 @@ class Variable(object):
name(str): the attribute name. name(str): the attribute name.
Returns: Returns:
int|str|list: The attribute value. The return value int|str|list, The attribute value. The return value
can be any valid attribute type. can be any valid attribute type.
""" """
return self.desc.attr(name) return self.desc.attr(name)
...@@ -3196,14 +3217,16 @@ class Operator(object): ...@@ -3196,14 +3217,16 @@ class Operator(object):
def input(self, name): def input(self, name):
r""" r"""
Get the input arguments according to the input parameter name. Get the input arguments according to the input parameter name.
Args: Args:
name(str): The input parameter name. name(str): The input parameter name.
Returns: Returns:
list: return the list of argument names that associated with \ list, return the list of argument names that associated with \
the specific parameter name. the specific parameter name.
""" """
return self.desc.input(name) return self.desc.input(name)
......
...@@ -20,7 +20,13 @@ from __future__ import print_function ...@@ -20,7 +20,13 @@ from __future__ import print_function
import warnings import warnings
from ..layer_helper import LayerHelper from ..layer_helper import LayerHelper
from ..initializer import Normal, Constant from ..initializer import Normal, Constant
from ..framework import Variable, _non_static_mode, _varbase_creator, _in_legacy_dygraph, in_dygraph_mode from ..framework import (
Variable,
_non_static_mode,
_varbase_creator,
_in_legacy_dygraph,
in_dygraph_mode,
)
from .. import core from .. import core
from ..param_attr import ParamAttr from ..param_attr import ParamAttr
from . import nn from . import nn
...@@ -33,22 +39,29 @@ __all__ = ['accuracy', 'auc'] ...@@ -33,22 +39,29 @@ __all__ = ['accuracy', 'auc']
def accuracy(input, label, k=1, correct=None, total=None): def accuracy(input, label, k=1, correct=None, total=None):
""" """
accuracy layer. accuracy layer.
Refer to the https://en.wikipedia.org/wiki/Precision_and_recall Refer to the https://en.wikipedia.org/wiki/Precision_and_recall
This function computes the accuracy using the input and label. This function computes the accuracy using the input and label.
If the correct label occurs in top k predictions, then correct will increment by one. If the correct label occurs in top k predictions, then correct will increment by one.
Note: the dtype of accuracy is determined by input. the input and label dtype can be different.
Note:
the dtype of accuracy is determined by input. the input and label dtype can be different.
Args: Args:
input(Tensor): The input of accuracy layer, which is the predictions of network. A Tensor with type float32,float64. input(Tensor): The input of accuracy layer, which is the predictions of network. A Tensor with type float32,float64.
The shape is ``[sample_number, class_dim]`` . The shape is ``[sample_number, class_dim]`` .
label(Tensor): The label of dataset. Tensor with type int32,int64. The shape is ``[sample_number, 1]`` . label(Tensor): The label of dataset. Tensor with type int32,int64. The shape is ``[sample_number, 1]`` .
k(int): The top k predictions for each class will be checked. Data type is int64 or int32. k(int, optional): The top k predictions for each class will be checked. Data type is int64 or int32. Default is 1.
correct(Tensor): The correct predictions count. A Tensor with type int64 or int32. correct(Tensor, optional): The correct predictions count. A Tensor with type int64 or int32. Default is None.
total(Tensor): The total entries count. A tensor with type int64 or int32. total(Tensor, optional): The total entries count. A tensor with type int64 or int32. Default is None.
Returns: Returns:
Tensor: The correct rate. A Tensor with type float32. Tensor, The correct rate. A Tensor with type float32.
Examples: Examples:
.. code-block:: python .. code-block:: python
import numpy as np import numpy as np
import paddle import paddle
import paddle.static as static import paddle.static as static
...@@ -68,6 +81,7 @@ def accuracy(input, label, k=1, correct=None, total=None): ...@@ -68,6 +81,7 @@ def accuracy(input, label, k=1, correct=None, total=None):
fetch_list=[result[0]]) fetch_list=[result[0]])
print(output) print(output)
#[array([0.], dtype=float32)] #[array([0.], dtype=float32)]
""" """
if _non_static_mode(): if _non_static_mode():
if correct is None: if correct is None:
...@@ -76,15 +90,18 @@ def accuracy(input, label, k=1, correct=None, total=None): ...@@ -76,15 +90,18 @@ def accuracy(input, label, k=1, correct=None, total=None):
total = _varbase_creator(dtype="int32") total = _varbase_creator(dtype="int32")
_k = k.numpy().item(0) if isinstance(k, Variable) else k _k = k.numpy().item(0) if isinstance(k, Variable) else k
topk_out, topk_indices = _legacy_C_ops.top_k_v2(input, 'k', _k, topk_out, topk_indices = _legacy_C_ops.top_k_v2(
'sorted', False) input, 'k', _k, 'sorted', False
_acc, _, _ = _legacy_C_ops.accuracy(topk_out, topk_indices, label, )
correct, total) _acc, _, _ = _legacy_C_ops.accuracy(
topk_out, topk_indices, label, correct, total
)
return _acc return _acc
helper = LayerHelper("accuracy", **locals()) helper = LayerHelper("accuracy", **locals())
check_variable_and_dtype(input, 'input', ['float16', 'float32', 'float64'], check_variable_and_dtype(
'accuracy') input, 'input', ['float16', 'float32', 'float64'], 'accuracy'
)
topk_out = helper.create_variable_for_type_inference(dtype=input.dtype) topk_out = helper.create_variable_for_type_inference(dtype=input.dtype)
topk_indices = helper.create_variable_for_type_inference(dtype="int64") topk_indices = helper.create_variable_for_type_inference(dtype="int64")
inputs = {"X": [input]} inputs = {"X": [input]}
...@@ -93,39 +110,38 @@ def accuracy(input, label, k=1, correct=None, total=None): ...@@ -93,39 +110,38 @@ def accuracy(input, label, k=1, correct=None, total=None):
else: else:
attrs = {'k': k} attrs = {'k': k}
attrs['sorted'] = False attrs['sorted'] = False
helper.append_op(type="top_k_v2", helper.append_op(
type="top_k_v2",
inputs=inputs, inputs=inputs,
attrs=attrs, attrs=attrs,
outputs={ outputs={"Out": [topk_out], "Indices": [topk_indices]},
"Out": [topk_out], )
"Indices": [topk_indices]
})
acc_out = helper.create_variable_for_type_inference(dtype="float32") acc_out = helper.create_variable_for_type_inference(dtype="float32")
if correct is None: if correct is None:
correct = helper.create_variable_for_type_inference(dtype="int32") correct = helper.create_variable_for_type_inference(dtype="int32")
if total is None: if total is None:
total = helper.create_variable_for_type_inference(dtype="int32") total = helper.create_variable_for_type_inference(dtype="int32")
helper.append_op(type="accuracy", helper.append_op(
inputs={ type="accuracy",
"Out": [topk_out], inputs={"Out": [topk_out], "Indices": [topk_indices], "Label": [label]},
"Indices": [topk_indices],
"Label": [label]
},
outputs={ outputs={
"Accuracy": [acc_out], "Accuracy": [acc_out],
"Correct": [correct], "Correct": [correct],
"Total": [total], "Total": [total],
}) },
)
return acc_out return acc_out
def auc(input, def auc(
input,
label, label,
curve='ROC', curve='ROC',
num_thresholds=2**12 - 1, num_thresholds=2**12 - 1,
topk=1, topk=1,
slide_steps=1, slide_steps=1,
ins_tag_weight=None): ins_tag_weight=None,
):
""" """
**Area Under the Curve (AUC) Layer** **Area Under the Curve (AUC) Layer**
...@@ -216,13 +232,14 @@ def auc(input, ...@@ -216,13 +232,14 @@ def auc(input,
helper = LayerHelper("auc", **locals()) helper = LayerHelper("auc", **locals())
if ins_tag_weight is None: if ins_tag_weight is None:
ins_tag_weight = tensor.fill_constant(shape=[1, 1], ins_tag_weight = tensor.fill_constant(
dtype="float32", shape=[1, 1], dtype="float32", value=1.0
value=1.0) )
check_variable_and_dtype(input, 'input', ['float32', 'float64'], 'auc') check_variable_and_dtype(input, 'input', ['float32', 'float64'], 'auc')
check_variable_and_dtype(label, 'label', ['int32', 'int64'], 'auc') check_variable_and_dtype(label, 'label', ['int32', 'int64'], 'auc')
check_variable_and_dtype(ins_tag_weight, 'ins_tag_weight', check_variable_and_dtype(
['float32', 'float64'], 'auc') ins_tag_weight, 'ins_tag_weight', ['float32', 'float64'], 'auc'
)
auc_out = helper.create_variable_for_type_inference(dtype="float64") auc_out = helper.create_variable_for_type_inference(dtype="float64")
batch_auc_out = helper.create_variable_for_type_inference(dtype="float64") batch_auc_out = helper.create_variable_for_type_inference(dtype="float64")
# make tp, tn, fp, fn persistable, so that can accumulate all batches. # make tp, tn, fp, fn persistable, so that can accumulate all batches.
...@@ -236,62 +253,71 @@ def auc(input, ...@@ -236,62 +253,71 @@ def auc(input,
batch_stat_pos = helper.create_global_variable( batch_stat_pos = helper.create_global_variable(
persistable=True, persistable=True,
dtype='int64', dtype='int64',
shape=[(1 + slide_steps) * (num_thresholds + 1) + 1]) shape=[(1 + slide_steps) * (num_thresholds + 1) + 1],
)
batch_stat_neg = helper.create_global_variable( batch_stat_neg = helper.create_global_variable(
persistable=True, persistable=True,
dtype='int64', dtype='int64',
shape=[(1 + slide_steps) * (num_thresholds + 1) + 1]) shape=[(1 + slide_steps) * (num_thresholds + 1) + 1],
)
# for global auc # for global auc
# Needn't maintain the batch id # Needn't maintain the batch id
stat_pos = helper.create_global_variable(persistable=True, stat_pos = helper.create_global_variable(
dtype='int64', persistable=True, dtype='int64', shape=[1, num_thresholds + 1]
shape=[1, num_thresholds + 1]) )
stat_neg = helper.create_global_variable(persistable=True, stat_neg = helper.create_global_variable(
dtype='int64', persistable=True, dtype='int64', shape=[1, num_thresholds + 1]
shape=[1, num_thresholds + 1]) )
for var in [batch_stat_pos, batch_stat_neg, stat_pos, stat_neg]: for var in [batch_stat_pos, batch_stat_neg, stat_pos, stat_neg]:
helper.set_variable_initializer(var, Constant(value=0.0, helper.set_variable_initializer(
force_cpu=False)) var, Constant(value=0.0, force_cpu=False)
)
#"InsTagWeight": [ins_tag_weight] # "InsTagWeight": [ins_tag_weight]
# Batch AUC # Batch AUC
helper.append_op(type="auc", helper.append_op(
type="auc",
inputs={ inputs={
"Predict": [input], "Predict": [input],
"Label": [label], "Label": [label],
"StatPos": [batch_stat_pos], "StatPos": [batch_stat_pos],
"StatNeg": [batch_stat_neg] "StatNeg": [batch_stat_neg],
}, },
attrs={ attrs={
"curve": curve, "curve": curve,
"num_thresholds": num_thresholds, "num_thresholds": num_thresholds,
"slide_steps": slide_steps "slide_steps": slide_steps,
}, },
outputs={ outputs={
"AUC": [batch_auc_out], "AUC": [batch_auc_out],
"StatPosOut": [batch_stat_pos], "StatPosOut": [batch_stat_pos],
"StatNegOut": [batch_stat_neg] "StatNegOut": [batch_stat_neg],
}) },
)
# Global AUC # Global AUC
helper.append_op(type="auc", helper.append_op(
type="auc",
inputs={ inputs={
"Predict": [input], "Predict": [input],
"Label": [label], "Label": [label],
"StatPos": [stat_pos], "StatPos": [stat_pos],
"StatNeg": [stat_neg] "StatNeg": [stat_neg],
}, },
attrs={ attrs={
"curve": curve, "curve": curve,
"num_thresholds": num_thresholds, "num_thresholds": num_thresholds,
"slide_steps": 0 "slide_steps": 0,
}, },
outputs={ outputs={
"AUC": [auc_out], "AUC": [auc_out],
"StatPosOut": [stat_pos], "StatPosOut": [stat_pos],
"StatNegOut": [stat_neg] "StatNegOut": [stat_neg],
}) },
return auc_out, batch_auc_out, [ )
batch_stat_pos, batch_stat_neg, stat_pos, stat_neg return (
] auc_out,
batch_auc_out,
[batch_stat_pos, batch_stat_neg, stat_pos, stat_neg],
)
因为 它太大了无法显示 source diff 。你可以改为 查看blob
...@@ -241,13 +241,13 @@ def send_ue_recv( ...@@ -241,13 +241,13 @@ def send_ue_recv(
src_index (Tensor): An 1-D tensor, and the available data type is int32, int64. src_index (Tensor): An 1-D tensor, and the available data type is int32, int64.
dst_index (Tensor): An 1-D tensor, and should have the same shape as `src_index`. dst_index (Tensor): An 1-D tensor, and should have the same shape as `src_index`.
The available data type is int32, int64. The available data type is int32, int64.
message_op (str): Different message ops for x and e, including `add`, `sub`, `mul`, `div`. message_op (str, optional): Different message ops for x and e, including `add`, `sub`, `mul`, `div`.
reduce_op (str): Different reduce ops, including `sum`, `mean`, `max`, `min`. reduce_op (str, optional): Different reduce ops, including `sum`, `mean`, `max`, `min`.
Default value is `sum`. Default value is `sum`.
out_size (int|Tensor|None): We can set `out_size` to get necessary output shape. If not set or out_size (int|Tensor, optional): We can set `out_size` to get necessary output shape. If not set or
out_size is smaller or equal to 0, then this input will not be used. out_size is smaller or equal to 0, then this input will not be used.
Otherwise, `out_size` should be equal with or larger than Otherwise, `out_size` should be equal with or larger than
max(dst_index) + 1. max(dst_index) + 1. Default value is `None`.
name (str, optional): Name for the operation (optional, default is None). name (str, optional): Name for the operation (optional, default is None).
For more information, please refer to :ref:`api_guide_Name`. For more information, please refer to :ref:`api_guide_Name`.
......
...@@ -26,6 +26,7 @@ def reindex_graph( ...@@ -26,6 +26,7 @@ def reindex_graph(
x, neighbors, count, value_buffer=None, index_buffer=None, name=None x, neighbors, count, value_buffer=None, index_buffer=None, name=None
): ):
""" """
Reindex Graph API. Reindex Graph API.
This API is mainly used in Graph Learning domain, which should be used This API is mainly used in Graph Learning domain, which should be used
...@@ -49,12 +50,12 @@ def reindex_graph( ...@@ -49,12 +50,12 @@ def reindex_graph(
should be the same with `x`. should be the same with `x`.
count (Tensor): The neighbor count of the input nodes `x`. And the count (Tensor): The neighbor count of the input nodes `x`. And the
data type should be int32. data type should be int32.
value_buffer (Tensor|None): Value buffer for hashtable. The data type should be int32, value_buffer (Tensor, optional): Value buffer for hashtable. The data type should be int32,
and should be filled with -1. Only useful for gpu version. and should be filled with -1. Only useful for gpu version. Default is None.
index_buffer (Tensor|None): Index buffer for hashtable. The data type should be int32, index_buffer (Tensor, optional): Index buffer for hashtable. The data type should be int32,
and should be filled with -1. Only useful for gpu version. and should be filled with -1. Only useful for gpu version.
`value_buffer` and `index_buffer` should be both not None `value_buffer` and `index_buffer` should be both not None
if you want to speed up by using hashtable buffer. if you want to speed up by using hashtable buffer. Default is None.
name (str, optional): Name for the operation (optional, default is None). name (str, optional): Name for the operation (optional, default is None).
For more information, please refer to :ref:`api_guide_Name`. For more information, please refer to :ref:`api_guide_Name`.
...@@ -69,6 +70,7 @@ def reindex_graph( ...@@ -69,6 +70,7 @@ def reindex_graph(
.. code-block:: python .. code-block:: python
import paddle import paddle
x = [0, 1, 2] x = [0, 1, 2]
neighbors = [8, 9, 0, 4, 7, 6, 7] neighbors = [8, 9, 0, 4, 7, 6, 7]
count = [2, 3, 2] count = [2, 3, 2]
...@@ -138,6 +140,7 @@ def reindex_heter_graph( ...@@ -138,6 +140,7 @@ def reindex_heter_graph(
x, neighbors, count, value_buffer=None, index_buffer=None, name=None x, neighbors, count, value_buffer=None, index_buffer=None, name=None
): ):
""" """
Reindex HeterGraph API. Reindex HeterGraph API.
This API is mainly used in Graph Learning domain, which should be used This API is mainly used in Graph Learning domain, which should be used
...@@ -161,12 +164,12 @@ def reindex_heter_graph( ...@@ -161,12 +164,12 @@ def reindex_heter_graph(
The data type should be the same with `x`. The data type should be the same with `x`.
count (list|tuple): The neighbor counts of the input nodes `x` from different graphs. count (list|tuple): The neighbor counts of the input nodes `x` from different graphs.
And the data type should be int32. And the data type should be int32.
value_buffer (Tensor|None): Value buffer for hashtable. The data type should be int32, value_buffer (Tensor, optional): Value buffer for hashtable. The data type should be int32,
and should be filled with -1. Only useful for gpu version. and should be filled with -1. Only useful for gpu version. Default is None.
index_buffer (Tensor|None): Index buffer for hashtable. The data type should be int32, index_buffer (Tensor, optional): Index buffer for hashtable. The data type should be int32,
and should be filled with -1. Only useful for gpu version. and should be filled with -1. Only useful for gpu version.
`value_buffer` and `index_buffer` should be both not None `value_buffer` and `index_buffer` should be both not None
if you want to speed up by using hashtable buffer. if you want to speed up by using hashtable buffer. Default is None.
name (str, optional): Name for the operation (optional, default is None). name (str, optional): Name for the operation (optional, default is None).
For more information, please refer to :ref:`api_guide_Name`. For more information, please refer to :ref:`api_guide_Name`.
...@@ -183,6 +186,7 @@ def reindex_heter_graph( ...@@ -183,6 +186,7 @@ def reindex_heter_graph(
.. code-block:: python .. code-block:: python
import paddle import paddle
x = [0, 1, 2] x = [0, 1, 2]
neighbors_a = [8, 9, 0, 4, 7, 6, 7] neighbors_a = [8, 9, 0, 4, 7, 6, 7]
count_a = [2, 3, 2] count_a = [2, 3, 2]
......
...@@ -32,6 +32,7 @@ def sample_neighbors( ...@@ -32,6 +32,7 @@ def sample_neighbors(
name=None, name=None,
): ):
""" """
Graph Sample Neighbors API. Graph Sample Neighbors API.
This API is mainly used in Graph Learning domain, and the main purpose is to This API is mainly used in Graph Learning domain, and the main purpose is to
...@@ -52,16 +53,16 @@ def sample_neighbors( ...@@ -52,16 +53,16 @@ def sample_neighbors(
The data type should be the same with `row`. The data type should be the same with `row`.
input_nodes (Tensor): The input nodes we need to sample neighbors for, and the input_nodes (Tensor): The input nodes we need to sample neighbors for, and the
data type should be the same with `row`. data type should be the same with `row`.
sample_size (int): The number of neighbors we need to sample. Default value is -1, sample_size (int, optional): The number of neighbors we need to sample. Default value is -1,
which means returning all the neighbors of the input nodes. which means returning all the neighbors of the input nodes.
eids (Tensor): The eid information of the input graph. If return_eids is True, eids (Tensor, optional): The eid information of the input graph. If return_eids is True,
then `eids` should not be None. The data type should be the then `eids` should not be None. The data type should be the
same with `row`. Default is None. same with `row`. Default is None.
return_eids (bool): Whether to return eid information of sample edges. Default is False. return_eids (bool, optional): Whether to return eid information of sample edges. Default is False.
perm_buffer (Tensor): Permutation buffer for fisher-yates sampling. If `use_perm_buffer` perm_buffer (Tensor, optional): Permutation buffer for fisher-yates sampling. If `use_perm_buffer`
is True, then `perm_buffer` should not be None. The data type should is True, then `perm_buffer` should not be None. The data type should
be the same with `row`. If not None, we will use fiser-yates sampling be the same with `row`. If not None, we will use fiser-yates sampling
to speed up. Only useful for gpu version. to speed up. Only useful for gpu version. Default is None.
name (str, optional): Name for the operation (optional, default is None). name (str, optional): Name for the operation (optional, default is None).
For more information, please refer to :ref:`api_guide_Name`. For more information, please refer to :ref:`api_guide_Name`.
...@@ -78,6 +79,7 @@ def sample_neighbors( ...@@ -78,6 +79,7 @@ def sample_neighbors(
.. code-block:: python .. code-block:: python
import paddle import paddle
# edges: (3, 0), (7, 0), (0, 1), (9, 1), (1, 2), (4, 3), (2, 4), # edges: (3, 0), (7, 0), (0, 1), (9, 1), (1, 2), (4, 3), (2, 4),
# (9, 5), (3, 5), (9, 6), (1, 6), (9, 8), (7, 8) # (9, 5), (3, 5), (9, 6), (1, 6), (9, 8), (7, 8)
row = [3, 7, 0, 9, 1, 4, 2, 9, 3, 9, 1, 9, 7] row = [3, 7, 0, 9, 1, 4, 2, 9, 3, 9, 1, 9, 7]
......
...@@ -69,8 +69,9 @@ def to_list(value): ...@@ -69,8 +69,9 @@ def to_list(value):
def to_numpy(var): def to_numpy(var):
assert isinstance(var, (Variable, fluid.core.VarBase, assert isinstance(
fluid.core.eager.Tensor)), "not a variable" var, (Variable, fluid.core.VarBase, fluid.core.eager.Tensor)
), "not a variable"
if isinstance(var, (fluid.core.VarBase, fluid.core.eager.Tensor)): if isinstance(var, (fluid.core.VarBase, fluid.core.eager.Tensor)):
return var.numpy() return var.numpy()
t = global_scope().find_var(var.name).get_tensor() t = global_scope().find_var(var.name).get_tensor()
...@@ -105,10 +106,9 @@ def extract_args(func): ...@@ -105,10 +106,9 @@ def extract_args(func):
def _all_gather(x, nranks, ring_id=0, use_calc_stream=True): def _all_gather(x, nranks, ring_id=0, use_calc_stream=True):
return collective._c_allgather(x, return collective._c_allgather(
nranks, x, nranks, ring_id=ring_id, use_calc_stream=use_calc_stream
ring_id=ring_id, )
use_calc_stream=use_calc_stream)
def wait_server_ready(endpoints): def wait_server_ready(endpoints):
...@@ -119,7 +119,8 @@ def wait_server_ready(endpoints): ...@@ -119,7 +119,8 @@ def wait_server_ready(endpoints):
for ep in endpoints: for ep in endpoints:
ip_port = ep.split(":") ip_port = ep.split(":")
with contextlib.closing( with contextlib.closing(
socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock: socket.socket(socket.AF_INET, socket.SOCK_STREAM)
) as sock:
sock.settimeout(2) sock.settimeout(2)
result = sock.connect_ex((ip_port[0], int(ip_port[1]))) result = sock.connect_ex((ip_port[0], int(ip_port[1])))
if result != 0: if result != 0:
...@@ -131,8 +132,9 @@ def wait_server_ready(endpoints): ...@@ -131,8 +132,9 @@ def wait_server_ready(endpoints):
break break
def init_communicator(program, rank, nranks, wait_port, current_endpoint, def init_communicator(
endpoints): program, rank, nranks, wait_port, current_endpoint, endpoints
):
if nranks < 2: if nranks < 2:
return return
other_endpoints = endpoints[:] other_endpoints = endpoints[:]
...@@ -144,53 +146,66 @@ def init_communicator(program, rank, nranks, wait_port, current_endpoint, ...@@ -144,53 +146,66 @@ def init_communicator(program, rank, nranks, wait_port, current_endpoint,
nccl_id_var = block.create_var( nccl_id_var = block.create_var(
name=fluid.unique_name.generate('nccl_id'), name=fluid.unique_name.generate('nccl_id'),
persistable=True, persistable=True,
type=fluid.core.VarDesc.VarType.RAW) type=fluid.core.VarDesc.VarType.RAW,
)
block.append_op(type='c_gen_nccl_id', block.append_op(
type='c_gen_nccl_id',
inputs={}, inputs={},
outputs={'Out': nccl_id_var}, outputs={'Out': nccl_id_var},
attrs={ attrs={
'rank': rank, 'rank': rank,
'endpoint': current_endpoint, 'endpoint': current_endpoint,
'other_endpoints': other_endpoints 'other_endpoints': other_endpoints,
}) },
)
block.append_op(type='c_comm_init', block.append_op(
type='c_comm_init',
inputs={'X': nccl_id_var}, inputs={'X': nccl_id_var},
outputs={}, outputs={},
attrs={ attrs={
'nranks': nranks, 'nranks': nranks,
'rank': rank, 'rank': rank,
'ring_id': 0, 'ring_id': 0,
}) },
)
elif core.is_compiled_with_npu(): elif core.is_compiled_with_npu():
hccl_id_var = block.create_var( hccl_id_var = block.create_var(
name=fluid.unique_name.generate('hccl_id'), name=fluid.unique_name.generate('hccl_id'),
persistable=True, persistable=True,
type=core.VarDesc.VarType.RAW) type=core.VarDesc.VarType.RAW,
block.append_op(type='c_gen_hccl_id', )
block.append_op(
type='c_gen_hccl_id',
inputs={}, inputs={},
outputs={'Out': hccl_id_var}, outputs={'Out': hccl_id_var},
attrs={ attrs={
'rank': rank, 'rank': rank,
'endpoint': current_endpoint, 'endpoint': current_endpoint,
'other_endpoints': other_endpoints 'other_endpoints': other_endpoints,
}) },
block.append_op(type='c_comm_init_hccl', )
block.append_op(
type='c_comm_init_hccl',
inputs={'X': hccl_id_var}, inputs={'X': hccl_id_var},
outputs={}, outputs={},
attrs={ attrs={
'rank': rank, 'rank': rank,
'ring_id': 0, 'ring_id': 0,
'device_id': int(os.getenv("FLAGS_selected_npus")), 'device_id': int(os.getenv("FLAGS_selected_npus")),
'rank_ids': nranks 'rank_ids': nranks,
}) },
)
def prepare_distributed_context(place=None): def prepare_distributed_context(place=None):
if place is None: if place is None:
place = fluid.CUDAPlace(ParallelEnv().dev_id) if ParallelEnv().nranks > 1 \ place = (
fluid.CUDAPlace(ParallelEnv().dev_id)
if ParallelEnv().nranks > 1
else fluid.CUDAPlace(0) else fluid.CUDAPlace(0)
)
place = _get_paddle_place(place) place = _get_paddle_place(place)
strategy = fluid.dygraph.parallel.ParallelStrategy() strategy = fluid.dygraph.parallel.ParallelStrategy()
...@@ -208,9 +223,14 @@ def prepare_distributed_context(place=None): ...@@ -208,9 +223,14 @@ def prepare_distributed_context(place=None):
def _init_context(): def _init_context():
communicator_prog = fluid.Program() communicator_prog = fluid.Program()
init_communicator(communicator_prog, strategy.local_rank, init_communicator(
strategy.nranks, True, strategy.current_endpoint, communicator_prog,
strategy.trainer_endpoints) strategy.local_rank,
strategy.nranks,
True,
strategy.current_endpoint,
strategy.trainer_endpoints,
)
exe = fluid.Executor(place) exe = fluid.Executor(place)
exe.run(communicator_prog) exe.run(communicator_prog)
...@@ -220,7 +240,7 @@ def prepare_distributed_context(place=None): ...@@ -220,7 +240,7 @@ def prepare_distributed_context(place=None):
fluid.enable_dygraph(place) fluid.enable_dygraph(place)
else: else:
assert ("Only support CUDAPlace for now.") assert "Only support CUDAPlace for now."
_parallel_context_initialized = True _parallel_context_initialized = True
return strategy return strategy
...@@ -246,7 +266,9 @@ def _update_input_info(inputs): ...@@ -246,7 +266,9 @@ def _update_input_info(inputs):
class StaticGraphAdapter(object): class StaticGraphAdapter(object):
""" """
Model traning/inference with a static graph. Model traning/inference with a static graph.
""" """
def __init__(self, model): def __init__(self, model):
...@@ -269,7 +291,7 @@ class StaticGraphAdapter(object): ...@@ -269,7 +291,7 @@ class StaticGraphAdapter(object):
'eval_total': 0, 'eval_total': 0,
'test_total': 0, 'test_total': 0,
'eval_batch': 0, 'eval_batch': 0,
'test_batch': 0 'test_batch': 0,
} }
self._nranks = ParallelEnv().nranks self._nranks = ParallelEnv().nranks
...@@ -289,10 +311,13 @@ class StaticGraphAdapter(object): ...@@ -289,10 +311,13 @@ class StaticGraphAdapter(object):
self.model.mode = value self.model.mode = value
def train_batch(self, inputs, labels=None, update=True): def train_batch(self, inputs, labels=None, update=True):
assert self.model._optimizer, \ assert (
"model not ready, please call `model.prepare()` first" self.model._optimizer
), "model not ready, please call `model.prepare()` first"
self.mode = 'train' self.mode = 'train'
assert update is True, "Does not support `update == False` in static mode by now." assert (
update is True
), "Does not support `update == False` in static mode by now."
return self._run(inputs, labels) return self._run(inputs, labels)
def eval_batch(self, inputs, labels=None): def eval_batch(self, inputs, labels=None):
...@@ -307,7 +332,6 @@ class StaticGraphAdapter(object): ...@@ -307,7 +332,6 @@ class StaticGraphAdapter(object):
return self.model.network.parameters(*args, **kwargs) return self.model.network.parameters(*args, **kwargs)
def save(self, path): def save(self, path):
def _save(state, path): def _save(state, path):
if not state: if not state:
return return
...@@ -331,8 +355,7 @@ class StaticGraphAdapter(object): ...@@ -331,8 +355,7 @@ class StaticGraphAdapter(object):
# XXX `optimizer.state_dict()` only work in dygraph mode # XXX `optimizer.state_dict()` only work in dygraph mode
optim_path = path + ".pdopt" optim_path = path + ".pdopt"
optim = { optim = {
p.name: p p.name: p for p in filter(is_belong_to_optimizer, prog.list_vars())
for p in filter(is_belong_to_optimizer, prog.list_vars())
} }
if not optim: if not optim:
return return
...@@ -348,8 +371,10 @@ class StaticGraphAdapter(object): ...@@ -348,8 +371,10 @@ class StaticGraphAdapter(object):
# restore parameter states # restore parameter states
fluid.core._create_loaded_parameter( fluid.core._create_loaded_parameter(
[param for param, state in param_state_pairs], global_scope(), [param for param, state in param_state_pairs],
executor) global_scope(),
executor,
)
for param, state in param_state_pairs: for param, state in param_state_pairs:
self._set_var(param, state) self._set_var(param, state)
...@@ -377,9 +402,10 @@ class StaticGraphAdapter(object): ...@@ -377,9 +402,10 @@ class StaticGraphAdapter(object):
# static-graph, since the time of global_step to increase is # static-graph, since the time of global_step to increase is
# different. # different.
state_val = ( state_val = (
np.array(converted_state.pop("global_step")) - 1 (np.array(converted_state.pop("global_step")) - 1)
) if "global_step" in converted_state else converted_state.pop( if "global_step" in converted_state
"@LR_DECAY_COUNTER@", None) else converted_state.pop("@LR_DECAY_COUNTER@", None)
)
if state_val is not None: if state_val is not None:
converted_state[var.name] = state_val converted_state[var.name] = state_val
elif var.name.startswith("learning_rate_"): elif var.name.startswith("learning_rate_"):
...@@ -396,36 +422,61 @@ class StaticGraphAdapter(object): ...@@ -396,36 +422,61 @@ class StaticGraphAdapter(object):
opt_cls_name = self.model._optimizer.__class__.__name__ opt_cls_name = self.model._optimizer.__class__.__name__
opt_unq_name = None opt_unq_name = None
for name in self.model._optimizer._accumulators.keys(): for name in self.model._optimizer._accumulators.keys():
accum_name = name if opt_name is None else name[ accum_name = (
len(opt_name) + 1:] name
for param_name, state_var in self.model._optimizer._accumulators[ if opt_name is None
name].items(): else name[len(opt_name) + 1 :]
)
for (
param_name,
state_var,
) in self.model._optimizer._accumulators[name].items():
if opt_unq_name is None: if opt_unq_name is None:
# can not infer out the exact unique(opt_name), # can not infer out the exact unique(opt_name),
# thus try to extract rather than generate # thus try to extract rather than generate
for state_key in sorted(state.keys(), for state_key in sorted(
state.keys(),
key=lambda x: len(x), key=lambda x: len(x),
reverse=True): reverse=True,
prefix = param_name + "_" + ( ):
prefix = (
param_name
+ "_"
+ (
opt_cls_name opt_cls_name
if opt_name is None else opt_name) + "_" if opt_name is None
else opt_name
)
+ "_"
)
if state_key.startswith(prefix): if state_key.startswith(prefix):
prefix_offset = state_key[len( prefix_offset = state_key[
prefix):].find("_") + len(prefix) len(prefix) :
].find("_") + len(prefix)
opt_unq_name = state_key[ opt_unq_name = state_key[
len(param_name + "_"):prefix_offset] len(
param_name + "_"
) : prefix_offset
]
# TODO: assert # TODO: assert
# assert opt_unq_name is None # assert opt_unq_name is None
# gen(param.name + "_" + gen(opt_name) + "_" + accum_name) # gen(param.name + "_" + gen(opt_name) + "_" + accum_name)
# always end with "_0" since the unique optimizer._name # always end with "_0" since the unique optimizer._name
dy_state_name = (param_name + "_" + opt_unq_name + dy_state_name = (
"_" + accum_name + "_0") param_name
+ "_"
+ opt_unq_name
+ "_"
+ accum_name
+ "_0"
)
converted_state[ converted_state[
state_var.name] = converted_state.pop( state_var.name
dy_state_name) ] = converted_state.pop(dy_state_name)
assert var.name in converted_state, \ assert (
"variable [{}] is not in optimizer state file".format(var.name) var.name in converted_state
), "variable [{}] is not in optimizer state file".format(var.name)
self._set_var(var, converted_state[var.name]) self._set_var(var, converted_state[var.name])
def _set_var(self, var, ndarray): def _set_var(self, var, ndarray):
...@@ -444,15 +495,17 @@ class StaticGraphAdapter(object): ...@@ -444,15 +495,17 @@ class StaticGraphAdapter(object):
def _run(self, inputs, labels=None): def _run(self, inputs, labels=None):
compiled_prog = self._compiled_progs.get(self.mode, None) compiled_prog = self._compiled_progs.get(self.mode, None)
assert compiled_prog, \ assert (
"Model is not ready, please call `model.prepare()` first" compiled_prog
), "Model is not ready, please call `model.prepare()` first"
inputs = to_list(inputs) inputs = to_list(inputs)
if labels is not None: if labels is not None:
labels = to_list(labels) labels = to_list(labels)
assert len(inputs) == len(self._input_vars[self.mode]), \ assert len(inputs) == len(self._input_vars[self.mode]), (
"number of inputs" \ "number of inputs"
+ " does not match number of arguments of `forward` method" + " does not match number of arguments of `forward` method"
)
feed = {} feed = {}
input_names = [v.name for v in self._input_vars[self.mode]] input_names = [v.name for v in self._input_vars[self.mode]]
...@@ -462,8 +515,10 @@ class StaticGraphAdapter(object): ...@@ -462,8 +515,10 @@ class StaticGraphAdapter(object):
# train and test may take different arguments # train and test may take different arguments
if inputs[idx] is not None: if inputs[idx] is not None:
feed[n] = inputs[idx] feed[n] = inputs[idx]
if self._amp_level == 'O2' and input_dtypes[ if (
idx] == core.VarDesc.VarType.FP16: self._amp_level == 'O2'
and input_dtypes[idx] == core.VarDesc.VarType.FP16
):
if isinstance(feed[n], core.LoDTensor): if isinstance(feed[n], core.LoDTensor):
feed[n] = feed[n]._as_type(core.VarDesc.VarType.FP16) feed[n] = feed[n]._as_type(core.VarDesc.VarType.FP16)
elif isinstance(feed[n], np.array): elif isinstance(feed[n], np.array):
...@@ -491,10 +546,12 @@ class StaticGraphAdapter(object): ...@@ -491,10 +546,12 @@ class StaticGraphAdapter(object):
else: else:
pruned_fetch_list.append(fetch_var) pruned_fetch_list.append(fetch_var)
rets = self._executor.run(compiled_prog, rets = self._executor.run(
compiled_prog,
feed=feed, feed=feed,
fetch_list=pruned_fetch_list, fetch_list=pruned_fetch_list,
return_numpy=False) return_numpy=False,
)
# restore pruned fetch_list Variable from feeds # restore pruned fetch_list Variable from feeds
for i, name in enumerate(pruned_fetch_idx_name_map): for i, name in enumerate(pruned_fetch_idx_name_map):
...@@ -510,20 +567,24 @@ class StaticGraphAdapter(object): ...@@ -510,20 +567,24 @@ class StaticGraphAdapter(object):
metrics = [] metrics = []
for metric, state in zip(self.model._metrics, metric_states): for metric, state in zip(self.model._metrics, metric_states):
# cut off padding size # cut off padding size
if self.mode != 'train' and self.model._test_dataloader is not None \ if (
and isinstance(self.model._test_dataloader, DataLoader) \ self.mode != 'train'
and self._nranks > 1: and self.model._test_dataloader is not None
and isinstance(self.model._test_dataloader, DataLoader)
and self._nranks > 1
):
total_size = len(self.model._test_dataloader.dataset) total_size = len(self.model._test_dataloader.dataset)
# TODO: fixme if have better way to get batch size # TODO: fixme if have better way to get batch size
samples = state[0].shape[0] samples = state[0].shape[0]
current_count = self._merge_count.get(self.mode + '_total', 0) current_count = self._merge_count.get(self.mode + '_total', 0)
if current_count + samples >= total_size: if current_count + samples >= total_size:
state = [ state = [
s[:int(total_size - current_count), ...] for s in state s[: int(total_size - current_count), ...] for s in state
] ]
self._merge_count[self.mode + '_total'] = 0 self._merge_count[self.mode + '_total'] = 0
self._merge_count[self.mode + '_batch'] = int(total_size - self._merge_count[self.mode + '_batch'] = int(
current_count) total_size - current_count
)
else: else:
self._merge_count[self.mode + '_total'] += samples self._merge_count[self.mode + '_total'] += samples
self._merge_count[self.mode + '_batch'] = samples self._merge_count[self.mode + '_batch'] = samples
...@@ -555,8 +616,11 @@ class StaticGraphAdapter(object): ...@@ -555,8 +616,11 @@ class StaticGraphAdapter(object):
if mode != 'train': if mode != 'train':
for op in list(prog.global_block().ops): for op in list(prog.global_block().ops):
prog.global_block()._remove_op(0) prog.global_block()._remove_op(0)
if mode == 'train' and self.model._optimizer \ if (
and self.model._optimizer._learning_rate_map: mode == 'train'
and self.model._optimizer
and self.model._optimizer._learning_rate_map
):
# HACK workaround learning rate map issue # HACK workaround learning rate map issue
lr_var = self.model._optimizer._learning_rate_map[self._orig_prog] lr_var = self.model._optimizer._learning_rate_map[self._orig_prog]
new_lr_var = prog.global_block().vars[lr_var.name] new_lr_var = prog.global_block().vars[lr_var.name]
...@@ -594,20 +658,27 @@ class StaticGraphAdapter(object): ...@@ -594,20 +658,27 @@ class StaticGraphAdapter(object):
dist_strategy.amp = True dist_strategy.amp = True
dist_strategy.amp_configs = self._amp_configs.copy() dist_strategy.amp_configs = self._amp_configs.copy()
dist_strategy.amp_configs.update(self._amp_custom_lists) dist_strategy.amp_configs.update(self._amp_custom_lists)
dist_strategy.amp_configs[ dist_strategy.amp_configs['use_pure_fp16'] = (
'use_pure_fp16'] = self._amp_level == 'O2' self._amp_level == 'O2'
)
self.model._optimizer = fleet.distributed_optimizer( self.model._optimizer = fleet.distributed_optimizer(
self.model._optimizer, strategy=dist_strategy) self.model._optimizer, strategy=dist_strategy
)
elif self._amp_level != "O0" and core.is_compiled_with_cuda: elif self._amp_level != "O0" and core.is_compiled_with_cuda:
amp_lists = paddle.static.amp.AutoMixedPrecisionLists( amp_lists = (
paddle.static.amp.AutoMixedPrecisionLists(
**self._amp_custom_lists **self._amp_custom_lists
) if self._amp_custom_lists else None )
if self._amp_custom_lists
else None
)
self.model._optimizer = paddle.static.amp.decorate( self.model._optimizer = paddle.static.amp.decorate(
self.model._optimizer, self.model._optimizer,
amp_lists=amp_lists, amp_lists=amp_lists,
use_pure_fp16=self._amp_level == "O2", use_pure_fp16=self._amp_level == "O2",
use_fp16_guard=self._use_fp16_guard, use_fp16_guard=self._use_fp16_guard,
**self._amp_configs) **self._amp_configs
)
self.model._optimizer.minimize(self._loss_endpoint) self.model._optimizer.minimize(self._loss_endpoint)
...@@ -620,7 +691,7 @@ class StaticGraphAdapter(object): ...@@ -620,7 +691,7 @@ class StaticGraphAdapter(object):
self._endpoints[mode] = { self._endpoints[mode] = {
"output": outputs, "output": outputs,
"loss": to_list(losses), "loss": to_list(losses),
"metric": metrics "metric": metrics,
} }
def _compile_and_initialize(self, prog, mode): def _compile_and_initialize(self, prog, mode):
...@@ -628,8 +699,9 @@ class StaticGraphAdapter(object): ...@@ -628,8 +699,9 @@ class StaticGraphAdapter(object):
if compiled_prog is not None: if compiled_prog is not None:
return compiled_prog return compiled_prog
assert self.model._place is not None, \ assert (
"device is not set, please call `model.prepare()` first" self.model._place is not None
), "device is not set, please call `model.prepare()` first"
place = self.model._place place = self.model._place
...@@ -642,8 +714,11 @@ class StaticGraphAdapter(object): ...@@ -642,8 +714,11 @@ class StaticGraphAdapter(object):
uninitialized = [] uninitialized = []
for var_py in self._startup_prog.list_vars(): for var_py in self._startup_prog.list_vars():
var = fluid.global_scope().find_var(var_py.name) var = fluid.global_scope().find_var(var_py.name)
if not var_py.name.startswith('nccl_id') and var and \ if (
var.get_tensor()._is_initialized(): not var_py.name.startswith('nccl_id')
and var
and var.get_tensor()._is_initialized()
):
continue continue
uninitialized.append(var_py) uninitialized.append(var_py)
...@@ -651,7 +726,10 @@ class StaticGraphAdapter(object): ...@@ -651,7 +726,10 @@ class StaticGraphAdapter(object):
startup_prog = self._startup_prog._prune(uninitialized) startup_prog = self._startup_prog._prune(uninitialized)
self._executor.run(startup_prog) self._executor.run(startup_prog)
if self._amp_level == "O2" and mode == 'train' and core.is_compiled_with_cuda( if (
self._amp_level == "O2"
and mode == 'train'
and core.is_compiled_with_cuda()
): ):
self.model._optimizer.amp_init(place) self.model._optimizer.amp_init(place)
...@@ -664,7 +742,6 @@ class StaticGraphAdapter(object): ...@@ -664,7 +742,6 @@ class StaticGraphAdapter(object):
class DynamicGraphAdapter(object): class DynamicGraphAdapter(object):
def __init__(self, model): def __init__(self, model):
super(DynamicGraphAdapter, self).__init__() super(DynamicGraphAdapter, self).__init__()
self.model = model self.model = model
...@@ -674,7 +751,7 @@ class DynamicGraphAdapter(object): ...@@ -674,7 +751,7 @@ class DynamicGraphAdapter(object):
'eval_total': 0, 'eval_total': 0,
'test_total': 0, 'test_total': 0,
'eval_batch': 0, 'eval_batch': 0,
'test_batch': 0 'test_batch': 0,
} }
self._input_info = None self._input_info = None
...@@ -691,7 +768,8 @@ class DynamicGraphAdapter(object): ...@@ -691,7 +768,8 @@ class DynamicGraphAdapter(object):
stradegy.trainer_endpoints = ParallelEnv().trainer_endpoints stradegy.trainer_endpoints = ParallelEnv().trainer_endpoints
stradegy.current_endpoint = ParallelEnv().current_endpoint stradegy.current_endpoint = ParallelEnv().current_endpoint
self.ddp_model = fluid.dygraph.parallel.DataParallel( self.ddp_model = fluid.dygraph.parallel.DataParallel(
self.model.network, stradegy) self.model.network, stradegy
)
@property @property
def mode(self): def mode(self):
...@@ -703,8 +781,9 @@ class DynamicGraphAdapter(object): ...@@ -703,8 +781,9 @@ class DynamicGraphAdapter(object):
# TODO multi device in dygraph mode not implemented at present time # TODO multi device in dygraph mode not implemented at present time
def train_batch(self, inputs, labels=None, update=True): def train_batch(self, inputs, labels=None, update=True):
assert self.model._optimizer, \ assert (
"model not ready, please call `model.prepare()` first" self.model._optimizer
), "model not ready, please call `model.prepare()` first"
self.model.network.train() self.model.network.train()
self.mode = 'train' self.mode = 'train'
inputs = to_list(inputs) inputs = to_list(inputs)
...@@ -716,9 +795,11 @@ class DynamicGraphAdapter(object): ...@@ -716,9 +795,11 @@ class DynamicGraphAdapter(object):
if self._amp_level != "O0" and self.model._scaler is None: if self._amp_level != "O0" and self.model._scaler is None:
self.model._scaler = paddle.amp.GradScaler(**self._amp_configs) self.model._scaler = paddle.amp.GradScaler(**self._amp_configs)
with paddle.amp.auto_cast(enable=self._amp_level != 'O0', with paddle.amp.auto_cast(
enable=self._amp_level != 'O0',
**self._amp_custom_lists, **self._amp_custom_lists,
level=self._amp_level): level=self._amp_level
):
if self._nranks > 1: if self._nranks > 1:
outputs = self.ddp_model(*[to_variable(x) for x in inputs]) outputs = self.ddp_model(*[to_variable(x) for x in inputs])
else: else:
...@@ -746,8 +827,11 @@ class DynamicGraphAdapter(object): ...@@ -746,8 +827,11 @@ class DynamicGraphAdapter(object):
m = metric.update(*[to_numpy(m) for m in to_list(metric_outs)]) m = metric.update(*[to_numpy(m) for m in to_list(metric_outs)])
metrics.append(m) metrics.append(m)
return ([to_numpy(l) for l in losses], metrics) \ return (
if len(metrics) > 0 else [to_numpy(l) for l in losses] ([to_numpy(l) for l in losses], metrics)
if len(metrics) > 0
else [to_numpy(l) for l in losses]
)
def eval_batch(self, inputs, labels=None): def eval_batch(self, inputs, labels=None):
self.model.network.eval() self.model.network.eval()
...@@ -777,21 +861,25 @@ class DynamicGraphAdapter(object): ...@@ -777,21 +861,25 @@ class DynamicGraphAdapter(object):
metrics = [] metrics = []
for metric in self.model._metrics: for metric in self.model._metrics:
# cut off padding value. # cut off padding value.
if self.model._test_dataloader is not None and self._nranks > 1 \ if (
and isinstance(self.model._test_dataloader, DataLoader): self.model._test_dataloader is not None
and self._nranks > 1
and isinstance(self.model._test_dataloader, DataLoader)
):
total_size = len(self.model._test_dataloader.dataset) total_size = len(self.model._test_dataloader.dataset)
samples = outputs[0].shape[0] samples = outputs[0].shape[0]
current_count = self._merge_count.get(self.mode + '_total', 0) current_count = self._merge_count.get(self.mode + '_total', 0)
if current_count + samples >= total_size: if current_count + samples >= total_size:
outputs = [ outputs = [
o[:int(total_size - current_count)] for o in outputs o[: int(total_size - current_count)] for o in outputs
] ]
labels = [ labels = [
l[:int(total_size - current_count)] for l in labels l[: int(total_size - current_count)] for l in labels
] ]
self._merge_count[self.mode + '_total'] = 0 self._merge_count[self.mode + '_total'] = 0
self._merge_count[self.mode + '_batch'] = int(total_size - self._merge_count[self.mode + '_batch'] = int(
current_count) total_size - current_count
)
else: else:
self._merge_count[self.mode + '_total'] += samples self._merge_count[self.mode + '_total'] += samples
self._merge_count[self.mode + '_batch'] = samples self._merge_count[self.mode + '_batch'] = samples
...@@ -858,38 +946,48 @@ class DynamicGraphAdapter(object): ...@@ -858,38 +946,48 @@ class DynamicGraphAdapter(object):
opt_unq_name = '' opt_unq_name = ''
opt_cls_name = self.model._optimizer.__class__.__name__ opt_cls_name = self.model._optimizer.__class__.__name__
opt_name = opt_unq_name[:opt_unq_name.rfind("_")] # remove suffix idx opt_name = opt_unq_name[: opt_unq_name.rfind("_")] # remove suffix idx
param_names = [param.name for param in self.model.network.parameters()] param_names = [param.name for param in self.model.network.parameters()]
for var_name, state_var in sorted(optim_state.items(), for var_name, state_var in sorted(
key=lambda x: len(x[0]), optim_state.items(), key=lambda x: len(x[0]), reverse=True
reverse=True): ):
if var_name in ["@LR_DECAY_COUNTER@", "global_step"]: if var_name in ["@LR_DECAY_COUNTER@", "global_step"]:
# NOTE: dygraph saved global_step is 1 larger than that in # NOTE: dygraph saved global_step is 1 larger than that in
# static-graph, since the time of global_step to increase is # static-graph, since the time of global_step to increase is
# different. # different.
if var_name == "@LR_DECAY_COUNTER@": if var_name == "@LR_DECAY_COUNTER@":
converted_state["global_step"] = np.array( converted_state["global_step"] = (
converted_state.pop("@LR_DECAY_COUNTER@")) + 1 np.array(converted_state.pop("@LR_DECAY_COUNTER@")) + 1
)
else: else:
# moment and other accumulators # moment and other accumulators
# extend state dict to include promising dygraph names # extend state dict to include promising dygraph names
for param_name in param_names: for param_name in param_names:
if var_name.startswith(param_name + "_" + opt_name): if var_name.startswith(param_name + "_" + opt_name):
# when init optimizer with name # when init optimizer with name
accum_name = var_name[len(param_name + "_" + opt_name + accum_name = var_name[
"_"):] len(param_name + "_" + opt_name + "_") :
elif var_name.startswith(param_name + ]
"_") and opt_name == opt_cls_name: elif (
var_name.startswith(param_name + "_")
and opt_name == opt_cls_name
):
# when init optimizer without name # when init optimizer without name
accum_name = var_name[len(param_name + "_"):] accum_name = var_name[len(param_name + "_") :]
else: else:
continue continue
# remove suffix idx # remove suffix idx
accum_name = accum_name[:accum_name.rfind("_")] accum_name = accum_name[: accum_name.rfind("_")]
# state names always end with "_0" in dygraph because of the # state names always end with "_0" in dygraph because of the
# unique optimizer._name # unique optimizer._name
dy_state_name = (param_name + "_" + opt_unq_name + "_" + dy_state_name = (
accum_name + "_0") param_name
+ "_"
+ opt_unq_name
+ "_"
+ accum_name
+ "_0"
)
converted_state[dy_state_name] = state_var converted_state[dy_state_name] = state_var
if not hasattr(self.model._optimizer, 'set_state_dict'): if not hasattr(self.model._optimizer, 'set_state_dict'):
...@@ -901,18 +999,23 @@ class DynamicGraphAdapter(object): ...@@ -901,18 +999,23 @@ class DynamicGraphAdapter(object):
self.model._optimizer.set_state_dict(converted_state) self.model._optimizer.set_state_dict(converted_state)
def prepare(self): def prepare(self):
if self._amp_level == "O2" and self.model.mode == 'train' and core.is_compiled_with_cuda( if (
self._amp_level == "O2"
and self.model.mode == 'train'
and core.is_compiled_with_cuda()
): ):
self.model.network, self.model._optimizer = paddle.amp.decorate( self.model.network, self.model._optimizer = paddle.amp.decorate(
models=self.model.network, models=self.model.network,
optimizers=self.model._optimizer, optimizers=self.model._optimizer,
level='O2') level='O2',
)
if self._amp_level != "O0": if self._amp_level != "O0":
self.model._scaler = None self.model._scaler = None
class Model(object): class Model(object):
""" """
An Model object is network with training and inference features. An Model object is network with training and inference features.
Dynamic graph and static graph are supported at the same time, Dynamic graph and static graph are supported at the same time,
switched by `paddle.enable_static()`. The usage is as follows. switched by `paddle.enable_static()`. The usage is as follows.
...@@ -1053,6 +1156,7 @@ class Model(object): ...@@ -1053,6 +1156,7 @@ class Model(object):
def train_batch(self, inputs, labels=None, update=True): def train_batch(self, inputs, labels=None, update=True):
""" """
Run one training step on one batch of data. And using `update` indicates Run one training step on one batch of data. And using `update` indicates
whether optimizer update gradients computing by this batch. whether optimizer update gradients computing by this batch.
...@@ -1098,6 +1202,7 @@ class Model(object): ...@@ -1098,6 +1202,7 @@ class Model(object):
loss = model.train_batch([data], [label]) loss = model.train_batch([data], [label])
print(loss) print(loss)
# [array([2.192784], dtype=float32)] # [array([2.192784], dtype=float32)]
""" """
loss = self._adapter.train_batch(inputs, labels, update) loss = self._adapter.train_batch(inputs, labels, update)
if fluid._non_static_mode() and self._input_info is None: if fluid._non_static_mode() and self._input_info is None:
...@@ -1107,6 +1212,7 @@ class Model(object): ...@@ -1107,6 +1212,7 @@ class Model(object):
@no_grad() @no_grad()
def eval_batch(self, inputs, labels=None): def eval_batch(self, inputs, labels=None):
""" """
Run one evaluating step on a batch of data. Run one evaluating step on a batch of data.
Args: Args:
...@@ -1150,6 +1256,7 @@ class Model(object): ...@@ -1150,6 +1256,7 @@ class Model(object):
loss, acc = model.eval_batch([data], [label]) loss, acc = model.eval_batch([data], [label])
print(loss, acc) print(loss, acc)
# [array([2.8825705], dtype=float32)] [0.0] # [array([2.8825705], dtype=float32)] [0.0]
""" """
loss = self._adapter.eval_batch(inputs, labels) loss = self._adapter.eval_batch(inputs, labels)
if fluid._non_static_mode() and self._input_info is None: if fluid._non_static_mode() and self._input_info is None:
...@@ -1159,6 +1266,7 @@ class Model(object): ...@@ -1159,6 +1266,7 @@ class Model(object):
@no_grad() @no_grad()
def predict_batch(self, inputs): def predict_batch(self, inputs):
""" """
Run one predicting step on a batch of data. Run one predicting step on a batch of data.
Args: Args:
...@@ -1197,6 +1305,7 @@ class Model(object): ...@@ -1197,6 +1305,7 @@ class Model(object):
# [array([[0.08189095, 0.16740078, 0.06889386, 0.05085445, 0.10729759, # [array([[0.08189095, 0.16740078, 0.06889386, 0.05085445, 0.10729759,
# 0.02217775, 0.14518553, 0.1591538 , 0.01808308, 0.17906217]], # 0.02217775, 0.14518553, 0.1591538 , 0.01808308, 0.17906217]],
# dtype=float32)] # dtype=float32)]
""" """
loss = self._adapter.predict_batch(inputs) loss = self._adapter.predict_batch(inputs)
if fluid._non_static_mode() and self._input_info is None: if fluid._non_static_mode() and self._input_info is None:
...@@ -1205,6 +1314,7 @@ class Model(object): ...@@ -1205,6 +1314,7 @@ class Model(object):
def save(self, path, training=True): def save(self, path, training=True):
""" """
This function saves parameters, optimizer information or model and This function saves parameters, optimizer information or model and
paramters only for inference to path. It depends on the parameter paramters only for inference to path. It depends on the parameter
`training`. `training`.
...@@ -1272,6 +1382,7 @@ class Model(object): ...@@ -1272,6 +1382,7 @@ class Model(object):
model.fit(data, epochs=1, batch_size=32, verbose=0) model.fit(data, epochs=1, batch_size=32, verbose=0)
model.save('checkpoint/test') # save for training model.save('checkpoint/test') # save for training
model.save('inference_model', False) # save for inference model.save('inference_model', False) # save for inference
""" """
if ParallelEnv().local_rank == 0: if ParallelEnv().local_rank == 0:
...@@ -1282,6 +1393,7 @@ class Model(object): ...@@ -1282,6 +1393,7 @@ class Model(object):
def load(self, path, skip_mismatch=False, reset_optimizer=False): def load(self, path, skip_mismatch=False, reset_optimizer=False):
""" """
Load from files storing the model states and optimizer states. The file Load from files storing the model states and optimizer states. The file
for optimizer states is not necessary if no need to restore the optimizer. for optimizer states is not necessary if no need to restore the optimizer.
...@@ -1329,6 +1441,7 @@ class Model(object): ...@@ -1329,6 +1441,7 @@ class Model(object):
model.save('checkpoint/test') model.save('checkpoint/test')
model.load('checkpoint/test') model.load('checkpoint/test')
""" """
def _load_state_from_path(path): def _load_state_from_path(path):
...@@ -1341,17 +1454,24 @@ class Model(object): ...@@ -1341,17 +1454,24 @@ class Model(object):
state = param_state.get(key, None) state = param_state.get(key, None)
if state is None: if state is None:
raise ValueError( raise ValueError(
"{} is not found in the providing file.".format(key)) "{} is not found in the providing file.".format(key)
)
if list(state.shape) != list(param.shape): if list(state.shape) != list(param.shape):
raise ValueError( raise ValueError(
"{} receives a shape {}, but the expected shape is {}.". "{} receives a shape {}, but the expected shape is {}.".format(
format(key, list(state.shape), list(param.shape))) key, list(state.shape), list(param.shape)
)
)
return param, state return param, state
def _strip_postfix(path): def _strip_postfix(path):
path, ext = os.path.splitext(path) path, ext = os.path.splitext(path)
assert ext in ['', '.pdparams', '.pdopt', '.pdmodel'], \ assert ext in [
"Unknown postfix {} from weights".format(ext) '',
'.pdparams',
'.pdopt',
'.pdmodel',
], "Unknown postfix {} from weights".format(ext)
return path return path
path = _strip_postfix(path) path = _strip_postfix(path)
...@@ -1365,15 +1485,17 @@ class Model(object): ...@@ -1365,15 +1485,17 @@ class Model(object):
except ValueError as err: except ValueError as err:
if skip_mismatch: if skip_mismatch:
warnings.warn( warnings.warn(
("Skip loading for {}. ".format(key) + str(err))) ("Skip loading for {}. ".format(key) + str(err))
)
# reset optimizer when mismatch happens # reset optimizer when mismatch happens
reset_optimizer = True reset_optimizer = True
else: else:
raise err raise err
matched_param_state.append(match_res) matched_param_state.append(match_res)
optim_state = None if reset_optimizer else _load_state_from_path( optim_state = (
path + ".pdopt") None if reset_optimizer else _load_state_from_path(path + ".pdopt")
)
# TODO: support save/load scaler state in static graph # TODO: support save/load scaler state in static graph
if _non_static_mode(): if _non_static_mode():
...@@ -1382,13 +1504,15 @@ class Model(object): ...@@ -1382,13 +1504,15 @@ class Model(object):
if os.path.exists(path + '.pdscaler'): if os.path.exists(path + '.pdscaler'):
scaler_state = paddle.load(path + '.pdscaler') scaler_state = paddle.load(path + '.pdscaler')
return self._adapter.load(matched_param_state, optim_state, return self._adapter.load(
scaler_state) matched_param_state, optim_state, scaler_state
)
else: else:
return self._adapter.load(matched_param_state, optim_state) return self._adapter.load(matched_param_state, optim_state)
def parameters(self, *args, **kwargs): def parameters(self, *args, **kwargs):
""" """
Returns a list of parameters of the model. Returns a list of parameters of the model.
Returns: Returns:
...@@ -1411,17 +1535,19 @@ class Model(object): ...@@ -1411,17 +1535,19 @@ class Model(object):
nn.Linear(200, 10)), input) nn.Linear(200, 10)), input)
params = model.parameters() params = model.parameters()
""" """
return self._adapter.parameters() return self._adapter.parameters()
def _prepare_amp(self, amp_configs): def _prepare_amp(self, amp_configs):
def _check_pure_fp16_configs(): def _check_pure_fp16_configs():
# pure float16 training has some restricts now # pure float16 training has some restricts now
if self._adapter._amp_level == "O2" and self._optimizer._grad_clip: if self._adapter._amp_level == "O2" and self._optimizer._grad_clip:
# clip by value is not supported # clip by value is not supported
assert isinstance(self._optimizer._grad_clip, (paddle.nn.ClipGradByGlobalNorm, paddle.nn.ClipGradByNorm)), \ assert isinstance(
"Only GradientClipByNorm and GradientClipByGlobalNorm are supported in amp training with level=O2 currently." self._optimizer._grad_clip,
(paddle.nn.ClipGradByGlobalNorm, paddle.nn.ClipGradByNorm),
), "Only GradientClipByNorm and GradientClipByGlobalNorm are supported in amp training with level=O2 currently."
self._adapter._amp_custom_lists = {} self._adapter._amp_custom_lists = {}
self._adapter._amp_configs = {} self._adapter._amp_configs = {}
...@@ -1433,7 +1559,8 @@ class Model(object): ...@@ -1433,7 +1559,8 @@ class Model(object):
elif isinstance(amp_configs, str): elif isinstance(amp_configs, str):
if amp_configs not in ('O0', 'O1', 'O2'): if amp_configs not in ('O0', 'O1', 'O2'):
raise ValueError( raise ValueError(
"The level of amp_configs should be 'O0', 'O1' or 'O2'.") "The level of amp_configs should be 'O0', 'O1' or 'O2'."
)
self._adapter._amp_level = amp_configs self._adapter._amp_level = amp_configs
_check_pure_fp16_configs() _check_pure_fp16_configs()
return return
...@@ -1442,7 +1569,8 @@ class Model(object): ...@@ -1442,7 +1569,8 @@ class Model(object):
self._adapter._amp_level = 'O1' self._adapter._amp_level = 'O1'
elif amp_configs['level'] not in ('O0', 'O1', 'O2'): elif amp_configs['level'] not in ('O0', 'O1', 'O2'):
raise ValueError( raise ValueError(
"amp_configs['level'] should be 'O0', 'O1' or 'O2'.") "amp_configs['level'] should be 'O0', 'O1' or 'O2'."
)
else: else:
self._adapter._amp_level = amp_configs['level'] self._adapter._amp_level = amp_configs['level']
amp_config_key_set = set(amp_configs.keys()) - {'level'} amp_config_key_set = set(amp_configs.keys()) - {'level'}
...@@ -1459,12 +1587,14 @@ class Model(object): ...@@ -1459,12 +1587,14 @@ class Model(object):
# construct amp_custom_lists # construct amp_custom_lists
if self._adapter._amp_level != 'O0' and amp_config_key_set: if self._adapter._amp_level != 'O0' and amp_config_key_set:
for param_name in [ for param_name in [
'custom_white_list', 'custom_black_list', 'custom_white_list',
'custom_black_varnames' 'custom_black_list',
'custom_black_varnames',
]: ]:
if param_name in amp_config_key_set: if param_name in amp_config_key_set:
self._adapter._amp_custom_lists[param_name] = amp_configs[ self._adapter._amp_custom_lists[param_name] = amp_configs[
param_name] param_name
]
amp_config_key_set -= {param_name} amp_config_key_set -= {param_name}
def _check_amp_configs(amp_config_key_set): def _check_amp_configs(amp_config_key_set):
...@@ -1479,13 +1609,16 @@ class Model(object): ...@@ -1479,13 +1609,16 @@ class Model(object):
} }
if amp_config_key_set - accepted_param_set: if amp_config_key_set - accepted_param_set:
raise ValueError( raise ValueError(
"Except for 'level', the keys of 'amp_configs' must be accepted by mixed precision APIs, but {} could not be recognized." "Except for 'level', the keys of 'amp_configs' must be accepted by mixed precision APIs, but {} could not be recognized.".format(
.format(tuple(amp_config_key_set - accepted_param_set))) tuple(amp_config_key_set - accepted_param_set)
)
)
if 'use_fp16_guard' in amp_config_key_set: if 'use_fp16_guard' in amp_config_key_set:
if _non_static_mode(): if _non_static_mode():
raise ValueError( raise ValueError(
"'use_fp16_guard' is supported in static mode only.") "'use_fp16_guard' is supported in static mode only."
)
self._adapter._use_fp16_guard = amp_configs['use_fp16_guard'] self._adapter._use_fp16_guard = amp_configs['use_fp16_guard']
amp_config_key_set.remove('use_fp16_guard') amp_config_key_set.remove('use_fp16_guard')
...@@ -1495,12 +1628,11 @@ class Model(object): ...@@ -1495,12 +1628,11 @@ class Model(object):
for key in amp_configs_set: for key in amp_configs_set:
self._adapter._amp_configs[key] = amp_configs[key] self._adapter._amp_configs[key] = amp_configs[key]
def prepare(self, def prepare(
optimizer=None, self, optimizer=None, loss=None, metrics=None, amp_configs=None
loss=None, ):
metrics=None,
amp_configs=None):
""" """
Configures the model before runing. Configures the model before runing.
Args: Args:
...@@ -1532,6 +1664,7 @@ class Model(object): ...@@ -1532,6 +1664,7 @@ class Model(object):
Returns: Returns:
None None
""" """
self._place = _get_device() self._place = _get_device()
if isinstance(self._place, fluid.CUDAPlace): if isinstance(self._place, fluid.CUDAPlace):
...@@ -1539,15 +1672,17 @@ class Model(object): ...@@ -1539,15 +1672,17 @@ class Model(object):
if ParallelEnv().nranks > 1 and not _parallel_context_initialized: if ParallelEnv().nranks > 1 and not _parallel_context_initialized:
if fluid._non_static_mode(): if fluid._non_static_mode():
main_prog_seed = fluid.default_main_program().random_seed main_prog_seed = fluid.default_main_program().random_seed
startup_prog_seed = fluid.default_startup_program( startup_prog_seed = (
).random_seed fluid.default_startup_program().random_seed
)
fluid.disable_dygraph() fluid.disable_dygraph()
paddle.disable_static(self._place) paddle.disable_static(self._place)
# enable_dygraph would create and switch to a new program, # enable_dygraph would create and switch to a new program,
# thus also copy seed to the new program # thus also copy seed to the new program
fluid.default_main_program().random_seed = main_prog_seed fluid.default_main_program().random_seed = main_prog_seed
fluid.default_startup_program( fluid.default_startup_program().random_seed = (
).random_seed = startup_prog_seed startup_prog_seed
)
else: else:
prepare_distributed_context(self._place) prepare_distributed_context(self._place)
_parallel_context_initialized = True _parallel_context_initialized = True
...@@ -1562,15 +1697,16 @@ class Model(object): ...@@ -1562,15 +1697,16 @@ class Model(object):
metrics = metrics or [] metrics = metrics or []
for metric in to_list(metrics): for metric in to_list(metrics):
assert isinstance(metric, Metric), \ assert isinstance(
"{} is not sub class of Metric".format( metric, Metric
metric.__class__.__name__) ), "{} is not sub class of Metric".format(metric.__class__.__name__)
self._metrics = to_list(metrics) self._metrics = to_list(metrics)
self._prepare_amp(amp_configs) self._prepare_amp(amp_configs)
self._adapter.prepare() self._adapter.prepare()
def fit(self, def fit(
self,
train_data=None, train_data=None,
eval_data=None, eval_data=None,
batch_size=1, batch_size=1,
...@@ -1585,8 +1721,10 @@ class Model(object): ...@@ -1585,8 +1721,10 @@ class Model(object):
num_workers=0, num_workers=0,
callbacks=None, callbacks=None,
accumulate_grad_batches=1, accumulate_grad_batches=1,
num_iters=None): num_iters=None,
):
""" """
Trains the model for a fixed number of epochs. If `eval_data` is set, Trains the model for a fixed number of epochs. If `eval_data` is set,
evaluation will be done at the end of each epoch. evaluation will be done at the end of each epoch.
...@@ -1641,7 +1779,7 @@ class Model(object): ...@@ -1641,7 +1779,7 @@ class Model(object):
How to make a batch is done internally. How to make a batch is done internally.
.. code-block:: python .. code-block:: python
:name: code-example1 :name: code-example3
import paddle import paddle
import paddle.vision.transforms as T import paddle.vision.transforms as T
...@@ -1681,7 +1819,7 @@ class Model(object): ...@@ -1681,7 +1819,7 @@ class Model(object):
DataLoader. DataLoader.
.. code-block:: python .. code-block:: python
:name: code-example2 :name: code-example4
import paddle import paddle
import paddle.vision.transforms as T import paddle.vision.transforms as T
...@@ -1718,31 +1856,38 @@ class Model(object): ...@@ -1718,31 +1856,38 @@ class Model(object):
val_loader, val_loader,
epochs=2, epochs=2,
save_dir='mnist_checkpoint') save_dir='mnist_checkpoint')
""" """
assert train_data is not None, \ assert train_data is not None, "train_data must be given!"
"train_data must be given!"
if isinstance(train_data, Dataset): if isinstance(train_data, Dataset):
train_sampler = DistributedBatchSampler(train_data, train_sampler = DistributedBatchSampler(
train_data,
batch_size=batch_size, batch_size=batch_size,
shuffle=shuffle, shuffle=shuffle,
drop_last=drop_last) drop_last=drop_last,
train_loader = DataLoader(train_data, )
train_loader = DataLoader(
train_data,
batch_sampler=train_sampler, batch_sampler=train_sampler,
places=self._place, places=self._place,
num_workers=num_workers, num_workers=num_workers,
return_list=True) return_list=True,
)
else: else:
train_loader = train_data train_loader = train_data
if eval_data is not None and isinstance(eval_data, Dataset): if eval_data is not None and isinstance(eval_data, Dataset):
eval_sampler = DistributedBatchSampler(eval_data, eval_sampler = DistributedBatchSampler(
batch_size=batch_size) eval_data, batch_size=batch_size
eval_loader = DataLoader(eval_data, )
eval_loader = DataLoader(
eval_data,
batch_sampler=eval_sampler, batch_sampler=eval_sampler,
places=self._place, places=self._place,
num_workers=num_workers, num_workers=num_workers,
return_list=True) return_list=True,
)
elif eval_data is not None: elif eval_data is not None:
eval_loader = eval_data eval_loader = eval_data
else: else:
...@@ -1755,8 +1900,11 @@ class Model(object): ...@@ -1755,8 +1900,11 @@ class Model(object):
steps = self._len_data_loader(train_loader) steps = self._len_data_loader(train_loader)
self.num_iters = num_iters self.num_iters = num_iters
if num_iters is not None and isinstance(num_iters, int) and isinstance( if (
steps, int): num_iters is not None
and isinstance(num_iters, int)
and isinstance(steps, int)
):
assert num_iters > 0, "num_iters must be greater than 0!" assert num_iters > 0, "num_iters must be greater than 0!"
epochs = (num_iters // steps) + 1 epochs = (num_iters // steps) + 1
steps = min(num_iters, steps) steps = min(num_iters, steps)
...@@ -1784,10 +1932,10 @@ class Model(object): ...@@ -1784,10 +1932,10 @@ class Model(object):
if do_eval and epoch % eval_freq == 0: if do_eval and epoch % eval_freq == 0:
eval_steps = self._len_data_loader(eval_loader) eval_steps = self._len_data_loader(eval_loader)
cbks.on_begin('eval', { cbks.on_begin(
'steps': eval_steps, 'eval',
'metrics': self._metrics_name() {'steps': eval_steps, 'metrics': self._metrics_name()},
}) )
eval_logs = self._run_one_epoch(eval_loader, cbks, 'eval') eval_logs = self._run_one_epoch(eval_loader, cbks, 'eval')
...@@ -1798,14 +1946,16 @@ class Model(object): ...@@ -1798,14 +1946,16 @@ class Model(object):
cbks.on_end('train', logs) cbks.on_end('train', logs)
self._test_dataloader = None self._test_dataloader = None
def evaluate(self, def evaluate(
self,
eval_data, eval_data,
batch_size=1, batch_size=1,
log_freq=10, log_freq=10,
verbose=2, verbose=2,
num_workers=0, num_workers=0,
callbacks=None, callbacks=None,
num_iters=None): num_iters=None,
):
""" """
Evaluate the loss and metrics of the model on input dataset. Evaluate the loss and metrics of the model on input dataset.
...@@ -1859,13 +2009,16 @@ class Model(object): ...@@ -1859,13 +2009,16 @@ class Model(object):
""" """
if eval_data is not None and isinstance(eval_data, Dataset): if eval_data is not None and isinstance(eval_data, Dataset):
eval_sampler = DistributedBatchSampler(eval_data, eval_sampler = DistributedBatchSampler(
batch_size=batch_size) eval_data, batch_size=batch_size
eval_loader = DataLoader(eval_data, )
eval_loader = DataLoader(
eval_data,
batch_sampler=eval_sampler, batch_sampler=eval_sampler,
places=self._place, places=self._place,
num_workers=num_workers, num_workers=num_workers,
return_list=True) return_list=True,
)
else: else:
eval_loader = eval_data eval_loader = eval_data
...@@ -1881,15 +2034,17 @@ class Model(object): ...@@ -1881,15 +2034,17 @@ class Model(object):
eval_steps = self._len_data_loader(eval_loader) eval_steps = self._len_data_loader(eval_loader)
self.num_iters = num_iters self.num_iters = num_iters
if num_iters is not None and isinstance(num_iters, int) and isinstance( if (
eval_steps, int): num_iters is not None
and isinstance(num_iters, int)
and isinstance(eval_steps, int)
):
assert num_iters > 0, "num_iters must be greater than 0!" assert num_iters > 0, "num_iters must be greater than 0!"
eval_steps = min(num_iters, eval_steps) eval_steps = min(num_iters, eval_steps)
self.num_iters = eval_steps self.num_iters = eval_steps
cbks.on_begin('eval', { cbks.on_begin(
'steps': eval_steps, 'eval', {'steps': eval_steps, 'metrics': self._metrics_name()}
'metrics': self._metrics_name() )
})
logs = self._run_one_epoch(eval_loader, cbks, 'eval') logs = self._run_one_epoch(eval_loader, cbks, 'eval')
...@@ -1903,13 +2058,15 @@ class Model(object): ...@@ -1903,13 +2058,15 @@ class Model(object):
return eval_result return eval_result
def predict(self, def predict(
self,
test_data, test_data,
batch_size=1, batch_size=1,
num_workers=0, num_workers=0,
stack_outputs=False, stack_outputs=False,
verbose=1, verbose=1,
callbacks=None): callbacks=None,
):
""" """
Compute the output predictions on testing data. Compute the output predictions on testing data.
...@@ -1980,13 +2137,16 @@ class Model(object): ...@@ -1980,13 +2137,16 @@ class Model(object):
""" """
if test_data is not None and isinstance(test_data, Dataset): if test_data is not None and isinstance(test_data, Dataset):
test_sampler = DistributedBatchSampler(test_data, test_sampler = DistributedBatchSampler(
batch_size=batch_size) test_data, batch_size=batch_size
test_loader = DataLoader(test_data, )
test_loader = DataLoader(
test_data,
batch_sampler=test_sampler, batch_sampler=test_sampler,
places=self._place, places=self._place,
num_workers=num_workers, num_workers=num_workers,
return_list=True) return_list=True,
)
else: else:
test_loader = test_data test_loader = test_data
...@@ -2036,7 +2196,8 @@ class Model(object): ...@@ -2036,7 +2196,8 @@ class Model(object):
if self._is_shape_inferred: if self._is_shape_inferred:
warnings.warn( warnings.warn(
"'inputs' was not specified when Model initialization, so the input shape to be saved will be the shape derived from the user's actual inputs. The input shape to be saved is %s. For saving correct input shapes, please provide 'inputs' for Model initialization." "'inputs' was not specified when Model initialization, so the input shape to be saved will be the shape derived from the user's actual inputs. The input shape to be saved is %s. For saving correct input shapes, please provide 'inputs' for Model initialization."
% self._input_info[0]) % self._input_info[0]
)
paddle.jit.save(layer, path, input_spec=self._inputs) paddle.jit.save(layer, path, input_spec=self._inputs)
...@@ -2047,7 +2208,8 @@ class Model(object): ...@@ -2047,7 +2208,8 @@ class Model(object):
raise ValueError( raise ValueError(
"The input path MUST be format of dirname/file_prefix " "The input path MUST be format of dirname/file_prefix "
"[dirname\\file_prefix in Windows system], but received " "[dirname\\file_prefix in Windows system], but received "
"file_prefix is empty string.") "file_prefix is empty string."
)
dirname = os.path.dirname(path) dirname = os.path.dirname(path)
if dirname and not os.path.exists(dirname): if dirname and not os.path.exists(dirname):
...@@ -2058,21 +2220,24 @@ class Model(object): ...@@ -2058,21 +2220,24 @@ class Model(object):
params_filename = file_prefix + INFER_PARAMS_SUFFIX params_filename = file_prefix + INFER_PARAMS_SUFFIX
prog = self._adapter._progs.get('test', None) prog = self._adapter._progs.get('test', None)
assert prog, \ assert (
"Model is not ready, please call `model.prepare()` first" prog
), "Model is not ready, please call `model.prepare()` first"
infer_prog = prog.clone(for_test=True) infer_prog = prog.clone(for_test=True)
input_names = [v.name for v in self._adapter._input_vars['test']] input_names = [v.name for v in self._adapter._input_vars['test']]
endpoints = self._adapter._endpoints['test']['output'] endpoints = self._adapter._endpoints['test']['output']
fluid.io.save_inference_model(model_path, fluid.io.save_inference_model(
model_path,
input_names, input_names,
endpoints, endpoints,
self._adapter._executor, self._adapter._executor,
main_program=infer_prog, main_program=infer_prog,
model_filename=model_filename, model_filename=model_filename,
params_filename=params_filename) params_filename=params_filename,
)
def _run_one_epoch( def _run_one_epoch(
self, self,
...@@ -2098,16 +2263,21 @@ class Model(object): ...@@ -2098,16 +2263,21 @@ class Model(object):
# LoDTensor.shape is callable, where LoDTensor comes from # LoDTensor.shape is callable, where LoDTensor comes from
# DataLoader in static graph # DataLoader in static graph
batch_size = data[0].shape()[0] if callable( batch_size = (
data[0].shape) else data[0].shape[0] data[0].shape()[0]
if callable(data[0].shape)
else data[0].shape[0]
)
callbacks.on_batch_begin(mode, step, logs) callbacks.on_batch_begin(mode, step, logs)
if mode != 'predict': if mode != 'predict':
_inputs = [data[:len(self._inputs)], data[len(self._inputs):]] _inputs = [data[: len(self._inputs)], data[len(self._inputs) :]]
if mode == 'train': if mode == 'train':
_inputs.append((step + 1) % self._accumulate == 0 _inputs.append(
or step + 1 == len(data_loader)) (step + 1) % self._accumulate == 0
or step + 1 == len(data_loader)
)
outs = getattr(self, mode + '_batch')(*_inputs) outs = getattr(self, mode + '_batch')(*_inputs)
...@@ -2128,15 +2298,17 @@ class Model(object): ...@@ -2128,15 +2298,17 @@ class Model(object):
logs[k] = v logs[k] = v
else: else:
if self._inputs is not None: if self._inputs is not None:
outs = self.predict_batch(data[:len(self._inputs)]) outs = self.predict_batch(data[: len(self._inputs)])
else: else:
outs = self.predict_batch(data) outs = self.predict_batch(data)
outputs.append(outs) outputs.append(outs)
logs['step'] = step logs['step'] = step
if mode == 'train' or self._adapter._merge_count.get( if (
mode + '_batch', 0) <= 0: mode == 'train'
or self._adapter._merge_count.get(mode + '_batch', 0) <= 0
):
logs['batch_size'] = batch_size * ParallelEnv().nranks logs['batch_size'] = batch_size * ParallelEnv().nranks
else: else:
logs['batch_size'] = self._adapter._merge_count[mode + '_batch'] logs['batch_size'] = self._adapter._merge_count[mode + '_batch']
...@@ -2190,8 +2362,9 @@ class Model(object): ...@@ -2190,8 +2362,9 @@ class Model(object):
# {'total_params': 61610, 'trainable_params': 61610} # {'total_params': 61610, 'trainable_params': 61610}
""" """
assert (input_size is not None or self._inputs assert (
is not None), "'input_size' or 'self._input' must be set" input_size is not None or self._inputs is not None
), "'input_size' or 'self._input' must be set"
if input_size is not None: if input_size is not None:
_input_size = input_size _input_size = input_size
else: else:
...@@ -2208,7 +2381,10 @@ class Model(object): ...@@ -2208,7 +2381,10 @@ class Model(object):
if is_input: if is_input:
arg_names = extract_args(self.network.forward)[1:] arg_names = extract_args(self.network.forward)[1:]
# While Saving inference model in dygraph, and providing inputs only in running. # While Saving inference model in dygraph, and providing inputs only in running.
if shapes is not None and dtypes is not None and fluid._non_static_mode( if (
shapes is not None
and dtypes is not None
and fluid._non_static_mode()
): ):
out_specs = [ out_specs = [
Input(name=n, dtype=dtypes[i], shape=shapes[i]) Input(name=n, dtype=dtypes[i], shape=shapes[i])
...@@ -2221,7 +2397,8 @@ class Model(object): ...@@ -2221,7 +2397,8 @@ class Model(object):
elif isinstance(specs, dict): elif isinstance(specs, dict):
assert is_input is False assert is_input is False
out_specs = [ out_specs = [
specs[n] for n in extract_args(self.network.forward) specs[n]
for n in extract_args(self.network.forward)
if n != 'self' if n != 'self'
] ]
else: else:
...@@ -2232,8 +2409,10 @@ class Model(object): ...@@ -2232,8 +2409,10 @@ class Model(object):
assert isinstance(spec, Input) assert isinstance(spec, Input)
if spec.name is None: if spec.name is None:
raise ValueError( raise ValueError(
"Requires Input[{}].name != None, but receive `None` with {}." "Requires Input[{}].name != None, but receive `None` with {}.".format(
.format(i, spec)) i, spec
)
)
return out_specs return out_specs
...@@ -2258,6 +2437,7 @@ class Model(object): ...@@ -2258,6 +2437,7 @@ class Model(object):
"Update self._inputs according to given inputs." "Update self._inputs according to given inputs."
self._input_info = self._adapter._input_info self._input_info = self._adapter._input_info
if self._input_info is not None and len(self._input_info) == 2: if self._input_info is not None and len(self._input_info) == 2:
self._inputs = self._verify_spec(None, self._input_info[0], self._inputs = self._verify_spec(
self._input_info[1], True) None, self._input_info[0], self._input_info[1], True
)
self._is_shape_inferred = True self._is_shape_inferred = True
...@@ -284,9 +284,11 @@ def fused_bias_dropout_residual_layer_norm( ...@@ -284,9 +284,11 @@ def fused_bias_dropout_residual_layer_norm(
name=None, name=None,
): ):
r""" r"""
The fused_bias_dropout_residual_layer_norm operator. The pseudo code is as follows: The fused_bias_dropout_residual_layer_norm operator. The pseudo code is as follows:
.. code-block:: python .. code-block:: python
y = layer_norm(residual + dropout(bias + x)) y = layer_norm(residual + dropout(bias + x))
Parameters: Parameters:
...@@ -315,10 +317,9 @@ def fused_bias_dropout_residual_layer_norm( ...@@ -315,10 +317,9 @@ def fused_bias_dropout_residual_layer_norm(
name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`. name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
Returns: Returns:
Tensor: The output Tensor, the data type and shape is same as `x`. Tensor, The output Tensor, the data type and shape is same as `x`.
Examples: Examples:
.. code-block:: python .. code-block:: python
# required: gpu # required: gpu
...@@ -336,6 +337,7 @@ def fused_bias_dropout_residual_layer_norm( ...@@ -336,6 +337,7 @@ def fused_bias_dropout_residual_layer_norm(
x, residual, bias) x, residual, bias)
# [2, 4, 128] # [2, 4, 128]
print(output.shape) print(output.shape)
""" """
seed = None seed = None
if mode not in ('downscale_in_infer', 'upscale_in_train'): if mode not in ('downscale_in_infer', 'upscale_in_train'):
......
...@@ -16,7 +16,10 @@ from paddle.incubate.nn import functional as incubate_f ...@@ -16,7 +16,10 @@ from paddle.incubate.nn import functional as incubate_f
from paddle.nn import Layer from paddle.nn import Layer
from paddle.framework import ParamAttr from paddle.framework import ParamAttr
import paddle import paddle
from paddle.nn.layer.transformer import _convert_attention_mask, _convert_param_attr_to_list from paddle.nn.layer.transformer import (
_convert_attention_mask,
_convert_param_attr_to_list,
)
from paddle.nn.initializer import Constant from paddle.nn.initializer import Constant
from paddle.fluid.dygraph import no_grad from paddle.fluid.dygraph import no_grad
from paddle.fluid.framework import convert_np_dtype_to_dtype_, _non_static_mode from paddle.fluid.framework import convert_np_dtype_to_dtype_, _non_static_mode
...@@ -51,7 +54,8 @@ def _to_dtype(t, dtype): ...@@ -51,7 +54,8 @@ def _to_dtype(t, dtype):
if t.place.is_gpu_place(): if t.place.is_gpu_place():
size_dtype = core.size_of_dtype(dtype) size_dtype = core.size_of_dtype(dtype)
waiting_alloc_memory = ( waiting_alloc_memory = (
(np.prod(t.shape) * size_dtype) / 256 + 1) * 256 * 1.2 ((np.prod(t.shape) * size_dtype) / 256 + 1) * 256 * 1.2
)
gpu_memory_available = core.gpu_memory_available() gpu_memory_available = core.gpu_memory_available()
if gpu_memory_available < waiting_alloc_memory: if gpu_memory_available < waiting_alloc_memory:
t_used = t._copy_to(paddle.CPUPlace(), False) t_used = t._copy_to(paddle.CPUPlace(), False)
...@@ -106,31 +110,38 @@ class FusedBiasDropoutResidualLayerNorm(Layer): ...@@ -106,31 +110,38 @@ class FusedBiasDropoutResidualLayerNorm(Layer):
output = fused_bias_dropout_residual_ln(x, residual) # [2, 4, 128] output = fused_bias_dropout_residual_ln(x, residual) # [2, 4, 128]
""" """
def __init__(self, def __init__(
self,
embed_dim, embed_dim,
dropout_rate=0.5, dropout_rate=0.5,
weight_attr=None, weight_attr=None,
bias_attr=None, bias_attr=None,
epsilon=1e-5, epsilon=1e-5,
name=None): name=None,
):
super(FusedBiasDropoutResidualLayerNorm, self).__init__() super(FusedBiasDropoutResidualLayerNorm, self).__init__()
assert embed_dim > 0, ("Expected embed_dim to be greater than 0, " assert embed_dim > 0, (
"but recieved {}".format(embed_dim)) "Expected embed_dim to be greater than 0, "
"but recieved {}".format(embed_dim)
)
self._dtype = self._helper.get_default_dtype() self._dtype = self._helper.get_default_dtype()
self._bias_attr = bias_attr self._bias_attr = bias_attr
self._weight_attr = weight_attr self._weight_attr = weight_attr
self.embed_dim = embed_dim self.embed_dim = embed_dim
self.linear_bias = self.create_parameter(shape=[embed_dim], self.linear_bias = self.create_parameter(
shape=[embed_dim],
attr=self._bias_attr, attr=self._bias_attr,
dtype=self._dtype, dtype=self._dtype,
is_bias=True) is_bias=True,
)
self.ln_scale = self.create_parameter( self.ln_scale = self.create_parameter(
attr=self._weight_attr, attr=self._weight_attr,
shape=[embed_dim], shape=[embed_dim],
default_initializer=Constant(value=1.0)) default_initializer=Constant(value=1.0),
self.ln_bias = self.create_parameter(attr=self._bias_attr, )
shape=[embed_dim], self.ln_bias = self.create_parameter(
is_bias=True) attr=self._bias_attr, shape=[embed_dim], is_bias=True
)
self.dropout_rate = dropout_rate self.dropout_rate = dropout_rate
self._epsilon = epsilon self._epsilon = epsilon
...@@ -163,14 +174,20 @@ class FusedBiasDropoutResidualLayerNorm(Layer): ...@@ -163,14 +174,20 @@ class FusedBiasDropoutResidualLayerNorm(Layer):
ln_epsilon=self._epsilon, ln_epsilon=self._epsilon,
training=self.training, training=self.training,
mode='upscale_in_train', mode='upscale_in_train',
name=self.name) name=self.name,
)
return out return out
def extra_repr(self): def extra_repr(self):
name_str = ', name={}'.format(self.name) if self.name else '' name_str = ', name={}'.format(self.name) if self.name else ''
return 'embed_dim={}, seq_len={}, dropout_rate={}, epsilon={}, dtype={}{}'.format( return 'embed_dim={}, seq_len={}, dropout_rate={}, epsilon={}, dtype={}{}'.format(
self.embed_dim, self.seq_len, self.dropout_rate, self._epsilon, self.embed_dim,
self._dtype, name_str) self.seq_len,
self.dropout_rate,
self._epsilon,
self._dtype,
name_str,
)
class FusedMultiHeadAttention(Layer): class FusedMultiHeadAttention(Layer):
...@@ -246,7 +263,8 @@ class FusedMultiHeadAttention(Layer): ...@@ -246,7 +263,8 @@ class FusedMultiHeadAttention(Layer):
output = multi_head_attn(query, None, None, attn_mask=attn_mask) # [2, 4, 128] output = multi_head_attn(query, None, None, attn_mask=attn_mask) # [2, 4, 128]
""" """
def __init__(self, def __init__(
self,
embed_dim, embed_dim,
num_heads, num_heads,
dropout_rate=0.5, dropout_rate=0.5,
...@@ -266,13 +284,19 @@ class FusedMultiHeadAttention(Layer): ...@@ -266,13 +284,19 @@ class FusedMultiHeadAttention(Layer):
epsilon=1e-5, epsilon=1e-5,
nranks=1, nranks=1,
ring_id=-1, ring_id=-1,
name=None): name=None,
):
super(FusedMultiHeadAttention, self).__init__() super(FusedMultiHeadAttention, self).__init__()
assert embed_dim > 0, ("Expected embed_dim to be greater than 0, " assert embed_dim > 0, (
"but received {}".format(embed_dim)) "Expected embed_dim to be greater than 0, "
assert num_heads > 0, ("Expected nhead to be greater than 0, " "but received {}".format(embed_dim)
"but received {}".format(num_heads)) )
assert (
num_heads > 0
), "Expected nhead to be greater than 0, " "but received {}".format(
num_heads
)
self.normalize_before = normalize_before self.normalize_before = normalize_before
self._dtype = self._helper.get_default_dtype() self._dtype = self._helper.get_default_dtype()
...@@ -285,7 +309,9 @@ class FusedMultiHeadAttention(Layer): ...@@ -285,7 +309,9 @@ class FusedMultiHeadAttention(Layer):
self.kdim = kdim self.kdim = kdim
self.vdim = vdim self.vdim = vdim
self.need_weights = need_weights self.need_weights = need_weights
assert self.head_dim * num_heads == embed_dim, "embed_dim must be divisible by num_heads" assert (
self.head_dim * num_heads == embed_dim
), "embed_dim must be divisible by num_heads"
assert need_weights is False, "Only support need_weight is False now." assert need_weights is False, "Only support need_weight is False now."
# tensor model parallel # tensor model parallel
...@@ -296,21 +322,26 @@ class FusedMultiHeadAttention(Layer): ...@@ -296,21 +322,26 @@ class FusedMultiHeadAttention(Layer):
shape=[3, num_heads, self.head_dim, embed_dim], shape=[3, num_heads, self.head_dim, embed_dim],
attr=qkv_weight_attr, attr=qkv_weight_attr,
dtype=self._dtype, dtype=self._dtype,
is_bias=False) is_bias=False,
)
self.qkv_bias = self.create_parameter( self.qkv_bias = self.create_parameter(
shape=[3, num_heads, self.head_dim], shape=[3, num_heads, self.head_dim],
attr=qkv_bias_attr, attr=qkv_bias_attr,
dtype=self._dtype, dtype=self._dtype,
is_bias=True) is_bias=True,
)
self.linear_weight = self.create_parameter( self.linear_weight = self.create_parameter(
shape=[num_heads * self.head_dim, embed_dim], shape=[num_heads * self.head_dim, embed_dim],
attr=linear_weight_attr, attr=linear_weight_attr,
dtype=self._dtype, dtype=self._dtype,
is_bias=False) is_bias=False,
self.linear_bias = self.create_parameter(shape=[embed_dim], )
self.linear_bias = self.create_parameter(
shape=[embed_dim],
attr=linear_bias_attr, attr=linear_bias_attr,
dtype=self._dtype, dtype=self._dtype,
is_bias=True) is_bias=True,
)
# tensor model parallel # tensor model parallel
if nranks > 1: if nranks > 1:
...@@ -325,10 +356,11 @@ class FusedMultiHeadAttention(Layer): ...@@ -325,10 +356,11 @@ class FusedMultiHeadAttention(Layer):
self.pre_ln_scale = self.create_parameter( self.pre_ln_scale = self.create_parameter(
attr=pre_ln_scale_attr, attr=pre_ln_scale_attr,
shape=[embed_dim], shape=[embed_dim],
default_initializer=Constant(value=1.0)) default_initializer=Constant(value=1.0),
self.pre_ln_bias = self.create_parameter(attr=pre_ln_bias_attr, )
shape=[embed_dim], self.pre_ln_bias = self.create_parameter(
is_bias=True) attr=pre_ln_bias_attr, shape=[embed_dim], is_bias=True
)
self.ln_scale = None self.ln_scale = None
self.ln_bias = None self.ln_bias = None
else: else:
...@@ -337,10 +369,11 @@ class FusedMultiHeadAttention(Layer): ...@@ -337,10 +369,11 @@ class FusedMultiHeadAttention(Layer):
self.ln_scale = self.create_parameter( self.ln_scale = self.create_parameter(
attr=ln_scale_attr, attr=ln_scale_attr,
shape=[embed_dim], shape=[embed_dim],
default_initializer=Constant(value=1.0)) default_initializer=Constant(value=1.0),
self.ln_bias = self.create_parameter(attr=ln_bias_attr, )
shape=[embed_dim], self.ln_bias = self.create_parameter(
is_bias=True) attr=ln_bias_attr, shape=[embed_dim], is_bias=True
)
self.dropout_rate = dropout_rate self.dropout_rate = dropout_rate
self.attn_dropout_rate = attn_dropout_rate self.attn_dropout_rate = attn_dropout_rate
...@@ -404,15 +437,25 @@ class FusedMultiHeadAttention(Layer): ...@@ -404,15 +437,25 @@ class FusedMultiHeadAttention(Layer):
ln_epsilon=self._epsilon, ln_epsilon=self._epsilon,
training=self.training, training=self.training,
ring_id=self._ring_id, ring_id=self._ring_id,
name=self.name) name=self.name,
)
return out return out
def extra_repr(self): def extra_repr(self):
name_str = ', name={}'.format(self.name) if self.name else '' name_str = ', name={}'.format(self.name) if self.name else ''
return 'embed_dim={}, num_heads={}, dropout_rate={}, attn_dropout_rate={}, epsilon={}, kdim={}, vdim={}, normalize_before={}, need_weights={}, dtype={}{}'.format( return 'embed_dim={}, num_heads={}, dropout_rate={}, attn_dropout_rate={}, epsilon={}, kdim={}, vdim={}, normalize_before={}, need_weights={}, dtype={}{}'.format(
self.embed_dim, self.num_heads, self.dropout_rate, self.embed_dim,
self.attn_dropout_rate, self._epsilon, self.kdim, self.vdim, self.num_heads,
self.normalize_before, self.need_weights, self._dtype, name_str) self.dropout_rate,
self.attn_dropout_rate,
self._epsilon,
self.kdim,
self.vdim,
self.normalize_before,
self.need_weights,
self._dtype,
name_str,
)
def _amp_decorate(self, dtype): def _amp_decorate(self, dtype):
# tmp fix for amp.decorator(O2) # tmp fix for amp.decorator(O2)
...@@ -495,7 +538,8 @@ class FusedFeedForward(Layer): ...@@ -495,7 +538,8 @@ class FusedFeedForward(Layer):
# (1, 8, 8) # (1, 8, 8)
""" """
def __init__(self, def __init__(
self,
d_model, d_model,
dim_feedforward, dim_feedforward,
dropout_rate=0.1, dropout_rate=0.1,
...@@ -513,15 +557,20 @@ class FusedFeedForward(Layer): ...@@ -513,15 +557,20 @@ class FusedFeedForward(Layer):
ln2_bias_attr=None, ln2_bias_attr=None,
nranks=1, nranks=1,
ring_id=-1, ring_id=-1,
name=None): name=None,
):
super(FusedFeedForward, self).__init__() super(FusedFeedForward, self).__init__()
assert d_model > 0, ( assert (
"Expected d_model to be greater than 0, but received {}".format( d_model > 0
d_model)) ), "Expected d_model to be greater than 0, but received {}".format(
assert dim_feedforward > 0, ( d_model
"Expected dim_feedforward to be greater than 0, but received {}". )
format(dim_feedforward)) assert (
dim_feedforward > 0
), "Expected dim_feedforward to be greater than 0, but received {}".format(
dim_feedforward
)
self._dtype = self._helper.get_default_dtype() self._dtype = self._helper.get_default_dtype()
self._d_model = d_model self._d_model = d_model
...@@ -530,7 +579,9 @@ class FusedFeedForward(Layer): ...@@ -530,7 +579,9 @@ class FusedFeedForward(Layer):
dim_feedforward = dim_feedforward // nranks dim_feedforward = dim_feedforward // nranks
self._dim_feedforward = dim_feedforward self._dim_feedforward = dim_feedforward
self._dropout_rate = dropout_rate self._dropout_rate = dropout_rate
self._act_dropout_rate = dropout_rate if act_dropout_rate is None else act_dropout_rate self._act_dropout_rate = (
dropout_rate if act_dropout_rate is None else act_dropout_rate
)
self._act_method = activation self._act_method = activation
self._normalize_before = normalize_before self._normalize_before = normalize_before
self._epsilon = epsilon self._epsilon = epsilon
...@@ -540,22 +591,28 @@ class FusedFeedForward(Layer): ...@@ -540,22 +591,28 @@ class FusedFeedForward(Layer):
shape=[d_model, dim_feedforward], shape=[d_model, dim_feedforward],
attr=linear1_weight_attr, attr=linear1_weight_attr,
dtype=self._dtype, dtype=self._dtype,
is_bias=False) is_bias=False,
self._linear1_bias = self.create_parameter(shape=[dim_feedforward], )
self._linear1_bias = self.create_parameter(
shape=[dim_feedforward],
attr=linear1_bias_attr, attr=linear1_bias_attr,
dtype=self._dtype, dtype=self._dtype,
is_bias=True) is_bias=True,
)
self._linear2_weight = self.create_parameter( self._linear2_weight = self.create_parameter(
shape=[dim_feedforward, d_model], shape=[dim_feedforward, d_model],
attr=linear2_weight_attr, attr=linear2_weight_attr,
dtype=self._dtype, dtype=self._dtype,
is_bias=False) is_bias=False,
)
self._linear2_bias = self.create_parameter(shape=[d_model], self._linear2_bias = self.create_parameter(
shape=[d_model],
attr=linear2_bias_attr, attr=linear2_bias_attr,
dtype=self._dtype, dtype=self._dtype,
is_bias=True) is_bias=True,
)
if nranks > 1: if nranks > 1:
assert ring_id != -1 assert ring_id != -1
...@@ -569,10 +626,11 @@ class FusedFeedForward(Layer): ...@@ -569,10 +626,11 @@ class FusedFeedForward(Layer):
shape=[d_model], shape=[d_model],
attr=ln1_scale_attr, attr=ln1_scale_attr,
is_bias=False, is_bias=False,
default_initializer=Constant(1.0)) default_initializer=Constant(1.0),
self._ln1_bias = self.create_parameter(shape=[d_model], )
attr=ln1_bias_attr, self._ln1_bias = self.create_parameter(
is_bias=True) shape=[d_model], attr=ln1_bias_attr, is_bias=True
)
self._ln2_scale = None self._ln2_scale = None
self._ln2_bias = None self._ln2_bias = None
else: else:
...@@ -582,10 +640,11 @@ class FusedFeedForward(Layer): ...@@ -582,10 +640,11 @@ class FusedFeedForward(Layer):
shape=[d_model], shape=[d_model],
attr=ln2_scale_attr, attr=ln2_scale_attr,
is_bias=False, is_bias=False,
default_initializer=Constant(1.0)) default_initializer=Constant(1.0),
self._ln2_bias = self.create_parameter(shape=[d_model], )
attr=ln2_bias_attr, self._ln2_bias = self.create_parameter(
is_bias=True) shape=[d_model], attr=ln2_bias_attr, is_bias=True
)
self.name = name self.name = name
...@@ -608,15 +667,23 @@ class FusedFeedForward(Layer): ...@@ -608,15 +667,23 @@ class FusedFeedForward(Layer):
pre_layer_norm=self._normalize_before, pre_layer_norm=self._normalize_before,
training=self.training, training=self.training,
ring_id=self._ring_id, ring_id=self._ring_id,
name=self.name) name=self.name,
)
return out return out
def extra_repr(self): def extra_repr(self):
name_str = ', name={}'.format(self.name) if self.name else '' name_str = ', name={}'.format(self.name) if self.name else ''
return 'd_model={}, dim_feedforward={}, dropout_rate={}, epsilon={}, activation={}, act_dropout_rate={}, normalize_before={}, dtype={}{}'.format( return 'd_model={}, dim_feedforward={}, dropout_rate={}, epsilon={}, activation={}, act_dropout_rate={}, normalize_before={}, dtype={}{}'.format(
self._d_model, self._dim_feedforward, self._dropout_rate, self._d_model,
self._epsilon, self._act_method, self._act_dropout_rate, self._dim_feedforward,
self._normalize_before, self._dtype, name_str) self._dropout_rate,
self._epsilon,
self._act_method,
self._act_dropout_rate,
self._normalize_before,
self._dtype,
name_str,
)
def _amp_decorate(self, dtype): def _amp_decorate(self, dtype):
# tmp fix for amp.decorator(O2) # tmp fix for amp.decorator(O2)
...@@ -640,6 +707,7 @@ class FusedFeedForward(Layer): ...@@ -640,6 +707,7 @@ class FusedFeedForward(Layer):
class FusedTransformerEncoderLayer(Layer): class FusedTransformerEncoderLayer(Layer):
""" """
FusedTransformerEncoderLayer is composed of two sub-layers which are self (multi-head) FusedTransformerEncoderLayer is composed of two sub-layers which are self (multi-head)
attention and feedforward network. Before and after each sub-layer, pre-process attention and feedforward network. Before and after each sub-layer, pre-process
and post-precess would be applied on the input and output accordingly. If and post-precess would be applied on the input and output accordingly. If
...@@ -681,7 +749,6 @@ class FusedTransformerEncoderLayer(Layer): ...@@ -681,7 +749,6 @@ class FusedTransformerEncoderLayer(Layer):
Examples: Examples:
.. code-block:: python .. code-block:: python
# required: gpu # required: gpu
...@@ -694,9 +761,11 @@ class FusedTransformerEncoderLayer(Layer): ...@@ -694,9 +761,11 @@ class FusedTransformerEncoderLayer(Layer):
attn_mask = paddle.rand((2, 2, 4, 4)) attn_mask = paddle.rand((2, 2, 4, 4))
encoder_layer = FusedTransformerEncoderLayer(128, 2, 512) encoder_layer = FusedTransformerEncoderLayer(128, 2, 512)
enc_output = encoder_layer(enc_input, attn_mask) # [2, 4, 128] enc_output = encoder_layer(enc_input, attn_mask) # [2, 4, 128]
""" """
def __init__(self, def __init__(
self,
d_model, d_model,
nhead, nhead,
dim_feedforward, dim_feedforward,
...@@ -706,21 +775,33 @@ class FusedTransformerEncoderLayer(Layer): ...@@ -706,21 +775,33 @@ class FusedTransformerEncoderLayer(Layer):
act_dropout_rate=None, act_dropout_rate=None,
normalize_before=False, normalize_before=False,
weight_attr=None, weight_attr=None,
bias_attr=None): bias_attr=None,
):
self._config = locals() self._config = locals()
self._config.pop("self") self._config.pop("self")
self._config.pop("__class__", None) # py3 self._config.pop("__class__", None) # py3
super(FusedTransformerEncoderLayer, self).__init__() super(FusedTransformerEncoderLayer, self).__init__()
assert d_model > 0, ("Expected d_model to be greater than 0, " assert (
"but received {}".format(d_model)) d_model > 0
assert nhead > 0, ("Expected nhead to be greater than 0, " ), "Expected d_model to be greater than 0, " "but received {}".format(
"but received {}".format(nhead)) d_model
)
assert (
nhead > 0
), "Expected nhead to be greater than 0, " "but received {}".format(
nhead
)
assert dim_feedforward > 0, ( assert dim_feedforward > 0, (
"Expected dim_feedforward to be greater than 0, " "Expected dim_feedforward to be greater than 0, "
"but received {}".format(dim_feedforward)) "but received {}".format(dim_feedforward)
attn_dropout_rate = dropout_rate if attn_dropout_rate is None else attn_dropout_rate )
act_dropout_rate = dropout_rate if act_dropout_rate is None else act_dropout_rate attn_dropout_rate = (
dropout_rate if attn_dropout_rate is None else attn_dropout_rate
)
act_dropout_rate = (
dropout_rate if act_dropout_rate is None else act_dropout_rate
)
self.normalize_before = normalize_before self.normalize_before = normalize_before
weight_attrs = _convert_param_attr_to_list(weight_attr, 2) weight_attrs = _convert_param_attr_to_list(weight_attr, 2)
...@@ -739,9 +820,11 @@ class FusedTransformerEncoderLayer(Layer): ...@@ -739,9 +820,11 @@ class FusedTransformerEncoderLayer(Layer):
pre_ln_scale_attr=weight_attrs[0], pre_ln_scale_attr=weight_attrs[0],
pre_ln_bias_attr=bias_attrs[0], pre_ln_bias_attr=bias_attrs[0],
ln_scale_attr=weight_attrs[0], ln_scale_attr=weight_attrs[0],
ln_bias_attr=bias_attrs[0]) ln_bias_attr=bias_attrs[0],
)
self.ffn = FusedFeedForward(d_model, self.ffn = FusedFeedForward(
d_model,
dim_feedforward, dim_feedforward,
dropout_rate=dropout_rate, dropout_rate=dropout_rate,
activation=activation, activation=activation,
...@@ -750,11 +833,14 @@ class FusedTransformerEncoderLayer(Layer): ...@@ -750,11 +833,14 @@ class FusedTransformerEncoderLayer(Layer):
linear1_weight_attr=weight_attrs[1], linear1_weight_attr=weight_attrs[1],
linear1_bias_attr=bias_attrs[1], linear1_bias_attr=bias_attrs[1],
linear2_weight_attr=weight_attrs[1], linear2_weight_attr=weight_attrs[1],
linear2_bias_attr=bias_attrs[1]) linear2_bias_attr=bias_attrs[1],
)
def forward(self, src, src_mask=None, cache=None): def forward(self, src, src_mask=None, cache=None):
""" """
Applies a Transformer encoder layer on the input. Applies a Transformer encoder layer on the input.
Parameters: Parameters:
src (Tensor): The input of Transformer encoder layer. It is src (Tensor): The input of Transformer encoder layer. It is
a tensor with shape `[batch_size, sequence_length, d_model]`. a tensor with shape `[batch_size, sequence_length, d_model]`.
...@@ -770,25 +856,27 @@ class FusedTransformerEncoderLayer(Layer): ...@@ -770,25 +856,27 @@ class FusedTransformerEncoderLayer(Layer):
`-INF` values and the others have 0 values. It can be None when `-INF` values and the others have 0 values. It can be None when
nothing wanted or needed to be prevented attention to. Default None. nothing wanted or needed to be prevented attention to. Default None.
cache (Tensor, optional): It is an instance of `MultiHeadAttention.Cache`. cache (Tensor, optional): It is an instance of `MultiHeadAttention.Cache`.
See `TransformerEncoderLayer.gen_cache` for more details. It is See :ref:`api_paddle_nn_TransformerEncoderLayer`.gen_cache for more details. It is
only used for inference and should be None for training. Default only used for inference and should be None for training. Default
None. None.
Returns: Returns:
Tensor|tuple: It is a tensor that has the same shape and data type \ Tensor|tuple, It is a tensor that has the same shape and data type \
as `enc_input`, representing the output of Transformer encoder \ as `enc_input`, representing the output of Transformer encoder \
layer. Or a tuple if `cache` is not None, except for encoder \ layer. Or a tuple if `cache` is not None, except for encoder \
layer output, the tuple includes the new cache which is same \ layer output, the tuple includes the new cache which is same \
as input `cache` argument but `incremental_cache` has an \ as input `cache` argument but `incremental_cache` has an \
incremental length. See `MultiHeadAttention.gen_cache` and \ incremental length. See `MultiHeadAttention.gen_cache` and \
`MultiHeadAttention.forward` for more details. `MultiHeadAttention.forward` for more details.
""" """
src_mask = _convert_attention_mask(src_mask, src.dtype) src_mask = _convert_attention_mask(src_mask, src.dtype)
if cache is None: if cache is None:
attn_out = self.fused_attn(src, attn_mask=src_mask) attn_out = self.fused_attn(src, attn_mask=src_mask)
else: else:
attn_out, incremental_cache = self.fused_attn(src, attn_out, incremental_cache = self.fused_attn(
attn_mask=src_mask, src, attn_mask=src_mask, cache=cache
cache=cache) )
ffn_out = self.ffn(attn_out) ffn_out = self.ffn(attn_out)
...@@ -889,7 +977,8 @@ class FusedTransformer(Layer): ...@@ -889,7 +977,8 @@ class FusedTransformer(Layer):
cross_attn_mask) # [2, 6, 128] cross_attn_mask) # [2, 6, 128]
""" """
def __init__(self, def __init__(
self,
d_model=512, d_model=512,
nhead=8, nhead=8,
num_encoder_layers=6, num_encoder_layers=6,
...@@ -903,7 +992,8 @@ class FusedTransformer(Layer): ...@@ -903,7 +992,8 @@ class FusedTransformer(Layer):
weight_attr=None, weight_attr=None,
bias_attr=None, bias_attr=None,
custom_encoder=None, custom_encoder=None,
custom_decoder=None): custom_decoder=None,
):
super(fusedTransformer, self).__init__() super(fusedTransformer, self).__init__()
raise NotImplementedError() raise NotImplementedError()
...@@ -1071,7 +1161,8 @@ class FusedMultiTransformer(Layer): ...@@ -1071,7 +1161,8 @@ class FusedMultiTransformer(Layer):
enc_output = encoder_layers(enc_input, attn_mask) # [2, 4, 128] enc_output = encoder_layers(enc_input, attn_mask) # [2, 4, 128]
""" """
def __init__(self, def __init__(
self,
embed_dim, embed_dim,
num_heads, num_heads,
dim_feedforward, dim_feedforward,
...@@ -1095,16 +1186,24 @@ class FusedMultiTransformer(Layer): ...@@ -1095,16 +1186,24 @@ class FusedMultiTransformer(Layer):
nranks=1, nranks=1,
trans_qkvw=True, trans_qkvw=True,
ring_id=-1, ring_id=-1,
name=None): name=None,
):
super(FusedMultiTransformer, self).__init__() super(FusedMultiTransformer, self).__init__()
assert embed_dim > 0, ("Expected embed_dim to be greater than 0, " assert embed_dim > 0, (
"but received {}".format(embed_dim)) "Expected embed_dim to be greater than 0, "
assert num_heads > 0, ("Expected nhead to be greater than 0, " "but received {}".format(embed_dim)
"but received {}".format(num_heads)) )
assert dim_feedforward > 0, ( assert (
"Expected dim_feedforward to be greater than 0, but received {}". num_heads > 0
format(dim_feedforward)) ), "Expected nhead to be greater than 0, " "but received {}".format(
num_heads
)
assert (
dim_feedforward > 0
), "Expected dim_feedforward to be greater than 0, but received {}".format(
dim_feedforward
)
self.normalize_before = normalize_before self.normalize_before = normalize_before
self._dtype = self._helper.get_default_dtype() self._dtype = self._helper.get_default_dtype()
...@@ -1115,7 +1214,9 @@ class FusedMultiTransformer(Layer): ...@@ -1115,7 +1214,9 @@ class FusedMultiTransformer(Layer):
self.embed_dim = embed_dim self.embed_dim = embed_dim
self.num_heads = num_heads self.num_heads = num_heads
self.head_dim = embed_dim // num_heads self.head_dim = embed_dim // num_heads
assert self.head_dim * num_heads == embed_dim, "embed_dim must be divisible by num_heads" assert (
self.head_dim * num_heads == embed_dim
), "embed_dim must be divisible by num_heads"
# tensor model parallel # tensor model parallel
if nranks > 1: if nranks > 1:
...@@ -1161,57 +1262,71 @@ class FusedMultiTransformer(Layer): ...@@ -1161,57 +1262,71 @@ class FusedMultiTransformer(Layer):
ln_scale = self.create_parameter( ln_scale = self.create_parameter(
attr=ln_scale_attr, attr=ln_scale_attr,
shape=[embed_dim], shape=[embed_dim],
default_initializer=Constant(value=1.0)) default_initializer=Constant(value=1.0),
ln_bias = self.create_parameter(attr=ln_bias_attr, )
shape=[embed_dim], ln_bias = self.create_parameter(
is_bias=True) attr=ln_bias_attr, shape=[embed_dim], is_bias=True
)
qkv_weight = self.create_parameter( qkv_weight = self.create_parameter(
shape=[3, num_heads, self.head_dim, embed_dim] shape=[3, num_heads, self.head_dim, embed_dim]
if trans_qkvw else [embed_dim, 3, num_heads, self.head_dim], if trans_qkvw
else [embed_dim, 3, num_heads, self.head_dim],
attr=qkv_weight_attr, attr=qkv_weight_attr,
dtype=self._dtype, dtype=self._dtype,
is_bias=False) is_bias=False,
)
qkv_bias = self.create_parameter( qkv_bias = self.create_parameter(
shape=[3, num_heads, self.head_dim], shape=[3, num_heads, self.head_dim],
attr=qkv_bias_attr, attr=qkv_bias_attr,
dtype=self._dtype, dtype=self._dtype,
is_bias=True) is_bias=True,
)
linear_weight = self.create_parameter( linear_weight = self.create_parameter(
shape=[num_heads * self.head_dim, embed_dim], shape=[num_heads * self.head_dim, embed_dim],
attr=linear_weight_attr, attr=linear_weight_attr,
dtype=self._dtype, dtype=self._dtype,
is_bias=False) is_bias=False,
linear_bias = self.create_parameter(shape=[embed_dim], )
linear_bias = self.create_parameter(
shape=[embed_dim],
attr=linear_bias_attr, attr=linear_bias_attr,
dtype=self._dtype, dtype=self._dtype,
is_bias=True) is_bias=True,
)
ffn_ln_scale = self.create_parameter( ffn_ln_scale = self.create_parameter(
shape=[embed_dim], shape=[embed_dim],
attr=ffn_ln_scale_attr, attr=ffn_ln_scale_attr,
is_bias=False, is_bias=False,
default_initializer=Constant(1.0)) default_initializer=Constant(1.0),
ffn_ln_bias = self.create_parameter(shape=[embed_dim], )
attr=ffn_ln_bias_attr, ffn_ln_bias = self.create_parameter(
is_bias=True) shape=[embed_dim], attr=ffn_ln_bias_attr, is_bias=True
)
ffn1_weight = self.create_parameter( ffn1_weight = self.create_parameter(
shape=[embed_dim, dim_feedforward], shape=[embed_dim, dim_feedforward],
attr=ffn1_weight_attr, attr=ffn1_weight_attr,
dtype=self._dtype, dtype=self._dtype,
is_bias=False) is_bias=False,
ffn1_bias = self.create_parameter(shape=[dim_feedforward], )
ffn1_bias = self.create_parameter(
shape=[dim_feedforward],
attr=ffn1_bias_attr, attr=ffn1_bias_attr,
dtype=self._dtype, dtype=self._dtype,
is_bias=True) is_bias=True,
)
ffn2_weight = self.create_parameter( ffn2_weight = self.create_parameter(
shape=[dim_feedforward, embed_dim], shape=[dim_feedforward, embed_dim],
attr=ffn2_weight_attr, attr=ffn2_weight_attr,
dtype=self._dtype, dtype=self._dtype,
is_bias=False) is_bias=False,
ffn2_bias = self.create_parameter(shape=[embed_dim], )
ffn2_bias = self.create_parameter(
shape=[embed_dim],
attr=ffn2_bias_attr, attr=ffn2_bias_attr,
dtype=self._dtype, dtype=self._dtype,
is_bias=True) is_bias=True,
)
# tensor model parallel # tensor model parallel
if nranks > 1: if nranks > 1:
...@@ -1300,5 +1415,6 @@ class FusedMultiTransformer(Layer): ...@@ -1300,5 +1415,6 @@ class FusedMultiTransformer(Layer):
mode='upscale_in_train', mode='upscale_in_train',
trans_qkvw=self._trans_qkvw, trans_qkvw=self._trans_qkvw,
ring_id=self._ring_id, ring_id=self._ring_id,
name=self.name) name=self.name,
)
return out return out
...@@ -20,14 +20,17 @@ from paddle.fluid import core ...@@ -20,14 +20,17 @@ from paddle.fluid import core
from paddle import _C_ops, _legacy_C_ops from paddle import _C_ops, _legacy_C_ops
def graph_khop_sampler(row, def graph_khop_sampler(
row,
colptr, colptr,
input_nodes, input_nodes,
sample_sizes, sample_sizes,
sorted_eids=None, sorted_eids=None,
return_eids=False, return_eids=False,
name=None): name=None,
):
""" """
Graph Khop Sampler API. Graph Khop Sampler API.
This API is mainly used in Graph Learning domain, and the main purpose is to This API is mainly used in Graph Learning domain, and the main purpose is to
...@@ -50,24 +53,23 @@ def graph_khop_sampler(row, ...@@ -50,24 +53,23 @@ def graph_khop_sampler(row,
sample_sizes (list|tuple): The number of neighbors and number of layers we want sample_sizes (list|tuple): The number of neighbors and number of layers we want
to sample. The data type should be int, and the shape to sample. The data type should be int, and the shape
should only have one dimension. should only have one dimension.
sorted_eids (Tensor): The sorted edge ids, should not be None when `return_eids` sorted_eids (Tensor, optional): The sorted edge ids, should not be None when `return_eids`
is True. The shape should be [num_edges, 1], and the data is True. The shape should be [num_edges, 1], and the data
type should be the same with `row`. type should be the same with `row`. Default is None.
return_eids (bool): Whether to return the id of the sample edges. Default is False. return_eids (bool, optional): Whether to return the id of the sample edges. Default is False.
name (str, optional): Name for the operation (optional, default is None). name (str, optional): Name for the operation (optional, default is None).
For more information, please refer to :ref:`api_guide_Name`. For more information, please refer to :ref:`api_guide_Name`.
Returns: Returns:
edge_src (Tensor): The src index of the output edges, also means the first column of - edge_src (Tensor), The src index of the output edges, also means the first column of
the edges. The shape is [num_sample_edges, 1] currently. the edges. The shape is [num_sample_edges, 1] currently.
edge_dst (Tensor): The dst index of the output edges, also means the second column - edge_dst (Tensor), The dst index of the output edges, also means the second column
of the edges. The shape is [num_sample_edges, 1] currently. of the edges. The shape is [num_sample_edges, 1] currently.
sample_index (Tensor): The original id of the input nodes and sampled neighbor nodes. - sample_index (Tensor), The original id of the input nodes and sampled neighbor nodes.
reindex_nodes (Tensor): The reindex id of the input nodes. - reindex_nodes (Tensor), The reindex id of the input nodes.
edge_eids (Tensor): Return the id of the sample edges if `return_eids` is True. - edge_eids (Tensor), Return the id of the sample edges if `return_eids` is True.
Examples: Examples:
.. code-block:: python .. code-block:: python
import paddle import paddle
...@@ -80,44 +82,72 @@ def graph_khop_sampler(row, ...@@ -80,44 +82,72 @@ def graph_khop_sampler(row,
colptr = paddle.to_tensor(colptr, dtype="int64") colptr = paddle.to_tensor(colptr, dtype="int64")
nodes = paddle.to_tensor(nodes, dtype="int64") nodes = paddle.to_tensor(nodes, dtype="int64")
edge_src, edge_dst, sample_index, reindex_nodes = \ edge_src, edge_dst, sample_index, reindex_nodes = paddle.incubate.graph_khop_sampler(row, colptr, nodes, sample_sizes, False)
paddle.incubate.graph_khop_sampler(row, colptr, nodes, sample_sizes, False)
""" """
if _non_static_mode(): if _non_static_mode():
if return_eids: if return_eids:
if sorted_eids is None: if sorted_eids is None:
raise ValueError(f"`sorted_eid` should not be None " raise ValueError(
f"if return_eids is True.") f"`sorted_eid` should not be None "
edge_src, edge_dst, sample_index, reindex_nodes, edge_eids = \ f"if return_eids is True."
_legacy_C_ops.graph_khop_sampler(row, sorted_eids, )
colptr, input_nodes, (
"sample_sizes", sample_sizes, edge_src,
"return_eids", True) edge_dst,
sample_index,
reindex_nodes,
edge_eids,
) = _legacy_C_ops.graph_khop_sampler(
row,
sorted_eids,
colptr,
input_nodes,
"sample_sizes",
sample_sizes,
"return_eids",
True,
)
return edge_src, edge_dst, sample_index, reindex_nodes, edge_eids return edge_src, edge_dst, sample_index, reindex_nodes, edge_eids
else: else:
edge_src, edge_dst, sample_index, reindex_nodes, _ = \ (
_legacy_C_ops.graph_khop_sampler(row, None, edge_src,
colptr, input_nodes, edge_dst,
"sample_sizes", sample_sizes, sample_index,
"return_eids", False) reindex_nodes,
_,
) = _legacy_C_ops.graph_khop_sampler(
row,
None,
colptr,
input_nodes,
"sample_sizes",
sample_sizes,
"return_eids",
False,
)
return edge_src, edge_dst, sample_index, reindex_nodes return edge_src, edge_dst, sample_index, reindex_nodes
check_variable_and_dtype(row, "Row", ("int32", "int64"), check_variable_and_dtype(
"graph_khop_sampler") row, "Row", ("int32", "int64"), "graph_khop_sampler"
)
if return_eids: if return_eids:
if sorted_eids is None: if sorted_eids is None:
raise ValueError(f"`sorted_eid` should not be None " raise ValueError(
f"if return_eids is True.") f"`sorted_eid` should not be None " f"if return_eids is True."
check_variable_and_dtype(sorted_eids, "Eids", ("int32", "int64"), )
"graph_khop_sampler") check_variable_and_dtype(
sorted_eids, "Eids", ("int32", "int64"), "graph_khop_sampler"
check_variable_and_dtype(colptr, "Col_Ptr", ("int32", "int64"), )
"graph_khop_sampler")
check_variable_and_dtype(input_nodes, "X", ("int32", "int64"), check_variable_and_dtype(
"graph_khop_sampler") colptr, "Col_Ptr", ("int32", "int64"), "graph_khop_sampler"
)
check_variable_and_dtype(
input_nodes, "X", ("int32", "int64"), "graph_khop_sampler"
)
helper = LayerHelper("graph_khop_sampler", **locals()) helper = LayerHelper("graph_khop_sampler", **locals())
edge_src = helper.create_variable_for_type_inference(dtype=row.dtype) edge_src = helper.create_variable_for_type_inference(dtype=row.dtype)
...@@ -125,24 +155,23 @@ def graph_khop_sampler(row, ...@@ -125,24 +155,23 @@ def graph_khop_sampler(row,
sample_index = helper.create_variable_for_type_inference(dtype=row.dtype) sample_index = helper.create_variable_for_type_inference(dtype=row.dtype)
reindex_nodes = helper.create_variable_for_type_inference(dtype=row.dtype) reindex_nodes = helper.create_variable_for_type_inference(dtype=row.dtype)
edge_eids = helper.create_variable_for_type_inference(dtype=row.dtype) edge_eids = helper.create_variable_for_type_inference(dtype=row.dtype)
helper.append_op(type="graph_khop_sampler", helper.append_op(
type="graph_khop_sampler",
inputs={ inputs={
"Row": row, "Row": row,
"Eids": sorted_eids, "Eids": sorted_eids,
"Col_Ptr": colptr, "Col_Ptr": colptr,
"X": input_nodes "X": input_nodes,
}, },
outputs={ outputs={
"Out_Src": edge_src, "Out_Src": edge_src,
"Out_Dst": edge_dst, "Out_Dst": edge_dst,
"Sample_Index": sample_index, "Sample_Index": sample_index,
"Reindex_X": reindex_nodes, "Reindex_X": reindex_nodes,
"Out_Eids": edge_eids "Out_Eids": edge_eids,
}, },
attrs={ attrs={"sample_sizes": sample_sizes, "return_eids": return_eids},
"sample_sizes": sample_sizes, )
"return_eids": return_eids
})
if return_eids: if return_eids:
return edge_src, edge_dst, sample_index, reindex_nodes, edge_eids return edge_src, edge_dst, sample_index, reindex_nodes, edge_eids
else: else:
......
...@@ -21,18 +21,23 @@ from paddle import _C_ops, _legacy_C_ops ...@@ -21,18 +21,23 @@ from paddle import _C_ops, _legacy_C_ops
import paddle.utils.deprecated as deprecated import paddle.utils.deprecated as deprecated
@deprecated(since="2.4.0", @deprecated(
since="2.4.0",
update_to="paddle.geometric.reindex_graph", update_to="paddle.geometric.reindex_graph",
level=1, level=1,
reason="paddle.incubate.graph_reindex will be removed in future") reason="paddle.incubate.graph_reindex will be removed in future",
def graph_reindex(x, )
def graph_reindex(
x,
neighbors, neighbors,
count, count,
value_buffer=None, value_buffer=None,
index_buffer=None, index_buffer=None,
flag_buffer_hashtable=False, flag_buffer_hashtable=False,
name=None): name=None,
):
""" """
Graph Reindex API. Graph Reindex API.
This API is mainly used in Graph Learning domain, which should be used This API is mainly used in Graph Learning domain, which should be used
...@@ -40,7 +45,7 @@ def graph_reindex(x, ...@@ -40,7 +45,7 @@ def graph_reindex(x,
is to reindex the ids information of the input nodes, and return the is to reindex the ids information of the input nodes, and return the
corresponding graph edges after reindex. corresponding graph edges after reindex.
**Notes**: Notes:
The number in x should be unique, otherwise it would cause potential errors. The number in x should be unique, otherwise it would cause potential errors.
Besides, we also support multi-edge-types neighbors reindexing. If we have different Besides, we also support multi-edge-types neighbors reindexing. If we have different
edge_type neighbors for x, we should concatenate all the neighbors and count of x. edge_type neighbors for x, we should concatenate all the neighbors and count of x.
...@@ -58,24 +63,23 @@ def graph_reindex(x, ...@@ -58,24 +63,23 @@ def graph_reindex(x,
should be the same with `x`. should be the same with `x`.
count (Tensor): The neighbor count of the input nodes `x`. And the count (Tensor): The neighbor count of the input nodes `x`. And the
data type should be int32. data type should be int32.
value_buffer (Tensor|None): Value buffer for hashtable. The data type should value_buffer (Tensor, optional): Value buffer for hashtable. The data type should
be int32, and should be filled with -1. be int32, and should be filled with -1. Default is None.
index_buffer (Tensor|None): Index buffer for hashtable. The data type should index_buffer (Tensor, optional): Index buffer for hashtable. The data type should
be int32, and should be filled with -1. be int32, and should be filled with -1. Default is None.
flag_buffer_hashtable (bool): Whether to use buffer for hashtable to speed up. flag_buffer_hashtable (bool, optional): Whether to use buffer for hashtable to speed up.
Default is False. Only useful for gpu version currently. Default is False. Only useful for gpu version currently.
name (str, optional): Name for the operation (optional, default is None). name (str, optional): Name for the operation (optional, default is None).
For more information, please refer to :ref:`api_guide_Name`. For more information, please refer to :ref:`api_guide_Name`.
Returns: Returns:
reindex_src (Tensor): The source node index of graph edges after reindex. - reindex_src (Tensor), The source node index of graph edges after reindex.
reindex_dst (Tensor): The destination node index of graph edges after reindex. - reindex_dst (Tensor), The destination node index of graph edges after reindex.
out_nodes (Tensor): The index of unique input nodes and neighbors before reindex, - out_nodes (Tensor), The index of unique input nodes and neighbors before reindex,
where we put the input nodes `x` in the front, and put neighbor where we put the input nodes `x` in the front, and put neighbor
nodes in the back. nodes in the back.
Examples: Examples:
.. code-block:: python .. code-block:: python
import paddle import paddle
...@@ -109,47 +113,55 @@ def graph_reindex(x, ...@@ -109,47 +113,55 @@ def graph_reindex(x,
""" """
if flag_buffer_hashtable: if flag_buffer_hashtable:
if value_buffer is None or index_buffer is None: if value_buffer is None or index_buffer is None:
raise ValueError(f"`value_buffer` and `index_buffer` should not" raise ValueError(
"be None if `flag_buffer_hashtable` is True.") f"`value_buffer` and `index_buffer` should not"
"be None if `flag_buffer_hashtable` is True."
)
if _non_static_mode(): if _non_static_mode():
reindex_src, reindex_dst, out_nodes = \ reindex_src, reindex_dst, out_nodes = _legacy_C_ops.graph_reindex(
_legacy_C_ops.graph_reindex(x, neighbors, count, value_buffer, index_buffer, x,
"flag_buffer_hashtable", flag_buffer_hashtable) neighbors,
count,
value_buffer,
index_buffer,
"flag_buffer_hashtable",
flag_buffer_hashtable,
)
return reindex_src, reindex_dst, out_nodes return reindex_src, reindex_dst, out_nodes
check_variable_and_dtype(x, "X", ("int32", "int64"), "graph_reindex") check_variable_and_dtype(x, "X", ("int32", "int64"), "graph_reindex")
check_variable_and_dtype(neighbors, "Neighbors", ("int32", "int64"), check_variable_and_dtype(
"graph_reindex") neighbors, "Neighbors", ("int32", "int64"), "graph_reindex"
)
check_variable_and_dtype(count, "Count", ("int32"), "graph_reindex") check_variable_and_dtype(count, "Count", ("int32"), "graph_reindex")
if flag_buffer_hashtable: if flag_buffer_hashtable:
check_variable_and_dtype(value_buffer, "HashTable_Value", ("int32"), check_variable_and_dtype(
"graph_reindex") value_buffer, "HashTable_Value", ("int32"), "graph_reindex"
check_variable_and_dtype(index_buffer, "HashTable_Index", ("int32"), )
"graph_reindex") check_variable_and_dtype(
index_buffer, "HashTable_Index", ("int32"), "graph_reindex"
)
helper = LayerHelper("graph_reindex", **locals()) helper = LayerHelper("graph_reindex", **locals())
reindex_src = helper.create_variable_for_type_inference(dtype=x.dtype) reindex_src = helper.create_variable_for_type_inference(dtype=x.dtype)
reindex_dst = helper.create_variable_for_type_inference(dtype=x.dtype) reindex_dst = helper.create_variable_for_type_inference(dtype=x.dtype)
out_nodes = helper.create_variable_for_type_inference(dtype=x.dtype) out_nodes = helper.create_variable_for_type_inference(dtype=x.dtype)
helper.append_op(type="graph_reindex", helper.append_op(
type="graph_reindex",
inputs={ inputs={
"X": "X": x,
x, "Neighbors": neighbors,
"Neighbors": "Count": count,
neighbors, "HashTable_Value": value_buffer if flag_buffer_hashtable else None,
"Count": "HashTable_Index": index_buffer if flag_buffer_hashtable else None,
count,
"HashTable_Value":
value_buffer if flag_buffer_hashtable else None,
"HashTable_Index":
index_buffer if flag_buffer_hashtable else None,
}, },
outputs={ outputs={
"Reindex_Src": reindex_src, "Reindex_Src": reindex_src,
"Reindex_Dst": reindex_dst, "Reindex_Dst": reindex_dst,
"Out_Nodes": out_nodes "Out_Nodes": out_nodes,
}, },
attrs={"flag_buffer_hashtable": flag_buffer_hashtable}) attrs={"flag_buffer_hashtable": flag_buffer_hashtable},
)
return reindex_src, reindex_dst, out_nodes return reindex_src, reindex_dst, out_nodes
...@@ -25,8 +25,10 @@ import paddle.utils.deprecated as deprecated ...@@ -25,8 +25,10 @@ import paddle.utils.deprecated as deprecated
since="2.4.0", since="2.4.0",
update_to="paddle.geometric.sample_neighbors", update_to="paddle.geometric.sample_neighbors",
level=1, level=1,
reason="paddle.incubate.graph_sample_neighbors will be removed in future") reason="paddle.incubate.graph_sample_neighbors will be removed in future",
def graph_sample_neighbors(row, )
def graph_sample_neighbors(
row,
colptr, colptr,
input_nodes, input_nodes,
eids=None, eids=None,
...@@ -34,8 +36,10 @@ def graph_sample_neighbors(row, ...@@ -34,8 +36,10 @@ def graph_sample_neighbors(row,
sample_size=-1, sample_size=-1,
return_eids=False, return_eids=False,
flag_perm_buffer=False, flag_perm_buffer=False,
name=None): name=None,
):
""" """
Graph Sample Neighbors API. Graph Sample Neighbors API.
This API is mainly used in Graph Learning domain, and the main purpose is to This API is mainly used in Graph Learning domain, and the main purpose is to
...@@ -71,14 +75,13 @@ def graph_sample_neighbors(row, ...@@ -71,14 +75,13 @@ def graph_sample_neighbors(row,
For more information, please refer to :ref:`api_guide_Name`. For more information, please refer to :ref:`api_guide_Name`.
Returns: Returns:
out_neighbors (Tensor): The sample neighbors of the input nodes. - out_neighbors (Tensor), The sample neighbors of the input nodes.
out_count (Tensor): The number of sampling neighbors of each input node, and the shape - out_count (Tensor), The number of sampling neighbors of each input node, and the shape should be the same with `input_nodes`.
should be the same with `input_nodes`. - out_eids (Tensor), If `return_eids` is True, we will return the eid information of the sample edges.
out_eids (Tensor): If `return_eids` is True, we will return the eid information of the
sample edges.
Examples: Examples:
.. code-block:: python .. code-block:: python
import paddle import paddle
# edges: (3, 0), (7, 0), (0, 1), (9, 1), (1, 2), (4, 3), (2, 4), # edges: (3, 0), (7, 0), (0, 1), (9, 1), (1, 2), (4, 3), (2, 4),
# (9, 5), (3, 5), (9, 6), (1, 6), (9, 8), (7, 8) # (9, 5), (3, 5), (9, 6), (1, 6), (9, 8), (7, 8)
...@@ -98,59 +101,83 @@ def graph_sample_neighbors(row, ...@@ -98,59 +101,83 @@ def graph_sample_neighbors(row,
if return_eids: if return_eids:
if eids is None: if eids is None:
raise ValueError( raise ValueError(
f"`eids` should not be None if `return_eids` is True.") f"`eids` should not be None if `return_eids` is True."
)
if flag_perm_buffer: if flag_perm_buffer:
if perm_buffer is None: if perm_buffer is None:
raise ValueError( raise ValueError(
f"`perm_buffer` should not be None if `flag_perm_buffer`" f"`perm_buffer` should not be None if `flag_perm_buffer`"
"is True.") "is True."
)
if _non_static_mode(): if _non_static_mode():
out_neighbors, out_count, out_eids = _legacy_C_ops.graph_sample_neighbors( (
row, colptr, input_nodes, eids, perm_buffer, "sample_size", out_neighbors,
sample_size, "return_eids", return_eids, "flag_perm_buffer", out_count,
flag_perm_buffer) out_eids,
) = _legacy_C_ops.graph_sample_neighbors(
row,
colptr,
input_nodes,
eids,
perm_buffer,
"sample_size",
sample_size,
"return_eids",
return_eids,
"flag_perm_buffer",
flag_perm_buffer,
)
if return_eids: if return_eids:
return out_neighbors, out_count, out_eids return out_neighbors, out_count, out_eids
return out_neighbors, out_count return out_neighbors, out_count
check_variable_and_dtype(row, "Row", ("int32", "int64"), check_variable_and_dtype(
"graph_sample_neighbors") row, "Row", ("int32", "int64"), "graph_sample_neighbors"
check_variable_and_dtype(colptr, "Col_Ptr", ("int32", "int64"), )
"graph_sample_neighbors") check_variable_and_dtype(
check_variable_and_dtype(input_nodes, "X", ("int32", "int64"), colptr, "Col_Ptr", ("int32", "int64"), "graph_sample_neighbors"
"graph_sample_neighbors") )
check_variable_and_dtype(
input_nodes, "X", ("int32", "int64"), "graph_sample_neighbors"
)
if return_eids: if return_eids:
check_variable_and_dtype(eids, "Eids", ("int32", "int64"), check_variable_and_dtype(
"graph_sample_neighbors") eids, "Eids", ("int32", "int64"), "graph_sample_neighbors"
)
if flag_perm_buffer: if flag_perm_buffer:
check_variable_and_dtype(perm_buffer, "Perm_Buffer", ("int32", "int64"), check_variable_and_dtype(
"graph_sample_neighbors") perm_buffer,
"Perm_Buffer",
("int32", "int64"),
"graph_sample_neighbors",
)
helper = LayerHelper("graph_sample_neighbors", **locals()) helper = LayerHelper("graph_sample_neighbors", **locals())
out_neighbors = helper.create_variable_for_type_inference(dtype=row.dtype) out_neighbors = helper.create_variable_for_type_inference(dtype=row.dtype)
out_count = helper.create_variable_for_type_inference(dtype=row.dtype) out_count = helper.create_variable_for_type_inference(dtype=row.dtype)
out_eids = helper.create_variable_for_type_inference(dtype=row.dtype) out_eids = helper.create_variable_for_type_inference(dtype=row.dtype)
helper.append_op(type="graph_sample_neighbors", helper.append_op(
type="graph_sample_neighbors",
inputs={ inputs={
"Row": row, "Row": row,
"Col_Ptr": colptr, "Col_Ptr": colptr,
"X": input_nodes, "X": input_nodes,
"Eids": eids if return_eids else None, "Eids": eids if return_eids else None,
"Perm_Buffer": "Perm_Buffer": perm_buffer if flag_perm_buffer else None,
perm_buffer if flag_perm_buffer else None
}, },
outputs={ outputs={
"Out": out_neighbors, "Out": out_neighbors,
"Out_Count": out_count, "Out_Count": out_count,
"Out_Eids": out_eids "Out_Eids": out_eids,
}, },
attrs={ attrs={
"sample_size": sample_size, "sample_size": sample_size,
"return_eids": return_eids, "return_eids": return_eids,
"flag_perm_buffer": flag_perm_buffer "flag_perm_buffer": flag_perm_buffer,
}) },
)
if return_eids: if return_eids:
return out_neighbors, out_count, out_eids return out_neighbors, out_count, out_eids
return out_neighbors, out_count return out_neighbors, out_count
...@@ -36,7 +36,8 @@ from paddle import _C_ops, _legacy_C_ops ...@@ -36,7 +36,8 @@ from paddle import _C_ops, _legacy_C_ops
__all__ = ['resnet_basic_block', 'ResNetBasicBlock'] __all__ = ['resnet_basic_block', 'ResNetBasicBlock']
def resnet_basic_block(x, def resnet_basic_block(
x,
filter1, filter1,
scale1, scale1,
bias1, bias1,
...@@ -69,73 +70,198 @@ def resnet_basic_block(x, ...@@ -69,73 +70,198 @@ def resnet_basic_block(x,
use_global_stats=None, use_global_stats=None,
training=False, training=False,
trainable_statistics=False, trainable_statistics=False,
find_conv_max=True): find_conv_max=True,
):
if fluid.framework.in_dygraph_mode(): if fluid.framework.in_dygraph_mode():
attrs = ('stride1', stride1, 'stride2', stride2, 'stride3', stride3, attrs = (
'padding1', padding1, 'padding2', padding2, 'padding3', 'stride1',
padding3, 'dilation1', dilation1, 'dilation2', dilation2, stride1,
'dilation3', dilation3, 'group', groups, 'momentum', momentum, 'stride2',
'epsilon', eps, 'data_format', data_format, 'has_shortcut', stride2,
has_shortcut, 'use_global_stats', use_global_stats, 'stride3',
"trainable_statistics", trainable_statistics, 'is_test', stride3,
not training, 'act_type', "relu", 'find_conv_input_max', 'padding1',
find_conv_max) padding1,
'padding2',
out, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _ = \ padding2,
getattr(_C_ops, "resnet_basic_block")(x, filter1, scale1, bias1, mean1, var1, filter2, scale2, bias2, mean2, var2, \ 'padding3',
filter3, scale3, bias3, mean3, var3, mean1, var1, mean2, var2, mean3, var3, *attrs) padding3,
'dilation1',
dilation1,
'dilation2',
dilation2,
'dilation3',
dilation3,
'group',
groups,
'momentum',
momentum,
'epsilon',
eps,
'data_format',
data_format,
'has_shortcut',
has_shortcut,
'use_global_stats',
use_global_stats,
"trainable_statistics",
trainable_statistics,
'is_test',
not training,
'act_type',
"relu",
'find_conv_input_max',
find_conv_max,
)
(
out,
_,
_,
_,
_,
_,
_,
_,
_,
_,
_,
_,
_,
_,
_,
_,
_,
_,
_,
_,
_,
_,
_,
) = getattr(_C_ops, "resnet_basic_block")(
x,
filter1,
scale1,
bias1,
mean1,
var1,
filter2,
scale2,
bias2,
mean2,
var2,
filter3,
scale3,
bias3,
mean3,
var3,
mean1,
var1,
mean2,
var2,
mean3,
var3,
*attrs
)
return out return out
helper = LayerHelper('resnet_basic_block', **locals()) helper = LayerHelper('resnet_basic_block', **locals())
bn_param_dtype = fluid.core.VarDesc.VarType.FP32 bn_param_dtype = fluid.core.VarDesc.VarType.FP32
max_dtype = fluid.core.VarDesc.VarType.FP32 max_dtype = fluid.core.VarDesc.VarType.FP32
out = helper.create_variable_for_type_inference(dtype=x.dtype, out = helper.create_variable_for_type_inference(
stop_gradient=True) dtype=x.dtype, stop_gradient=True
conv1 = helper.create_variable_for_type_inference(dtype=x.dtype, )
stop_gradient=True) conv1 = helper.create_variable_for_type_inference(
dtype=x.dtype, stop_gradient=True
)
saved_mean1 = helper.create_variable_for_type_inference( saved_mean1 = helper.create_variable_for_type_inference(
dtype=bn_param_dtype, stop_gradient=True) dtype=bn_param_dtype, stop_gradient=True
)
saved_invstd1 = helper.create_variable_for_type_inference( saved_invstd1 = helper.create_variable_for_type_inference(
dtype=bn_param_dtype, stop_gradient=True) dtype=bn_param_dtype, stop_gradient=True
running_mean1 = helper.create_variable_for_type_inference( )
dtype=bn_param_dtype, stop_gradient=True) if mean1 is None else mean1 running_mean1 = (
running_var1 = helper.create_variable_for_type_inference( helper.create_variable_for_type_inference(
dtype=bn_param_dtype, stop_gradient=True) if var1 is None else var1 dtype=bn_param_dtype, stop_gradient=True
conv2 = helper.create_variable_for_type_inference(dtype=x.dtype, )
stop_gradient=True) if mean1 is None
conv2_input = helper.create_variable_for_type_inference(dtype=x.dtype, else mean1
stop_gradient=True) )
running_var1 = (
helper.create_variable_for_type_inference(
dtype=bn_param_dtype, stop_gradient=True
)
if var1 is None
else var1
)
conv2 = helper.create_variable_for_type_inference(
dtype=x.dtype, stop_gradient=True
)
conv2_input = helper.create_variable_for_type_inference(
dtype=x.dtype, stop_gradient=True
)
saved_mean2 = helper.create_variable_for_type_inference( saved_mean2 = helper.create_variable_for_type_inference(
dtype=bn_param_dtype, stop_gradient=True) dtype=bn_param_dtype, stop_gradient=True
)
saved_invstd2 = helper.create_variable_for_type_inference( saved_invstd2 = helper.create_variable_for_type_inference(
dtype=bn_param_dtype, stop_gradient=True) dtype=bn_param_dtype, stop_gradient=True
running_mean2 = helper.create_variable_for_type_inference( )
dtype=bn_param_dtype, stop_gradient=True) if mean2 is None else mean2 running_mean2 = (
running_var2 = helper.create_variable_for_type_inference( helper.create_variable_for_type_inference(
dtype=bn_param_dtype, stop_gradient=True) if var2 is None else var2 dtype=bn_param_dtype, stop_gradient=True
conv3 = helper.create_variable_for_type_inference(dtype=x.dtype, )
stop_gradient=True) if mean2 is None
else mean2
)
running_var2 = (
helper.create_variable_for_type_inference(
dtype=bn_param_dtype, stop_gradient=True
)
if var2 is None
else var2
)
conv3 = helper.create_variable_for_type_inference(
dtype=x.dtype, stop_gradient=True
)
saved_mean3 = helper.create_variable_for_type_inference( saved_mean3 = helper.create_variable_for_type_inference(
dtype=bn_param_dtype, stop_gradient=True) dtype=bn_param_dtype, stop_gradient=True
)
saved_invstd3 = helper.create_variable_for_type_inference( saved_invstd3 = helper.create_variable_for_type_inference(
dtype=bn_param_dtype, stop_gradient=True) dtype=bn_param_dtype, stop_gradient=True
running_mean3 = helper.create_variable_for_type_inference( )
dtype=bn_param_dtype, stop_gradient=True) if mean3 is None else mean3 running_mean3 = (
running_var3 = helper.create_variable_for_type_inference( helper.create_variable_for_type_inference(
dtype=bn_param_dtype, stop_gradient=True) if var3 is None else var3 dtype=bn_param_dtype, stop_gradient=True
)
if mean3 is None
else mean3
)
running_var3 = (
helper.create_variable_for_type_inference(
dtype=bn_param_dtype, stop_gradient=True
)
if var3 is None
else var3
)
conv1_input_max = helper.create_variable_for_type_inference( conv1_input_max = helper.create_variable_for_type_inference(
dtype=max_dtype, stop_gradient=True) dtype=max_dtype, stop_gradient=True
)
conv1_filter_max = helper.create_variable_for_type_inference( conv1_filter_max = helper.create_variable_for_type_inference(
dtype=max_dtype, stop_gradient=True) dtype=max_dtype, stop_gradient=True
)
conv2_input_max = helper.create_variable_for_type_inference( conv2_input_max = helper.create_variable_for_type_inference(
dtype=max_dtype, stop_gradient=True) dtype=max_dtype, stop_gradient=True
)
conv2_filter_max = helper.create_variable_for_type_inference( conv2_filter_max = helper.create_variable_for_type_inference(
dtype=max_dtype, stop_gradient=True) dtype=max_dtype, stop_gradient=True
)
conv3_input_max = helper.create_variable_for_type_inference( conv3_input_max = helper.create_variable_for_type_inference(
dtype=max_dtype, stop_gradient=True) dtype=max_dtype, stop_gradient=True
)
conv3_filter_max = helper.create_variable_for_type_inference( conv3_filter_max = helper.create_variable_for_type_inference(
dtype=max_dtype, stop_gradient=True) dtype=max_dtype, stop_gradient=True
)
inputs = { inputs = {
'X': x, 'X': x,
...@@ -175,7 +301,7 @@ def resnet_basic_block(x, ...@@ -175,7 +301,7 @@ def resnet_basic_block(x,
"trainable_statistics": trainable_statistics, "trainable_statistics": trainable_statistics,
'is_test': not training, 'is_test': not training,
'act_type': "relu", 'act_type': "relu",
'find_conv_input_max': find_conv_max 'find_conv_input_max': find_conv_max,
} }
outputs = { outputs = {
...@@ -203,39 +329,120 @@ def resnet_basic_block(x, ...@@ -203,39 +329,120 @@ def resnet_basic_block(x,
'MaxInput3': conv3_input_max, 'MaxInput3': conv3_input_max,
'MaxFilter3': conv3_filter_max, 'MaxFilter3': conv3_filter_max,
} }
helper.append_op(type='resnet_basic_block', helper.append_op(
inputs=inputs, type='resnet_basic_block', inputs=inputs, outputs=outputs, attrs=attrs
outputs=outputs, )
attrs=attrs)
return out return out
class ResNetBasicBlock(Layer): class ResNetBasicBlock(Layer):
""" r"""
ResNetBasicBlock is designed for optimize the performence of the basic unit of ssd resnet block. ResNetBasicBlock is designed for optimize the performence of the basic unit of ssd resnet block.
The fusion op architecture like this: If has_shortcut = True, it can calculate 3 Conv2D, 3 BatchNorm and 2 ReLU in one time.
has_shortcut = True: else: If has_shortcut = False, it can calculate 2 Conv2D, 2 BatchNorm and 2 ReLU in one time. In this
X X case the shape of output is same with input.
/ /
| | | |
CONV1 | CONV1 | Args:
| | | | num_channels (int): The number of input image channel.
BN1 | BN1 | num_filter (int): The number of filter. It is as same as the output image channel.
| | | | filter_size (int|list|tuple): The filter size. If filter_size
RELU1 | RELU1 | is a tuple, it must contain two integers, (filter_size_height,
| | | | filter_size_width). Otherwise, filter_size_height = filter_size_width =\
CONV2 CONV3 CONV2 | filter_size.
| | | | stride (int, optional): The stride size. It means the stride in convolution.
BN2 BN3 BN2 | If stride is a tuple, it must contain two integers, (stride_height, stride_width).
\ / \ / Otherwise, stride_height = stride_width = stride. Default: stride = 1.
ADD ADD act (str, optional): Activation type, if it is set to None, activation is not appended.
| | Default: None
RELU RELU momentum (float, optional): The value used for the moving_mean and
| | moving_var computation. This should be a float number or a Tensor with
Y Y shape [1] and data type as float32. The updated formula is:
:math:`moving\_mean = moving\_mean * momentum + new\_mean * (1. - momentum)`
:math:`moving\_var = moving\_var * momentum + new\_var * (1. - momentum)`
Default is 0.9.
eps (float, optional): A value added to the denominator for
numerical stability. Default is 1e-5.
data_format (str, optional): Specify the data format of the input, and the data format of the output
will be consistent with that of the input. Now is only support `"NCHW"`, the data is stored in
the order of: `[batch_size, input_channels, input_height, input_width]`.
has_shortcut (bool, optional): Whether to calculate CONV3 and BN3. Default: False.
use_global_stats (bool, optional): Whether to use global mean and
variance. In inference or test mode, set use_global_stats to true
or is_test to true, and the behavior is equivalent.
In train mode, when setting use_global_stats True, the global mean
and variance are also used during train period. Default: False.
is_test (bool, optional): A flag indicating whether it is in
test phrase or not. Default: False.
filter_attr (ParamAttr, optional): The parameter attribute for learnable parameters/weights
of conv2d. If it is set to None or one attribute of ParamAttr, conv2d
will create ParamAttr as param_attr. Default: None.
scale_attr (ParamAttr, optional): The parameter attribute for Parameter `scale`
of batch_norm. If it is set to None or one attribute of ParamAttr, batch_norm will create ParamAttr
as param_attr, the name of scale can be set in ParamAttr. If the Initializer of the param_attr is not set,
the parameter is initialized with Xavier. Default: None.
bias_attr (ParamAttr, optional): The parameter attribute for the bias of batch_norm.
If it is set to None or one attribute of ParamAttr, batch_norm
will create ParamAttr as bias_attr, the name of bias can be set in ParamAttr.
If the Initializer of the bias_attr is not set, the bias is initialized zero.
Default: None.
moving_mean_name (str, optional): The name of moving_mean which store the global Mean. If it
is set to None, batch_norm will save global mean with a random name, otherwise, batch_norm
will save global mean with the string. Default: None.
moving_var_name (str, optional): The name of the moving_variance which store the global Variance.
If it is set to None, batch_norm will save global variance with a random name, otherwise, batch_norm
will save global variance with the string. Default: None.
padding (int, optional): The padding size. It is only spupport padding_height = padding_width = padding.
Default: padding = 0.
dilation (int, optional): The dilation size. It means the spacing between the kernel
points. It is only spupport dilation_height = dilation_width = dilation.
Default: dilation = 1.
trainable_statistics (bool, optional): Whether to calculate mean and var in eval mode. In eval mode, when
setting trainable_statistics True, mean and variance will be calculated by current batch statistics.
Default: False.
find_conv_max (bool, optional): Whether to calculate max value of each conv2d. Default: True.
Returns:
A Tensor representing the ResNetBasicBlock, whose data type is the same with input.
Examples:
.. code-block:: python
# required: xpu
import paddle
from paddle.incubate.xpu.resnet_block import ResNetBasicBlock
ch_in = 4
ch_out = 8
x = paddle.uniform((2, ch_in, 16, 16), dtype='float32', min=-1., max=1.)
resnet_basic_block = ResNetBasicBlock(num_channels1=ch_in,
num_filter1=ch_out,
filter1_size=3,
num_channels2=ch_out,
num_filter2=ch_out,
filter2_size=3,
num_channels3=ch_in,
num_filter3=ch_out,
filter3_size=1,
stride1=1,
stride2=1,
stride3=1,
act='relu',
padding1=1,
padding2=1,
padding3=0,
has_shortcut=True)
out = resnet_basic_block.forward(x)
print(out.shape) # [2, 8, 16, 16]
""" """
def __init__(self, def __init__(
self,
num_channels1, num_channels1,
num_filter1, num_filter1,
filter1_size, filter1_size,
...@@ -277,14 +484,17 @@ class ResNetBasicBlock(Layer): ...@@ -277,14 +484,17 @@ class ResNetBasicBlock(Layer):
dilation2=1, dilation2=1,
dilation3=1, dilation3=1,
trainable_statistics=False, trainable_statistics=False,
find_conv_max=True): find_conv_max=True,
):
super(ResNetBasicBlock, self).__init__() super(ResNetBasicBlock, self).__init__()
self._stride1 = stride1 self._stride1 = stride1
self._stride2 = stride2 self._stride2 = stride2
self._kernel1_size = utils.convert_to_list(filter1_size, 2, self._kernel1_size = utils.convert_to_list(
'filter1_size') filter1_size, 2, 'filter1_size'
self._kernel2_size = utils.convert_to_list(filter2_size, 2, )
'filter2_size') self._kernel2_size = utils.convert_to_list(
filter2_size, 2, 'filter2_size'
)
self._dilation1 = dilation1 self._dilation1 = dilation1
self._dilation2 = dilation2 self._dilation2 = dilation2
self._padding1 = padding1 self._padding1 = padding1
...@@ -301,8 +511,9 @@ class ResNetBasicBlock(Layer): ...@@ -301,8 +511,9 @@ class ResNetBasicBlock(Layer):
self._find_conv_max = find_conv_max self._find_conv_max = find_conv_max
if has_shortcut: if has_shortcut:
self._kernel3_size = utils.convert_to_list(filter3_size, 2, self._kernel3_size = utils.convert_to_list(
'filter3_size') filter3_size, 2, 'filter3_size'
)
self._padding3 = padding3 self._padding3 = padding3
self._stride3 = stride3 self._stride3 = stride3
self._dilation3 = dilation3 self._dilation3 = dilation3
...@@ -317,11 +528,13 @@ class ResNetBasicBlock(Layer): ...@@ -317,11 +528,13 @@ class ResNetBasicBlock(Layer):
if data_format not in valid_format: if data_format not in valid_format:
raise ValueError( raise ValueError(
"conv_format must be one of {}, but got conv_format={}".format( "conv_format must be one of {}, but got conv_format={}".format(
valid_format, data_format)) valid_format, data_format
)
)
def _get_default_param_initializer(channels, kernel_size): def _get_default_param_initializer(channels, kernel_size):
filter_elem_num = np.prod(kernel_size) * channels filter_elem_num = np.prod(kernel_size) * channels
std = (2.0 / filter_elem_num)**0.5 std = (2.0 / filter_elem_num) ** 0.5
return I.Normal(0.0, std) return I.Normal(0.0, std)
# init filter # init filter
...@@ -335,92 +548,128 @@ class ResNetBasicBlock(Layer): ...@@ -335,92 +548,128 @@ class ResNetBasicBlock(Layer):
shape=filter1_shape, shape=filter1_shape,
attr=filter1_attr, attr=filter1_attr,
default_initializer=_get_default_param_initializer( default_initializer=_get_default_param_initializer(
num_channels1, self._kernel1_size)) num_channels1, self._kernel1_size
),
)
self.scale_1 = self.create_parameter( self.scale_1 = self.create_parameter(
shape=bn1_param_shape, shape=bn1_param_shape,
attr=scale1_attr, attr=scale1_attr,
dtype=bn_param_dtype, dtype=bn_param_dtype,
default_initializer=I.Constant(1.0)) default_initializer=I.Constant(1.0),
self.bias_1 = self.create_parameter(shape=bn1_param_shape, )
self.bias_1 = self.create_parameter(
shape=bn1_param_shape,
attr=bias1_attr, attr=bias1_attr,
dtype=bn_param_dtype, dtype=bn_param_dtype,
is_bias=True) is_bias=True,
self.mean_1 = self.create_parameter(attr=ParamAttr( )
self.mean_1 = self.create_parameter(
attr=ParamAttr(
name=moving_mean1_name, name=moving_mean1_name,
initializer=I.Constant(0.0), initializer=I.Constant(0.0),
trainable=False), trainable=False,
),
shape=bn1_param_shape, shape=bn1_param_shape,
dtype=bn_param_dtype) dtype=bn_param_dtype,
)
self.mean_1.stop_gradient = True self.mean_1.stop_gradient = True
self.var_1 = self.create_parameter( self.var_1 = self.create_parameter(
attr=ParamAttr(name=moving_var1_name, attr=ParamAttr(
name=moving_var1_name,
initializer=I.Constant(1.0), initializer=I.Constant(1.0),
trainable=False), trainable=False,
),
shape=bn1_param_shape, shape=bn1_param_shape,
dtype=bn_param_dtype) dtype=bn_param_dtype,
)
self.var_1.stop_gradient = True self.var_1.stop_gradient = True
self.filter_2 = self.create_parameter( self.filter_2 = self.create_parameter(
shape=filter2_shape, shape=filter2_shape,
attr=filter2_attr, attr=filter2_attr,
default_initializer=_get_default_param_initializer( default_initializer=_get_default_param_initializer(
num_channels2, self._kernel2_size)) num_channels2, self._kernel2_size
),
)
self.scale_2 = self.create_parameter( self.scale_2 = self.create_parameter(
shape=bn2_param_shape, shape=bn2_param_shape,
attr=scale2_attr, attr=scale2_attr,
dtype=bn_param_dtype, dtype=bn_param_dtype,
default_initializer=I.Constant(1.0)) default_initializer=I.Constant(1.0),
self.bias_2 = self.create_parameter(shape=bn2_param_shape, )
self.bias_2 = self.create_parameter(
shape=bn2_param_shape,
attr=bias2_attr, attr=bias2_attr,
dtype=bn_param_dtype, dtype=bn_param_dtype,
is_bias=True) is_bias=True,
self.mean_2 = self.create_parameter(attr=ParamAttr( )
self.mean_2 = self.create_parameter(
attr=ParamAttr(
name=moving_mean2_name, name=moving_mean2_name,
initializer=I.Constant(0.0), initializer=I.Constant(0.0),
trainable=False), trainable=False,
),
shape=bn2_param_shape, shape=bn2_param_shape,
dtype=bn_param_dtype) dtype=bn_param_dtype,
)
self.mean_2.stop_gradient = True self.mean_2.stop_gradient = True
self.var_2 = self.create_parameter( self.var_2 = self.create_parameter(
attr=ParamAttr(name=moving_var2_name, attr=ParamAttr(
name=moving_var2_name,
initializer=I.Constant(1.0), initializer=I.Constant(1.0),
trainable=False), trainable=False,
),
shape=bn2_param_shape, shape=bn2_param_shape,
dtype=bn_param_dtype) dtype=bn_param_dtype,
)
self.var_2.stop_gradient = True self.var_2.stop_gradient = True
if has_shortcut: if has_shortcut:
bn3_param_shape = [1, 1, num_filter3] bn3_param_shape = [1, 1, num_filter3]
filter3_shape = [ filter3_shape = [
num_filter3, num_channels3, filter3_size, filter3_size num_filter3,
num_channels3,
filter3_size,
filter3_size,
] ]
self.filter_3 = self.create_parameter( self.filter_3 = self.create_parameter(
shape=filter3_shape, shape=filter3_shape,
attr=filter3_attr, attr=filter3_attr,
default_initializer=_get_default_param_initializer( default_initializer=_get_default_param_initializer(
num_channels3, self._kernel3_size)) num_channels3, self._kernel3_size
),
)
self.scale_3 = self.create_parameter( self.scale_3 = self.create_parameter(
shape=bn3_param_shape, shape=bn3_param_shape,
attr=scale3_attr, attr=scale3_attr,
dtype=bn_param_dtype, dtype=bn_param_dtype,
default_initializer=I.Constant(1.0)) default_initializer=I.Constant(1.0),
self.bias_3 = self.create_parameter(shape=bn3_param_shape, )
self.bias_3 = self.create_parameter(
shape=bn3_param_shape,
attr=bias3_attr, attr=bias3_attr,
dtype=bn_param_dtype, dtype=bn_param_dtype,
is_bias=True) is_bias=True,
self.mean_3 = self.create_parameter(attr=ParamAttr( )
self.mean_3 = self.create_parameter(
attr=ParamAttr(
name=moving_mean3_name, name=moving_mean3_name,
initializer=I.Constant(0.0), initializer=I.Constant(0.0),
trainable=False), trainable=False,
),
shape=bn3_param_shape, shape=bn3_param_shape,
dtype=bn_param_dtype) dtype=bn_param_dtype,
)
self.mean_3.stop_gradient = True self.mean_3.stop_gradient = True
self.var_3 = self.create_parameter(attr=ParamAttr( self.var_3 = self.create_parameter(
attr=ParamAttr(
name=moving_var3_name, name=moving_var3_name,
initializer=I.Constant(1.0), initializer=I.Constant(1.0),
trainable=False), trainable=False,
),
shape=bn3_param_shape, shape=bn3_param_shape,
dtype=bn_param_dtype) dtype=bn_param_dtype,
)
self.var_3.stop_gradient = True self.var_3.stop_gradient = True
else: else:
self.filter_3 = None self.filter_3 = None
...@@ -464,5 +713,6 @@ class ResNetBasicBlock(Layer): ...@@ -464,5 +713,6 @@ class ResNetBasicBlock(Layer):
use_global_stats=self._use_global_stats, use_global_stats=self._use_global_stats,
training=self.training, training=self.training,
trainable_statistics=self._trainable_statistics, trainable_statistics=self._trainable_statistics,
find_conv_max=self._find_conv_max) find_conv_max=self._find_conv_max,
)
return out return out
...@@ -715,6 +715,7 @@ def upsample( ...@@ -715,6 +715,7 @@ def upsample(
name=None, name=None,
): ):
""" """
This API resizes a batch of images. This API resizes a batch of images.
The input must be a 3-D Tensor of the shape (num_batches, channels, in_w) The input must be a 3-D Tensor of the shape (num_batches, channels, in_w)
...@@ -725,11 +726,12 @@ def upsample( ...@@ -725,11 +726,12 @@ def upsample(
and the resizing only applies on the three dimensions(depth, height and width). and the resizing only applies on the three dimensions(depth, height and width).
Supporting resample methods: Supporting resample methods:
'linear' : Linear interpolation - 'linear' : Linear interpolation
'bilinear' : Bilinear interpolation - 'bilinear' : Bilinear interpolation
'trilinear' : Trilinear interpolation - 'trilinear' : Trilinear interpolation
'nearest' : Nearest neighbor interpolation - 'nearest' : Nearest neighbor interpolation
'bicubic' : Bicubic interpolation - 'bicubic' : Bicubic interpolation
Linear interpolation is the method of using a line connecting two known quantities Linear interpolation is the method of using a line connecting two known quantities
to determine the value of an unknown quantity between the two known quantities. to determine the value of an unknown quantity between the two known quantities.
...@@ -831,8 +833,9 @@ def upsample( ...@@ -831,8 +833,9 @@ def upsample(
D_out = D_{in} * scale_{factor} D_out = D_{in} * scale_{factor}
H_out = H_{in} * scale_{factor} H_out = H_{in} * scale_{factor}
W_out = W_{in} * scale_{factor} W_out = W_{in} * scale_{factor}
https://en.wikipedia.org/wiki/Linear_interpolation.
For details of linear interpolation, please refer to Wikipedia: For details of linear interpolation, please refer to Wikipedia:
https://en.wikipedia.org/wiki/Linear_interpolation.
For details of nearest neighbor interpolation, please refer to Wikipedia: For details of nearest neighbor interpolation, please refer to Wikipedia:
https://en.wikipedia.org/wiki/Nearest-neighbor_interpolation. https://en.wikipedia.org/wiki/Nearest-neighbor_interpolation.
...@@ -876,6 +879,7 @@ def upsample( ...@@ -876,6 +879,7 @@ def upsample(
name(str, optional): The default value is None. name(str, optional): The default value is None.
Normally there is no need for user to set this property. Normally there is no need for user to set this property.
For more information, please refer to :ref:`api_guide_Name` For more information, please refer to :ref:`api_guide_Name`
Returns: Returns:
A 3-D Tensor of the shape (num_batches, channels, out_w) or (num_batches, out_w, channels), A 3-D Tensor of the shape (num_batches, channels, out_w) or (num_batches, out_w, channels),
A 4-D Tensor of the shape (num_batches, channels, out_h, out_w) or (num_batches, out_h, out_w, channels), A 4-D Tensor of the shape (num_batches, channels, out_h, out_w) or (num_batches, out_h, out_w, channels),
......
...@@ -23,6 +23,7 @@ __all__ = [] ...@@ -23,6 +23,7 @@ __all__ = []
def pairwise_distance(x, y, p=2., epsilon=1e-6, keepdim=False, name=None): def pairwise_distance(x, y, p=2., epsilon=1e-6, keepdim=False, name=None):
r""" r"""
It computes the pairwise distance between two vectors. The It computes the pairwise distance between two vectors. The
distance is calculated by p-oreder norm: distance is calculated by p-oreder norm:
...@@ -48,6 +49,7 @@ def pairwise_distance(x, y, p=2., epsilon=1e-6, keepdim=False, name=None): ...@@ -48,6 +49,7 @@ def pairwise_distance(x, y, p=2., epsilon=1e-6, keepdim=False, name=None):
Returns: Returns:
Tensor, the dtype is same as input tensor. Tensor, the dtype is same as input tensor.
- If :attr:`keepdim` is True, the output shape is :math:`[N, 1]` or :math:`[1]`, - If :attr:`keepdim` is True, the output shape is :math:`[N, 1]` or :math:`[1]`,
depending on whether the input has data shaped as :math:`[N, D]`. depending on whether the input has data shaped as :math:`[N, D]`.
- If :attr:`keepdim` is False, the output shape is :math:`[N]` or :math:`[]`, - If :attr:`keepdim` is False, the output shape is :math:`[N]` or :math:`[]`,
......
...@@ -1310,6 +1310,7 @@ def margin_ranking_loss( ...@@ -1310,6 +1310,7 @@ def margin_ranking_loss(
def l1_loss(input, label, reduction='mean', name=None): def l1_loss(input, label, reduction='mean', name=None):
r""" r"""
Computes the L1 Loss of Tensor ``input`` and ``label`` as follows. Computes the L1 Loss of Tensor ``input`` and ``label`` as follows.
If `reduction` set to ``'none'``, the loss is: If `reduction` set to ``'none'``, the loss is:
...@@ -1341,7 +1342,7 @@ def l1_loss(input, label, reduction='mean', name=None): ...@@ -1341,7 +1342,7 @@ def l1_loss(input, label, reduction='mean', name=None):
Returns: Returns:
Tensor, the L1 Loss of Tensor ``input`` and ``label``. Tensor, the L1 Loss of Tensor ``input`` and ``label``.
If `reduction` is ``'none'``, the shape of output loss is [N, *], the same as ``input`` . If `reduction` is ``'none'``, the shape of output loss is :math:`[N, *]`, the same as ``input`` .
If `reduction` is ``'mean'`` or ``'sum'``, the shape of output loss is [1]. If `reduction` is ``'mean'`` or ``'sum'``, the shape of output loss is [1].
Examples: Examples:
...@@ -1364,6 +1365,7 @@ def l1_loss(input, label, reduction='mean', name=None): ...@@ -1364,6 +1365,7 @@ def l1_loss(input, label, reduction='mean', name=None):
l1_loss = paddle.nn.functional.l1_loss(input, label, reduction='sum') l1_loss = paddle.nn.functional.l1_loss(input, label, reduction='sum')
print(l1_loss.numpy()) print(l1_loss.numpy())
# [1.4] # [1.4]
""" """
if reduction not in ['sum', 'mean', 'none']: if reduction not in ['sum', 'mean', 'none']:
raise ValueError( raise ValueError(
...@@ -2286,6 +2288,7 @@ def cross_entropy( ...@@ -2286,6 +2288,7 @@ def cross_entropy(
name=None, name=None,
): ):
r""" r"""
By default, this operator implements the cross entropy loss function with softmax. This function By default, this operator implements the cross entropy loss function with softmax. This function
combines the calculation of the softmax operation and the cross entropy loss function combines the calculation of the softmax operation and the cross entropy loss function
to provide a more numerically stable computing. to provide a more numerically stable computing.
...@@ -2399,21 +2402,13 @@ def cross_entropy( ...@@ -2399,21 +2402,13 @@ def cross_entropy(
Parameters: Parameters:
input (Tensor): the data type is float32, float64. Shape is :math:`[N_1, N_2, ..., N_k, C]`, where C is number of classes, ``k >= 1`` .
- **input** (Tensor)
Input tensor, the data type is float32, float64. Shape is
:math:`[N_1, N_2, ..., N_k, C]`, where C is number of classes , ``k >= 1`` .
Note: Note:
1. when use_softmax=True, it expects unscaled logits. This operator should not be used with the output of softmax operator, which will produce incorrect results.
1. when use_softmax=True, it expects unscaled logits. This operator should not be used with the
output of softmax operator, which will produce incorrect results.
2. when use_softmax=False, it expects the output of softmax operator. 2. when use_softmax=False, it expects the output of softmax operator.
- **label** (Tensor) label (Tensor):
1. If soft_label=False, the shape is 1. If soft_label=False, the shape is
:math:`[N_1, N_2, ..., N_k]` or :math:`[N_1, N_2, ..., N_k, 1]`, k >= 1. :math:`[N_1, N_2, ..., N_k]` or :math:`[N_1, N_2, ..., N_k, 1]`, k >= 1.
the data type is int32, int64, float32, float64, where each value is [0, C-1]. the data type is int32, int64, float32, float64, where each value is [0, C-1].
...@@ -2421,48 +2416,27 @@ def cross_entropy( ...@@ -2421,48 +2416,27 @@ def cross_entropy(
2. If soft_label=True, the shape and data type should be same with ``input`` , 2. If soft_label=True, the shape and data type should be same with ``input`` ,
and the sum of the labels for each sample should be 1. and the sum of the labels for each sample should be 1.
- **weight** (Tensor, optional) weight (Tensor, optional): a manual rescaling weight given to each class.
a manual rescaling weight given to each class.
If given, has to be a Tensor of size C and the data type is float32, float64. If given, has to be a Tensor of size C and the data type is float32, float64.
Default is ``'None'`` . Default is ``'None'`` .
ignore_index (int64, optional): Specifies a target value that is ignored
- **ignore_index** (int64, optional)
Specifies a target value that is ignored
and does not contribute to the loss. A negative value means that no label and does not contribute to the loss. A negative value means that no label
value needs to be ignored. Only valid when soft_label = False. value needs to be ignored. Only valid when soft_label = False.
Default is ``-100`` . Default is ``-100`` .
reduction (str, optional): Indicate how to average the loss by batch_size,
- **reduction** (str, optional)
Indicate how to average the loss by batch_size,
the candicates are ``'none'`` | ``'mean'`` | ``'sum'``. the candicates are ``'none'`` | ``'mean'`` | ``'sum'``.
If :attr:`reduction` is ``'mean'``, the reduced mean loss is returned; If :attr:`reduction` is ``'mean'``, the reduced mean loss is returned;
If :attr:`size_average` is ``'sum'``, the reduced sum loss is returned. If :attr:`size_average` is ``'sum'``, the reduced sum loss is returned.
If :attr:`reduction` is ``'none'``, the unreduced loss is returned. If :attr:`reduction` is ``'none'``, the unreduced loss is returned.
Default is ``'mean'``. Default is ``'mean'``.
soft_label (bool, optional): Indicate whether label is soft. Default is ``False``.
- **soft_label** (bool, optional) axis (int, optional):The index of dimension to perform softmax calculations.
Indicate whether label is soft.
Default is ``False``.
- **axis** (int, optional)
The index of dimension to perform softmax calculations.
It should be in range :math:`[-1, rank - 1]`, where :math:`rank` is the It should be in range :math:`[-1, rank - 1]`, where :math:`rank` is the
number of dimensions of input :attr:`input`. number of dimensions of input :attr:`input`.
Default is ``-1`` . Default is ``-1`` .
use_softmax (bool, optional): Indicate whether compute softmax before cross_entropy.
- **use_softmax** (bool, optional)
Indicate whether compute softmax before cross_entropy.
Default is ``True``. Default is ``True``.
name (str, optional): The name of the operator. Default is ``None`` .
- **name** (str, optional)
The name of the operator. Default is ``None`` .
For more information, please refer to :ref:`api_guide_Name` . For more information, please refer to :ref:`api_guide_Name` .
Returns: Returns:
...@@ -2478,9 +2452,7 @@ def cross_entropy( ...@@ -2478,9 +2452,7 @@ def cross_entropy(
2. if soft_label = True, the dimension of return value is :math:`[N_1, N_2, ..., N_k, 1]` . 2. if soft_label = True, the dimension of return value is :math:`[N_1, N_2, ..., N_k, 1]` .
Examples: Examples:
.. code-block:: python .. code-block:: python
# hard labels # hard labels
...@@ -3834,6 +3806,7 @@ def triplet_margin_loss( ...@@ -3834,6 +3806,7 @@ def triplet_margin_loss(
def soft_margin_loss(input, label, reduction='mean', name=None): def soft_margin_loss(input, label, reduction='mean', name=None):
""" """
The API measures the soft margin loss between input predictions ``input`` The API measures the soft margin loss between input predictions ``input``
and target labels ``label`` . It can be described as: and target labels ``label`` . It can be described as:
...@@ -3842,7 +3815,7 @@ def soft_margin_loss(input, label, reduction='mean', name=None): ...@@ -3842,7 +3815,7 @@ def soft_margin_loss(input, label, reduction='mean', name=None):
Parameters: Parameters:
input (Tensor): The input predications tensor with shape: [N, *], input (Tensor): The input predications tensor with shape: ``[N, *]``,
N is batch_size, `*` means any number of additional dimensions. The ``input`` ranges from -inf to inf. N is batch_size, `*` means any number of additional dimensions. The ``input`` ranges from -inf to inf.
Available dtype is float32, float64. Available dtype is float32, float64.
...@@ -3862,8 +3835,7 @@ def soft_margin_loss(input, label, reduction='mean', name=None): ...@@ -3862,8 +3835,7 @@ def soft_margin_loss(input, label, reduction='mean', name=None):
Returns: Returns:
Output (Tensor): If ``reduction`` is ``'none'``, the shape of output is Output (Tensor): If ``reduction`` is ``'none'``, the shape of output is same as ``input`` , else the shape of output is [1].
same as ``input`` , else the shape of output is [1].
Examples: Examples:
.. code-block:: python .. code-block:: python
...@@ -3889,6 +3861,7 @@ def soft_margin_loss(input, label, reduction='mean', name=None): ...@@ -3889,6 +3861,7 @@ def soft_margin_loss(input, label, reduction='mean', name=None):
# [0.84367639, 0.74795729, 0.44629076, 0.55123353, 0.77659678], # [0.84367639, 0.74795729, 0.44629076, 0.55123353, 0.77659678],
# [0.39465919, 0.76651484, 0.54485321, 0.76609844, 0.77166790], # [0.39465919, 0.76651484, 0.54485321, 0.76609844, 0.77166790],
# [0.51283568, 0.84757161, 0.78913331, 1.05268764, 0.45318675]]) # [0.51283568, 0.84757161, 0.78913331, 1.05268764, 0.45318675]])
""" """
if reduction not in ['sum', 'mean', 'none']: if reduction not in ['sum', 'mean', 'none']:
raise ValueError( raise ValueError(
......
...@@ -1735,16 +1735,18 @@ def adaptive_avg_pool1d(x, output_size, name=None): ...@@ -1735,16 +1735,18 @@ def adaptive_avg_pool1d(x, output_size, name=None):
def adaptive_avg_pool2d(x, output_size, data_format='NCHW', name=None): def adaptive_avg_pool2d(x, output_size, data_format='NCHW', name=None):
""" r"""
Applies 2D adaptive avg pooling on input tensor. The h and w dimensions Applies 2D adaptive avg pooling on input tensor. The h and w dimensions
of the output tensor are determined by the parameter output_size. of the output tensor are determined by the parameter output_size.
For avg adaptive pool2d: For avg adaptive pool2d:
.. math:: .. math::
hstart &= floor(i * H_{in} / H_{out}) hstart &= floor(i * H_{in} / H_{out}) \\
hend &= ceil((i + 1) * H_{in} / H_{out}) hend &= ceil((i + 1) * H_{in} / H_{out}) \\
wstart &= floor(j * W_{in} / W_{out}) wstart &= floor(j * W_{in} / W_{out}) \\
wend &= ceil((j + 1) * W_{in} / W_{out}) wend &= ceil((j + 1) * W_{in} / W_{out}) \\
Output(i ,j) &= \frac{\sum Input[hstart:hend, wstart:wend]}{(hend - hstart) * (wend - wstart)} Output(i ,j) &= \frac{\sum Input[hstart:hend, wstart:wend]}{(hend - hstart) * (wend - wstart)}
Args: Args:
...@@ -1753,14 +1755,15 @@ def adaptive_avg_pool2d(x, output_size, data_format='NCHW', name=None): ...@@ -1753,14 +1755,15 @@ def adaptive_avg_pool2d(x, output_size, data_format='NCHW', name=None):
output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list, output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
it must contain two element, (H, W). H and W can be either a int, or None which means it must contain two element, (H, W). H and W can be either a int, or None which means
the size will be the same as that of the input. the size will be the same as that of the input.
data_format (str): The data format of the input and output data. An optional string data_format (str, optional): The data format of the input and output data. An optional string
from: "NCHW", "NHWC". The default is "NCHW". When it is "NCHW", the data is stored in from: "NCHW", "NHWC". The default is "NCHW". When it is "NCHW", the data is stored in
the order of: [batch_size, input_channels, input_height, input_width]. the order of: [batch_size, input_channels, input_height, input_width].
name(str, optional): For detailed information, please refer name(str, optional): For detailed information, please refer
to :ref:`api_guide_Name`. Usually name is no need to set and to :ref:`api_guide_Name`. Usually name is no need to set and
None by default. None by default.
Returns: Returns:
Tensor: The output tensor of avg adaptive pool2d result. The data type is same as input tensor. Tensor, The output tensor of avg adaptive pool2d result. The data type is same as input tensor.
Examples: Examples:
.. code-block:: python .. code-block:: python
...@@ -1788,6 +1791,7 @@ def adaptive_avg_pool2d(x, output_size, data_format='NCHW', name=None): ...@@ -1788,6 +1791,7 @@ def adaptive_avg_pool2d(x, output_size, data_format='NCHW', name=None):
x = x, x = x,
output_size=[3, 3]) output_size=[3, 3])
# out.shape is [2, 3, 3, 3] # out.shape is [2, 3, 3, 3]
""" """
if not in_dynamic_mode(): if not in_dynamic_mode():
check_variable_and_dtype( check_variable_and_dtype(
...@@ -1879,35 +1883,37 @@ def adaptive_avg_pool2d(x, output_size, data_format='NCHW', name=None): ...@@ -1879,35 +1883,37 @@ def adaptive_avg_pool2d(x, output_size, data_format='NCHW', name=None):
def adaptive_avg_pool3d(x, output_size, data_format='NCDHW', name=None): def adaptive_avg_pool3d(x, output_size, data_format='NCDHW', name=None):
""" r"""
This operation applies 3D adaptive avg pooling on input tensor. The h and w dimensions This operation applies 3D adaptive avg pooling on input tensor. The h and w dimensions
of the output tensor are determined by the parameter output_size. of the output tensor are determined by the parameter output_size.
For avg adaptive pool3d: For avg adaptive pool3d:
.. math:: .. math::
dstart &= floor(i * D_{in} / D_{out}) dstart &= floor(i * D_{in} / D_{out}) \\
dend &= ceil((i + 1) * D_{in} / D_{out}) dend &= ceil((i + 1) * D_{in} / D_{out}) \\
hstart &= floor(j * H_{in} / H_{out}) hstart &= floor(j * H_{in} / H_{out}) \\
hend &= ceil((j + 1) * H_{in} / H_{out}) hend &= ceil((j + 1) * H_{in} / H_{out}) \\
wstart &= floor(k * W_{in} / W_{out}) wstart &= floor(k * W_{in} / W_{out}) \\
wend &= ceil((k + 1) * W_{in} / W_{out}) wend &= ceil((k + 1) * W_{in} / W_{out}) \\
Output(i ,j, k) &= \frac{\sum Input[dstart:dend, hstart:hend, wstart:wend]} Output(i ,j, k) &= \frac{\sum Input[dstart:dend, hstart:hend, wstart:wend]}
{(dend - dstart) * (hend - hstart) * (wend - wstart)} {(dend - dstart) * (hend - hstart) * (wend - wstart)}
Args: Args:
x (Tensor): The input tensor of adaptive avg pool3d operator, which is a 5-D tensor. x (Tensor): The input tensor of adaptive avg pool3d operator, which is a 5-D tensor.
The data type can be float32, float64. The data type can be float32, float64.
output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list, output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or
it must contain three elements, (D, H, W). D, H and W can be either a int, or None which means list, it must contain three elements, (D, H, W). D, H and W can be either a int,
the size will be the same as that of the input. or None which means the size will be the same as that of the input.
data_format (str): The data format of the input and output data. An optional string data_format (str, optional): The data format of the input and output data. An optional string
from: "NCDHW", "NDHWC". The default is "NCDHW". When it is "NCDHW", the data is stored in from: "NCDHW", "NDHWC". The default is "NCDHW". When it is "NCDHW", the data is stored in
the order of: [batch_size, input_channels, input_depth, input_height, input_width]. the order of: [batch_size, input_channels, input_depth, input_height, input_width].
name(str, optional): For detailed information, please refer name(str, optional): For detailed information, please refer to :ref:`api_guide_Name`.
to :ref:`api_guide_Name`. Usually name is no need to set and Usually name is no need to set and None by default.
None by default.
Returns: Returns:
Tensor: The output tensor of avg adaptive pool3d result. The data type is same as input tensor. Tensor, The output tensor of avg adaptive pool3d result. The data type is same as input tensor.
Examples: Examples:
.. code-block:: python .. code-block:: python
...@@ -1937,6 +1943,7 @@ def adaptive_avg_pool3d(x, output_size, data_format='NCDHW', name=None): ...@@ -1937,6 +1943,7 @@ def adaptive_avg_pool3d(x, output_size, data_format='NCDHW', name=None):
x = input_data, x = input_data,
output_size=[3, 3, 3]) output_size=[3, 3, 3])
# out.shape is [2, 3, 3, 3, 3] # out.shape is [2, 3, 3, 3, 3]
""" """
if not in_dynamic_mode(): if not in_dynamic_mode():
check_variable_and_dtype( check_variable_and_dtype(
......
...@@ -1450,15 +1450,16 @@ class Maxout(Layer): ...@@ -1450,15 +1450,16 @@ class Maxout(Layer):
class Softmax2D(Layer): class Softmax2D(Layer):
r""" r"""
Softmax2D Activation. Softmax2D Activation.
Given a Tensor with shape (B, C, H, W) or (C, H, W), it will apply Softmax to each location (C, h_i, w_j). Given a Tensor with shape (B, C, H, W) or (C, H, W), it will apply Softmax to each location (C, h_i, w_j).
The sum of result in each location (C, H_i, W_j) will be one. The sum of result in each location (C, H_i, W_j) will be one.
Shape: Shape:
- Input: :math:`(B, C, H, W)` or :math:`(C, H, W)` - Input: :math:`(B, C, H, W)` or :math:`(C, H, W)`
- Output: :math:`(B, C, H, W)` or :math:`(C, H, W)`(same as input) - Output: :math:`(B, C, H, W)` or :math:`(C, H, W)` (same as input)
Return: Returns:
A Tensor of the same shape and dtype as input with value in range [0, 1]. A Tensor of the same shape and dtype as input with value in range [0, 1].
Examples: Examples:
...@@ -1483,6 +1484,7 @@ class Softmax2D(Layer): ...@@ -1483,6 +1484,7 @@ class Softmax2D(Layer):
# [[0.42368975 0.51082766 0.47752273 0.5258871 ] # [[0.42368975 0.51082766 0.47752273 0.5258871 ]
# [0.66754097 0.47182566 0.5187628 0.5402329 ] # [0.66754097 0.47182566 0.5187628 0.5402329 ]
# [0.49014282 0.46369177 0.50340754 0.5289428 ]]]] # [0.49014282 0.46369177 0.50340754 0.5289428 ]]]]
""" """
def __init__(self, name=None): def __init__(self, name=None):
......
...@@ -20,6 +20,7 @@ __all__ = [] ...@@ -20,6 +20,7 @@ __all__ = []
class PairwiseDistance(Layer): class PairwiseDistance(Layer):
r""" r"""
It computes the pairwise distance between two vectors. The It computes the pairwise distance between two vectors. The
distance is calculated by p-oreder norm: distance is calculated by p-oreder norm:
...@@ -38,10 +39,10 @@ class PairwiseDistance(Layer): ...@@ -38,10 +39,10 @@ class PairwiseDistance(Layer):
Generally, no setting is required. Default: None. Generally, no setting is required. Default: None.
Shape: Shape:
x: :math:`[N, D]` or :math:`[D]`, where :math:`N` is batch size, :math:`D` - x: :math:`[N, D]` or :math:`[D]`, where :math:`N` is batch size, :math:`D`
is the dimension of the data. Available data type is float32, float64. is the dimension of the data. Available data type is float32, float64.
y: :math:`[N, D]` or :math:`[D]`, y have the same dtype as x. - y: :math:`[N, D]` or :math:`[D]`, y have the same dtype as x.
output: The same dtype as input tensor. - output: The same dtype as input tensor.
- If :attr:`keepdim` is True, the output shape is :math:`[N, 1]` or :math:`[1]`, - If :attr:`keepdim` is True, the output shape is :math:`[N, 1]` or :math:`[1]`,
depending on whether the input has data shaped as :math:`[N, D]`. depending on whether the input has data shaped as :math:`[N, D]`.
- If :attr:`keepdim` is False, the output shape is :math:`[N]` or :math:`[]`, - If :attr:`keepdim` is False, the output shape is :math:`[N]` or :math:`[]`,
......
...@@ -31,7 +31,8 @@ __all__ = [] ...@@ -31,7 +31,8 @@ __all__ = []
class BCEWithLogitsLoss(Layer): class BCEWithLogitsLoss(Layer):
r""" r"""
This operator combines the sigmoid layer and the :ref:`api_nn_loss_BCELoss` layer.
This operator combines the sigmoid layer and the :ref:`api_paddle_nn_BCELoss` layer.
Also, we can see it as the combine of ``sigmoid_cross_entropy_with_logits`` Also, we can see it as the combine of ``sigmoid_cross_entropy_with_logits``
layer and some reduce operations. layer and some reduce operations.
...@@ -86,21 +87,21 @@ class BCEWithLogitsLoss(Layer): ...@@ -86,21 +87,21 @@ class BCEWithLogitsLoss(Layer):
For more information, please refer to :ref:`api_guide_Name`. For more information, please refer to :ref:`api_guide_Name`.
Shapes: Shapes:
logit (Tensor): The input predications tensor. 2-D tensor with shape: [N, *], - logit (Tensor): The input predications tensor. 2-D tensor with shape: [N, `*`],
N is batch_size, `*` means number of additional dimensions. The ``logit`` N is batch_size, `*` means number of additional dimensions. The ``logit``
is usually the output of Linear layer. Available dtype is float32, float64. is usually the output of Linear layer. Available dtype is float32, float64.
label (Tensor): The target labels tensor. 2-D tensor with the same shape as - label (Tensor): The target labels tensor. 2-D tensor with the same shape as
``logit``. The target labels which values should be numbers between 0 and 1. ``logit``. The target labels which values should be numbers between 0 and 1.
Available dtype is float32, float64. Available dtype is float32, float64.
output (Tensor): If ``reduction`` is ``'none'``, the shape of output is - output (Tensor): If ``reduction`` is ``'none'``, the shape of output is
same as ``logit`` , else the shape of output is scalar. same as ``logit`` , else the shape of output is scalar.
Returns: Returns:
A callable object of BCEWithLogitsLoss. A callable object of BCEWithLogitsLoss.
Examples: Examples:
.. code-block:: python .. code-block:: python
import paddle import paddle
logit = paddle.to_tensor([5.0, 1.0, 3.0], dtype="float32") logit = paddle.to_tensor([5.0, 1.0, 3.0], dtype="float32")
label = paddle.to_tensor([1.0, 0.0, 1.0], dtype="float32") label = paddle.to_tensor([1.0, 0.0, 1.0], dtype="float32")
...@@ -139,6 +140,7 @@ class BCEWithLogitsLoss(Layer): ...@@ -139,6 +140,7 @@ class BCEWithLogitsLoss(Layer):
class CrossEntropyLoss(Layer): class CrossEntropyLoss(Layer):
r""" r"""
By default, this operator implements the cross entropy loss function with softmax. This function By default, this operator implements the cross entropy loss function with softmax. This function
combines the calculation of the softmax operation and the cross entropy loss function combines the calculation of the softmax operation and the cross entropy loss function
to provide a more numerically stable computing. to provide a more numerically stable computing.
...@@ -251,60 +253,35 @@ class CrossEntropyLoss(Layer): ...@@ -251,60 +253,35 @@ class CrossEntropyLoss(Layer):
Parameters: Parameters:
weight (Tensor, optional): a manual rescaling weight given to each class.
- **weight** (Tensor, optional)
a manual rescaling weight given to each class.
If given, has to be a Tensor of size C and the data type is float32, float64. If given, has to be a Tensor of size C and the data type is float32, float64.
Default is ``'None'`` . Default is ``'None'`` .
ignore_index (int64, optional): Specifies a target value that is ignored
- **ignore_index** (int64, optional)
Specifies a target value that is ignored
and does not contribute to the loss. A negative value means that no label and does not contribute to the loss. A negative value means that no label
value needs to be ignored. Only valid when soft_label = False. value needs to be ignored. Only valid when soft_label = False.
Default is ``-100`` . Default is ``-100`` .
reduction (str, optional): Indicate how to average the loss by batch_size,
- **reduction** (str, optional)
Indicate how to average the loss by batch_size,
the candicates are ``'none'`` | ``'mean'`` | ``'sum'``. the candicates are ``'none'`` | ``'mean'`` | ``'sum'``.
If :attr:`reduction` is ``'mean'``, the reduced mean loss is returned; If :attr:`reduction` is ``'mean'``, the reduced mean loss is returned;
If :attr:`size_average` is ``'sum'``, the reduced sum loss is returned. If :attr:`size_average` is ``'sum'``, the reduced sum loss is returned.
If :attr:`reduction` is ``'none'``, the unreduced loss is returned. If :attr:`reduction` is ``'none'``, the unreduced loss is returned.
Default is ``'mean'``. Default is ``'mean'``.
soft_label (bool, optional): Indicate whether label is soft.
- **soft_label** (bool, optional)
Indicate whether label is soft.
If soft_label=False, the label is hard. If soft_label=True, the label is soft. If soft_label=False, the label is hard. If soft_label=True, the label is soft.
Default is ``False``. Default is ``False``.
axis (int, optional): The index of dimension to perform softmax calculations.
- **axis** (int, optional)
The index of dimension to perform softmax calculations.
It should be in range :math:`[-1, rank - 1]`, where :math:`rank` is the number It should be in range :math:`[-1, rank - 1]`, where :math:`rank` is the number
of dimensions of input :attr:`input`. of dimensions of input :attr:`input`.
Default is ``-1`` . Default is ``-1`` .
use_softmax (bool, optional): Indicate whether compute softmax before cross_entropy.
- **use_softmax** (bool, optional)
Indicate whether compute softmax before cross_entropy.
Default is ``True``. Default is ``True``.
name (str, optional): The name of the operator. Default is ``None`` .
- **name** (str, optional)
The name of the operator. Default is ``None`` .
For more information, please refer to :ref:`api_guide_Name` . For more information, please refer to :ref:`api_guide_Name` .
Shape: Shape:
- **input** (Tensor), the data type is float32, float64. Shape is
- **input** (Tensor)
Input tensor, the data type is float32, float64. Shape is
:math:`[N_1, N_2, ..., N_k, C]`, where C is number of classes , ``k >= 1`` . :math:`[N_1, N_2, ..., N_k, C]`, where C is number of classes , ``k >= 1`` .
Note: Note:
1. when use_softmax=True, it expects unscaled logits. This operator should not be used with the 1. when use_softmax=True, it expects unscaled logits. This operator should not be used with the
...@@ -312,7 +289,6 @@ class CrossEntropyLoss(Layer): ...@@ -312,7 +289,6 @@ class CrossEntropyLoss(Layer):
2. when use_softmax=False, it expects the output of softmax operator. 2. when use_softmax=False, it expects the output of softmax operator.
- **label** (Tensor) - **label** (Tensor)
1. If soft_label=False, the shape is 1. If soft_label=False, the shape is
...@@ -322,14 +298,9 @@ class CrossEntropyLoss(Layer): ...@@ -322,14 +298,9 @@ class CrossEntropyLoss(Layer):
2. If soft_label=True, the shape and data type should be same with ``input`` , 2. If soft_label=True, the shape and data type should be same with ``input`` ,
and the sum of the labels for each sample should be 1. and the sum of the labels for each sample should be 1.
- **output** (Tensor) - **output** (Tensor), Return the softmax cross_entropy loss of ``input`` and ``label``.
Return the softmax cross_entropy loss of ``input`` and ``label``.
The data type is the same as input. The data type is the same as input.
If :attr:`reduction` is ``'mean'`` or ``'sum'`` , the dimension of return value is ``1``. If :attr:`reduction` is ``'mean'`` or ``'sum'`` , the dimension of return value is ``1``.
If :attr:`reduction` is ``'none'``: If :attr:`reduction` is ``'none'``:
1. If soft_label = False, the dimension of return value is the same with ``label`` . 1. If soft_label = False, the dimension of return value is the same with ``label`` .
...@@ -634,6 +605,7 @@ class MSELoss(Layer): ...@@ -634,6 +605,7 @@ class MSELoss(Layer):
class L1Loss(Layer): class L1Loss(Layer):
r""" r"""
Construct a callable object of the ``L1Loss`` class. Construct a callable object of the ``L1Loss`` class.
The L1Loss layer calculates the L1 Loss of ``input`` and ``label`` as follows. The L1Loss layer calculates the L1 Loss of ``input`` and ``label`` as follows.
...@@ -663,10 +635,10 @@ class L1Loss(Layer): ...@@ -663,10 +635,10 @@ class L1Loss(Layer):
name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`. name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
Shape: Shape:
input (Tensor): The input tensor. The shapes is [N, *], where N is batch size and `*` means any number of additional dimensions. It's data type should be float32, float64, int32, int64. - input (Tensor): The input tensor. The shapes is ``[N, *]``, where N is batch size and `*` means any number of additional dimensions. It's data type should be float32, float64, int32, int64.
label (Tensor): label. The shapes is [N, *], same shape as ``input`` . It's data type should be float32, float64, int32, int64. - label (Tensor): label. The shapes is ``[N, *]``, same shape as ``input`` . It's data type should be float32, float64, int32, int64.
output (Tensor): The L1 Loss of ``input`` and ``label``. - output (Tensor): The L1 Loss of ``input`` and ``label``.
If `reduction` is ``'none'``, the shape of output loss is [N, *], the same as ``input`` . If `reduction` is ``'none'``, the shape of output loss is ``[N, *]``, the same as ``input`` .
If `reduction` is ``'mean'`` or ``'sum'``, the shape of output loss is [1]. If `reduction` is ``'mean'`` or ``'sum'``, the shape of output loss is [1].
Examples: Examples:
...@@ -692,6 +664,7 @@ class L1Loss(Layer): ...@@ -692,6 +664,7 @@ class L1Loss(Layer):
print(output) print(output)
# [[0.20000005 0.19999999] # [[0.20000005 0.19999999]
# [0.2 0.79999995]] # [0.2 0.79999995]]
""" """
def __init__(self, reduction='mean', name=None): def __init__(self, reduction='mean', name=None):
...@@ -712,6 +685,7 @@ class L1Loss(Layer): ...@@ -712,6 +685,7 @@ class L1Loss(Layer):
class BCELoss(Layer): class BCELoss(Layer):
""" """
This interface is used to construct a callable object of the ``BCELoss`` class. This interface is used to construct a callable object of the ``BCELoss`` class.
The BCELoss layer measures the binary_cross_entropy loss between input predictions ``input`` The BCELoss layer measures the binary_cross_entropy loss between input predictions ``input``
and target labels ``label`` . The binary_cross_entropy loss can be described as: and target labels ``label`` . The binary_cross_entropy loss can be described as:
...@@ -755,13 +729,13 @@ class BCELoss(Layer): ...@@ -755,13 +729,13 @@ class BCELoss(Layer):
For more information, please refer to :ref:`api_guide_Name`. For more information, please refer to :ref:`api_guide_Name`.
Shape: Shape:
input (Tensor): 2-D tensor with shape: [N, *], N is batch_size, `*` means - input (Tensor): 2-D tensor with shape: ``[N, *]``, N is batch_size, `*` means
number of additional dimensions. The input ``input`` should always number of additional dimensions. The input ``input`` should always
be the output of sigmod. Available dtype is float32, float64. be the output of sigmod. Available dtype is float32, float64.
label (Tensor): 2-D tensor with the same shape as ``input``. The target - label (Tensor): 2-D tensor with the same shape as ``input``. The target
labels which values should be numbers between 0 and 1. Available labels which values should be numbers between 0 and 1. Available
dtype is float32, float64. dtype is float32, float64.
output (Tensor): If ``reduction`` is ``'none'``, the shape of output is - output (Tensor): If ``reduction`` is ``'none'``, the shape of output is
same as ``input`` , else the shape of output is scalar. same as ``input`` , else the shape of output is scalar.
Returns: Returns:
...@@ -914,6 +888,7 @@ class NLLLoss(Layer): ...@@ -914,6 +888,7 @@ class NLLLoss(Layer):
class KLDivLoss(Layer): class KLDivLoss(Layer):
r""" r"""
Generate a callable object of 'KLDivLoss' to calculate the Generate a callable object of 'KLDivLoss' to calculate the
Kullback-Leibler divergence loss between Input(X) and Kullback-Leibler divergence loss between Input(X) and
Input(Target). Notes that Input(X) is the log-probability Input(Target). Notes that Input(X) is the log-probability
...@@ -933,14 +908,10 @@ class KLDivLoss(Layer): ...@@ -933,14 +908,10 @@ class KLDivLoss(Layer):
Default is ``'mean'``. Default is ``'mean'``.
Shape: Shape:
- input (Tensor): ``(N, *)``, where ``*`` means, any number of additional dimensions.
- input (Tensor): (N, *), where * means, any number of additional dimensions. - label (Tensor): ``(N, *)``, same shape as input.
- label (Tensor): (N, *), same shape as input.
- output (Tensor): tensor with shape: [1] by default. - output (Tensor): tensor with shape: [1] by default.
Examples: Examples:
.. code-block:: python .. code-block:: python
...@@ -970,6 +941,7 @@ class KLDivLoss(Layer): ...@@ -970,6 +941,7 @@ class KLDivLoss(Layer):
kldiv_criterion = nn.KLDivLoss(reduction='none') kldiv_criterion = nn.KLDivLoss(reduction='none')
pred_loss = kldiv_criterion(x, target) pred_loss = kldiv_criterion(x, target)
# shape=[5, 20] # shape=[5, 20]
""" """
def __init__(self, reduction='mean'): def __init__(self, reduction='mean'):
...@@ -1720,6 +1692,7 @@ class TripletMarginLoss(Layer): ...@@ -1720,6 +1692,7 @@ class TripletMarginLoss(Layer):
class SoftMarginLoss(Layer): class SoftMarginLoss(Layer):
r""" r"""
Creates a criterion that measures a two-class soft margin loss between input predictions ``input`` Creates a criterion that measures a two-class soft margin loss between input predictions ``input``
and target labels ``label`` . It can be described as: and target labels ``label`` . It can be described as:
...@@ -1738,16 +1711,13 @@ class SoftMarginLoss(Layer): ...@@ -1738,16 +1711,13 @@ class SoftMarginLoss(Layer):
name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`. name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
Shapes: Shapes:
- Input (Tensor): The input tensor with shape: ``[N, *]``,
Input (Tensor): The input tensor with shape: [N, *],
N is batch_size, `*` means any number of additional dimensions. The ``input`` ranges from -inf to inf N is batch_size, `*` means any number of additional dimensions. The ``input`` ranges from -inf to inf
Available dtype is float32, float64. Available dtype is float32, float64.
- Label (Tensor): The target labels tensor with the same shape as
Label (Tensor): The target labels tensor with the same shape as
``input``. The target labels which values should be numbers -1 or 1. ``input``. The target labels which values should be numbers -1 or 1.
Available dtype is int32, int64, float32, float64. Available dtype is int32, int64, float32, float64.
- Output (Tensor): If ``reduction`` is ``'none'``, the shape of output is
Output (Tensor): If ``reduction`` is ``'none'``, the shape of output is
same as ``input`` , else the shape of output is [1]. same as ``input`` , else the shape of output is [1].
Returns: Returns:
...@@ -1780,6 +1750,7 @@ class SoftMarginLoss(Layer): ...@@ -1780,6 +1750,7 @@ class SoftMarginLoss(Layer):
# [0.55476735, 1.10505384, 0.89923519, 0.45018155, 1.06587511], # [0.55476735, 1.10505384, 0.89923519, 0.45018155, 1.06587511],
# [0.37998142, 0.48067240, 0.47791212, 0.55664053, 0.98581399], # [0.37998142, 0.48067240, 0.47791212, 0.55664053, 0.98581399],
# [0.78571653, 0.59319711, 0.39701841, 0.76172109, 0.83781742]]) # [0.78571653, 0.59319711, 0.39701841, 0.76172109, 0.83781742]])
""" """
def __init__(self, reduction='mean', name=None): def __init__(self, reduction='mean', name=None):
......
...@@ -321,6 +321,7 @@ Where `H` means height of feature map, `W` means width of feature map. ...@@ -321,6 +321,7 @@ Where `H` means height of feature map, `W` means width of feature map.
class GroupNorm(Layer): class GroupNorm(Layer):
""" """
This interface is used to construct a callable object of the ``GroupNorm`` class. This interface is used to construct a callable object of the ``GroupNorm`` class.
For more details, refer to code examples. For more details, refer to code examples.
It implements the function of the Group Normalization Layer. It implements the function of the Group Normalization Layer.
...@@ -341,7 +342,7 @@ class GroupNorm(Layer): ...@@ -341,7 +342,7 @@ class GroupNorm(Layer):
name(str, optional): Name for the GroupNorm, default is None. For more information, please refer to :ref:`api_guide_Name`.. name(str, optional): Name for the GroupNorm, default is None. For more information, please refer to :ref:`api_guide_Name`..
Shape: Shape:
- x: Tensor with shape: (batch, num_features, *). - x: Tensor with shape: attr:`(batch, num_features, *)`.
- output: The same shape as input x. - output: The same shape as input x.
Returns: Returns:
...@@ -1047,6 +1048,7 @@ class BatchNorm3D(_BatchNormBase): ...@@ -1047,6 +1048,7 @@ class BatchNorm3D(_BatchNormBase):
class SyncBatchNorm(_BatchNormBase): class SyncBatchNorm(_BatchNormBase):
r""" r"""
This interface is used to construct a callable object of the ``SyncBatchNorm`` class. This interface is used to construct a callable object of the ``SyncBatchNorm`` class.
It implements the function of the Cross-GPU Synchronized Batch Normalization Layer, and can It implements the function of the Cross-GPU Synchronized Batch Normalization Layer, and can
be used as a normalizer function for other operations, such as conv2d and fully connected be used as a normalizer function for other operations, such as conv2d and fully connected
...@@ -1092,9 +1094,9 @@ class SyncBatchNorm(_BatchNormBase): ...@@ -1092,9 +1094,9 @@ class SyncBatchNorm(_BatchNormBase):
- :math:`\beta` : trainable shift parameter vector - :math:`\beta` : trainable shift parameter vector
Note: Note:
If you want to use container to pack your model and has ``SyncBatchNorm`` in the If you want to use container to pack your model and has :ref:`api_paddle_nn_SyncBatchNorm` in the
evaluation phase, please use ``nn.LayerList`` or ``nn.Sequential`` instead of evaluation phase, please use :ref:`api_paddle_nn_LayerList` or :ref:`api_paddle_nn_Sequential` instead of
``list`` to pack the model. :ref:`api_paddle_hub_list` to pack the model.
Parameters: Parameters:
num_features(int): Indicate the number of channels of the input ``Tensor``. num_features(int): Indicate the number of channels of the input ``Tensor``.
...@@ -1112,8 +1114,8 @@ class SyncBatchNorm(_BatchNormBase): ...@@ -1112,8 +1114,8 @@ class SyncBatchNorm(_BatchNormBase):
have trainable bias parameter. Default: None. have trainable bias parameter. Default: None.
Shapes: Shapes:
input: Tensor that the dimension from 2 to 5. - input: Tensor that the dimension from 2 to 5.
output: Tensor with the same shape as input. - output: Tensor with the same shape as input.
Examples: Examples:
.. code-block:: python .. code-block:: python
...@@ -1135,6 +1137,7 @@ class SyncBatchNorm(_BatchNormBase): ...@@ -1135,6 +1137,7 @@ class SyncBatchNorm(_BatchNormBase):
# [[ 0.80956620, -0.66528702], # [[ 0.80956620, -0.66528702],
# [-1.27446556, 1.13018656]]]]) # [-1.27446556, 1.13018656]]]])
""" """
def __init__( def __init__(
...@@ -1284,8 +1287,8 @@ class SyncBatchNorm(_BatchNormBase): ...@@ -1284,8 +1287,8 @@ class SyncBatchNorm(_BatchNormBase):
The original model with converted SyncBatchNorm layers. If BatchNorm*d layer in the model, use SyncBatchNorm layer instead. The original model with converted SyncBatchNorm layers. If BatchNorm*d layer in the model, use SyncBatchNorm layer instead.
Examples: Examples:
.. code-block:: python .. code-block:: python
import paddle import paddle
import paddle.nn as nn import paddle.nn as nn
......
...@@ -224,6 +224,7 @@ class AvgPool2D(Layer): ...@@ -224,6 +224,7 @@ class AvgPool2D(Layer):
class AvgPool3D(Layer): class AvgPool3D(Layer):
""" """
This operation applies 3D max pooling over input features based on the input, This operation applies 3D max pooling over input features based on the input,
and kernel_size, stride, padding parameters. Input(X) and Output(Out) are and kernel_size, stride, padding parameters. Input(X) and Output(Out) are
in NCDHW format, where N is batch size, C is the number of channels, in NCDHW format, where N is batch size, C is the number of channels,
...@@ -264,6 +265,7 @@ class AvgPool3D(Layer): ...@@ -264,6 +265,7 @@ class AvgPool3D(Layer):
The data type can be float32, float64. The data type can be float32, float64.
- output(Tensor): The output tensor of avg pool3d operator, which is a 5-D tensor. - output(Tensor): The output tensor of avg pool3d operator, which is a 5-D tensor.
The data type is same as input x. The data type is same as input x.
Examples: Examples:
.. code-block:: python .. code-block:: python
......
...@@ -514,14 +514,17 @@ class QuantizedConv2D(Layer): ...@@ -514,14 +514,17 @@ class QuantizedConv2D(Layer):
class QuantizedConv2DTranspose(Layer): class QuantizedConv2DTranspose(Layer):
""" """
The computational logic of QuantizedConv2DTranspose is the same with Conv2DTranspose. The computational logic of QuantizedConv2DTranspose is the same with Conv2DTranspose.
The only difference is that its inputs are all fake quantized. The only difference is that its inputs are all fake quantized.
Examples: Examples:
.. code-block:: python .. code-block:: python
import paddle import paddle
import paddle.nn as nn import paddle.nn as nn
from paddle.nn.quant.quant_layers import QuantizedConv2DTranspose from paddle.nn.quant.quant_layers import QuantizedConv2DTranspose
x_var = paddle.uniform((2, 4, 8, 8), dtype='float32', min=-1., max=1.) x_var = paddle.uniform((2, 4, 8, 8), dtype='float32', min=-1., max=1.)
conv = nn.Conv2DTranspose(4, 6, (3, 3)) conv = nn.Conv2DTranspose(4, 6, (3, 3))
conv_quantized = QuantizedConv2DTranspose(conv) conv_quantized = QuantizedConv2DTranspose(conv)
...@@ -531,6 +534,7 @@ class QuantizedConv2DTranspose(Layer): ...@@ -531,6 +534,7 @@ class QuantizedConv2DTranspose(Layer):
y_np = y_var.numpy() y_np = y_var.numpy()
print(y_np.shape, y_quantized_np.shape) print(y_np.shape, y_quantized_np.shape)
# (2, 6, 10, 10), (2, 6, 10, 10) # (2, 6, 10, 10), (2, 6, 10, 10)
""" """
def __init__(self, def __init__(self,
......
...@@ -1661,6 +1661,7 @@ class MultiplicativeDecay(LRScheduler): ...@@ -1661,6 +1661,7 @@ class MultiplicativeDecay(LRScheduler):
class OneCycleLR(LRScheduler): class OneCycleLR(LRScheduler):
r""" r"""
Sets the learning rate according to the one cycle learning rate scheduler. Sets the learning rate according to the one cycle learning rate scheduler.
The scheduler adjusts the learning rate from an initial learning rate to the maximum learning rate and then The scheduler adjusts the learning rate from an initial learning rate to the maximum learning rate and then
from that maximum learning rate to the minimum learning rate, which is much less than the initial learning rate. from that maximum learning rate to the minimum learning rate, which is much less than the initial learning rate.
...@@ -1674,22 +1675,25 @@ class OneCycleLR(LRScheduler): ...@@ -1674,22 +1675,25 @@ class OneCycleLR(LRScheduler):
Also note that you should update learning rate each step. Also note that you should update learning rate each step.
Args: Args:
max_learning_rate (float): The maximum learning rate. It is a python float number. max_learning_rate (float): The maximum learning rate. It is a python float number. Functionally, it defines the initial learning rate by ``divide_factor`` .
Functionally, it defines the initial learning rate by ``divide_factor`` .
total_steps (int): Number of total training steps. total_steps (int): Number of total training steps.
divide_factor (float): Initial learning rate will be determined by initial_learning_rate = max_learning_rate / divide_factor. Default: 25. divide_factor (float, optional): Initial learning rate will be determined by initial_learning_rate = max_learning_rate / divide_factor. Default: 25.
end_learning_rate (float, optional): The minimum learning rate during training, it should be much less than initial learning rate. end_learning_rate (float, optional): The minimum learning rate during training, it should be much less than initial learning rate.
phase_pct (float): The percentage of total steps which used to increasing learning rate. Default: 0.3. phase_pct (float): The percentage of total steps which used to increasing learning rate. Default: 0.3.
anneal_strategy (str, optional): Strategy of adjusting learning rate.'cos' for cosine annealing, anneal_strategy (str, optional): Strategy of adjusting learning rate.'cos' for cosine annealing, 'linear' for linear annealing. Default: 'cos'.
'linear' for linear annealing. Default: 'cos'.
three_phase (bool, optional): Whether to use three phase. three_phase (bool, optional): Whether to use three phase.
If ``True``: If ``True``:
1. The learning rate will first increase from initial learning rate to maximum learning rate. 1. The learning rate will first increase from initial learning rate to maximum learning rate.
2. Then it will decrease to initial learning rate. Number of step in this phase is the same as the one in first phase. 2. Then it will decrease to initial learning rate. Number of step in this phase is the same as the one in first phase.
3. Finally, it will decrease to minimum learning rate which is much less than initial learning rate. 3. Finally, it will decrease to minimum learning rate which is much less than initial learning rate.
If ``False``: If ``False``:
1. The learning rate will increase to maximum learning rate. 1. The learning rate will increase to maximum learning rate.
2. Then it will directly decrease to minimum learning rate. 2. Then it will directly decrease to minimum learning rate.
last_epoch (int, optional): The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate. last_epoch (int, optional): The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` . verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
...@@ -1741,6 +1745,7 @@ class OneCycleLR(LRScheduler): ...@@ -1741,6 +1745,7 @@ class OneCycleLR(LRScheduler):
}, },
fetch_list=loss.name) fetch_list=loss.name)
scheduler.step() # You should update learning rate each step scheduler.step() # You should update learning rate each step
""" """
def __init__( def __init__(
......
...@@ -124,7 +124,8 @@ def frame(x, frame_length, hop_length, axis=-1, name=None): ...@@ -124,7 +124,8 @@ def frame(x, frame_length, hop_length, axis=-1, name=None):
if frame_length > x.shape[axis]: if frame_length > x.shape[axis]:
raise ValueError( raise ValueError(
f'Attribute frame_length should be less equal than sequence length, ' f'Attribute frame_length should be less equal than sequence length, '
f'but got ({frame_length}) > ({x.shape[axis]}).') f'but got ({frame_length}) > ({x.shape[axis]}).'
)
op_type = 'frame' op_type = 'frame'
...@@ -132,25 +133,33 @@ def frame(x, frame_length, hop_length, axis=-1, name=None): ...@@ -132,25 +133,33 @@ def frame(x, frame_length, hop_length, axis=-1, name=None):
return _C_ops.frame(x, frame_length, hop_length, axis) return _C_ops.frame(x, frame_length, hop_length, axis)
if _in_legacy_dygraph(): if _in_legacy_dygraph():
attrs = ('frame_length', frame_length, 'hop_length', hop_length, 'axis', attrs = (
axis) 'frame_length',
frame_length,
'hop_length',
hop_length,
'axis',
axis,
)
op = getattr(_legacy_C_ops, op_type) op = getattr(_legacy_C_ops, op_type)
out = op(x, *attrs) out = op(x, *attrs)
else: else:
check_variable_and_dtype( check_variable_and_dtype(
x, 'x', ['int32', 'int64', 'float16', 'float32', 'float64'], x, 'x', ['int32', 'int64', 'float16', 'float32', 'float64'], op_type
op_type) )
helper = LayerHelper(op_type, **locals()) helper = LayerHelper(op_type, **locals())
dtype = helper.input_dtype(input_param_name='x') dtype = helper.input_dtype(input_param_name='x')
out = helper.create_variable_for_type_inference(dtype=dtype) out = helper.create_variable_for_type_inference(dtype=dtype)
helper.append_op(type=op_type, helper.append_op(
type=op_type,
inputs={'X': x}, inputs={'X': x},
attrs={ attrs={
'frame_length': frame_length, 'frame_length': frame_length,
'hop_length': hop_length, 'hop_length': hop_length,
'axis': axis 'axis': axis,
}, },
outputs={'Out': out}) outputs={'Out': out},
)
return out return out
...@@ -225,22 +234,22 @@ def overlap_add(x, hop_length, axis=-1, name=None): ...@@ -225,22 +234,22 @@ def overlap_add(x, hop_length, axis=-1, name=None):
out = op(x, *attrs) out = op(x, *attrs)
else: else:
check_variable_and_dtype( check_variable_and_dtype(
x, 'x', ['int32', 'int64', 'float16', 'float32', 'float64'], x, 'x', ['int32', 'int64', 'float16', 'float32', 'float64'], op_type
op_type) )
helper = LayerHelper(op_type, **locals()) helper = LayerHelper(op_type, **locals())
dtype = helper.input_dtype(input_param_name='x') dtype = helper.input_dtype(input_param_name='x')
out = helper.create_variable_for_type_inference(dtype=dtype) out = helper.create_variable_for_type_inference(dtype=dtype)
helper.append_op(type=op_type, helper.append_op(
type=op_type,
inputs={'X': x}, inputs={'X': x},
attrs={ attrs={'hop_length': hop_length, 'axis': axis},
'hop_length': hop_length, outputs={'Out': out},
'axis': axis )
},
outputs={'Out': out})
return out return out
def stft(x, def stft(
x,
n_fft, n_fft,
hop_length=None, hop_length=None,
win_length=None, win_length=None,
...@@ -249,8 +258,10 @@ def stft(x, ...@@ -249,8 +258,10 @@ def stft(x,
pad_mode='reflect', pad_mode='reflect',
normalized=False, normalized=False,
onesided=True, onesided=True,
name=None): name=None,
):
r""" r"""
Short-time Fourier transform (STFT). Short-time Fourier transform (STFT).
The STFT computes the discrete Fourier transforms (DFT) of short overlapping The STFT computes the discrete Fourier transforms (DFT) of short overlapping
...@@ -263,9 +274,12 @@ def stft(x, ...@@ -263,9 +274,12 @@ def stft(x,
Where: Where:
- :math:`t`: The :math:`t`-th input window. - :math:`t`: The :math:`t`-th input window.
- :math:`\omega`: Frequency :math:`0 \leq \omega < \text{n\_fft}` for `onesided=False`, - :math:`\omega`: Frequency :math:`0 \leq \omega < \text{n\_fft}` for `onesided=False`,
or :math:`0 \leq \omega < \lfloor \text{n\_fft} / 2 \rfloor + 1` for `onesided=True`. or :math:`0 \leq \omega < \lfloor \text{n\_fft} / 2 \rfloor + 1` for `onesided=True`.
- :math:`N`: Value of `n_fft`. - :math:`N`: Value of `n_fft`.
- :math:`H`: Value of `hop_length`. - :math:`H`: Value of `hop_length`.
Args: Args:
...@@ -292,9 +306,9 @@ def stft(x, ...@@ -292,9 +306,9 @@ def stft(x,
to set this property. For more information, please refer to :ref:`api_guide_Name`. to set this property. For more information, please refer to :ref:`api_guide_Name`.
Returns: Returns:
The complex STFT output tensor with shape `[..., n_fft//2 + 1, num_frames]`( The complex STFT output tensor with shape `[..., n_fft//2 + 1, num_frames]`
real-valued input and `onesided` is `True`) or `[..., n_fft, num_frames]`( (real-valued input and `onesided` is `True`) or `[..., n_fft, num_frames]`
`onesided` is `False`) (`onesided` is `False`)
Examples: Examples:
.. code-block:: python .. code-block:: python
...@@ -311,14 +325,17 @@ def stft(x, ...@@ -311,14 +325,17 @@ def stft(x,
x = paddle.randn([8, 48000], dtype=paddle.float64) + \ x = paddle.randn([8, 48000], dtype=paddle.float64) + \
paddle.randn([8, 48000], dtype=paddle.float64)*1j # [8, 48000] complex128 paddle.randn([8, 48000], dtype=paddle.float64)*1j # [8, 48000] complex128
y1 = stft(x, n_fft=512, center=False, onesided=False) # [8, 512, 372] y1 = stft(x, n_fft=512, center=False, onesided=False) # [8, 512, 372]
""" """
check_variable_and_dtype(x, 'x', check_variable_and_dtype(
['float32', 'float64', 'complex64', 'complex128'], x, 'x', ['float32', 'float64', 'complex64', 'complex128'], 'stft'
'stft') )
x_rank = len(x.shape) x_rank = len(x.shape)
assert x_rank in [1, 2], \ assert x_rank in [
f'x should be a 1D or 2D real tensor, but got rank of x is {x_rank}' 1,
2,
], f'x should be a 1D or 2D real tensor, but got rank of x is {x_rank}'
if x_rank == 1: # (batch, seq_length) if x_rank == 1: # (batch, seq_length)
x = x.unsqueeze(0) x = x.unsqueeze(0)
...@@ -326,69 +343,77 @@ def stft(x, ...@@ -326,69 +343,77 @@ def stft(x,
if hop_length is None: if hop_length is None:
hop_length = int(n_fft // 4) hop_length = int(n_fft // 4)
assert hop_length > 0, \ assert hop_length > 0, f'hop_length should be > 0, but got {hop_length}.'
f'hop_length should be > 0, but got {hop_length}.'
if win_length is None: if win_length is None:
win_length = n_fft win_length = n_fft
if _non_static_mode(): if _non_static_mode():
assert 0 < n_fft <= x.shape[-1], \ assert (
f'n_fft should be in (0, seq_length({x.shape[-1]})], but got {n_fft}.' 0 < n_fft <= x.shape[-1]
), f'n_fft should be in (0, seq_length({x.shape[-1]})], but got {n_fft}.'
assert 0 < win_length <= n_fft, \ assert (
f'win_length should be in (0, n_fft({n_fft})], but got {win_length}.' 0 < win_length <= n_fft
), f'win_length should be in (0, n_fft({n_fft})], but got {win_length}.'
if window is not None: if window is not None:
assert len(window.shape) == 1 and len(window) == win_length, \ assert (
f'expected a 1D window tensor of size equal to win_length({win_length}), but got window with shape {window.shape}.' len(window.shape) == 1 and len(window) == win_length
), f'expected a 1D window tensor of size equal to win_length({win_length}), but got window with shape {window.shape}.'
else: else:
window = paddle.ones(shape=(win_length, ), dtype=x.dtype) window = paddle.ones(shape=(win_length,), dtype=x.dtype)
if win_length < n_fft: if win_length < n_fft:
pad_left = (n_fft - win_length) // 2 pad_left = (n_fft - win_length) // 2
pad_right = n_fft - win_length - pad_left pad_right = n_fft - win_length - pad_left
window = paddle.nn.functional.pad(window, window = paddle.nn.functional.pad(
pad=[pad_left, pad_right], window, pad=[pad_left, pad_right], mode='constant'
mode='constant') )
if center: if center:
assert pad_mode in ['constant', 'reflect'], \ assert pad_mode in [
'pad_mode should be "reflect" or "constant", but got "{}".'.format(pad_mode) 'constant',
'reflect',
], 'pad_mode should be "reflect" or "constant", but got "{}".'.format(
pad_mode
)
pad_length = n_fft // 2 pad_length = n_fft // 2
# FIXME: Input `x` can be a complex tensor but pad does not supprt complex input. # FIXME: Input `x` can be a complex tensor but pad does not supprt complex input.
x = paddle.nn.functional.pad(x.unsqueeze(-1), x = paddle.nn.functional.pad(
x.unsqueeze(-1),
pad=[pad_length, pad_length], pad=[pad_length, pad_length],
mode=pad_mode, mode=pad_mode,
data_format="NLC").squeeze(-1) data_format="NLC",
).squeeze(-1)
x_frames = frame(x=x, frame_length=n_fft, hop_length=hop_length, axis=-1) x_frames = frame(x=x, frame_length=n_fft, hop_length=hop_length, axis=-1)
x_frames = x_frames.transpose( x_frames = x_frames.transpose(
perm=[0, 2, perm=[0, 2, 1]
1]) # switch n_fft to last dim, egs: (batch, num_frames, n_fft) ) # switch n_fft to last dim, egs: (batch, num_frames, n_fft)
x_frames = paddle.multiply(x_frames, window) x_frames = paddle.multiply(x_frames, window)
norm = 'ortho' if normalized else 'backward' norm = 'ortho' if normalized else 'backward'
if is_complex(x_frames): if is_complex(x_frames):
assert not onesided, \ assert (
'onesided should be False when input or window is a complex Tensor.' not onesided
), 'onesided should be False when input or window is a complex Tensor.'
if not is_complex(x): if not is_complex(x):
out = fft_r2c(x=x_frames, out = fft_r2c(
x=x_frames,
n=None, n=None,
axis=-1, axis=-1,
norm=norm, norm=norm,
forward=True, forward=True,
onesided=onesided, onesided=onesided,
name=name) name=name,
)
else: else:
out = fft_c2c(x=x_frames, out = fft_c2c(
n=None, x=x_frames, n=None, axis=-1, norm=norm, forward=True, name=name
axis=-1, )
norm=norm,
forward=True,
name=name)
out = out.transpose(perm=[0, 2, 1]) # (batch, n_fft, num_frames) out = out.transpose(perm=[0, 2, 1]) # (batch, n_fft, num_frames)
...@@ -398,7 +423,8 @@ def stft(x, ...@@ -398,7 +423,8 @@ def stft(x,
return out return out
def istft(x, def istft(
x,
n_fft, n_fft,
hop_length=None, hop_length=None,
win_length=None, win_length=None,
...@@ -408,7 +434,8 @@ def istft(x, ...@@ -408,7 +434,8 @@ def istft(x,
onesided=True, onesided=True,
length=None, length=None,
return_complex=False, return_complex=False,
name=None): name=None,
):
r""" r"""
Inverse short-time Fourier transform (ISTFT). Inverse short-time Fourier transform (ISTFT).
...@@ -484,8 +511,12 @@ def istft(x, ...@@ -484,8 +511,12 @@ def istft(x,
check_variable_and_dtype(x, 'x', ['complex64', 'complex128'], 'istft') check_variable_and_dtype(x, 'x', ['complex64', 'complex128'], 'istft')
x_rank = len(x.shape) x_rank = len(x.shape)
assert x_rank in [2, 3], \ assert x_rank in [
'x should be a 2D or 3D complex tensor, but got rank of x is {}'.format(x_rank) 2,
3,
], 'x should be a 2D or 3D complex tensor, but got rank of x is {}'.format(
x_rank
)
if x_rank == 2: # (batch, n_fft, n_frames) if x_rank == 2: # (batch, n_fft, n_frames)
x = x.unsqueeze(0) x = x.unsqueeze(0)
...@@ -497,83 +528,107 @@ def istft(x, ...@@ -497,83 +528,107 @@ def istft(x,
win_length = n_fft win_length = n_fft
# Assure no gaps between frames. # Assure no gaps between frames.
assert 0 < hop_length <= win_length, \ assert (
'hop_length should be in (0, win_length({})], but got {}.'.format(win_length, hop_length) 0 < hop_length <= win_length
), 'hop_length should be in (0, win_length({})], but got {}.'.format(
win_length, hop_length
)
assert 0 < win_length <= n_fft, \ assert (
'win_length should be in (0, n_fft({})], but got {}.'.format(n_fft, win_length) 0 < win_length <= n_fft
), 'win_length should be in (0, n_fft({})], but got {}.'.format(
n_fft, win_length
)
n_frames = x.shape[-1] n_frames = x.shape[-1]
fft_size = x.shape[-2] fft_size = x.shape[-2]
if _non_static_mode(): if _non_static_mode():
if onesided: if onesided:
assert (fft_size == n_fft // 2 + 1), \ assert (
'fft_size should be equal to n_fft // 2 + 1({}) when onesided is True, but got {}.'.format(n_fft // 2 + 1, fft_size) fft_size == n_fft // 2 + 1
), 'fft_size should be equal to n_fft // 2 + 1({}) when onesided is True, but got {}.'.format(
n_fft // 2 + 1, fft_size
)
else: else:
assert (fft_size == n_fft), \ assert (
'fft_size should be equal to n_fft({}) when onesided is False, but got {}.'.format(n_fft, fft_size) fft_size == n_fft
), 'fft_size should be equal to n_fft({}) when onesided is False, but got {}.'.format(
n_fft, fft_size
)
if window is not None: if window is not None:
assert len(window.shape) == 1 and len(window) == win_length, \ assert (
'expected a 1D window tensor of size equal to win_length({}), but got window with shape {}.'.format(win_length, window.shape) len(window.shape) == 1 and len(window) == win_length
), 'expected a 1D window tensor of size equal to win_length({}), but got window with shape {}.'.format(
win_length, window.shape
)
else: else:
window_dtype = paddle.float32 if x.dtype in [ window_dtype = (
paddle.float32, paddle.complex64 paddle.float32
] else paddle.float64 if x.dtype in [paddle.float32, paddle.complex64]
window = paddle.ones(shape=(win_length, ), dtype=window_dtype) else paddle.float64
)
window = paddle.ones(shape=(win_length,), dtype=window_dtype)
if win_length < n_fft: if win_length < n_fft:
pad_left = (n_fft - win_length) // 2 pad_left = (n_fft - win_length) // 2
pad_right = n_fft - win_length - pad_left pad_right = n_fft - win_length - pad_left
# FIXME: Input `window` can be a complex tensor but pad does not supprt complex input. # FIXME: Input `window` can be a complex tensor but pad does not supprt complex input.
window = paddle.nn.functional.pad(window, window = paddle.nn.functional.pad(
pad=[pad_left, pad_right], window, pad=[pad_left, pad_right], mode='constant'
mode='constant') )
x = x.transpose( x = x.transpose(
perm=[0, 2, perm=[0, 2, 1]
1]) # switch n_fft to last dim, egs: (batch, num_frames, n_fft) ) # switch n_fft to last dim, egs: (batch, num_frames, n_fft)
norm = 'ortho' if normalized else 'backward' norm = 'ortho' if normalized else 'backward'
if return_complex: if return_complex:
assert not onesided, \ assert (
'onesided should be False when input(output of istft) or window is a complex Tensor.' not onesided
), 'onesided should be False when input(output of istft) or window is a complex Tensor.'
out = fft_c2c(x=x, n=None, axis=-1, norm=norm, forward=False, name=None) out = fft_c2c(x=x, n=None, axis=-1, norm=norm, forward=False, name=None)
else: else:
assert not is_complex(window), \ assert not is_complex(
'Data type of window should not be complex when return_complex is False.' window
), 'Data type of window should not be complex when return_complex is False.'
if onesided is False: if onesided is False:
x = x[:, :, :n_fft // 2 + 1] x = x[:, :, : n_fft // 2 + 1]
out = fft_c2r(x=x, n=None, axis=-1, norm=norm, forward=False, name=None) out = fft_c2r(x=x, n=None, axis=-1, norm=norm, forward=False, name=None)
out = paddle.multiply(out, window).transpose( out = paddle.multiply(out, window).transpose(
perm=[0, 2, 1]) # (batch, n_fft, num_frames) perm=[0, 2, 1]
out = overlap_add(x=out, hop_length=hop_length, ) # (batch, n_fft, num_frames)
axis=-1) # (batch, seq_length) out = overlap_add(
x=out, hop_length=hop_length, axis=-1
) # (batch, seq_length)
window_envelop = overlap_add( window_envelop = overlap_add(
x=paddle.tile( x=paddle.tile(
x=paddle.multiply(window, window).unsqueeze(0), x=paddle.multiply(window, window).unsqueeze(0),
repeat_times=[n_frames, repeat_times=[n_frames, 1],
1]).transpose(perm=[1, 0]), # (n_fft, num_frames) ).transpose(
perm=[1, 0]
), # (n_fft, num_frames)
hop_length=hop_length, hop_length=hop_length,
axis=-1) # (seq_length, ) axis=-1,
) # (seq_length, )
if length is None: if length is None:
if center: if center:
out = out[:, (n_fft // 2):-(n_fft // 2)] out = out[:, (n_fft // 2) : -(n_fft // 2)]
window_envelop = window_envelop[(n_fft // 2):-(n_fft // 2)] window_envelop = window_envelop[(n_fft // 2) : -(n_fft // 2)]
else: else:
if center: if center:
start = n_fft // 2 start = n_fft // 2
else: else:
start = 0 start = 0
out = out[:, start:start + length] out = out[:, start : start + length]
window_envelop = window_envelop[start:start + length] window_envelop = window_envelop[start : start + length]
# Check whether the Nonzero Overlap Add (NOLA) constraint is met. # Check whether the Nonzero Overlap Add (NOLA) constraint is met.
if _non_static_mode() and window_envelop.abs().min().item() < 1e-11: if _non_static_mode() and window_envelop.abs().min().item() < 1e-11:
......
...@@ -20,6 +20,7 @@ __all__ = [] ...@@ -20,6 +20,7 @@ __all__ = []
class ReLU(Layer): class ReLU(Layer):
""" """
Sparse ReLU Activation, requiring x to be a SparseCooTensor or SparseCsrTensor. Sparse ReLU Activation, requiring x to be a SparseCooTensor or SparseCsrTensor.
.. math:: .. math::
...@@ -44,6 +45,7 @@ class ReLU(Layer): ...@@ -44,6 +45,7 @@ class ReLU(Layer):
relu = paddle.sparse.nn.ReLU() relu = paddle.sparse.nn.ReLU()
out = relu(sparse_x) out = relu(sparse_x)
# [0., 0., 1.] # [0., 0., 1.]
""" """
def __init__(self, name=None): def __init__(self, name=None):
...@@ -59,7 +61,8 @@ class ReLU(Layer): ...@@ -59,7 +61,8 @@ class ReLU(Layer):
class Softmax(Layer): class Softmax(Layer):
""" r"""
Sparse Softmax Activation, requiring x to be a SparseCooTensor or SparseCsrTensor. Sparse Softmax Activation, requiring x to be a SparseCooTensor or SparseCsrTensor.
Note: Note:
...@@ -126,6 +129,7 @@ class Softmax(Layer): ...@@ -126,6 +129,7 @@ class Softmax(Layer):
class ReLU6(Layer): class ReLU6(Layer):
""" """
Sparse ReLU6 Activation, requiring x to be a SparseCooTensor or SparseCsrTensor. Sparse ReLU6 Activation, requiring x to be a SparseCooTensor or SparseCsrTensor.
.. math:: .. math::
...@@ -149,6 +153,7 @@ class ReLU6(Layer): ...@@ -149,6 +153,7 @@ class ReLU6(Layer):
sparse_x = dense_x.to_sparse_coo(1) sparse_x = dense_x.to_sparse_coo(1)
relu6 = paddle.sparse.nn.ReLU6() relu6 = paddle.sparse.nn.ReLU6()
out = relu6(sparse_x) out = relu6(sparse_x)
""" """
def __init__(self, name=None): def __init__(self, name=None):
...@@ -164,7 +169,8 @@ class ReLU6(Layer): ...@@ -164,7 +169,8 @@ class ReLU6(Layer):
class LeakyReLU(Layer): class LeakyReLU(Layer):
""" r"""
Sparse Leaky ReLU Activation, requiring x to be a SparseCooTensor or SparseCsrTensor. Sparse Leaky ReLU Activation, requiring x to be a SparseCooTensor or SparseCsrTensor.
.. math:: .. math::
...@@ -196,6 +202,7 @@ class LeakyReLU(Layer): ...@@ -196,6 +202,7 @@ class LeakyReLU(Layer):
sparse_x = dense_x.to_sparse_coo(1) sparse_x = dense_x.to_sparse_coo(1)
leaky_relu = paddle.sparse.nn.LeakyReLU(0.5) leaky_relu = paddle.sparse.nn.LeakyReLU(0.5)
out = leaky_relu(sparse_x) out = leaky_relu(sparse_x)
""" """
def __init__(self, negative_slope=0.01, name=None): def __init__(self, negative_slope=0.01, name=None):
......
...@@ -1180,7 +1180,8 @@ def triu(x, diagonal=0, name=None): ...@@ -1180,7 +1180,8 @@ def triu(x, diagonal=0, name=None):
def meshgrid(*args, **kwargs): def meshgrid(*args, **kwargs):
""" """
Takes a list of N tensors as input *args, each of which is 1-dimensional vector, and creates N-dimensional grids.
Takes a list of N tensors as input :attr:`*args`, each of which is 1-dimensional vector, and creates N-dimensional grids.
Args: Args:
*args(Tensor|list of Tensor) : tensors (tuple(list) of tensor): the shapes of input k tensors are (N1,), *args(Tensor|list of Tensor) : tensors (tuple(list) of tensor): the shapes of input k tensors are (N1,),
......
...@@ -22,9 +22,17 @@ from .math import multiply ...@@ -22,9 +22,17 @@ from .math import multiply
from .math import sum as paddle_sum from .math import sum as paddle_sum
from ..fluid.framework import _in_legacy_dygraph from ..fluid.framework import _in_legacy_dygraph
from paddle import _C_ops, _legacy_C_ops from paddle import _C_ops, _legacy_C_ops
from ..fluid.data_feeder import check_variable_and_dtype, check_type, check_dtype from ..fluid.data_feeder import (
check_variable_and_dtype,
check_type,
check_dtype,
)
from ..fluid.layer_helper import LayerHelper from ..fluid.layer_helper import LayerHelper
from ..fluid.framework import _non_static_mode, in_dygraph_mode, _in_legacy_dygraph from ..fluid.framework import (
_non_static_mode,
in_dygraph_mode,
_in_legacy_dygraph,
)
import collections import collections
import string import string
import opt_einsum import opt_einsum
...@@ -52,12 +60,13 @@ def parse_op_labels(labelstr, operand): ...@@ -52,12 +60,13 @@ def parse_op_labels(labelstr, operand):
''' '''
# Sanity checks # Sanity checks
for c in labelstr.replace('.', ''): for c in labelstr.replace('.', ''):
assert c.isalpha(), ( assert (
f"Invalid equation: {c} is not a valid label, which should be letters." c.isalpha()
) ), f"Invalid equation: {c} is not a valid label, which should be letters."
assert labelstr.replace('...', '', 1).find('.') == -1, ( assert (
f"Invalid equation: `.` is found outside of an ellipsis.") labelstr.replace('...', '', 1).find('.') == -1
), f"Invalid equation: `.` is found outside of an ellipsis."
# Check shape. Note, in Paddle a tensor rank is always nonzero # Check shape. Note, in Paddle a tensor rank is always nonzero
ndims = len(operand.shape) ndims = len(operand.shape)
...@@ -65,8 +74,9 @@ def parse_op_labels(labelstr, operand): ...@@ -65,8 +74,9 @@ def parse_op_labels(labelstr, operand):
full_labelstr = labelstr.replace('...', '.' * (ndims - len(labelstr) + 3)) full_labelstr = labelstr.replace('...', '.' * (ndims - len(labelstr) + 3))
assert len(full_labelstr) == ndims, ( assert (
f"Invalid equation: the label string '{labelstr}' misses dimensions.") len(full_labelstr) == ndims
), f"Invalid equation: the label string '{labelstr}' misses dimensions."
return full_labelstr return full_labelstr
...@@ -90,7 +100,8 @@ def parse_labels(labelstr, operands): ...@@ -90,7 +100,8 @@ def parse_labels(labelstr, operands):
nop_labels = labelstr.split(',') nop_labels = labelstr.split(',')
assert len(nop_labels) == len(operands), ( assert len(nop_labels) == len(operands), (
f"Invalid equation: the number of operands is {len(operands)}, " f"Invalid equation: the number of operands is {len(operands)}, "
f"but found {len(nop_labels)} segments in the label equation.") f"but found {len(nop_labels)} segments in the label equation."
)
return list(map(parse_op_labels, nop_labels, operands)) return list(map(parse_op_labels, nop_labels, operands))
...@@ -101,8 +112,9 @@ def validate_rhs(rhs, input_labels, n_bcast_dims): ...@@ -101,8 +112,9 @@ def validate_rhs(rhs, input_labels, n_bcast_dims):
''' '''
# Sanity check. # Sanity check.
if n_bcast_dims > 0: if n_bcast_dims > 0:
assert '...' in rhs, ( assert (
f"Invalid equation: missing ellipsis in output labels.") '...' in rhs
), f"Invalid equation: missing ellipsis in output labels."
rhs = rhs.replace('...', '') rhs = rhs.replace('...', '')
rhs_set = set(rhs) rhs_set = set(rhs)
...@@ -114,10 +126,12 @@ def validate_rhs(rhs, input_labels, n_bcast_dims): ...@@ -114,10 +126,12 @@ def validate_rhs(rhs, input_labels, n_bcast_dims):
non_input_labels = rhs_set.difference(input_labels) non_input_labels = rhs_set.difference(input_labels)
assert not non_input_labels, ( assert not non_input_labels, (
f"Invalid equation: " f"Invalid equation: "
f"output label {sorted(non_input_labels)} not used by any input.") f"output label {sorted(non_input_labels)} not used by any input."
)
# Verify that output labels are not duplicate # Verify that output labels are not duplicate
assert len(rhs) == len(rhs_set), ( assert len(rhs) == len(
f"Invalid equation: duplicate output labels are found.") rhs_set
), f"Invalid equation: duplicate output labels are found."
def build_view(in_labels, out_labels): def build_view(in_labels, out_labels):
...@@ -159,8 +173,8 @@ def build_view(in_labels, out_labels): ...@@ -159,8 +173,8 @@ def build_view(in_labels, out_labels):
# fill the broadcast dimension indices from right to left. # fill the broadcast dimension indices from right to left.
if s: if s:
for ax, dim in zip( for ax, dim in zip(
range(start, end)[::-1], range(start, end)[::-1], range(s.start(), s.end())[::-1]
range(s.start(), s.end())[::-1]): ):
inv_map[ax] = dim inv_map[ax] = dim
# Now work on non-broadcast dimensions # Now work on non-broadcast dimensions
...@@ -219,7 +233,8 @@ def build_global_view(nop_labels, rhs, n_bcast_dims): ...@@ -219,7 +233,8 @@ def build_global_view(nop_labels, rhs, n_bcast_dims):
g_labels_out = rhs.replace('...', '.' * n_bcast_dims) g_labels_out = rhs.replace('...', '.' * n_bcast_dims)
else: else:
g_labels_out = '.' * n_bcast_dims + ''.join( g_labels_out = '.' * n_bcast_dims + ''.join(
l for l, c in zip(labels, count) if c == 1) l for l, c in zip(labels, count) if c == 1
)
for i in range(len(count))[::-1]: for i in range(len(count))[::-1]:
if labels[i] in g_labels_out: if labels[i] in g_labels_out:
...@@ -267,12 +282,14 @@ def build_global_shape(g_view, g_labels, op_shapes): ...@@ -267,12 +282,14 @@ def build_global_shape(g_view, g_labels, op_shapes):
assert not non_bcastable, ( assert not non_bcastable, (
f"Invalid operands: label {g_labels[non_bcastable[0]]} " f"Invalid operands: label {g_labels[non_bcastable[0]]} "
f"corresponds to non-broadcastable dimensions.") f"corresponds to non-broadcastable dimensions."
)
g_shape = [sizes.pop() if len(sizes) > 0 else 1 for sizes in g_shape] g_shape = [sizes.pop() if len(sizes) > 0 else 1 for sizes in g_shape]
g_masks = [[s > 1 or s == -1 for s in view_shape] g_masks = [
for view_shape in view_shapes] [s > 1 or s == -1 for s in view_shape] for view_shape in view_shapes
]
return g_shape, g_masks return g_shape, g_masks
...@@ -297,8 +314,9 @@ def diagonalize(labels, operand): ...@@ -297,8 +314,9 @@ def diagonalize(labels, operand):
-------- --------
'ijj...i' would be merged into 'ij...' 'ijj...i' would be merged into 'ij...'
''' '''
assert not has_duplicated_labels(labels), ( assert not has_duplicated_labels(
f'Duplicate labels are not supported.') labels
), f'Duplicate labels are not supported.'
return labels, operand return labels, operand
...@@ -358,12 +376,21 @@ def plan_matmul(plan, g_view, op1, op2, g_supports, g_shape, I, J1, J2, K): ...@@ -358,12 +376,21 @@ def plan_matmul(plan, g_view, op1, op2, g_supports, g_shape, I, J1, J2, K):
plan.add_step(step) plan.add_step(step)
# Check if conditions hold for turnning the operation into a matmul # Check if conditions hold for turnning the operation into a matmul
if j1 + j2 > 0 and k > 0 and -1 not in np.concatenate( if (
(op1_vshape, op2_vshape)): j1 + j2 > 0
op1_shape = list(op1_vshape[I]) + [np.prod(op1_vshape[J1]) and k > 0
] + [np.prod(op1_vshape[K])] and -1 not in np.concatenate((op1_vshape, op2_vshape))
op2_shape = list(op2_vshape[I]) + [np.prod(op2_vshape[J2]) ):
] + [np.prod(op2_vshape[K])] op1_shape = (
list(op1_vshape[I])
+ [np.prod(op1_vshape[J1])]
+ [np.prod(op1_vshape[K])]
)
op2_shape = (
list(op2_vshape[I])
+ [np.prod(op2_vshape[J2])]
+ [np.prod(op2_vshape[K])]
)
# Merge J dims and K dims by reshaping # Merge J dims and K dims by reshaping
step = reshape, [var1], var1, op1_shape step = reshape, [var1], var1, op1_shape
...@@ -412,15 +439,22 @@ def plan_matmul(plan, g_view, op1, op2, g_supports, g_shape, I, J1, J2, K): ...@@ -412,15 +439,22 @@ def plan_matmul(plan, g_view, op1, op2, g_supports, g_shape, I, J1, J2, K):
step = squeeze, [var2], var2, [-1, -2] step = squeeze, [var2], var2, [-1, -2]
plan.add_step(step) plan.add_step(step)
elif j1 + j2 == 0 and not -1 in np.concatenate( elif j1 + j2 == 0 and not -1 in np.concatenate(
(op1_vshape[K], op2_vshape[K])): (op1_vshape[K], op2_vshape[K])
):
assert all(op1_vshape[K] == op2_vshape[K]) assert all(op1_vshape[K] == op2_vshape[K])
step = reshape, [ step = (
var1 reshape,
], var1, list(op1_vshape[I]) + [1] + [np.prod(op1_vshape[K])] [var1],
var1,
list(op1_vshape[I]) + [1] + [np.prod(op1_vshape[K])],
)
plan.add_step(step) plan.add_step(step)
step = reshape, [ step = (
var2 reshape,
], var2, list(op2_vshape[I]) + [1] + [np.prod(op2_vshape[K])] [var2],
var2,
list(op2_vshape[I]) + [1] + [np.prod(op2_vshape[K])],
)
plan.add_step(step) plan.add_step(step)
step = matmul, [var1, var2], var2, False, True step = matmul, [var1, var2], var2, False, True
plan.add_step(step) plan.add_step(step)
...@@ -449,8 +483,9 @@ def plan_matmul(plan, g_view, op1, op2, g_supports, g_shape, I, J1, J2, K): ...@@ -449,8 +483,9 @@ def plan_matmul(plan, g_view, op1, op2, g_supports, g_shape, I, J1, J2, K):
g_view[op2] = list(op2_view) g_view[op2] = list(op2_view)
def plan_summation(plan, g_view, op1, op2, g_supports, g_shape, g_count, def plan_summation(
n_bcast): plan, g_view, op1, op2, g_supports, g_shape, g_count, n_bcast
):
''' '''
Plan various kinds of summation Plan various kinds of summation
''' '''
...@@ -464,8 +499,9 @@ def plan_summation(plan, g_view, op1, op2, g_supports, g_shape, g_count, ...@@ -464,8 +499,9 @@ def plan_summation(plan, g_view, op1, op2, g_supports, g_shape, g_count,
I, K, J1, J2 = list(range(n_bcast)), [], [], [] I, K, J1, J2 = list(range(n_bcast)), [], [], []
for ax, dim1, dim2 in zip(range(n_bcast, ndim), op1_view[n_bcast:], for ax, dim1, dim2 in zip(
op2_view[n_bcast:]): range(n_bcast, ndim), op1_view[n_bcast:], op2_view[n_bcast:]
):
if (dim1 != -1) != (dim2 != -1): if (dim1 != -1) != (dim2 != -1):
if dim1 != -1: if dim1 != -1:
...@@ -531,7 +567,6 @@ def plan_broadcast(plan, operands, nop_axes): ...@@ -531,7 +567,6 @@ def plan_broadcast(plan, operands, nop_axes):
class Plan: class Plan:
def __init__(self): def __init__(self):
self.env = {} self.env = {}
self.steps = [] self.steps = []
...@@ -635,8 +670,9 @@ def plan_einsum(operands, g_view, g_shape, g_supports, g_count, n_bcast): ...@@ -635,8 +670,9 @@ def plan_einsum(operands, g_view, g_shape, g_supports, g_count, n_bcast):
# op1 is a one element tensor. # op1 is a one element tensor.
plan_scalar_prod(plan, i - 1, i) plan_scalar_prod(plan, i - 1, i)
else: else:
plan_summation(plan, g_view, i - 1, i, g_supports, g_shape, g_count, plan_summation(
n_bcast) plan, g_view, i - 1, i, g_supports, g_shape, g_count, n_bcast
)
# for ax, dim in enumerate(g_view[nop-1][:nout]): # for ax, dim in enumerate(g_view[nop-1][:nout]):
# assert dim == ax # assert dim == ax
...@@ -678,7 +714,9 @@ def preprocess(equation, *operands): ...@@ -678,7 +714,9 @@ def preprocess(equation, *operands):
""" """
equation = equation.replace(" ", "") equation = equation.replace(" ", "")
nop = len(operands) nop = len(operands)
assert nop > 0, "Required at least one operand in Einsum API, but received %s " % nop assert nop > 0, (
"Required at least one operand in Einsum API, but received %s " % nop
)
# Part the equation to left hand side and right hand side # Part the equation to left hand side and right hand side
lhs, *rhs = equation.lower().split('->') lhs, *rhs = equation.lower().split('->')
...@@ -692,22 +730,27 @@ def preprocess(equation, *operands): ...@@ -692,22 +730,27 @@ def preprocess(equation, *operands):
assert len(lhs.split(',')) == len(operands), ( assert len(lhs.split(',')) == len(operands), (
f"Invalid equation: the number of operands is {len(operands)}, " f"Invalid equation: the number of operands is {len(operands)}, "
f"but found {len(lhs.split(','))} segments in the label equation.") f"but found {len(lhs.split(','))} segments in the label equation."
)
assert not ('...' in lhs and '...' not in rhs assert not (
'...' in lhs and '...' not in rhs
), f'Invalid equation: missing ellipsis in output labels.' ), f'Invalid equation: missing ellipsis in output labels.'
assert not (len(list(filter(has_duplicated_labels, lhs.split(',')))) > assert not (
0), f'Duplicate labels are not supported.' len(list(filter(has_duplicated_labels, lhs.split(',')))) > 0
), f'Duplicate labels are not supported.'
assert not has_duplicated_labels( assert not has_duplicated_labels(
rhs), f'Invalid equation: duplicate output labels are found.' rhs
), f'Invalid equation: duplicate output labels are found.'
return lhs, rhs, labels return lhs, rhs, labels
def parse_fake_shape(equation, operands, labels): def parse_fake_shape(equation, operands, labels):
""" """
this shape is just used for operands planning. may differ with the original shape. this shape is just used for operands planning. may differ with the original shape.
for example: for example:
... is replaced by 1 ... is replaced by 1
...@@ -715,14 +758,15 @@ def parse_fake_shape(equation, operands, labels): ...@@ -715,14 +758,15 @@ def parse_fake_shape(equation, operands, labels):
Results Results
------- -------
list of shape list of shape
""" """
shaped = collections.namedtuple('shaped', ['shape']) shaped = collections.namedtuple('shaped', ['shape'])
def fake_shape(label, op): def fake_shape(label, op):
assert len(op.shape) == len( assert len(op.shape) == len(label), (
label "length of shape and length of label must be the same, but received %d != %d"
), "length of shape and length of label must be the same, but received %d != %d" % ( % (len(op.shape), len(label))
len(op.shape), len(label)) )
fakes = [s for i, (l, s) in enumerate(zip(label, op.shape)) if l != '.'] fakes = [s for i, (l, s) in enumerate(zip(label, op.shape)) if l != '.']
fakes = list(map(abs, fakes)) # make -1 -> 1 fakes = list(map(abs, fakes)) # make -1 -> 1
if '.' in label: if '.' in label:
...@@ -734,7 +778,6 @@ def parse_fake_shape(equation, operands, labels): ...@@ -734,7 +778,6 @@ def parse_fake_shape(equation, operands, labels):
def rhs_inference(lhs): def rhs_inference(lhs):
def is_free(key): def is_free(key):
return cnt.get(key) == 1 and key not in ['.', ','] return cnt.get(key) == 1 and key not in ['.', ',']
...@@ -753,7 +796,8 @@ def gen_equation_for_opteinsum(lhs, rhs): ...@@ -753,7 +796,8 @@ def gen_equation_for_opteinsum(lhs, rhs):
def get_used_label(counter): def get_used_label(counter):
used = set(counter.elements()) used = set(counter.elements())
for c in string.ascii_lowercase: for c in string.ascii_lowercase:
if c not in used: return c if c not in used:
return c
raise ValueError( raise ValueError(
"You have used all `a` - `z`, there can't find a unused for einsum optimization" "You have used all `a` - `z`, there can't find a unused for einsum optimization"
) )
...@@ -786,14 +830,15 @@ def einsum_v2(equation, *operands): ...@@ -786,14 +830,15 @@ def einsum_v2(equation, *operands):
var_list = list(operands) var_list = list(operands)
for path in cons: for path in cons:
(a, b), _, eq, *__ = path (a, b), _, eq, *__ = path
assert a > b, "Assume the first var_idx is smaller than the second_idx. opt_einsum can guarantee it." assert (
a > b
), "Assume the first var_idx is smaller than the second_idx. opt_einsum can guarantee it."
var_s = [var_list.pop(a), var_list.pop(b)] var_s = [var_list.pop(a), var_list.pop(b)]
eq = eq.replace(broadcast_label, "...") eq = eq.replace(broadcast_label, "...")
var_list.append(gen_einsum_op(eq, *var_s)) var_list.append(gen_einsum_op(eq, *var_s))
assert len( assert (
var_list len(var_list) == 1
) == 1, "There must be one elements in list, but received %d." % len( ), "There must be one elements in list, but received %d." % len(var_list)
var_list)
return var_list[0] return var_list[0]
...@@ -807,8 +852,9 @@ def gen_einsum_op(equation, *operands): ...@@ -807,8 +852,9 @@ def gen_einsum_op(equation, *operands):
if _in_legacy_dygraph(): if _in_legacy_dygraph():
# dygraph # dygraph
return _legacy_C_ops.einsum(operands, len(operands), len(operands), return _legacy_C_ops.einsum(
'equation', equation)[0] operands, len(operands), len(operands), 'equation', equation
)[0]
for inp in operands: for inp in operands:
check_variable_and_dtype(inp, 'dtype', ['float32', 'float64'], 'einsum') check_variable_and_dtype(inp, 'dtype', ['float32', 'float64'], 'einsum')
...@@ -825,19 +871,18 @@ def gen_einsum_op(equation, *operands): ...@@ -825,19 +871,18 @@ def gen_einsum_op(equation, *operands):
helper.create_variable_for_type_inference(dtype=operands[0].dtype) helper.create_variable_for_type_inference(dtype=operands[0].dtype)
for i in range(len(operands)) for i in range(len(operands))
] ]
helper.append_op(type='einsum', helper.append_op(
type='einsum',
inputs={'Operands': operands}, inputs={'Operands': operands},
outputs={ outputs={'Out': out, "InnerCache": caches, "XShape": xshape},
'Out': out, attrs=attrs,
"InnerCache": caches, )
"XShape": xshape
},
attrs=attrs)
return out return out
def einsum(equation, *operands): def einsum(equation, *operands):
r""" r"""
einsum(equation, *operands) einsum(equation, *operands)
The current version of this API should be used in dygraph only mode. The current version of this API should be used in dygraph only mode.
...@@ -873,8 +918,7 @@ def einsum(equation, *operands): ...@@ -873,8 +918,7 @@ def einsum(equation, *operands):
dimensions into broadcasting dimensions. dimensions into broadcasting dimensions.
- Singular labels are called free labels, duplicate are dummy labels. Dummy labeled - Singular labels are called free labels, duplicate are dummy labels. Dummy labeled
dimensions will be reduced and removed in the output. dimensions will be reduced and removed in the output.
- Output labels can be explicitly specified on the right hand side of `->` or omitted. - Output labels can be explicitly specified on the right hand side of `->` or omitted. In the latter case, the output labels will be inferred from the input labels.
In the latter case, the output labels will be inferred from the input labels.
- Inference of output labels - Inference of output labels
- Broadcasting label `...`, if present, is put on the leftmost position. - Broadcasting label `...`, if present, is put on the leftmost position.
- Free labels are reordered alphabetically and put after `...`. - Free labels are reordered alphabetically and put after `...`.
...@@ -884,10 +928,11 @@ def einsum(equation, *operands): ...@@ -884,10 +928,11 @@ def einsum(equation, *operands):
the sum over the original output. the sum over the original output.
- Non-input labels are invalid. - Non-input labels are invalid.
- Duplicate labels are invalid. - Duplicate labels are invalid.
- For any dummmy label which is present for the output, it's promoted to - For any dummy label which is present for the output, it's promoted to
a free label. a free label.
- For any free label which is not present for the output, it's lowered to - For any free label which is not present for the output, it's lowered to
a dummy label. a dummy label.
- Examples - Examples
- '...ij, ...jk', where i and k are free labels, j is dummy. The output label - '...ij, ...jk', where i and k are free labels, j is dummy. The output label
string is '...ik' string is '...ik'
...@@ -920,7 +965,7 @@ def einsum(equation, *operands): ...@@ -920,7 +965,7 @@ def einsum(equation, *operands):
operands should equal the number of input terms in the equation. operands should equal the number of input terms in the equation.
Returns: Returns:
result (`Tensor`): the result tensor. result (`Tensor`), the result tensor.
Examples: Examples:
.. code-block:: python .. code-block:: python
...@@ -992,8 +1037,10 @@ def einsum(equation, *operands): ...@@ -992,8 +1037,10 @@ def einsum(equation, *operands):
# [[0.32043904, 0.18164253, 0.27810261], # [[0.32043904, 0.18164253, 0.27810261],
# [0.50226176, 0.24512935, 0.39881429], # [0.50226176, 0.24512935, 0.39881429],
# [0.51476848, 0.23367381, 0.39229113]]]) # [0.51476848, 0.23367381, 0.39229113]]])
""" """
import os import os
if int(os.environ.get('FLAGS_new_einsum', "1")): if int(os.environ.get('FLAGS_new_einsum', "1")):
return einsum_v2(equation, *operands) return einsum_v2(equation, *operands)
...@@ -1039,9 +1086,11 @@ def einsum(equation, *operands): ...@@ -1039,9 +1086,11 @@ def einsum(equation, *operands):
# Counting how many non-trivial dimensions remain for each ax # Counting how many non-trivial dimensions remain for each ax
g_labels, g_view, g_nout, g_count = build_global_view( g_labels, g_view, g_nout, g_count = build_global_view(
nop_labels, rhs, n_bcast_dims) nop_labels, rhs, n_bcast_dims
g_shape, g_supports = build_global_shape(g_view, g_labels, )
[op.shape for op in operands]) g_shape, g_supports = build_global_shape(
g_view, g_labels, [op.shape for op in operands]
)
# Now we're ready to build up an execution plan # Now we're ready to build up an execution plan
args = operands, g_view, g_shape, g_supports, g_count, n_bcast_dims args = operands, g_view, g_shape, g_supports, g_count, n_bcast_dims
......
...@@ -1912,12 +1912,15 @@ def mv(x, vec, name=None): ...@@ -1912,12 +1912,15 @@ def mv(x, vec, name=None):
def det(x, name=None): def det(x, name=None):
""" """
Calculates determinant value of a square matrix or batches of square matrices. Calculates determinant value of a square matrix or batches of square matrices.
Args: Args:
x (Tensor): input (Tensor): the input matrix of size `(n, n)` or the x (Tensor): the input matrix of size `(n, n)` or the
batch of matrices of size `(*, n, n)` where `*` is one or more batch of matrices of size `(*, n, n)` where `*` is one or more
batch dimensions. batch dimensions.
name(str, optional): Name of the output. Default is None. It's used
to print debug info for developers. Details: :ref:`api_guide_Name`
Returns: Returns:
Tensor, the determinant value of a square matrix or batches of square matrices. Tensor, the determinant value of a square matrix or batches of square matrices.
...@@ -1968,18 +1971,20 @@ def det(x, name=None): ...@@ -1968,18 +1971,20 @@ def det(x, name=None):
def slogdet(x, name=None): def slogdet(x, name=None):
""" """
Calculates the sign and natural logarithm of the absolute value of a square matrix's or batches square matrices' determinant. Calculates the sign and natural logarithm of the absolute value of a square matrix's or batches square matrices' determinant.
The determinant can be computed with ``sign * exp(logabsdet) The determinant can be computed with ``sign * exp`` (logabsdet)
Supports input of float, double Supports input of float, double
Note that for matrices that have zero determinant, this returns ``(0, -inf)`` Note that for matrices that have zero determinant, this returns ``(0, -inf)``
Args: Args:
x (Tensor): the batch of matrices of size :math:`(*, n, n)` x (Tensor): the batch of matrices of size :math:`(*, n, n)`
where math:`*` is one or more batch dimensions. where math:`*` is one or more batch dimensions.
Returns: Returns:
y (Tensor): A tensor containing the sign of the determinant and the natural logarithm y (Tensor), A tensor containing the sign of the determinant and the natural logarithm
of the absolute value of determinant, respectively. of the absolute value of determinant, respectively.
Examples: Examples:
...@@ -2097,6 +2102,7 @@ def svd(x, full_matrices=False, name=None): ...@@ -2097,6 +2102,7 @@ def svd(x, full_matrices=False, name=None):
def matrix_power(x, n, name=None): def matrix_power(x, n, name=None):
r""" r"""
Computes the n-th power of a square matrix or a batch of square matrices. Computes the n-th power of a square matrix or a batch of square matrices.
Let :math:`X` be a sqaure matrix or a batch of square matrices, :math:`n` be Let :math:`X` be a sqaure matrix or a batch of square matrices, :math:`n` be
...@@ -2122,7 +2128,7 @@ def matrix_power(x, n, name=None): ...@@ -2122,7 +2128,7 @@ def matrix_power(x, n, name=None):
For more information, please refer to :ref:`api_guide_Name`. For more information, please refer to :ref:`api_guide_Name`.
Returns: Returns:
Tensor: The n-th power of the matrix (or the batch of matrices) `x`. Its - Tensor, The n-th power of the matrix (or the batch of matrices) `x`. Its
data type should be the same as that of `x`. data type should be the same as that of `x`.
Examples: Examples:
...@@ -3058,8 +3064,9 @@ def pinv(x, rcond=1e-15, hermitian=False, name=None): ...@@ -3058,8 +3064,9 @@ def pinv(x, rcond=1e-15, hermitian=False, name=None):
def solve(x, y, name=None): def solve(x, y, name=None):
r""" r"""
Computes the solution of a square system of linear equations with a unique solution for input 'X' and 'Y'. Computes the solution of a square system of linear equations with a unique solution for input 'X' and 'Y'.
Let :math: `X` be a sqaure matrix or a batch of square matrices, :math:`Y` be Let :math:`X` be a sqaure matrix or a batch of square matrices, :math:`Y` be
a vector/matrix or a batch of vectors/matrices, the equation should be: a vector/matrix or a batch of vectors/matrices, the equation should be:
.. math:: .. math::
...@@ -3068,9 +3075,9 @@ def solve(x, y, name=None): ...@@ -3068,9 +3075,9 @@ def solve(x, y, name=None):
Specifically, this system of linear equations has one solution if and only if input 'X' is invertible. Specifically, this system of linear equations has one solution if and only if input 'X' is invertible.
Args: Args:
x (Tensor): A square matrix or a batch of square matrices. Its shape should be `[*, M, M]`, where `*` is zero or x (Tensor): A square matrix or a batch of square matrices. Its shape should be ``[*, M, M]``, where ``*`` is zero or
more batch dimensions. Its data type should be float32 or float64. more batch dimensions. Its data type should be float32 or float64.
y (Tensor): A vector/matrix or a batch of vectors/matrices. Its shape should be `[*, M, K]`, where `*` is zero or y (Tensor): A vector/matrix or a batch of vectors/matrices. Its shape should be ``[*, M, K]``, where ``*`` is zero or
more batch dimensions. Its data type should be float32 or float64. more batch dimensions. Its data type should be float32 or float64.
name(str, optional): Name for the operation (optional, default is None). name(str, optional): Name for the operation (optional, default is None).
For more information, please refer to :ref:`api_guide_Name`. For more information, please refer to :ref:`api_guide_Name`.
......
...@@ -223,7 +223,8 @@ def scale(x, scale=1.0, bias=0.0, bias_after_scale=True, act=None, name=None): ...@@ -223,7 +223,8 @@ def scale(x, scale=1.0, bias=0.0, bias_after_scale=True, act=None, name=None):
def stanh(x, scale_a=0.67, scale_b=1.7159, name=None): def stanh(x, scale_a=0.67, scale_b=1.7159, name=None):
""" r"""
stanh activation. stanh activation.
.. math:: .. math::
...@@ -234,8 +235,7 @@ def stanh(x, scale_a=0.67, scale_b=1.7159, name=None): ...@@ -234,8 +235,7 @@ def stanh(x, scale_a=0.67, scale_b=1.7159, name=None):
x (Tensor): The input Tensor with data type float32, float64. x (Tensor): The input Tensor with data type float32, float64.
scale_a (float, optional): The scale factor a of the input. Default is 0.67. scale_a (float, optional): The scale factor a of the input. Default is 0.67.
scale_b (float, optional): The scale factor b of the output. Default is 1.7159. scale_b (float, optional): The scale factor b of the output. Default is 1.7159.
name (str, optional): Name for the operation (optional, default is None). name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
For more information, please refer to :ref:`api_guide_Name`.
Returns: Returns:
A Tensor with the same data type and shape as ``x`` . A Tensor with the same data type and shape as ``x`` .
......
...@@ -1301,6 +1301,7 @@ def distribute_fpn_proposals( ...@@ -1301,6 +1301,7 @@ def distribute_fpn_proposals(
name=None, name=None,
): ):
r""" r"""
In Feature Pyramid Networks (FPN) models, it is needed to distribute In Feature Pyramid Networks (FPN) models, it is needed to distribute
all proposals into different FPN level, with respect to scale of the proposals, all proposals into different FPN level, with respect to scale of the proposals,
the referring scale and the referring level. Besides, to restore the order of the referring scale and the referring level. Besides, to restore the order of
...@@ -1308,8 +1309,9 @@ def distribute_fpn_proposals( ...@@ -1308,8 +1309,9 @@ def distribute_fpn_proposals(
in current proposals. To compute FPN level for each roi, the formula is given as follows: in current proposals. To compute FPN level for each roi, the formula is given as follows:
.. math:: .. math::
roi\_scale &= \sqrt{BBoxArea(fpn\_roi)} roi\_scale &= \sqrt{BBoxArea(fpn\_roi)} \\
level = floor(&\log(\\frac{roi\_scale}{refer\_scale}) + refer\_level) level &= floor(\log(\frac{roi\_scale}{refer\_scale}) + refer\_level)
where BBoxArea is a function to compute the area of each roi. where BBoxArea is a function to compute the area of each roi.
Args: Args:
...@@ -1333,11 +1335,11 @@ def distribute_fpn_proposals( ...@@ -1333,11 +1335,11 @@ def distribute_fpn_proposals(
None by default. None by default.
Returns: Returns:
multi_rois (List) : The proposals in each FPN level. It is a list of 2-D Tensor with shape [M, 4], where M is - multi_rois (List), The proposals in each FPN level. It is a list of 2-D Tensor with shape [M, 4], where M is
and data type is same as `fpn_rois` . The length is max_level-min_level+1. and data type is same as `fpn_rois` . The length is max_level-min_level+1.
restore_ind (Tensor): The index used to restore the order of fpn_rois. It is a 2-D Tensor with shape [N, 1] - restore_ind (Tensor), The index used to restore the order of fpn_rois. It is a 2-D Tensor with shape [N, 1]
, where N is the number of total rois. The data type is int32. , where N is the number of total rois. The data type is int32.
rois_num_per_level (List): A list of 1-D Tensor and each Tensor is - rois_num_per_level (List), A list of 1-D Tensor and each Tensor is
the RoIs' number in each image on the corresponding level. The shape the RoIs' number in each image on the corresponding level. The shape
is [B] and data type of int32, where B is the number of images. is [B] and data type of int32, where B is the number of images.
...@@ -1356,6 +1358,7 @@ def distribute_fpn_proposals( ...@@ -1356,6 +1358,7 @@ def distribute_fpn_proposals(
refer_level=4, refer_level=4,
refer_scale=224, refer_scale=224,
rois_num=rois_num) rois_num=rois_num)
""" """
num_lvl = max_level - min_level + 1 num_lvl = max_level - min_level + 1
...@@ -2441,6 +2444,7 @@ def matrix_nms( ...@@ -2441,6 +2444,7 @@ def matrix_nms(
name=None, name=None,
): ):
""" """
This operator does matrix non maximum suppression (NMS). This operator does matrix non maximum suppression (NMS).
First selects a subset of candidate bounding boxes that have higher scores First selects a subset of candidate bounding boxes that have higher scores
than score_threshold (if provided), then the top k candidate is selected if than score_threshold (if provided), then the top k candidate is selected if
...@@ -2448,6 +2452,7 @@ def matrix_nms( ...@@ -2448,6 +2452,7 @@ def matrix_nms(
decayed according to the Matrix NMS scheme. decayed according to the Matrix NMS scheme.
Aftern NMS step, at most keep_top_k number of total bboxes are to be kept Aftern NMS step, at most keep_top_k number of total bboxes are to be kept
per image if keep_top_k is larger than -1. per image if keep_top_k is larger than -1.
Args: Args:
bboxes (Tensor): A 3-D Tensor with shape [N, M, 4] represents the bboxes (Tensor): A 3-D Tensor with shape [N, M, 4] represents the
predicted locations of M bounding bboxes, predicted locations of M bounding bboxes,
...@@ -2471,29 +2476,32 @@ def matrix_nms( ...@@ -2471,29 +2476,32 @@ def matrix_nms(
on score_threshold. on score_threshold.
keep_top_k (int): Number of total bboxes to be kept per image after NMS keep_top_k (int): Number of total bboxes to be kept per image after NMS
step. -1 means keeping all bboxes after NMS step. step. -1 means keeping all bboxes after NMS step.
use_gaussian (bool): Use Gaussian as the decay function. Default: False use_gaussian (bool, optional): Use Gaussian as the decay function. Default: False
gaussian_sigma (float): Sigma for Gaussian decay function. Default: 2.0 gaussian_sigma (float, optional): Sigma for Gaussian decay function. Default: 2.0
background_label (int): The index of background label, the background background_label (int, optional): The index of background label, the background
label will be ignored. If set to -1, then all label will be ignored. If set to -1, then all
categories will be considered. Default: 0 categories will be considered. Default: 0
normalized (bool): Whether detections are normalized. Default: True normalized (bool, optional): Whether detections are normalized. Default: True
return_index(bool): Whether return selected index. Default: False return_index(bool, optional): Whether return selected index. Default: False
return_rois_num(bool): whether return rois_num. Default: True return_rois_num(bool, optional): whether return rois_num. Default: True
name(str): Name of the matrix nms op. Default: None. name(str, optional): Name of the matrix nms op. Default: None.
Returns: Returns:
A tuple with three Tensor: (Out, Index, RoisNum) if return_index is True, - A tuple with three Tensor, (Out, Index, RoisNum) if return_index is True,
otherwise, a tuple with two Tensor (Out, RoisNum) is returned. otherwise, a tuple with two Tensor (Out, RoisNum) is returned.
Out (Tensor): A 2-D Tensor with shape [No, 6] containing the - Out (Tensor), A 2-D Tensor with shape [No, 6] containing the
detection results. detection results.
Each row has 6 values: [label, confidence, xmin, ymin, xmax, ymax] Each row has 6 values, [label, confidence, xmin, ymin, xmax, ymax]
Index (Tensor): A 2-D Tensor with shape [No, 1] containing the - Index (Tensor), A 2-D Tensor with shape [No, 1] containing the
selected indices, which are absolute values cross batches. selected indices, which are absolute values cross batches.
rois_num (Tensor): A 1-D Tensor with shape [N] containing - rois_num (Tensor), A 1-D Tensor with shape [N] containing
the number of detected boxes in each image. the number of detected boxes in each image.
Examples: Examples:
.. code-block:: python .. code-block:: python
import paddle import paddle
from paddle.vision.ops import matrix_nms from paddle.vision.ops import matrix_nms
boxes = paddle.rand([4, 1, 4]) boxes = paddle.rand([4, 1, 4])
boxes[..., 2] = boxes[..., 0] + boxes[..., 2] boxes[..., 2] = boxes[..., 0] + boxes[..., 2]
boxes[..., 3] = boxes[..., 1] + boxes[..., 3] boxes[..., 3] = boxes[..., 1] + boxes[..., 3]
...@@ -2501,6 +2509,7 @@ def matrix_nms( ...@@ -2501,6 +2509,7 @@ def matrix_nms(
out = matrix_nms(bboxes=boxes, scores=scores, background_label=0, out = matrix_nms(bboxes=boxes, scores=scores, background_label=0,
score_threshold=0.5, post_threshold=0.1, score_threshold=0.5, post_threshold=0.1,
nms_top_k=400, keep_top_k=200, normalized=False) nms_top_k=400, keep_top_k=200, normalized=False)
""" """
check_variable_and_dtype( check_variable_and_dtype(
bboxes, 'BBoxes', ['float32', 'float64'], 'matrix_nms' bboxes, 'BBoxes', ['float32', 'float64'], 'matrix_nms'
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册