未验证 提交 7fa415ca 编写于 作者: W Wang Xin 提交者: GitHub

remove ASCEND* keyword (#53046)

* remove ASCEND* keyword

* update docstring

* bug fixed

* bug fixed
上级 a44d8555
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/ascend_trigger_op.h"
namespace paddle {
namespace operators {
class AscendTriggerOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {}
protected:
phi::KernelKey GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
return phi::KernelKey(framework::proto::VarType::FP32,
ctx.device_context().GetPlace());
}
};
class AscendTriggerOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput("FeedList", "FeedList of Ascend SubGraph").AsDuplicable();
AddOutput("FetchList", "FetchList of Ascend SubGraph").AsDuplicable();
AddAttr<int>("graph_idx", "(int, the graph index").SetDefault(-1);
AddComment(R"DOC(
Trigger Ascend SubGraph
)DOC");
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(ascend_trigger,
ops::AscendTriggerOp,
ops::AscendTriggerOpMaker);
PD_REGISTER_STRUCT_KERNEL(
ascend_trigger, CPU, ALL_LAYOUT, ops::AscendTriggerCPUKernel, float) {}
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <memory>
#include <vector>
#include "paddle/fluid/framework/op_registry.h"
namespace paddle {
namespace operators {
template <typename T, typename DeviceContext>
class AscendTriggerCPUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &ctx) const override {
PADDLE_THROW(platform::errors::PreconditionNotMet(
"Please compile WITH_ASCEND option to enable ascend_trigger op"));
}
};
} // namespace operators
} // namespace paddle
......@@ -137,15 +137,6 @@ class CAllReduceOpCPUKernel : public framework::OpKernel<T> {
template <typename T, typename DeviceContext> \
class op_name##CPUKernel : public CAllReduceOpCPUKernel<red_type, T> {};
template <ReduceType red_type, typename T>
class CAllReduceOpASCENDKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
PADDLE_THROW(platform::errors::PreconditionNotMet(
"PaddlePaddle should compile with NPU."));
}
};
template <ReduceType red_type, typename T>
class CAllReduceOpXPUKernel : public framework::OpKernel<T> {
public:
......
......@@ -127,15 +127,6 @@ class CReduceOpCPUKernel : public framework::OpKernel<T> {
template <typename T, typename DeviceContext> \
class op_name##CPUKernel : public CReduceOpCPUKernel<red_type, T> {};
template <ReduceType red_type, typename T>
class CReduceOpASCENDKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
PADDLE_THROW(platform::errors::PreconditionNotMet(
"PaddlePaddle should compile with NPU."));
}
};
template <ReduceType red_type, typename T>
class CReduceOpXPUKernel : public framework::OpKernel<T> {
public:
......
......@@ -49,7 +49,6 @@ register_unity_group(
cos_sim_op.cc
crf_decoding_op.cc
crop_op.cc
ascend_trigger_op.cc
conj_op.cc
imag_op.cc
kldiv_loss_op.cc
......
......@@ -36,13 +36,3 @@ DEFINE_INT_STATUS(STAT_gpu12_mem_size)
DEFINE_INT_STATUS(STAT_gpu13_mem_size)
DEFINE_INT_STATUS(STAT_gpu14_mem_size)
DEFINE_INT_STATUS(STAT_gpu15_mem_size)
// For Ascend NPU
DEFINE_INT_STATUS(STAT_npu0_mem_size)
DEFINE_INT_STATUS(STAT_npu1_mem_size)
DEFINE_INT_STATUS(STAT_npu2_mem_size)
DEFINE_INT_STATUS(STAT_npu3_mem_size)
DEFINE_INT_STATUS(STAT_npu4_mem_size)
DEFINE_INT_STATUS(STAT_npu5_mem_size)
DEFINE_INT_STATUS(STAT_npu6_mem_size)
DEFINE_INT_STATUS(STAT_npu7_mem_size)
......@@ -45,9 +45,6 @@ inline size_t Alignment(size_t size,
#endif
}
}
if (place.GetType() == phi::AllocationType::NPU) {
size += 32; // required by ascendcl
}
size_t remaining = size % alignment;
return remaining == 0 ? size : size + (alignment - remaining);
}
......
......@@ -1153,16 +1153,6 @@ PADDLE_DEFINE_EXPORTED_string(jit_engine_type,
"Predictor",
"Choose default funciton type in JitLayer.");
/**
* Custom Device NPU related FLAG
* Name: FLAGS_npu_storage_format
* Since Version: 2.5.0
* Value Range: bool, default=false
* Example:
* Note: Enable NPU Storage Format for Ascend910 performance improvement.
*/
PADDLE_DEFINE_EXPORTED_bool(npu_storage_format, false, "");
#ifdef PADDLE_WITH_CUDNN_FRONTEND
/**
* CUDNNv8 related FLAG
......
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/npu_identity_kernel.h"
#include "glog/logging.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/empty_kernel.h"
namespace phi {
template <typename T, typename Context>
void NPUIdentityKernel(const Context& dev_ctx,
const DenseTensor& x,
const int format,
DenseTensor* out) {
VLOG(4) << "npu_identity op is only for NPU, please avoid using this kernel!";
out->ShareDataWith(x);
}
} // namespace phi
/** [ Why need npu_identity op? ]
*
* 1. Ascend CANN use internal storage format for high performance
* computing, for example if run BatchNorm2D op with CANN internal
* storage format ACL_FORMAT_NC1HWC0, time costs in transdata will
* be removed, and at will gain 2x performance improvement.
*
* 2.The internal storage format will use storage_properties_ in
* DenseTensor, and will change the size and layout of denser, and
* finally it should be called when change tensor to numpy and restore
* original size and format by calling CANN Identity OP.
*
* TODO(qili93): remove this op after custom op and custom device
* integrated and then move this op along with its code to plugin.
*/
PD_REGISTER_KERNEL(npu_identity,
CPU,
ALL_LAYOUT,
phi::NPUIdentityKernel,
float,
double,
int8_t,
uint8_t,
int16_t,
int,
int64_t,
bool,
phi::dtype::float16) {}
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
PD_REGISTER_KERNEL(npu_identity,
GPU,
ALL_LAYOUT,
phi::NPUIdentityKernel,
float,
double,
int8_t,
uint8_t,
int16_t,
int,
int64_t,
bool,
phi::dtype::float16) {}
#endif
......@@ -269,11 +269,6 @@ class ShardingOptimizer(MetaOptimizerBase):
self._gradient_merge_acc_step = gm_acc_step
self._optimizer_sharding = optimizer_sharding
# this feature is design for ascend, and should NOT be used in GPU training
self.pp_allreduce_in_optimize = sharding_configs[
"pp_allreduce_in_optimize"
]
def _inner_opt_minimize(
self, loss, startup_program, parameter_list, no_grad_set
):
......
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from paddle.framework import core
from . import collective
OpRole = core.op_proto_and_checker_maker.OpRole
from paddle.distributed import fleet
class AscendTranspiler(collective.Collective):
def __init__(self, startup_program, main_program):
self.nrings = 1
super().__init__(self.nrings)
self._startup_program = startup_program
self._main_program = main_program
def _insert_allreduce_ops(self):
block = self._main_program.global_block()
ring_id = -1
grad = None
for idx, op in reversed(list(enumerate(block.ops))):
if (
self._is_backward_op(op)
and self.op_role_var_key in op.attr_names
):
op_role_var = op.all_attrs()[self.op_role_var_key]
if len(op_role_var) == 0:
continue
assert len(op_role_var) % 2 == 0
offset = idx
for i in range(0, len(op_role_var), 2):
param = block.vars[op_role_var[i]]
grad = block.vars[op_role_var[i + 1]]
if param.is_distributed:
continue
# As we search ops reversedly, we should insert c_allreduce_sum
# op in the same way to keep the ring_id alternate
ring_id = (ring_id + 1) % self.nrings
block._insert_op(
offset + 1,
type='c_allreduce_sum',
inputs={'X': grad},
outputs={'Out': grad},
attrs={
'ring_id': ring_id,
self.op_role_key: OpRole.Backward,
},
)
block._insert_op(
offset + 2,
type='scale',
inputs={'X': grad},
outputs={'Out': grad},
attrs={
'scale': 1.0 / fleet.worker_num(),
self.op_role_key: OpRole.Backward,
},
)
if grad is None:
return
def transpile(self):
self._insert_allreduce_ops()
......@@ -377,12 +377,6 @@ def monkey_patch_varbase():
return None
new_ivar = self._grad_ivar()
# TODO(qili93): temporary for ascned npu performance to be removed along with npu_identity op
if (
_global_flags()['FLAGS_npu_storage_format']
and 'npu' in get_all_custom_device_type()
):
new_ivar = paddle.incubate._npu_identity(x=new_ivar, format=-1)
new_ivar = new_ivar._copy_to(core.CPUPlace(), True)
if self._grad_ivar().type == core.VarDesc.VarType.SELECTED_ROWS:
return (
......
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import paddle
from paddle import fluid
class TestAscendTriggerOP(unittest.TestCase):
"""TestCases for ascend_trigger op"""
def test_ascend_trigger_op(self):
paddle.enable_static()
program = fluid.Program()
block = program.global_block()
with fluid.program_guard(program):
x = paddle.static.data(
name='x', shape=[1], dtype='int64', lod_level=0
)
y = paddle.static.data(
name='y', shape=[1], dtype='int64', lod_level=0
)
block.append_op(
type="ascend_trigger",
inputs={"FeedList": [x]},
outputs={"FetchList": [y]},
attrs={'graph_idx': 0},
)
exe = paddle.static.Executor(paddle.CPUPlace())
try:
exe.run(program)
except RuntimeError as e:
pass
except:
self.assertTrue(False)
paddle.disable_static()
if __name__ == '__main__':
unittest.main()
#!/bin/bash
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
set -e
RANK_TABLE_FILE_NAME="rank_table_file.json"
cat > ${RANK_TABLE_FILE_NAME} <<EOF
{
"status": "completed",
"version": "1.0",
"server_count": "1",
"server_list": [
{
"server_id": "127.0.0.1",
"device": [
{
"device_id": "0",
"device_ip": "192.1.184.23",
"rank_id": "0"
},
{
"device_id": "1",
"device_ip": "192.2.21.93",
"rank_id": "1"
}
]
}
]
}
EOF
# set ascend rank table file env
export RANK_TABLE_FILE="${PWD}/${RANK_TABLE_FILE_NAME}"
# use ascend
echo "begin test use ascend npu"
distributed_args="--run_mode=collective --log_dir=testlog"
python -m paddle.distributed.fleet.launch ${distributed_args} ascend_multi_process_collective.py fleetlaunchascend
str1="selected_accelerators:0 selected_npus:0 worker_endpoints:127.0.0.1:6170,127.0.0.1:6171 trainers_num:2 current_endpoint:127.0.0.1:6170 trainer_id:0 device_ids:0,1 device_id:0"
str2="selected_accelerators:1 selected_npus:1 worker_endpoints:127.0.0.1:6170,127.0.0.1:6171 trainers_num:2 current_endpoint:127.0.0.1:6171 trainer_id:1 device_ids:0,1 device_id:1"
file_0="multi_process_fleetlaunchascend.check_0.log"
file_1="multi_process_fleetlaunchascend.check_1.log"
echo "paddlecloud params test"
if grep -q "$str1" "$file_0"; then
echo "find trainer 0"
else
echo "not find trainer 0"
exit -1
fi
if grep -q "$str2" "$file_1"; then
echo "find trainer 1"
else
echo "not find trainer 1"
exit -1
fi
# test async poll process
if [ -f $file_0 ]; then
rm $file_0
fi
if [ -f $file_1 ]; then
rm $file_1
fi
......@@ -27,7 +27,6 @@ from .tensor import segment_sum
from .tensor import segment_mean
from .tensor import segment_max
from .tensor import segment_min
from .tensor import _npu_identity
from .passes import fuse_resnet_unit_pass
from . import autograd # noqa: F401
......
......@@ -16,6 +16,5 @@ from .math import segment_sum
from .math import segment_mean
from .math import segment_max
from .math import segment_min
from .manipulation import _npu_identity
__all__ = []
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from paddle import _C_ops
from paddle.fluid.data_feeder import check_variable_and_dtype
from paddle.fluid.framework import in_dygraph_mode
from paddle.fluid.layer_helper import LayerHelper
__all__ = []
# TODO(qili93): remove this op after custom op and custom device
# integrated and then move this op along with its code to plugin.
def _npu_identity(x, format=-1):
"""
This OP takes in the Tensor :attr:`x` and change it to ouptut with
aclFormat with int value. This API is only used for Ascend NPU.
Args:
x(Tensor): An input N-D Tensor with data type bool, float16,
float32, float64, int32, int64, int16, int8, uint8.
format(int): Storage data format of the output in aclFormat,
default value is -1.
Returns:
Tensor: A Tensor with acl storage format on Ascend NPU.
Examples:
.. code-block:: python
# required: npu
import paddle
x = paddle.ones(shape=[6])
y = paddle.incubate._npu_identity(x, 3) # ACL_FORMAT_NC1HWC0 = 3
# y.shape = [1, 1, 1, 1, 16]
"""
if in_dygraph_mode():
return _C_ops.npu_identity(x, format)
else:
check_variable_and_dtype(
x,
'x',
[
'bool',
'int8',
'uint8',
'int16',
'int32',
'int64',
'float16',
'float32',
'float64',
],
'npu_identity',
)
helper = LayerHelper('npu_identity', **locals())
out = helper.create_variable_for_type_inference(
dtype=x.dtype, stop_gradient=x.stop_gradient
)
helper.append_op(
type='npu_identity',
inputs={'x': [x]},
outputs={'out': [out]},
attrs={'format': format},
)
return out
......@@ -14,7 +14,6 @@
from paddle import _C_ops, _legacy_C_ops, get_flags, in_dynamic_mode
from paddle.device import (
get_all_custom_device_type,
is_compiled_with_cuda,
is_compiled_with_custom_device,
is_compiled_with_rocm,
......@@ -27,7 +26,6 @@ from ...common_ops_import import Variable
from ...device import get_cudnn_version
from ...fluid.data_feeder import check_dtype, check_variable_and_dtype
from ...fluid.layer_helper import LayerHelper
from ...framework import no_grad
from ...tensor.manipulation import squeeze, unsqueeze
from ...utils import (
_contain_var,
......@@ -145,16 +143,6 @@ def _conv_nd(
new_shape = [1] * len(x.shape)
new_shape[channel_dim] = -1
bias = bias.reshape(new_shape)
# TODO(qili93): temporary for ascned npu performance to be removed along with npu_identity op
if (
_global_flags()['FLAGS_npu_storage_format']
and 'npu' in get_all_custom_device_type()
):
with no_grad():
bias_storage = _C_ops.npu_identity(
bias, 3
) # ACL_FORMAT_NC1HWC0 = 3
bias_storage._share_underline_tensor_to(bias)
return _C_ops.add(pre_bias, bias)
else:
return pre_bias
......@@ -739,16 +727,6 @@ def conv2d(
+ bias.shape
+ [1 for i in range(len(x.shape) - channel_dim - 1)],
)
# TODO(qili93): temporary for ascned npu performance to be removed along with npu_identity op
if (
_global_flags()['FLAGS_npu_storage_format']
and 'npu' in get_all_custom_device_type()
):
with no_grad():
bias_storage = _C_ops.npu_identity(
bias, 3
) # ACL_FORMAT_NC1HWC0 = 3
bias_storage._share_underline_tensor_to(bias)
return _C_ops.add(pre_bias, bias)
else:
return pre_bias
......
......@@ -33,7 +33,6 @@ import warnings
import numpy as np
from paddle import _C_ops, _legacy_C_ops, in_dynamic_mode
from paddle.device import get_all_custom_device_type
from paddle.fluid.framework import in_dygraph_mode
from ...fluid import dygraph_utils
......@@ -724,30 +723,6 @@ class _BatchNormBase(Layer):
shape=param_shape,
)
self._variance.stop_gradient = True
# TODO(qili93): temporary for ascned npu performance to be removed along with npu_identity op
if (
_global_flags()['FLAGS_npu_storage_format']
and 'npu' in get_all_custom_device_type()
):
with no_grad():
weight_trans = _C_ops.npu_identity(
self.weight, 3
) # ACL_FORMAT_NC1HWC0 = 3
bias_trans = _C_ops.npu_identity(
self.bias, 3
) # ACL_FORMAT_NC1HWC0 = 3
mean_trans = _C_ops.npu_identity(
self._mean, 3
) # ACL_FORMAT_NC1HWC0 = 3
var_trans = _C_ops.npu_identity(
self._variance, 3
) # ACL_FORMAT_NC1HWC0 = 3
weight_trans._share_underline_tensor_to(self.weight)
bias_trans._share_underline_tensor_to(self.bias)
mean_trans._share_underline_tensor_to(self._mean)
var_trans._share_underline_tensor_to(self._variance)
self._data_format = data_format
self._in_place = False
self._momentum = momentum
......
......@@ -100,10 +100,8 @@ def _proccess_archs(arch):
assert a in [
"GPU",
"ROCM",
"ASCEND",
"ASCEND_CL",
"XPU",
], f"""Supported arhc options are "GPU", "ROCM", "ASCEND" and "ASCEND_CL", "XPU", but the options is {a}"""
], f"""Supported arhc options are "GPU", "ROCM", and "XPU", but the options is {a}"""
archs += "WITH_" + a.upper() + " OR "
arch = "(" + archs[:-4] + ")"
else:
......
......@@ -178,7 +178,6 @@ HIGH_PARALLEL_JOB_NEW = [
'version_test',
'test_broadcast_to_op',
'test_squared_mat_sub_fuse_pass',
'test_fleet_ascend_utils',
'test_fused_emb_seq_pool_op',
'test_imperative_data_loader_exit_func',
'test_feed_fetch_method',
......@@ -460,7 +459,6 @@ HIGH_PARALLEL_JOB_NEW = [
'program_utils_test',
'test_fleet_distributed_strategy',
'test_hybrid_parallel_topology',
'test_ascend_trigger',
'test_fleet_rolemaker_3',
'test_conv_activation_mkldnn_fuse_pass',
'test_fusion_gru_bf16_mkldnn_op',
......@@ -2117,7 +2115,6 @@ CPU_PARALLEL_JOB = [
'test_analyzer_multi_model_prediction',
'test_fleet_base_3',
'test_fleet_base_2',
'test_ascend_trigger',
'test_fleet_amp_meta_optimizer',
'test_fleetrun',
'dense_table_test',
......@@ -3105,7 +3102,6 @@ TWO_PARALLEL_JOB = [
'test_get_inputs_outputs_in_block',
'test_get_device_properties',
'test_fleet_elastic_manager',
'test_fleet_ascend_utils',
'test_executor_check_fetch_list',
'test_eig_op',
'test_egr_performance_benchmark_fluid_cpu',
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册