未验证 提交 a8e02ef1 编写于 作者: F fwenguang 提交者: GitHub

[MLU] add mlu ci script (#39805)

* [MLU] add mlu ci script

* Update CMakeLists.txt
上级 f3d54e2e
......@@ -21,7 +21,6 @@ limitations under the License. */
namespace fw = paddle::framework;
namespace plat = paddle::platform;
namespace math = paddle::operators::math;
USE_OP(relu);
USE_OP_DEVICE_KERNEL(relu, MLU);
......
......@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/uniform_random_op.h"
#include "paddle/fluid/framework/generator.h"
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
namespace paddle {
......@@ -57,14 +58,45 @@ class MLUUniformRandomKernel : public framework::OpKernel<T> {
tensor->mutable_data<T>(ctx.GetPlace());
int64_t size = tensor->numel();
const float min = static_cast<T>(ctx.Attr<float>("min"));
const float max = static_cast<T>(ctx.Attr<float>("max"));
Tensor cpu_tensor(tensor->dtype());
cpu_tensor.Resize(tensor->dims());
T *data_cpu = cpu_tensor.mutable_data<T>(platform::CPUPlace());
std::uniform_real_distribution<T> dist(
static_cast<T>(ctx.Attr<float>("min")),
static_cast<T>(ctx.Attr<float>("max")));
unsigned int seed = static_cast<unsigned int>(ctx.Attr<int>("seed"));
// make mlu seed
MLUCnnlRandomGeneratorDesc random_desc(/*is_mlu200=*/false, seed);
cnnlDataType_t data_type = ToCnnlDataType(tensor->type());
MLUCnnl::RandomUniform(ctx, size, /*data type=*/data_type,
random_desc.get(), min, max, GetBasePtr(tensor));
auto engine = framework::GetCPURandomEngine(seed);
for (int64_t i = 0; i < size; ++i) {
data_cpu[i] = dist(*engine);
}
unsigned int diag_num =
static_cast<unsigned int>(ctx.Attr<int>("diag_num"));
unsigned int diag_step =
static_cast<unsigned int>(ctx.Attr<int>("diag_step"));
auto diag_val = static_cast<T>(ctx.Attr<float>("diag_val"));
if (diag_num > 0) {
PADDLE_ENFORCE_GT(
size, (diag_num - 1) * (diag_step + 1),
platform::errors::InvalidArgument(
"ShapeInvalid: the diagonal's elements is equal (num-1) "
"* (step-1) with num %d, step %d,"
"It should be smaller than %d, but received %d",
diag_num, diag_step, (diag_num - 1) * (diag_step + 1), size));
for (int64_t i = 0; i < diag_num; ++i) {
int64_t pos = i * diag_step + i;
data_cpu[pos] = diag_val;
}
}
// copy to MLU
framework::TensorCopy(
cpu_tensor, ctx.GetPlace(),
ctx.template device_context<platform::DeviceContext>(), tensor);
ctx.template device_context<paddle::platform::MLUDeviceContext>().Wait();
}
};
......
......@@ -1269,6 +1269,8 @@ function card_test() {
CUDA_DEVICE_COUNT=1
elif [ "${WITH_ROCM}" == "ON" ];then
CUDA_DEVICE_COUNT=$(rocm-smi -i | grep GPU | wc -l)
elif [ "${WITH_MLU}" == "ON" ];then
CUDA_DEVICE_COUNT=1
else
CUDA_DEVICE_COUNT=$(nvidia-smi -L | wc -l)
fi
......@@ -2102,6 +2104,130 @@ set -ex
fi
}
function parallel_test_base_mlu() {
mkdir -p ${PADDLE_ROOT}/build
cd ${PADDLE_ROOT}/build/python/paddle/fluid/tests/unittests/mlu
if [ ${WITH_TESTING:-ON} == "ON" ] ; then
cat <<EOF
========================================
Running unit mlu tests ...
========================================
EOF
set +x
test_cases=$(ctest -N -V) # get all test cases
get_quickly_disable_ut||disable_ut_quickly='disable_ut' # indicate whether the case was in quickly disable list
while read -r line; do
if [[ "$line" == "" ]]; then
continue
fi
read testcase <<< $(echo "$line"|grep -oEi "\w+$")
if [[ "$single_card_tests" == "" ]]; then
single_card_tests="^$testcase$"
else
single_card_tests="$single_card_tests|^$testcase$"
fi
done <<< "$test_cases";
ut_actual_total_startTime_s=`date +%s`
card_test "$single_card_tests" 1 # run cases 1 job each time with single MLU
collect_failed_tests
# add unit test retry for MLU
rm -f $tmp_dir/*
exec_times=0
retry_unittests_record=''
retry_time=4
exec_time_array=('first' 'second' 'third' 'fourth')
parallel_failed_tests_exec_retry_threshold=120
exec_retry_threshold=30
is_retry_execuate=0
rerun_ut_startTime_s=`date +%s`
if [ -n "$failed_test_lists" ];then
if [ ${TIMEOUT_DEBUG_HELP:-OFF} == "ON" ];then
bash $PADDLE_ROOT/tools/timeout_debug_help.sh "$failed_test_lists" # cat logs for tiemout uts which killed by ctest
fi
need_retry_ut_str=$(echo "$failed_test_lists" | grep -oEi "\-.+\(.+\)" | sed 's/(.\+)//' | sed 's/- //' )
need_retry_ut_arr=(${need_retry_ut_str})
need_retry_ut_count=${#need_retry_ut_arr[@]}
retry_unittests=$(echo "$failed_test_lists" | grep -oEi "\-.+\(.+\)" | sed 's/(.\+)//' | sed 's/- //' )
while ( [ $exec_times -lt $retry_time ] )
do
if [[ "${exec_times}" == "0" ]] ;then
if [ $need_retry_ut_count -lt $parallel_failed_tests_exec_retry_threshold ];then
is_retry_execuate=0
else
is_retry_execuate=1
fi
elif [[ "${exec_times}" == "1" ]] ;then
need_retry_ut_str=$(echo "$failed_test_lists" | grep -oEi "\-.+\(.+\)" | sed 's/(.\+)//' | sed 's/- //' )
need_retry_ut_arr=(${need_retry_ut_str})
need_retry_ut_count=${#need_retry_ut_arr[@]}
if [ $need_retry_ut_count -lt $exec_retry_threshold ];then
is_retry_execuate=0
else
is_retry_execuate=1
fi
fi
if [[ "$is_retry_execuate" == "0" ]];then
set +e
retry_unittests_record="$retry_unittests_record$failed_test_lists"
failed_test_lists_ult=`echo "${failed_test_lists}" |grep -Po '[^ ].*$'`
set -e
if [[ "${exec_times}" == "1" ]] || [[ "${exec_times}" == "3" ]];then
if [[ "${failed_test_lists}" == "" ]];then
break
else
retry_unittests=$(echo "$failed_test_lists" | grep -oEi "\-.+\(.+\)" | sed 's/(.\+)//' | sed 's/- //' )
fi
fi
echo "========================================="
echo "This is the ${exec_time_array[$exec_times]} time to re-run"
echo "========================================="
echo "The following unittest will be re-run:"
echo "${retry_unittests}"
for line in ${retry_unittests[@]} ;
do
tmp_one_tmp="$( echo $single_card_tests | grep -oEi $line )"
if [[ "$tmp_one_tmp" != "" ]]; then
if [[ "$one_card_retry" == "" ]]; then
one_card_retry="^$line$"
else
one_card_retry="$one_card_retry|^$line$"
fi
fi
done
if [[ "$one_card_retry" != "" ]]; then
card_test "$one_card_retry" 1 # run cases 1 job each time with single GPU
fi
exec_times=$[$exec_times+1]
failed_test_lists=''
collect_failed_tests
rm -f $tmp_dir/*
one_card_retry=''
else
break
fi
done
fi
rerun_ut_endTime_s=`date +%s`
echo "ipipe_log_param_Rerun_TestCases_Total_Time: $[ $rerun_ut_endTime_s - $rerun_ut_startTime_s ]s" >> ${PADDLE_ROOT}/build/build_summary.txt
ut_actual_total_endTime_s=`date +%s`
echo "ipipe_log_param_actual_TestCases_Total_Time: $[ $ut_actual_total_endTime_s - $ut_actual_total_startTime_s ]s" >> ${PADDLE_ROOT}/build/build_summary.txt
if [[ "$EXIT_CODE" != "0" ]]; then
show_ut_retry_result
fi
set -ex
fi
}
function parallel_test() {
mkdir -p ${PADDLE_ROOT}/build
cd ${PADDLE_ROOT}/build
......@@ -2117,6 +2243,8 @@ function parallel_test() {
parallel_test_base_xpu
elif [ "$WITH_ASCEND_CL" == "ON" ];then
parallel_test_base_npu
elif [ "$WITH_MLU" == "ON" ];then
parallel_test_base_mlu
else
parallel_test_base_cpu ${PROC_RUN:-1}
fi
......@@ -2873,6 +3001,11 @@ function main() {
parallel_test
check_coverage
;;
check_mlu_coverage)
cmake_gen_and_build ${PYTHON_ABI:-""} ${parallel_number}
parallel_test
check_coverage
;;
reuse_so_cicheck_py35)
reuse_so_cache
parallel_test
......
......@@ -23,6 +23,8 @@ import paddle
import paddle.fluid as fluid
from paddle.fluid import compiler, Program, program_guard
paddle.enable_static()
class TestAccuracyOp(OpTest):
def setUp(self):
......@@ -132,5 +134,4 @@ class TestAccuracyAPI(unittest.TestCase):
if __name__ == '__main__':
paddle.enable_static()
unittest.main()
......@@ -29,6 +29,7 @@ import paddle.fluid as fluid
from paddle.fluid import Program, program_guard
_set_use_system_allocator(True)
paddle.enable_static()
def _reference_testing(x, scale, offset, mean, var, epsilon, data_format):
......@@ -698,5 +699,4 @@ class TestDygraphBatchNormOpenReserveSpace(unittest.TestCase):
if __name__ == '__main__':
paddle.enable_static()
unittest.main()
......@@ -26,6 +26,8 @@ import paddle.fluid as fluid
from paddle.fluid import Program, program_guard
import paddle
paddle.enable_static()
class TestBatchNorm(unittest.TestCase):
def test_name(self):
......@@ -291,5 +293,4 @@ class TestBatchNormUseGlobalStatsCase3(TestBatchNormUseGlobalStats):
if __name__ == '__main__':
paddle.enable_static()
unittest.main()
......@@ -25,6 +25,8 @@ import paddle.fluid.core as core
import paddle.fluid as fluid
from paddle.fluid import compiler, Program, program_guard
paddle.enable_static()
class TestCastOpFp32ToFp16(OpTest):
def setUp(self):
......@@ -119,17 +121,7 @@ class TestCastOpError(unittest.TestCase):
x1 = fluid.create_lod_tensor(
np.array([[-1]]), [[1]], fluid.MLUPlace(0))
self.assertRaises(TypeError, fluid.layers.cast, x1, 'int32')
# The input dtype of cast_op must be bool, float16, float32, float64, int32, int64, uint8.
x2 = fluid.layers.data(name='x2', shape=[4], dtype='int16')
self.assertRaises(TypeError, fluid.layers.cast, x2, 'int32')
def test_dtype_type():
x4 = fluid.layers.data(name='x4', shape=[4], dtype='int32')
output = fluid.layers.cast(x=x4, dtype='int16')
self.assertRaises(TypeError, test_dtype_type)
if __name__ == '__main__':
paddle.enable_static()
unittest.main()
......@@ -176,7 +176,7 @@ def create_test_AxisTensor(parent):
class TestConcatAxisTensor(parent):
def setUp(self):
self.op_type = "concat"
self.dtype = self.init_dtype()
self.init_dtype()
self.init_test_data()
self.inputs = {
......
......@@ -23,6 +23,8 @@ from op_test import OpTest, skip_check_grad_ci
import paddle.fluid as fluid
from paddle.fluid import compiler, Program, program_guard
paddle.enable_static()
class TestElementwiseAddOp(OpTest):
def set_mlu(self):
......@@ -523,5 +525,4 @@ class TestBoolAddFloatElementwiseAddop(unittest.TestCase):
if __name__ == '__main__':
paddle.enable_static()
unittest.main()
......@@ -27,6 +27,8 @@ import paddle.fluid as fluid
import numpy as np
from paddle.fluid import compiler, Program, program_guard
paddle.enable_static()
# Situation 1: Attr(shape) is a list(without tensor)
class TestFillConstantOp1(OpTest):
......@@ -449,5 +451,4 @@ class TestFillConstantOpError(unittest.TestCase):
if __name__ == "__main__":
paddle.enable_static()
unittest.main()
......@@ -26,6 +26,8 @@ sys.path.append('..')
from op_test import OpTest
import paddle
paddle.enable_static()
class TestGaussianRandomOp(OpTest):
def setUp(self):
......@@ -74,5 +76,4 @@ class TestMeanStdAreInt(TestGaussianRandomOp):
if __name__ == "__main__":
paddle.enable_static()
unittest.main()
......@@ -26,6 +26,8 @@ import paddle.fluid as fluid
import numpy
from test_momentum_op import calculate_momentum_by_numpy
paddle.enable_static()
class TestMomentumOp1(OpTest):
def setUp(self):
......@@ -608,5 +610,4 @@ class TestMultiTensorMomentumStatic(unittest.TestCase):
if __name__ == "__main__":
paddle.enable_static()
unittest.main()
......@@ -27,6 +27,8 @@ sys.path.append('..')
from op_test import OpTest
from test_pool2d_op import pool2D_forward_naive, avg_pool2D_forward_naive, max_pool2D_forward_naive, adaptive_start_index, adaptive_end_index
paddle.enable_static()
def pool2d_backward_navie(x,
ksize,
......@@ -1016,5 +1018,4 @@ class TestDygraphPool2DAPI(unittest.TestCase):
if __name__ == '__main__':
paddle.enable_static()
unittest.main()
......@@ -25,6 +25,8 @@ import paddle.fluid.core as core
from paddle.fluid.op import Operator
from paddle.static import Program, program_guard
paddle.enable_static()
class TestScaleOp(OpTest):
def setUp(self):
......@@ -201,5 +203,4 @@ class TestScaleInplaceApiDygraph(TestScaleApiDygraph):
if __name__ == "__main__":
paddle.enable_static()
unittest.main()
......@@ -22,6 +22,8 @@ from op_test import OpTest
import paddle
import paddle.fluid.core as core
paddle.enable_static()
class TestTopkOp(OpTest):
def setUp(self):
......@@ -69,5 +71,4 @@ class TestTopkFP16Op(TestTopkOp):
if __name__ == "__main__":
paddle.enable_static()
unittest.main()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册