未验证 提交 8852591f 编写于 作者: A Allen Guo 提交者: GitHub

[IPU] add IPU related CI configures (#40354)

* add ci

* rm retry tests

* format

* restore retry tests

* update timeout for ipu uts
上级 85f8fd9b
......@@ -229,6 +229,7 @@ function cmake_base() {
-DWITH_CNCL=${WITH_CNCL:-OFF}
-DWITH_XPU=${WITH_XPU:-OFF}
-DWITH_MLU=${WITH_MLU:-OFF}
-DWITH_IPU=${WITH_IPU:-OFF}
-DLITE_GIT_TAG=release/v2.10
-DWITH_UNITY_BUILD=${WITH_UNITY_BUILD:-OFF}
-DWITH_XPU_BKCL=${WITH_XPU_BKCL:-OFF}
......@@ -280,6 +281,7 @@ EOF
-DLITE_GIT_TAG=release/v2.10 \
-DWITH_XPU=${WITH_XPU:-OFF} \
-DWITH_MLU=${WITH_MLU:-OFF} \
-DWITH_IPU=${WITH_IPU:-OFF} \
-DWITH_CNCL=${WITH_CNCL:-OFF} \
-DXPU_SDK_ROOT=${XPU_SDK_ROOT:-""} \
-DWITH_LITE=${WITH_LITE:-OFF} \
......@@ -1283,6 +1285,8 @@ function card_test() {
CUDA_DEVICE_COUNT=$(rocm-smi -i | grep GPU | wc -l)
elif [ "${WITH_MLU}" == "ON" ];then
CUDA_DEVICE_COUNT=1
elif [ "${WITH_IPU}" == "ON" ];then
CUDA_DEVICE_COUNT=1
else
CUDA_DEVICE_COUNT=$(nvidia-smi -L | wc -l)
fi
......@@ -2240,6 +2244,130 @@ set -ex
fi
}
function parallel_test_base_ipu() {
mkdir -p ${PADDLE_ROOT}/build
cd ${PADDLE_ROOT}/build/python/paddle/fluid/tests/unittests/ipu
if [ ${WITH_TESTING:-ON} == "ON" ] ; then
cat <<EOF
========================================
Running unit ipu tests ...
========================================
EOF
set +x
test_cases=$(ctest -N -V) # get all test cases
get_quickly_disable_ut||disable_ut_quickly='disable_ut' # indicate whether the case was in quickly disable list
while read -r line; do
if [[ "$line" == "" ]]; then
continue
fi
read testcase <<< $(echo "$line"|grep -oEi "\w+$")
if [[ "$single_card_tests" == "" ]]; then
single_card_tests="^$testcase$"
else
single_card_tests="$single_card_tests|^$testcase$"
fi
done <<< "$test_cases";
ut_actual_total_startTime_s=`date +%s`
card_test "$single_card_tests" 1 # run cases 1 job each time with single IPU
collect_failed_tests
# add unit test retry for IPU
rm -f $tmp_dir/*
exec_times=0
retry_unittests_record=''
retry_time=4
exec_time_array=('first' 'second' 'third' 'fourth')
parallel_failed_tests_exec_retry_threshold=120
exec_retry_threshold=30
is_retry_execuate=0
rerun_ut_startTime_s=`date +%s`
if [ -n "$failed_test_lists" ];then
if [ ${TIMEOUT_DEBUG_HELP:-OFF} == "ON" ];then
bash $PADDLE_ROOT/tools/timeout_debug_help.sh "$failed_test_lists" # cat logs for tiemout uts which killed by ctest
fi
need_retry_ut_str=$(echo "$failed_test_lists" | grep -oEi "\-.+\(.+\)" | sed 's/(.\+)//' | sed 's/- //' )
need_retry_ut_arr=(${need_retry_ut_str})
need_retry_ut_count=${#need_retry_ut_arr[@]}
retry_unittests=$(echo "$failed_test_lists" | grep -oEi "\-.+\(.+\)" | sed 's/(.\+)//' | sed 's/- //' )
while ( [ $exec_times -lt $retry_time ] )
do
if [[ "${exec_times}" == "0" ]] ;then
if [ $need_retry_ut_count -lt $parallel_failed_tests_exec_retry_threshold ];then
is_retry_execuate=0
else
is_retry_execuate=1
fi
elif [[ "${exec_times}" == "1" ]] ;then
need_retry_ut_str=$(echo "$failed_test_lists" | grep -oEi "\-.+\(.+\)" | sed 's/(.\+)//' | sed 's/- //' )
need_retry_ut_arr=(${need_retry_ut_str})
need_retry_ut_count=${#need_retry_ut_arr[@]}
if [ $need_retry_ut_count -lt $exec_retry_threshold ];then
is_retry_execuate=0
else
is_retry_execuate=1
fi
fi
if [[ "$is_retry_execuate" == "0" ]];then
set +e
retry_unittests_record="$retry_unittests_record$failed_test_lists"
failed_test_lists_ult=`echo "${failed_test_lists}" |grep -Po '[^ ].*$'`
set -e
if [[ "${exec_times}" == "1" ]] || [[ "${exec_times}" == "3" ]];then
if [[ "${failed_test_lists}" == "" ]];then
break
else
retry_unittests=$(echo "$failed_test_lists" | grep -oEi "\-.+\(.+\)" | sed 's/(.\+)//' | sed 's/- //' )
fi
fi
echo "========================================="
echo "This is the ${exec_time_array[$exec_times]} time to re-run"
echo "========================================="
echo "The following unittest will be re-run:"
echo "${retry_unittests}"
for line in ${retry_unittests[@]} ;
do
tmp_one_tmp="$( echo $single_card_tests | grep -oEi $line )"
if [[ "$tmp_one_tmp" != "" ]]; then
if [[ "$one_card_retry" == "" ]]; then
one_card_retry="^$line$"
else
one_card_retry="$one_card_retry|^$line$"
fi
fi
done
if [[ "$one_card_retry" != "" ]]; then
card_test "$one_card_retry" 1 # run cases 1 job each time with single GPU
fi
exec_times=$[$exec_times+1]
failed_test_lists=''
collect_failed_tests
rm -f $tmp_dir/*
one_card_retry=''
else
break
fi
done
fi
rerun_ut_endTime_s=`date +%s`
echo "ipipe_log_param_Rerun_TestCases_Total_Time: $[ $rerun_ut_endTime_s - $rerun_ut_startTime_s ]s" >> ${PADDLE_ROOT}/build/build_summary.txt
ut_actual_total_endTime_s=`date +%s`
echo "ipipe_log_param_actual_TestCases_Total_Time: $[ $ut_actual_total_endTime_s - $ut_actual_total_startTime_s ]s" >> ${PADDLE_ROOT}/build/build_summary.txt
if [[ "$EXIT_CODE" != "0" ]]; then
show_ut_retry_result
fi
set -ex
fi
}
function parallel_test() {
mkdir -p ${PADDLE_ROOT}/build
cd ${PADDLE_ROOT}/build
......@@ -2257,6 +2385,8 @@ function parallel_test() {
parallel_test_base_npu
elif [ "$WITH_MLU" == "ON" ];then
parallel_test_base_mlu
elif [ "$WITH_IPU" == "ON" ];then
parallel_test_base_ipu
else
parallel_test_base_cpu ${PROC_RUN:-1}
fi
......@@ -3022,6 +3152,11 @@ function main() {
parallel_test
check_coverage
;;
check_ipu_coverage)
cmake_gen_and_build ${PYTHON_ABI:-""} ${parallel_number}
parallel_test
check_coverage
;;
reuse_so_cicheck_py35)
reuse_so_cache
parallel_test
......
......@@ -4,5 +4,11 @@ if(WITH_IPU)
foreach(TEST_OP ${TEST_OPS})
py_test_modules(${TEST_OP} MODULES ${TEST_OP})
# set all UTs timeout to 200s
set_tests_properties(${TEST_OP} PROPERTIES TIMEOUT 200)
endforeach(TEST_OP)
set_tests_properties(test_conv_op_ipu PROPERTIES TIMEOUT 300)
set_tests_properties(test_elemetwise_x_op_ipu PROPERTIES TIMEOUT 300)
set_tests_properties(test_reduce_x_op_ipu PROPERTIES TIMEOUT 600)
endif()
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import numpy as np
import paddle
import paddle.fluid as fluid
import paddle.fluid.compiler as compiler
import paddle.nn.functional as F
import paddle.optimizer
import paddle.static
from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest,
np_dtype_to_fluid_str)
paddle.enable_static()
@unittest.skipIf(not paddle.is_compiled_with_ipu(),
"core is not compiled with IPU")
class TestRelu(IPUOpTest):
def setUp(self):
self.set_atol()
self.set_training()
self.init_op()
def init_op(self):
self.op = paddle.fluid.layers.relu
def set_feed_attr(self):
self.feed_shape = [x.shape for x in self.feed.values()]
self.feed_list = list(self.feed.keys())
self.feed_dtype = [
np_dtype_to_fluid_str(x.dtype) for x in self.feed.values()
]
def _test_base(self, run_ipu=True):
scope = fluid.core.Scope()
main_prog = paddle.static.Program()
startup_prog = paddle.static.Program()
SEED = self.SEED
main_prog.random_seed = SEED
startup_prog.random_seed = SEED
with fluid.scope_guard(scope):
with paddle.static.program_guard(main_prog, startup_prog):
x = paddle.static.data(
name=self.feed_list[0],
shape=self.feed_shape[0],
dtype=self.feed_dtype[0])
out = self.op(x, **self.attrs)
fetch_list = [out.name]
if run_ipu:
place = paddle.IPUPlace()
else:
place = paddle.CPUPlace()
exe = paddle.static.Executor(place)
exe.run(startup_prog)
if run_ipu:
feed_list = self.feed_list
ipu_strategy = paddle.static.IpuStrategy()
ipu_strategy.SetGraphConfig(is_training=self.is_training)
program = compiler.IpuCompiler(
main_prog,
ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
else:
program = main_prog
result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
return result[0]
def run_test_base(self):
res0 = self._test_base(False)
res1 = self._test_base(True)
self.assertTrue(
np.allclose(
res0.flatten(), res1.flatten(), atol=self.atol))
self.assertTrue(res0.shape == res1.shape)
def test_case0(self):
self.feed = {
"x": np.random.uniform(size=[1, 3, 10, 10]).astype('float32'),
}
self.attrs = {}
self.set_feed_attr()
self.run_test_base()
class TestTanh(TestRelu):
def init_op(self):
self.op = F.tanh
class TestLog(TestRelu):
def init_op(self):
self.op = paddle.fluid.layers.log
class TestSigmoid(TestRelu):
def init_op(self):
self.op = F.sigmoid
class TestSqrt(TestRelu):
def init_op(self):
self.op = paddle.fluid.layers.sqrt
if __name__ == "__main__":
unittest.main()
......@@ -115,7 +115,7 @@ class TestBase(IPUOpTest):
class TestCase1(TestBase):
def set_atol(self):
self.atol = 1e-7
self.atol = 1e-6
self.rtol = 1e-6
self.atol_fp16 = 1e-3
self.rtol_fp16 = 1e-3
......@@ -129,7 +129,7 @@ class TestCase1(TestBase):
class TestCase2(TestBase):
def set_atol(self):
self.atol = 1e-7
self.atol = 1e-6
self.rtol = 1e-6
self.atol_fp16 = 1e-3
self.rtol_fp16 = 1e-3
......
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import numpy as np
import paddle
import paddle.fluid as fluid
import paddle.fluid.compiler as compiler
import paddle.optimizer
import paddle.static
from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest,
np_dtype_to_fluid_str)
paddle.enable_static()
@unittest.skipIf(not paddle.is_compiled_with_ipu(),
"core is not compiled with IPU")
class TestBase(IPUOpTest):
def setUp(self):
self.set_atol()
self.set_feed()
self.set_feed_attr()
self.set_attrs()
def set_feed(self):
np_data = np.random.uniform(low=-1, high=1, size=[1, 3, 100, 100])
self.feed_ipu = {"x": np_data.astype('float16')}
self.feed_cpu = {"x": np_data.astype('float32')}
def set_feed_attr(self):
self.feed_shape = [x.shape for x in self.feed_cpu.values()]
self.feed_list = list(self.feed_cpu.keys())
self.feed_dtype = [
np_dtype_to_fluid_str(x.dtype) for x in self.feed_cpu.values()
]
def set_attrs(self):
self.attrs = {}
def _test_base(self, run_ipu=True):
scope = fluid.core.Scope()
main_prog = paddle.static.Program()
startup_prog = paddle.static.Program()
SEED = self.SEED
main_prog.random_seed = SEED
startup_prog.random_seed = SEED
with fluid.scope_guard(scope):
with paddle.static.program_guard(main_prog, startup_prog):
x = paddle.static.data(
name=self.feed_list[0],
shape=self.feed_shape[0],
dtype=self.feed_dtype[0])
conv1 = paddle.static.nn.conv2d(
x, num_filters=3, filter_size=3, bias_attr=False)
conv2 = paddle.static.nn.conv2d(
x, num_filters=3, filter_size=3, bias_attr=False)
add1 = conv1 + conv2
conv3 = paddle.static.nn.conv2d(
add1, num_filters=8, filter_size=8, bias_attr=False)
out = paddle.fluid.layers.relu(conv3, **self.attrs)
fetch_list = [out.name]
if run_ipu:
place = paddle.IPUPlace()
else:
place = paddle.CPUPlace()
exe = paddle.static.Executor(place)
exe.run(startup_prog)
feed = self.feed_ipu if run_ipu else self.feed_cpu
if run_ipu:
feed_list = self.feed_list
ipu_strategy = paddle.static.IpuStrategy()
ipu_strategy.SetGraphConfig(is_training=False)
ipu_strategy.SetHalfConfig(enable_fp16=True)
program = compiler.IPUCompiledProgram(
main_prog,
ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
else:
feed_list = self.feed_list
program = main_prog
result = exe.run(program, feed=feed, fetch_list=fetch_list)
return result[0]
def test_base(self):
res0 = self._test_base(False)
res1 = self._test_base(True)
self.assertTrue(res0.shape == res1.shape)
mae = np.mean(np.abs(res0.flatten() - res1.flatten()))
print("mae is ", mae)
self.assertTrue(mae < 0.001)
if __name__ == "__main__":
unittest.main()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册