提交 76af1f81 编写于 作者: L liuqi

Fix tuning bug: add profiling=true at vaildate_gcn.sh.

上级 a038d23f
......@@ -55,11 +55,13 @@ static void AddN(const std::vector<const Tensor *> &input_tensors,
local_ws[1] = std::min<uint32_t>(batch_height_pixels, kwg_size / local_ws[0]);
return {{local_ws[0], local_ws[1]},
{local_ws[1], local_ws[0]},
{kwg_size / 4, 4},
{kwg_size / 16, 16},
{kwg_size / 32, 32},
{kwg_size / 64, 64},
{kwg_size / 128, 128},
{kwg_size / 256, 256},
{kwg_size / 512, 512},
{kwg_size, 1},
{1, kwg_size}
};
......
......@@ -65,9 +65,7 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(
local_ws[0] = std::min<uint32_t>(channel_blocks, kwg_size);
local_ws[1] = std::min<uint32_t>(width, kwg_size / local_ws[0]);
local_ws[2] = std::min<uint32_t>(height * batch, kwg_size / (local_ws[0] * local_ws[1]));
return {{8, 128, 1}, //SNPE size
{local_ws[0], local_ws[1], local_ws[2]},
{local_ws[2], local_ws[1], local_ws[0]},
return {{local_ws[0], local_ws[1], local_ws[2]},
{kwg_size / 16, 4, 4},
{kwg_size / 32, 4, 8},
{kwg_size / 32, 8, 4},
......@@ -83,7 +81,9 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(
{7, 15, 9},
{9, 7, 15},
{15, 7, 9},
{1, kwg_size, 1}};
{1, kwg_size, 1},
{8, 128, 1}, //SNPE size
};
};
cl::Event event;
auto func = [&](const std::vector<uint32_t> &params) -> cl_int {
......
......@@ -55,8 +55,7 @@ static void Concat2(const Tensor *input0,
local_ws[0] = std::min<uint32_t>(channel_blk, kwg_size);
local_ws[1] = std::min<uint32_t>(width, kwg_size / local_ws[0]);
local_ws[2] = std::min<uint32_t>(height * batch, kwg_size / (local_ws[0] * local_ws[1]));
return {{4, 15, 8}, //SNPE size
{local_ws[0], local_ws[1], local_ws[2]},
return {{local_ws[0], local_ws[1], local_ws[2]},
{local_ws[2], local_ws[1], local_ws[0]},
{kwg_size / 16, 4, 4},
{kwg_size / 32, 4, 8},
......@@ -73,7 +72,9 @@ static void Concat2(const Tensor *input0,
{7, 15, 9},
{9, 7, 15},
{15, 7, 9},
{1, kwg_size, 1}};
{1, kwg_size, 1},
{4, 15, 8}, //SNPE size
};
};
cl::Event event;
auto func = [&](const std::vector<uint32_t> &params) -> cl_int {
......
......@@ -72,9 +72,7 @@ void Conv1x1(const Tensor *input,
local_ws[0] = std::min<uint32_t>(channel_blocks, kwg_size);
local_ws[1] = std::min<uint32_t>(width_blocks, kwg_size / local_ws[0]);
local_ws[2] = std::min<uint32_t>(height * batch, kwg_size / (local_ws[0] * local_ws[1]));
return {{4, 15, 8}, //SNPE size
{local_ws[0], local_ws[1], local_ws[2]},
{local_ws[2], local_ws[1], local_ws[0]},
return {{local_ws[0], local_ws[1], local_ws[2]},
{kwg_size/16, 4, 4},
{kwg_size/32, 4, 8},
{kwg_size/32, 8, 4},
......@@ -90,7 +88,9 @@ void Conv1x1(const Tensor *input,
{7, 15, 9},
{9, 7, 15},
{15, 7, 9},
{1, kwg_size, 1}};
{1, kwg_size, 1},
{4, 15, 8}, //SNPE size
};
};
cl::Event event;
auto func = [&](const std::vector<uint32_t>& params)->cl_int {
......
......@@ -66,8 +66,7 @@ static void Conv2d3x3S12(const Tensor *input, const Tensor *filter,
local_ws[0] = std::min<uint32_t>(channel_blocks, kwg_size);
local_ws[1] = std::min<uint32_t>(width_blocks, kwg_size / local_ws[0]);
local_ws[2] = std::min<uint32_t>(height * batch, kwg_size / (local_ws[0] * local_ws[1]));
return {{4, 15, 8}, //SNPE size
{local_ws[0], local_ws[1], local_ws[2]},
return {{local_ws[0], local_ws[1], local_ws[2]},
{local_ws[2], local_ws[1], local_ws[0]},
{kwg_size / 16, 4, 4},
{kwg_size / 32, 4, 8},
......@@ -84,7 +83,9 @@ static void Conv2d3x3S12(const Tensor *input, const Tensor *filter,
{7, 15, 9},
{9, 7, 15},
{15, 7, 9},
{1, kwg_size, 1}};
{1, kwg_size, 1},
{4, 15, 8}, //SNPE size
};
};
cl::Event event;
auto func = [&](const std::vector<uint32_t> &params) -> cl_int {
......
......@@ -68,8 +68,7 @@ void Conv2dOpencl(const Tensor *input, const Tensor *filter,
local_ws[0] = std::min<uint32_t>(channel_blocks, kwg_size);
local_ws[1] = std::min<uint32_t>(width_blocks, kwg_size / local_ws[0]);
local_ws[2] = std::min<uint32_t>(height * batch, kwg_size / (local_ws[0] * local_ws[1]));
return {{4, 15, 8}, //SNPE size
{local_ws[0], local_ws[1], local_ws[2]},
return {{local_ws[0], local_ws[1], local_ws[2]},
{local_ws[2], local_ws[1], local_ws[0]},
{kwg_size / 16, 4, 4},
{kwg_size / 32, 4, 8},
......@@ -86,7 +85,9 @@ void Conv2dOpencl(const Tensor *input, const Tensor *filter,
{7, 15, 9},
{9, 7, 15},
{15, 7, 9},
{1, kwg_size, 1}};
{1, kwg_size, 1},
{4, 15, 8}, //SNPE size
};
};
cl::Event event;
auto func = [&](const std::vector<uint32_t> &params) -> cl_int {
......
......@@ -67,9 +67,7 @@ static void Pooling(const Tensor *input,
local_ws[0] = std::min<uint32_t>(channel_blocks, kwg_size);
local_ws[1] = std::min<uint32_t>(out_width, kwg_size / local_ws[0]);
local_ws[2] = std::min<uint32_t>(out_height * batch, kwg_size / (local_ws[0] * local_ws[1]));
return {{4, 15, 8}, //SNPE size
{local_ws[0], local_ws[1], local_ws[2]},
{local_ws[2], local_ws[1], local_ws[0]},
return {{local_ws[0], local_ws[1], local_ws[2]},
{kwg_size / 16, 4, 4},
{kwg_size / 32, 4, 8},
{kwg_size / 32, 8, 4},
......@@ -85,7 +83,9 @@ static void Pooling(const Tensor *input,
{7, 15, 9},
{9, 7, 15},
{15, 7, 9},
{1, kwg_size, 1}};
{1, kwg_size, 1},
{4, 15, 8}, //SNPE size
};
};
cl::Event event;
auto func = [&](const std::vector<uint32_t> &params) -> cl_int {
......
......@@ -55,9 +55,7 @@ void ReluFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
local_ws[0] = std::min<uint32_t>(channel_blocks, kwg_size);
local_ws[1] = std::min<uint32_t>(width, kwg_size / local_ws[0]);
local_ws[2] = std::min<uint32_t>(height * batch, kwg_size / (local_ws[0] * local_ws[1]));
return {{4, 15, 8}, //SNPE size
{local_ws[0], local_ws[1], local_ws[2]},
{local_ws[2], local_ws[1], local_ws[0]},
return {{local_ws[0], local_ws[1], local_ws[2]},
{kwg_size / 16, 4, 4},
{kwg_size / 32, 4, 8},
{kwg_size / 32, 8, 4},
......@@ -73,7 +71,9 @@ void ReluFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
{7, 15, 9},
{9, 7, 15},
{15, 7, 9},
{1, kwg_size, 1}};
{1, kwg_size, 1},
{4, 15, 8}, //SNPE size
};
};
cl::Event event;
auto func = [&](const std::vector<uint32_t> &params) -> cl_int {
......
......@@ -64,9 +64,7 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
local_ws[0] = std::min<uint32_t>(channel_blocks, kwg_size);
local_ws[1] = std::min<uint32_t>(out_width, kwg_size / local_ws[0]);
local_ws[2] = std::min<uint32_t>(out_height * batch, kwg_size / (local_ws[0] * local_ws[1]));
return {{4, 15, 8}, //SNPE size
{local_ws[0], local_ws[1], local_ws[2]},
{local_ws[2], local_ws[1], local_ws[0]},
return {{local_ws[0], local_ws[1], local_ws[2]},
{kwg_size / 16, 4, 4},
{kwg_size / 32, 4, 8},
{kwg_size / 32, 8, 4},
......@@ -78,7 +76,9 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
{1, kwg_size / 32, 32},
{1, kwg_size / 64, 64},
{1, kwg_size / 128, 128},
{1, kwg_size, 1}};
{1, kwg_size, 1},
{4, 15, 8}, //SNPE size
};
};
cl::Event event;
auto func = [&](const std::vector<uint32_t> &params) -> cl_int {
......
......@@ -69,9 +69,12 @@ static void AddNBenchmark(int iters, int inputs, int n, int h, int w, int c) {
BM_ADDN_MACRO(INPUTS, N, H, W, C, TYPE, CPU); \
BM_ADDN_MACRO(INPUTS, N, H, W, C, TYPE, OPENCL);
BM_ADDN(2, 1, 240, 240, 256, float);
BM_ADDN(2, 1, 256, 256, 32, float);
BM_ADDN(2, 1, 128, 128, 32, float);
// BM_ADDN(2, 1, 240, 240, 256, half);
BM_ADDN(4, 1, 240, 240, 256, float);
BM_ADDN(4, 1, 128, 128, 3, float);
BM_ADDN(2, 1, 256, 256, 3, float);
BM_ADDN(2, 1, 512, 512, 3, float);
// BM_ADDN(4, 1, 240, 240, 256, half);
} // namespace mace
......@@ -46,6 +46,8 @@ class Tuner {
// tune
std::vector<param_type> opt_param = default_param;
RetType res = Tune<RetType>(param_generator, func, timer, &opt_param);
VLOG(1) << "Tuning result. "
<< param_key << ": " << internal::MakeString(opt_param);
param_table_[param_key] = opt_param;
return res;
} else {
......
......@@ -9,10 +9,10 @@
namespace mace {
class TunerTest: public ::testing::Test {
class TunerTest : public ::testing::Test {
protected:
virtual void SetUp() {
remove( "/data/local/tmp/mace.config" );
remove("/data/local/tmp/mace.config");
setenv("MACE_RUN_PARAMETER_PATH", "/data/local/tmp/mace.config", 1);
setenv("MACE_TUNING", "1", 1);
}
......@@ -20,7 +20,7 @@ class TunerTest: public ::testing::Test {
TEST_F(TunerTest, SimpleRun) {
int expect = 1;
auto TunerFunc = [&](const std::vector<int>& params)->int {
auto TunerFunc = [&](const std::vector<unsigned int> &params) -> int {
if (params.front() == 1) {
return expect;
} else {
......@@ -29,19 +29,27 @@ TEST_F(TunerTest, SimpleRun) {
};
WallClockTimer timer;
std::vector<int> default_params(1, 1);
int res = Tuner<int>::Get()->template TuneOrRun<int>("SimpleRun", default_params, nullptr, TunerFunc, &timer);
std::vector<unsigned int> default_params(1, 1);
int res = Tuner<unsigned int>::Get()->template TuneOrRun<unsigned int>("SimpleRun",
default_params,
nullptr,
TunerFunc,
&timer);
EXPECT_EQ(expect, res);
default_params[0] = 2;
res = Tuner<int>::Get()->template TuneOrRun<int>("SimpleRun", default_params, nullptr, TunerFunc, &timer);
EXPECT_EQ(expect+1, res);
res = Tuner<unsigned int>::Get()->template TuneOrRun<unsigned int>("SimpleRun",
default_params,
nullptr,
TunerFunc,
&timer);
EXPECT_EQ(expect + 1, res);
}
TEST_F(TunerTest, SimpleTune) {
int expect = 3;
auto TunerFunc = [&](const std::vector<int>& params)->int {
auto TunerFunc = [&](const std::vector<unsigned int> &params) -> int {
if (params.front() == expect) {
return expect;
} else {
......@@ -50,17 +58,26 @@ TEST_F(TunerTest, SimpleTune) {
}
};
std::vector<int> default_params(1, 1);
auto params_generator = []()->std::vector<std::vector<int>> {
std::vector<unsigned int> default_params(1, 1);
auto params_generator = []() -> std::vector<std::vector<unsigned int>> {
return {{1}, {2}, {3}, {4}};
};
// tune
WallClockTimer timer;
int res = Tuner<int>::Get()->template TuneOrRun<int>("SimpleRun", default_params, *params_generator, TunerFunc, &timer);
int res =
Tuner<unsigned int>::Get()->template TuneOrRun<unsigned int>("SimpleRun",
default_params,
*params_generator,
TunerFunc,
&timer);
EXPECT_EQ(expect, res);
// run
res = Tuner<int>::Get()->template TuneOrRun<int>("SimpleRun", default_params, nullptr, TunerFunc, &timer);
res = Tuner<unsigned int>::Get()->template TuneOrRun<unsigned int>("SimpleRun",
default_params,
nullptr,
TunerFunc,
&timer);
EXPECT_EQ(expect, res);
}
......
......@@ -39,6 +39,15 @@ build_and_run()
PRODUCTION_MODE_BUILD_FLAGS="--define production=true"
fi
if [[ "${TUNING_OR_NOT}" != "0" && "$PRODUCTION_MODE" != true ]];then
TUNING_MODE_BUILD_FLAGS="--define profiling=true"
tuning_flag=1
round=0 # only warm up
else
tuning_flag=0
round=100
fi
bazel build --verbose_failures -c opt --strip always mace/examples:mace_run \
--crosstool_top=//external:android/crosstool \
--host_crosstool_top=@bazel_tools//tools/cpp:toolchain \
......@@ -47,7 +56,8 @@ build_and_run()
--copt="-D_GLIBCXX_USE_C99_MATH_TR1" \
--copt="-Werror=return-type" \
--copt="-DMACE_MODEL_TAG=${MODEL_TAG}" \
$PRODUCTION_MODE_BUILD_FLAGS || exit -1
$PRODUCTION_MODE_BUILD_FLAGS \
$TUNING_MODE_BUILD_FLAGS || exit -1
adb shell "mkdir -p ${PHONE_DATA_DIR}" || exit -1
if [ "$PRODUCTION_MODE" = false ]; then
......@@ -56,14 +66,6 @@ build_and_run()
adb push ${MODEL_DIR}/${INPUT_FILE_NAME} ${PHONE_DATA_DIR} || exit -1
adb push bazel-bin/mace/examples/mace_run ${PHONE_DATA_DIR} || exit -1
if [[ "${TUNING_OR_NOT}" != "0" && "$PRODUCTION_MODE" != true ]];then
tuning_flag=1
round=0 # only warm up
else
tuning_flag=0
round=2
fi
adb </dev/null shell MACE_TUNING=${tuning_flag} \
MACE_CPP_MIN_VLOG_LEVEL=$VLOG_LEVEL \
MACE_RUN_PARAMETER_PATH=${PHONE_DATA_DIR}/mace_run.config \
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册