Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
慢慢CG
Mace
提交
b41fa3d6
Mace
项目概览
慢慢CG
/
Mace
与 Fork 源项目一致
Fork自
Xiaomi / Mace
通知
1
Star
0
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
Mace
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
提交
b41fa3d6
编写于
11月 27, 2018
作者:
李
李滨
浏览文件
操作
浏览文件
下载
差异文件
Merge branch 'multicore' into 'master'
Using guided openmp scheduler See merge request !879
上级
9aa9f272
69583cd6
变更
37
隐藏空白更改
内联
并排
Showing
37 changed file
with
318 addition
and
249 deletion
+318
-249
mace/core/runtime/cpu/cpu_runtime.cc
mace/core/runtime/cpu/cpu_runtime.cc
+139
-88
mace/ops/activation.h
mace/ops/activation.h
+7
-7
mace/ops/argmax.cc
mace/ops/argmax.cc
+1
-1
mace/ops/arm/activation_neon.cc
mace/ops/arm/activation_neon.cc
+4
-4
mace/ops/arm/conv_2d_neon_15x1.cc
mace/ops/arm/conv_2d_neon_15x1.cc
+1
-1
mace/ops/arm/conv_2d_neon_1x15.cc
mace/ops/arm/conv_2d_neon_1x15.cc
+1
-1
mace/ops/arm/conv_2d_neon_1x7.cc
mace/ops/arm/conv_2d_neon_1x7.cc
+1
-1
mace/ops/arm/conv_2d_neon_3x3.cc
mace/ops/arm/conv_2d_neon_3x3.cc
+2
-2
mace/ops/arm/conv_2d_neon_5x5.cc
mace/ops/arm/conv_2d_neon_5x5.cc
+1
-1
mace/ops/arm/conv_2d_neon_7x1.cc
mace/ops/arm/conv_2d_neon_7x1.cc
+1
-1
mace/ops/arm/conv_2d_neon_7x7.cc
mace/ops/arm/conv_2d_neon_7x7.cc
+3
-3
mace/ops/arm/conv_winograd.cc
mace/ops/arm/conv_winograd.cc
+7
-7
mace/ops/arm/deconv_2d_neon_2x2.cc
mace/ops/arm/deconv_2d_neon_2x2.cc
+2
-2
mace/ops/arm/deconv_2d_neon_3x3.cc
mace/ops/arm/deconv_2d_neon_3x3.cc
+2
-2
mace/ops/arm/deconv_2d_neon_4x4.cc
mace/ops/arm/deconv_2d_neon_4x4.cc
+2
-2
mace/ops/arm/depthwise_conv2d_neon_3x3.cc
mace/ops/arm/depthwise_conv2d_neon_3x3.cc
+2
-2
mace/ops/arm/depthwise_deconv2d_neon_3x3.cc
mace/ops/arm/depthwise_deconv2d_neon_3x3.cc
+4
-4
mace/ops/arm/depthwise_deconv2d_neon_4x4.cc
mace/ops/arm/depthwise_deconv2d_neon_4x4.cc
+4
-4
mace/ops/batch_to_space.cc
mace/ops/batch_to_space.cc
+2
-2
mace/ops/channel_shuffle.cc
mace/ops/channel_shuffle.cc
+1
-1
mace/ops/conv_2d.cc
mace/ops/conv_2d.cc
+4
-4
mace/ops/conv_pool_2d_util.cc
mace/ops/conv_pool_2d_util.cc
+2
-2
mace/ops/deconv_2d.cc
mace/ops/deconv_2d.cc
+2
-2
mace/ops/depth_to_space.cc
mace/ops/depth_to_space.cc
+1
-1
mace/ops/eltwise.cc
mace/ops/eltwise.cc
+60
-60
mace/ops/gather.cc
mace/ops/gather.cc
+1
-1
mace/ops/local_response_norm.cc
mace/ops/local_response_norm.cc
+1
-1
mace/ops/pooling.cc
mace/ops/pooling.cc
+4
-4
mace/ops/reduce_mean.cc
mace/ops/reduce_mean.cc
+7
-7
mace/ops/resize_bicubic.cc
mace/ops/resize_bicubic.cc
+1
-1
mace/ops/resize_bilinear.cc
mace/ops/resize_bilinear.cc
+2
-2
mace/ops/sgemm.cc
mace/ops/sgemm.cc
+19
-19
mace/ops/softmax.cc
mace/ops/softmax.cc
+4
-4
mace/ops/space_to_batch.cc
mace/ops/space_to_batch.cc
+2
-2
mace/ops/sqrdiff_mean.cc
mace/ops/sqrdiff_mean.cc
+1
-1
mace/public/mace.h
mace/public/mace.h
+18
-0
mace/utils/quantize.h
mace/utils/quantize.h
+2
-2
未找到文件。
mace/core/runtime/cpu/cpu_runtime.cc
浏览文件 @
b41fa3d6
...
@@ -36,45 +36,98 @@ namespace mace {
...
@@ -36,45 +36,98 @@ namespace mace {
int
MaceOpenMPThreadCount
=
1
;
int
MaceOpenMPThreadCount
=
1
;
namespace
{
struct
CPUFreq
{
size_t
core_id
;
float
freq
;
};
namespace
{
#if defined(__ANDROID__)
int
GetCPUCount
()
{
int
GetCPUCount
()
{
char
path
[
64
];
int
cpu_count
=
0
;
int
cpu_count
=
0
;
int
result
=
0
;
std
::
string
cpu_sys_conf
=
"/proc/cpuinfo"
;
std
::
ifstream
f
(
cpu_sys_conf
);
while
(
true
)
{
if
(
!
f
.
is_open
())
{
snprintf
(
path
,
sizeof
(
path
),
"/sys/devices/system/cpu/cpu%d"
,
cpu_count
);
LOG
(
ERROR
)
<<
"failed to open "
<<
cpu_sys_conf
;
result
=
access
(
path
,
F_OK
);
return
-
1
;
if
(
result
!=
0
)
{
}
if
(
errno
!=
ENOENT
)
{
std
::
string
line
;
LOG
(
ERROR
)
<<
"Access "
<<
path
<<
" failed: "
<<
strerror
(
errno
);
const
std
::
string
processor_key
=
"processor"
;
}
while
(
std
::
getline
(
f
,
line
))
{
return
cpu_count
;
if
(
line
.
size
()
>=
processor_key
.
size
()
&&
line
.
compare
(
0
,
processor_key
.
size
(),
processor_key
)
==
0
)
{
++
cpu_count
;
}
}
cpu_count
++
;
}
}
if
(
f
.
bad
())
{
LOG
(
ERROR
)
<<
"failed to read "
<<
cpu_sys_conf
;
}
if
(
!
f
.
eof
())
{
LOG
(
ERROR
)
<<
"failed to read end of "
<<
cpu_sys_conf
;
}
f
.
close
();
VLOG
(
2
)
<<
"CPU cores: "
<<
cpu_count
;
return
cpu_count
;
}
}
#endif
int
GetCPUMaxFreq
(
int
cpu_id
)
{
int
GetCPUMaxFreq
(
std
::
vector
<
float
>
*
max_freqs
)
{
char
path
[
64
];
#if defined(__ANDROID__)
snprintf
(
path
,
sizeof
(
path
),
int
cpu_count
=
GetCPUCount
();
"/sys/devices/system/cpu/cpu%d/cpufreq/cpuinfo_max_freq"
,
for
(
int
cpu_id
=
0
;
cpu_id
<
cpu_count
;
++
cpu_id
)
{
cpu_id
);
std
::
string
cpuinfo_max_freq_sys_conf
=
MakeString
(
"/sys/devices/system/cpu/cpu"
,
FILE
*
fp
=
fopen
(
path
,
"rb"
);
cpu_id
,
if
(
!
fp
)
{
"/cpufreq/cpuinfo_max_freq"
);
LOG
(
WARNING
)
<<
"File: "
<<
path
<<
" not exists."
;
std
::
ifstream
f
(
cpuinfo_max_freq_sys_conf
);
return
0
;
if
(
!
f
.
is_open
())
{
LOG
(
ERROR
)
<<
"failed to open "
<<
cpuinfo_max_freq_sys_conf
;
return
-
1
;
}
std
::
string
line
;
if
(
std
::
getline
(
f
,
line
))
{
float
freq
=
atof
(
line
.
c_str
());
max_freqs
->
push_back
(
freq
);
}
if
(
f
.
bad
())
{
LOG
(
ERROR
)
<<
"failed to read "
<<
cpuinfo_max_freq_sys_conf
;
}
f
.
close
();
}
#else
std
::
string
cpu_sys_conf
=
"/proc/cpuinfo"
;
std
::
ifstream
f
(
cpu_sys_conf
);
if
(
!
f
.
is_open
())
{
LOG
(
ERROR
)
<<
"failed to open "
<<
cpu_sys_conf
;
return
-
1
;
}
}
std
::
string
line
;
const
std
::
string
freq_key
=
"cpu MHz"
;
while
(
std
::
getline
(
f
,
line
))
{
if
(
line
.
size
()
>=
freq_key
.
size
()
&&
line
.
compare
(
0
,
freq_key
.
size
(),
freq_key
)
==
0
)
{
size_t
pos
=
line
.
find
(
":"
);
if
(
pos
!=
std
::
string
::
npos
)
{
std
::
string
freq_str
=
line
.
substr
(
pos
+
1
);
float
freq
=
atof
(
freq_str
.
c_str
());
max_freqs
->
push_back
(
freq
);
}
}
}
if
(
f
.
bad
())
{
LOG
(
ERROR
)
<<
"failed to read "
<<
cpu_sys_conf
;
}
if
(
!
f
.
eof
())
{
LOG
(
ERROR
)
<<
"failed to read end of "
<<
cpu_sys_conf
;
}
f
.
close
();
#endif
int
freq
=
0
;
for
(
float
freq
:
*
max_freqs
)
{
int
items_read
=
fscanf
(
fp
,
"%d"
,
&
freq
);
VLOG
(
2
)
<<
"CPU freq: "
<<
freq
;
if
(
items_read
!=
1
)
{
LOG
(
WARNING
)
<<
"Read file: "
<<
path
<<
" failed."
;
}
}
fclose
(
fp
);
return
freq
;
return
0
;
}
}
MaceStatus
SetThreadAffinity
(
cpu_set_t
mask
)
{
MaceStatus
SetThreadAffinity
(
cpu_set_t
mask
)
{
...
@@ -93,51 +146,14 @@ MaceStatus SetThreadAffinity(cpu_set_t mask) {
...
@@ -93,51 +146,14 @@ MaceStatus SetThreadAffinity(cpu_set_t mask) {
}
}
}
}
MaceStatus
GetCPUBigLittleCoreIDs
(
std
::
vector
<
int
>
*
big_core_ids
,
std
::
vector
<
int
>
*
little_core_ids
)
{
MACE_CHECK_NOTNULL
(
big_core_ids
);
MACE_CHECK_NOTNULL
(
little_core_ids
);
int
cpu_count
=
GetCPUCount
();
std
::
vector
<
int
>
cpu_max_freq
(
cpu_count
);
// set cpu max frequency
for
(
int
i
=
0
;
i
<
cpu_count
;
++
i
)
{
cpu_max_freq
[
i
]
=
GetCPUMaxFreq
(
i
);
if
(
cpu_max_freq
[
i
]
==
0
)
{
LOG
(
WARNING
)
<<
"Cannot get CPU"
<<
i
<<
"'s max frequency info, maybe it is offline."
;
return
MaceStatus
(
MaceStatus
::
MACE_INVALID_ARGS
,
"Cannot get CPU's max frequency info,"
" maybe it is offline."
);
}
}
int
big_core_freq
=
*
(
std
::
max_element
(
cpu_max_freq
.
begin
(),
cpu_max_freq
.
end
()));
int
little_core_freq
=
*
(
std
::
min_element
(
cpu_max_freq
.
begin
(),
cpu_max_freq
.
end
()));
big_core_ids
->
reserve
(
cpu_count
);
little_core_ids
->
reserve
(
cpu_count
);
for
(
int
i
=
0
;
i
<
cpu_count
;
++
i
)
{
if
(
cpu_max_freq
[
i
]
==
little_core_freq
)
{
little_core_ids
->
push_back
(
i
);
}
if
(
cpu_max_freq
[
i
]
==
big_core_freq
)
{
big_core_ids
->
push_back
(
i
);
}
}
return
MaceStatus
::
MACE_SUCCESS
;
}
MaceStatus
SetOpenMPThreadsAndAffinityCPUs
(
int
omp_num_threads
,
MaceStatus
SetOpenMPThreadsAndAffinityCPUs
(
int
omp_num_threads
,
const
std
::
vector
<
in
t
>
&
cpu_ids
)
{
const
std
::
vector
<
size_
t
>
&
cpu_ids
)
{
MaceOpenMPThreadCount
=
omp_num_threads
;
MaceOpenMPThreadCount
=
omp_num_threads
;
#ifdef MACE_ENABLE_OPENMP
#ifdef MACE_ENABLE_OPENMP
VLOG
(
1
)
<<
"Set OpenMP threads number: "
<<
omp_num_threads
VLOG
(
1
)
<<
"Set OpenMP threads number: "
<<
omp_num_threads
<<
", CPU core IDs: "
<<
MakeString
(
cpu_ids
);
<<
", CPU core IDs: "
<<
MakeString
(
cpu_ids
);
omp_set_schedule
(
omp_sched_guided
,
1
);
omp_set_num_threads
(
omp_num_threads
);
omp_set_num_threads
(
omp_num_threads
);
#else
#else
MACE_UNUSED
(
omp_num_threads
);
MACE_UNUSED
(
omp_num_threads
);
...
@@ -174,55 +190,90 @@ MaceStatus SetOpenMPThreadsAndAffinityCPUs(int omp_num_threads,
...
@@ -174,55 +190,90 @@ MaceStatus SetOpenMPThreadsAndAffinityCPUs(int omp_num_threads,
}
// namespace
}
// namespace
MaceStatus
CPURuntime
::
SetOpenMPThreadsAndAffinityPolicy
(
MaceStatus
CPURuntime
::
SetOpenMPThreadsAndAffinityPolicy
(
int
omp_
num_threads_hint
,
int
num_threads_hint
,
CPUAffinityPolicy
policy
,
CPUAffinityPolicy
policy
,
void
*
gemm_context
)
{
void
*
gemm_context
)
{
// get cpu frequency info
std
::
vector
<
float
>
cpu_max_freqs
;
if
(
GetCPUMaxFreq
(
&
cpu_max_freqs
)
==
-
1
||
cpu_max_freqs
.
size
()
==
0
)
{
return
MaceStatus
::
MACE_INVALID_ARGS
;
}
std
::
vector
<
CPUFreq
>
cpu_freq
(
cpu_max_freqs
.
size
());
for
(
size_t
i
=
0
;
i
<
cpu_max_freqs
.
size
();
++
i
)
{
cpu_freq
[
i
].
core_id
=
i
;
cpu_freq
[
i
].
freq
=
cpu_max_freqs
[
i
];
}
if
(
policy
==
CPUAffinityPolicy
::
AFFINITY_POWER_SAVE
||
policy
==
CPUAffinityPolicy
::
AFFINITY_LITTLE_ONLY
)
{
std
::
sort
(
cpu_freq
.
begin
(),
cpu_freq
.
end
(),
[
=
](
const
CPUFreq
&
lhs
,
const
CPUFreq
&
rhs
)
{
return
lhs
.
freq
<
rhs
.
freq
;
});
}
else
if
(
policy
==
CPUAffinityPolicy
::
AFFINITY_HIGH_PERFORMANCE
||
policy
==
CPUAffinityPolicy
::
AFFINITY_BIG_ONLY
)
{
std
::
sort
(
cpu_freq
.
begin
(),
cpu_freq
.
end
(),
[](
const
CPUFreq
&
lhs
,
const
CPUFreq
&
rhs
)
{
return
lhs
.
freq
>
rhs
.
freq
;
});
}
int
cpu_count
=
static_cast
<
int
>
(
cpu_freq
.
size
());
if
(
num_threads_hint
<=
0
||
num_threads_hint
>
cpu_count
)
{
num_threads_hint
=
cpu_count
;
}
if
(
policy
==
CPUAffinityPolicy
::
AFFINITY_NONE
)
{
if
(
policy
==
CPUAffinityPolicy
::
AFFINITY_NONE
)
{
#ifdef MACE_ENABLE_QUANTIZE
#ifdef MACE_ENABLE_QUANTIZE
if
(
gemm_context
)
{
if
(
gemm_context
)
{
static_cast
<
gemmlowp
::
GemmContext
*>
(
gemm_context
)
->
set_max_num_threads
(
static_cast
<
gemmlowp
::
GemmContext
*>
(
gemm_context
)
->
set_max_num_threads
(
std
::
max
(
0
,
omp_num_threads_hint
)
);
num_threads_hint
);
}
}
#else
#else
MACE_UNUSED
(
gemm_context
);
MACE_UNUSED
(
gemm_context
);
#endif // MACE_ENABLE_QUANTIZE
#endif // MACE_ENABLE_QUANTIZE
#ifdef MACE_ENABLE_OPENMP
#ifdef MACE_ENABLE_OPENMP
if
(
omp_num_threads_hint
>
0
)
{
omp_set_num_threads
(
num_threads_hint
);
omp_set_num_threads
(
std
::
min
(
omp_num_threads_hint
,
omp_get_num_procs
()));
}
#else
#else
LOG
(
WARNING
)
<<
"Set OpenMP threads number failed: OpenMP not enabled."
;
LOG
(
WARNING
)
<<
"Set OpenMP threads number failed: OpenMP not enabled."
;
#endif
#endif
return
MaceStatus
::
MACE_SUCCESS
;
return
MaceStatus
::
MACE_SUCCESS
;
}
}
std
::
vector
<
int
>
big_core_ids
;
std
::
vector
<
int
>
little_core_ids
;
MaceStatus
res
=
GetCPUBigLittleCoreIDs
(
&
big_core_ids
,
&
little_core_ids
);
if
(
res
!=
MaceStatus
::
MACE_SUCCESS
)
{
return
res
;
}
std
::
vector
<
int
>
use_cpu_ids
;
// decide num of cores to use
if
(
policy
==
CPUAffinityPolicy
::
AFFINITY_BIG_ONLY
)
{
int
cores_to_use
=
0
;
use_cpu_ids
=
std
::
move
(
big_core_ids
);
if
(
policy
==
CPUAffinityPolicy
::
AFFINITY_BIG_ONLY
||
policy
==
CPUAffinityPolicy
::
AFFINITY_LITTLE_ONLY
)
{
for
(
size_t
i
=
0
;
i
<
cpu_max_freqs
.
size
();
++
i
)
{
if
(
cpu_freq
[
i
].
freq
!=
cpu_freq
[
0
].
freq
)
{
break
;
}
++
cores_to_use
;
}
num_threads_hint
=
cores_to_use
;
}
else
{
}
else
{
use_cpu_ids
=
std
::
move
(
little_core_ids
)
;
cores_to_use
=
num_threads_hint
;
}
}
if
(
omp_num_threads_hint
<=
0
||
VLOG
(
2
)
<<
"Use "
<<
num_threads_hint
<<
" threads"
;
omp_num_threads_hint
>
static_cast
<
int
>
(
use_cpu_ids
.
size
()))
{
std
::
vector
<
size_t
>
cpu_ids
(
cores_to_use
);
omp_num_threads_hint
=
use_cpu_ids
.
size
();
for
(
int
i
=
0
;
i
<
cores_to_use
;
++
i
)
{
VLOG
(
2
)
<<
"Bind thread to core: "
<<
cpu_freq
[
i
].
core_id
<<
" with freq "
<<
cpu_freq
[
i
].
freq
;
cpu_ids
[
i
]
=
cpu_freq
[
i
].
core_id
;
}
}
#ifdef MACE_ENABLE_QUANTIZE
#ifdef MACE_ENABLE_QUANTIZE
if
(
gemm_context
)
{
if
(
gemm_context
)
{
static_cast
<
gemmlowp
::
GemmContext
*>
(
gemm_context
)
->
set_max_num_threads
(
static_cast
<
gemmlowp
::
GemmContext
*>
(
gemm_context
)
->
set_max_num_threads
(
omp_
num_threads_hint
);
num_threads_hint
);
}
}
#endif // MACE_ENABLE_QUANTIZE
#endif // MACE_ENABLE_QUANTIZE
return
SetOpenMPThreadsAndAffinityCPUs
(
omp_num_threads_hint
,
use_
cpu_ids
);
return
SetOpenMPThreadsAndAffinityCPUs
(
num_threads_hint
,
cpu_ids
);
}
}
}
// namespace mace
}
// namespace mace
...
...
mace/ops/activation.h
浏览文件 @
b41fa3d6
...
@@ -66,26 +66,26 @@ void DoActivation(const T *input_ptr,
...
@@ -66,26 +66,26 @@ void DoActivation(const T *input_ptr,
case
NOOP
:
case
NOOP
:
break
;
break
;
case
RELU
:
case
RELU
:
#pragma omp parallel for
#pragma omp parallel for
schedule(runtime)
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
output_ptr
[
i
]
=
std
::
max
(
input_ptr
[
i
],
static_cast
<
T
>
(
0
));
output_ptr
[
i
]
=
std
::
max
(
input_ptr
[
i
],
static_cast
<
T
>
(
0
));
}
}
break
;
break
;
case
RELUX
:
case
RELUX
:
#pragma omp parallel for
#pragma omp parallel for
schedule(runtime)
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
output_ptr
[
i
]
=
std
::
min
(
std
::
max
(
input_ptr
[
i
],
static_cast
<
T
>
(
0
)),
output_ptr
[
i
]
=
std
::
min
(
std
::
max
(
input_ptr
[
i
],
static_cast
<
T
>
(
0
)),
static_cast
<
T
>
(
relux_max_limit
));
static_cast
<
T
>
(
relux_max_limit
));
}
}
break
;
break
;
case
TANH
:
case
TANH
:
#pragma omp parallel for
#pragma omp parallel for
schedule(runtime)
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
output_ptr
[
i
]
=
std
::
tanh
(
input_ptr
[
i
]);
output_ptr
[
i
]
=
std
::
tanh
(
input_ptr
[
i
]);
}
}
break
;
break
;
case
SIGMOID
:
case
SIGMOID
:
#pragma omp parallel for
#pragma omp parallel for
schedule(runtime)
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
output_ptr
[
i
]
=
1
/
(
1
+
std
::
exp
(
-
input_ptr
[
i
]));
output_ptr
[
i
]
=
1
/
(
1
+
std
::
exp
(
-
input_ptr
[
i
]));
}
}
...
@@ -111,13 +111,13 @@ inline void DoActivation(const float *input_ptr,
...
@@ -111,13 +111,13 @@ inline void DoActivation(const float *input_ptr,
ReluxNeon
(
input_ptr
,
relux_max_limit
,
size
,
output_ptr
);
ReluxNeon
(
input_ptr
,
relux_max_limit
,
size
,
output_ptr
);
break
;
break
;
case
TANH
:
case
TANH
:
#pragma omp parallel for
#pragma omp parallel for
schedule(runtime)
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
output_ptr
[
i
]
=
std
::
tanh
(
input_ptr
[
i
]);
output_ptr
[
i
]
=
std
::
tanh
(
input_ptr
[
i
]);
}
}
break
;
break
;
case
SIGMOID
:
case
SIGMOID
:
#pragma omp parallel for
#pragma omp parallel for
schedule(runtime)
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
output_ptr
[
i
]
=
1
/
(
1
+
std
::
exp
(
-
input_ptr
[
i
]));
output_ptr
[
i
]
=
1
/
(
1
+
std
::
exp
(
-
input_ptr
[
i
]));
}
}
...
@@ -134,7 +134,7 @@ void PReLUActivation(const T *input_ptr,
...
@@ -134,7 +134,7 @@ void PReLUActivation(const T *input_ptr,
const
index_t
inner_size
,
const
index_t
inner_size
,
const
T
*
alpha_ptr
,
const
T
*
alpha_ptr
,
T
*
output_ptr
)
{
T
*
output_ptr
)
{
#pragma omp parallel for collapse(3)
#pragma omp parallel for collapse(3)
schedule(runtime)
for
(
index_t
i
=
0
;
i
<
outer_size
;
++
i
)
{
for
(
index_t
i
=
0
;
i
<
outer_size
;
++
i
)
{
for
(
index_t
chan_idx
=
0
;
chan_idx
<
input_chan
;
++
chan_idx
)
{
for
(
index_t
chan_idx
=
0
;
chan_idx
<
input_chan
;
++
chan_idx
)
{
for
(
index_t
j
=
0
;
j
<
inner_size
;
++
j
)
{
for
(
index_t
j
=
0
;
j
<
inner_size
;
++
j
)
{
...
...
mace/ops/argmax.cc
浏览文件 @
b41fa3d6
...
@@ -59,7 +59,7 @@ class ArgMaxOp : public Operation {
...
@@ -59,7 +59,7 @@ class ArgMaxOp : public Operation {
index_t
outer_size
=
output
->
size
();
index_t
outer_size
=
output
->
size
();
index_t
inner_size
=
input
->
dim
(
axis_value
);
index_t
inner_size
=
input
->
dim
(
axis_value
);
#pragma omp parallel for
#pragma omp parallel for
schedule(runtime)
for
(
index_t
i
=
0
;
i
<
outer_size
;
++
i
)
{
for
(
index_t
i
=
0
;
i
<
outer_size
;
++
i
)
{
int
idx
=
0
;
int
idx
=
0
;
T
max_value
=
std
::
numeric_limits
<
T
>::
lowest
();
T
max_value
=
std
::
numeric_limits
<
T
>::
lowest
();
...
...
mace/ops/arm/activation_neon.cc
浏览文件 @
b41fa3d6
...
@@ -25,7 +25,7 @@ namespace ops {
...
@@ -25,7 +25,7 @@ namespace ops {
void
ReluNeon
(
const
float
*
input
,
const
index_t
size
,
float
*
output
)
{
void
ReluNeon
(
const
float
*
input
,
const
index_t
size
,
float
*
output
)
{
#if defined(MACE_ENABLE_NEON)
#if defined(MACE_ENABLE_NEON)
float32x4_t
vzero
=
vdupq_n_f32
(
0.
f
);
float32x4_t
vzero
=
vdupq_n_f32
(
0.
f
);
#pragma omp parallel for
#pragma omp parallel for
schedule(runtime)
for
(
index_t
i
=
0
;
i
<=
size
-
4
;
i
+=
4
)
{
for
(
index_t
i
=
0
;
i
<=
size
-
4
;
i
+=
4
)
{
float32x4_t
v
=
vld1q_f32
(
input
+
i
);
float32x4_t
v
=
vld1q_f32
(
input
+
i
);
v
=
vmaxq_f32
(
v
,
vzero
);
v
=
vmaxq_f32
(
v
,
vzero
);
...
@@ -36,7 +36,7 @@ void ReluNeon(const float *input, const index_t size, float *output) {
...
@@ -36,7 +36,7 @@ void ReluNeon(const float *input, const index_t size, float *output) {
output
[
i
]
=
std
::
max
(
input
[
i
],
0.
f
);
output
[
i
]
=
std
::
max
(
input
[
i
],
0.
f
);
}
}
#else
#else
#pragma omp parallel for
#pragma omp parallel for
schedule(runtime)
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
output
[
i
]
=
std
::
max
(
input
[
i
],
0.
f
);
output
[
i
]
=
std
::
max
(
input
[
i
],
0.
f
);
}
}
...
@@ -48,7 +48,7 @@ void ReluxNeon(const float *input, const float limit,
...
@@ -48,7 +48,7 @@ void ReluxNeon(const float *input, const float limit,
#if defined(MACE_ENABLE_NEON)
#if defined(MACE_ENABLE_NEON)
float32x4_t
vzero
=
vdupq_n_f32
(
0.
f
);
float32x4_t
vzero
=
vdupq_n_f32
(
0.
f
);
float32x4_t
vlimit
=
vdupq_n_f32
(
limit
);
float32x4_t
vlimit
=
vdupq_n_f32
(
limit
);
#pragma omp parallel for
#pragma omp parallel for
schedule(runtime)
for
(
index_t
i
=
0
;
i
<=
size
-
4
;
i
+=
4
)
{
for
(
index_t
i
=
0
;
i
<=
size
-
4
;
i
+=
4
)
{
float32x4_t
v
=
vld1q_f32
(
input
+
i
);
float32x4_t
v
=
vld1q_f32
(
input
+
i
);
v
=
vmaxq_f32
(
v
,
vzero
);
v
=
vmaxq_f32
(
v
,
vzero
);
...
@@ -60,7 +60,7 @@ void ReluxNeon(const float *input, const float limit,
...
@@ -60,7 +60,7 @@ void ReluxNeon(const float *input, const float limit,
output
[
i
]
=
std
::
min
(
std
::
max
(
input
[
i
],
0.
f
),
limit
);
output
[
i
]
=
std
::
min
(
std
::
max
(
input
[
i
],
0.
f
),
limit
);
}
}
#else
#else
#pragma omp parallel for
#pragma omp parallel for
schedule(runtime)
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
output
[
i
]
=
std
::
min
(
std
::
max
(
input
[
i
],
0.
f
),
limit
);
output
[
i
]
=
std
::
min
(
std
::
max
(
input
[
i
],
0.
f
),
limit
);
}
}
...
...
mace/ops/arm/conv_2d_neon_15x1.cc
浏览文件 @
b41fa3d6
...
@@ -60,7 +60,7 @@ void Conv2dNeonK15x1S1(const float *input,
...
@@ -60,7 +60,7 @@ void Conv2dNeonK15x1S1(const float *input,
const
index_t
tile_width
=
const
index_t
tile_width
=
out_shape
[
1
]
<
4
?
RoundUpDiv4
(
out_shape
[
3
])
:
out_shape
[
3
];
out_shape
[
1
]
<
4
?
RoundUpDiv4
(
out_shape
[
3
])
:
out_shape
[
3
];
#pragma omp parallel for collapse(3)
#pragma omp parallel for collapse(3)
schedule(runtime)
for
(
index_t
b
=
0
;
b
<
out_shape
[
0
];
++
b
)
{
for
(
index_t
b
=
0
;
b
<
out_shape
[
0
];
++
b
)
{
for
(
index_t
m
=
0
;
m
<
out_shape
[
1
];
++
m
)
{
for
(
index_t
m
=
0
;
m
<
out_shape
[
1
];
++
m
)
{
for
(
index_t
w
=
0
;
w
<
out_shape
[
3
];
w
+=
tile_width
)
{
for
(
index_t
w
=
0
;
w
<
out_shape
[
3
];
w
+=
tile_width
)
{
...
...
mace/ops/arm/conv_2d_neon_1x15.cc
浏览文件 @
b41fa3d6
...
@@ -61,7 +61,7 @@ void Conv2dNeonK1x15S1(const float *input,
...
@@ -61,7 +61,7 @@ void Conv2dNeonK1x15S1(const float *input,
const
index_t
tile_height
=
const
index_t
tile_height
=
out_shape
[
1
]
<
4
?
RoundUpDiv4
(
out_shape
[
2
])
:
out_shape
[
2
];
out_shape
[
1
]
<
4
?
RoundUpDiv4
(
out_shape
[
2
])
:
out_shape
[
2
];
#pragma omp parallel for collapse(3)
#pragma omp parallel for collapse(3)
schedule(runtime)
for
(
index_t
b
=
0
;
b
<
out_shape
[
0
];
++
b
)
{
for
(
index_t
b
=
0
;
b
<
out_shape
[
0
];
++
b
)
{
for
(
index_t
m
=
0
;
m
<
out_shape
[
1
];
++
m
)
{
for
(
index_t
m
=
0
;
m
<
out_shape
[
1
];
++
m
)
{
for
(
index_t
h
=
0
;
h
<
out_shape
[
2
];
h
+=
tile_height
)
{
for
(
index_t
h
=
0
;
h
<
out_shape
[
2
];
h
+=
tile_height
)
{
...
...
mace/ops/arm/conv_2d_neon_1x7.cc
浏览文件 @
b41fa3d6
...
@@ -32,7 +32,7 @@ void Conv2dNeonK1x7S1(const float *input,
...
@@ -32,7 +32,7 @@ void Conv2dNeonK1x7S1(const float *input,
const
index_t
in_batch_size
=
in_shape
[
1
]
*
in_image_size
;
const
index_t
in_batch_size
=
in_shape
[
1
]
*
in_image_size
;
const
index_t
out_batch_size
=
out_shape
[
1
]
*
out_image_size
;
const
index_t
out_batch_size
=
out_shape
[
1
]
*
out_image_size
;
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2)
schedule(runtime)
for
(
index_t
b
=
0
;
b
<
out_shape
[
0
];
++
b
)
{
for
(
index_t
b
=
0
;
b
<
out_shape
[
0
];
++
b
)
{
for
(
index_t
m
=
0
;
m
<
out_shape
[
1
];
m
+=
4
)
{
for
(
index_t
m
=
0
;
m
<
out_shape
[
1
];
m
+=
4
)
{
const
index_t
out_channels
=
out_shape
[
1
];
const
index_t
out_channels
=
out_shape
[
1
];
...
...
mace/ops/arm/conv_2d_neon_3x3.cc
浏览文件 @
b41fa3d6
...
@@ -33,7 +33,7 @@ void Conv2dNeonK3x3S1(const float *input,
...
@@ -33,7 +33,7 @@ void Conv2dNeonK3x3S1(const float *input,
const
index_t
in_batch_size
=
in_shape
[
1
]
*
in_image_size
;
const
index_t
in_batch_size
=
in_shape
[
1
]
*
in_image_size
;
const
index_t
out_batch_size
=
out_shape
[
1
]
*
out_image_size
;
const
index_t
out_batch_size
=
out_shape
[
1
]
*
out_image_size
;
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2)
schedule(runtime)
for
(
index_t
b
=
0
;
b
<
out_shape
[
0
];
++
b
)
{
for
(
index_t
b
=
0
;
b
<
out_shape
[
0
];
++
b
)
{
for
(
index_t
m
=
0
;
m
<
out_shape
[
1
];
m
+=
2
)
{
for
(
index_t
m
=
0
;
m
<
out_shape
[
1
];
m
+=
2
)
{
const
index_t
out_channels
=
out_shape
[
1
];
const
index_t
out_channels
=
out_shape
[
1
];
...
@@ -515,7 +515,7 @@ void Conv2dNeonK3x3S2(const float *input,
...
@@ -515,7 +515,7 @@ void Conv2dNeonK3x3S2(const float *input,
const
index_t
in_batch_size
=
in_shape
[
1
]
*
in_image_size
;
const
index_t
in_batch_size
=
in_shape
[
1
]
*
in_image_size
;
const
index_t
out_batch_size
=
out_shape
[
1
]
*
out_image_size
;
const
index_t
out_batch_size
=
out_shape
[
1
]
*
out_image_size
;
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2)
schedule(runtime)
for
(
index_t
b
=
0
;
b
<
out_shape
[
0
];
++
b
)
{
for
(
index_t
b
=
0
;
b
<
out_shape
[
0
];
++
b
)
{
for
(
index_t
m
=
0
;
m
<
out_shape
[
1
];
++
m
)
{
for
(
index_t
m
=
0
;
m
<
out_shape
[
1
];
++
m
)
{
for
(
index_t
c
=
0
;
c
<
in_shape
[
1
];
++
c
)
{
for
(
index_t
c
=
0
;
c
<
in_shape
[
1
];
++
c
)
{
...
...
mace/ops/arm/conv_2d_neon_5x5.cc
浏览文件 @
b41fa3d6
...
@@ -87,7 +87,7 @@ void Conv2dNeonK5x5S1(const float *input,
...
@@ -87,7 +87,7 @@ void Conv2dNeonK5x5S1(const float *input,
const
index_t
in_batch_size
=
in_shape
[
1
]
*
in_image_size
;
const
index_t
in_batch_size
=
in_shape
[
1
]
*
in_image_size
;
const
index_t
out_batch_size
=
out_shape
[
1
]
*
out_image_size
;
const
index_t
out_batch_size
=
out_shape
[
1
]
*
out_image_size
;
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2)
schedule(runtime)
for
(
index_t
b
=
0
;
b
<
out_shape
[
0
];
++
b
)
{
for
(
index_t
b
=
0
;
b
<
out_shape
[
0
];
++
b
)
{
for
(
index_t
m
=
0
;
m
<
out_shape
[
1
];
m
+=
4
)
{
for
(
index_t
m
=
0
;
m
<
out_shape
[
1
];
m
+=
4
)
{
const
index_t
out_channels
=
out_shape
[
1
];
const
index_t
out_channels
=
out_shape
[
1
];
...
...
mace/ops/arm/conv_2d_neon_7x1.cc
浏览文件 @
b41fa3d6
...
@@ -32,7 +32,7 @@ void Conv2dNeonK7x1S1(const float *input,
...
@@ -32,7 +32,7 @@ void Conv2dNeonK7x1S1(const float *input,
const
index_t
in_batch_size
=
in_shape
[
1
]
*
in_image_size
;
const
index_t
in_batch_size
=
in_shape
[
1
]
*
in_image_size
;
const
index_t
out_batch_size
=
out_shape
[
1
]
*
out_image_size
;
const
index_t
out_batch_size
=
out_shape
[
1
]
*
out_image_size
;
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2)
schedule(runtime)
for
(
index_t
b
=
0
;
b
<
out_shape
[
0
];
++
b
)
{
for
(
index_t
b
=
0
;
b
<
out_shape
[
0
];
++
b
)
{
for
(
index_t
m
=
0
;
m
<
out_shape
[
1
];
m
+=
4
)
{
for
(
index_t
m
=
0
;
m
<
out_shape
[
1
];
m
+=
4
)
{
const
index_t
out_channels
=
out_shape
[
1
];
const
index_t
out_channels
=
out_shape
[
1
];
...
...
mace/ops/arm/conv_2d_neon_7x7.cc
浏览文件 @
b41fa3d6
...
@@ -164,7 +164,7 @@ void Conv2dNeonK7x7S1(const float *input,
...
@@ -164,7 +164,7 @@ void Conv2dNeonK7x7S1(const float *input,
const
index_t
in_batch_size
=
in_shape
[
1
]
*
in_image_size
;
const
index_t
in_batch_size
=
in_shape
[
1
]
*
in_image_size
;
const
index_t
out_batch_size
=
out_shape
[
1
]
*
out_image_size
;
const
index_t
out_batch_size
=
out_shape
[
1
]
*
out_image_size
;
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2)
schedule(runtime)
for
(
index_t
b
=
0
;
b
<
out_shape
[
0
];
++
b
)
{
for
(
index_t
b
=
0
;
b
<
out_shape
[
0
];
++
b
)
{
for
(
index_t
m
=
0
;
m
<
out_shape
[
1
];
m
+=
4
)
{
for
(
index_t
m
=
0
;
m
<
out_shape
[
1
];
m
+=
4
)
{
const
index_t
out_channels
=
out_shape
[
1
];
const
index_t
out_channels
=
out_shape
[
1
];
...
@@ -319,7 +319,7 @@ void Conv2dNeonK7x7S2(const float *input,
...
@@ -319,7 +319,7 @@ void Conv2dNeonK7x7S2(const float *input,
const
index_t
in_batch_size
=
in_shape
[
1
]
*
in_image_size
;
const
index_t
in_batch_size
=
in_shape
[
1
]
*
in_image_size
;
const
index_t
out_batch_size
=
out_shape
[
1
]
*
out_image_size
;
const
index_t
out_batch_size
=
out_shape
[
1
]
*
out_image_size
;
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2)
schedule(runtime)
for
(
index_t
b
=
0
;
b
<
out_shape
[
0
];
++
b
)
{
for
(
index_t
b
=
0
;
b
<
out_shape
[
0
];
++
b
)
{
for
(
index_t
m
=
0
;
m
<
out_shape
[
1
];
m
+=
4
)
{
for
(
index_t
m
=
0
;
m
<
out_shape
[
1
];
m
+=
4
)
{
const
index_t
out_channels
=
out_shape
[
1
];
const
index_t
out_channels
=
out_shape
[
1
];
...
@@ -484,7 +484,7 @@ void Conv2dNeonK7x7S3(const float *input,
...
@@ -484,7 +484,7 @@ void Conv2dNeonK7x7S3(const float *input,
const
index_t
in_batch_size
=
in_shape
[
1
]
*
in_image_size
;
const
index_t
in_batch_size
=
in_shape
[
1
]
*
in_image_size
;
const
index_t
out_batch_size
=
out_shape
[
1
]
*
out_image_size
;
const
index_t
out_batch_size
=
out_shape
[
1
]
*
out_image_size
;
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2)
schedule(runtime)
for
(
index_t
b
=
0
;
b
<
out_shape
[
0
];
++
b
)
{
for
(
index_t
b
=
0
;
b
<
out_shape
[
0
];
++
b
)
{
for
(
index_t
m
=
0
;
m
<
out_shape
[
1
];
m
+=
4
)
{
for
(
index_t
m
=
0
;
m
<
out_shape
[
1
];
m
+=
4
)
{
const
index_t
out_channels
=
out_shape
[
1
];
const
index_t
out_channels
=
out_shape
[
1
];
...
...
mace/ops/arm/conv_winograd.cc
浏览文件 @
b41fa3d6
...
@@ -34,7 +34,7 @@ void TransformInput4x4(const float *input,
...
@@ -34,7 +34,7 @@ void TransformInput4x4(const float *input,
const
index_t
input_batch_size
=
in_height_width
*
in_channels
;
const
index_t
input_batch_size
=
in_height_width
*
in_channels
;
const
index_t
output_batch_size
=
16
*
in_channels
*
tile_count
;
const
index_t
output_batch_size
=
16
*
in_channels
*
tile_count
;
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2)
schedule(runtime)
for
(
index_t
n
=
0
;
n
<
batch
;
++
n
)
{
for
(
index_t
n
=
0
;
n
<
batch
;
++
n
)
{
for
(
index_t
c
=
0
;
c
<
in_channels
;
++
c
)
{
for
(
index_t
c
=
0
;
c
<
in_channels
;
++
c
)
{
index_t
tile_index
=
0
;
index_t
tile_index
=
0
;
...
@@ -155,7 +155,7 @@ void TransformInput8x8(const float *input,
...
@@ -155,7 +155,7 @@ void TransformInput8x8(const float *input,
const
index_t
input_batch_size
=
in_height_width
*
in_channels
;
const
index_t
input_batch_size
=
in_height_width
*
in_channels
;
const
index_t
output_batch_size
=
64
*
in_channels
*
tile_count
;
const
index_t
output_batch_size
=
64
*
in_channels
*
tile_count
;
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2)
schedule(runtime)
for
(
index_t
n
=
0
;
n
<
batch
;
++
n
)
{
for
(
index_t
n
=
0
;
n
<
batch
;
++
n
)
{
for
(
index_t
c
=
0
;
c
<
in_channels
;
++
c
)
{
for
(
index_t
c
=
0
;
c
<
in_channels
;
++
c
)
{
index_t
tile_index
=
0
;
index_t
tile_index
=
0
;
...
@@ -292,7 +292,7 @@ void TransformOutput4x4(const float *input,
...
@@ -292,7 +292,7 @@ void TransformOutput4x4(const float *input,
const
index_t
out_image_size
=
out_height
*
out_width
;
const
index_t
out_image_size
=
out_height
*
out_width
;
const
index_t
output_batch_size
=
out_channels
*
out_image_size
;
const
index_t
output_batch_size
=
out_channels
*
out_image_size
;
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2)
schedule(runtime)
for
(
index_t
n
=
0
;
n
<
batch
;
++
n
)
{
for
(
index_t
n
=
0
;
n
<
batch
;
++
n
)
{
for
(
index_t
m
=
0
;
m
<
out_channels
;
++
m
)
{
for
(
index_t
m
=
0
;
m
<
out_channels
;
++
m
)
{
index_t
tile_offset
=
0
;
index_t
tile_offset
=
0
;
...
@@ -388,7 +388,7 @@ void TransformOutput8x8(const float *input,
...
@@ -388,7 +388,7 @@ void TransformOutput8x8(const float *input,
const
index_t
out_image_size
=
out_height
*
out_width
;
const
index_t
out_image_size
=
out_height
*
out_width
;
const
index_t
output_batch_size
=
out_channels
*
out_image_size
;
const
index_t
output_batch_size
=
out_channels
*
out_image_size
;
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2)
schedule(runtime)
for
(
index_t
n
=
0
;
n
<
batch
;
++
n
)
{
for
(
index_t
n
=
0
;
n
<
batch
;
++
n
)
{
for
(
index_t
m
=
0
;
m
<
out_channels
;
++
m
)
{
for
(
index_t
m
=
0
;
m
<
out_channels
;
++
m
)
{
index_t
tile_offset
=
0
;
index_t
tile_offset
=
0
;
...
@@ -471,7 +471,7 @@ void TransformFilter4x4(const float *filter,
...
@@ -471,7 +471,7 @@ void TransformFilter4x4(const float *filter,
float
*
output
)
{
float
*
output
)
{
const
index_t
stride
=
out_channels
*
in_channels
;
const
index_t
stride
=
out_channels
*
in_channels
;
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2)
schedule(runtime)
for
(
index_t
m
=
0
;
m
<
out_channels
;
++
m
)
{
for
(
index_t
m
=
0
;
m
<
out_channels
;
++
m
)
{
for
(
index_t
c
=
0
;
c
<
in_channels
;
++
c
)
{
for
(
index_t
c
=
0
;
c
<
in_channels
;
++
c
)
{
float
g0
,
g1
,
g2
,
g3
,
g4
,
g5
,
g6
,
g7
,
g8
;
float
g0
,
g1
,
g2
,
g3
,
g4
,
g5
,
g6
,
g7
,
g8
;
...
@@ -573,7 +573,7 @@ void TransformFilter8x8(const float *filter,
...
@@ -573,7 +573,7 @@ void TransformFilter8x8(const float *filter,
{
1.0
f
/
45
,
-
1.0
f
/
90
,
1.0
f
/
180
},
{
1.0
f
/
45
,
-
1.0
f
/
90
,
1.0
f
/
180
},
{
0.0
f
,
0.0
f
,
1.0
f
}};
{
0.0
f
,
0.0
f
,
1.0
f
}};
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2)
schedule(runtime)
for
(
index_t
m
=
0
;
m
<
out_channels
;
++
m
)
{
for
(
index_t
m
=
0
;
m
<
out_channels
;
++
m
)
{
for
(
index_t
c
=
0
;
c
<
in_channels
;
++
c
)
{
for
(
index_t
c
=
0
;
c
<
in_channels
;
++
c
)
{
// load filter
// load filter
...
@@ -720,7 +720,7 @@ void ConvRef3x3s1(const float *input,
...
@@ -720,7 +720,7 @@ void ConvRef3x3s1(const float *input,
index_t
out_height
=
in_height
-
2
;
index_t
out_height
=
in_height
-
2
;
index_t
out_width
=
in_width
-
2
;
index_t
out_width
=
in_width
-
2
;
#pragma omp parallel for collapse(4)
#pragma omp parallel for collapse(4)
schedule(runtime)
for
(
index_t
b
=
0
;
b
<
batch
;
++
b
)
{
for
(
index_t
b
=
0
;
b
<
batch
;
++
b
)
{
for
(
index_t
m
=
0
;
m
<
out_channels
;
++
m
)
{
for
(
index_t
m
=
0
;
m
<
out_channels
;
++
m
)
{
for
(
index_t
h
=
0
;
h
<
out_height
;
++
h
)
{
for
(
index_t
h
=
0
;
h
<
out_height
;
++
h
)
{
...
...
mace/ops/arm/deconv_2d_neon_2x2.cc
浏览文件 @
b41fa3d6
...
@@ -33,7 +33,7 @@ void Deconv2dNeonK2x2S1(const float *input,
...
@@ -33,7 +33,7 @@ void Deconv2dNeonK2x2S1(const float *input,
const
index_t
out_img_size
=
outh
*
outw
;
const
index_t
out_img_size
=
outh
*
outw
;
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2)
schedule(runtime)
for
(
index_t
b
=
0
;
b
<
out_shape
[
0
];
++
b
)
{
for
(
index_t
b
=
0
;
b
<
out_shape
[
0
];
++
b
)
{
for
(
index_t
oc
=
0
;
oc
<
outch
;
oc
+=
2
)
{
for
(
index_t
oc
=
0
;
oc
<
outch
;
oc
+=
2
)
{
if
(
oc
+
1
<
outch
)
{
if
(
oc
+
1
<
outch
)
{
...
@@ -199,7 +199,7 @@ void Deconv2dNeonK2x2S2(const float *input,
...
@@ -199,7 +199,7 @@ void Deconv2dNeonK2x2S2(const float *input,
const
index_t
outw
=
out_shape
[
3
];
const
index_t
outw
=
out_shape
[
3
];
const
index_t
out_img_size
=
outh
*
outw
;
const
index_t
out_img_size
=
outh
*
outw
;
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2)
schedule(runtime)
for
(
index_t
b
=
0
;
b
<
out_shape
[
0
];
++
b
)
{
for
(
index_t
b
=
0
;
b
<
out_shape
[
0
];
++
b
)
{
for
(
index_t
oc
=
0
;
oc
<
outch
;
++
oc
)
{
for
(
index_t
oc
=
0
;
oc
<
outch
;
++
oc
)
{
float
*
out_base
=
output
+
(
b
*
outch
+
oc
)
*
out_img_size
;
float
*
out_base
=
output
+
(
b
*
outch
+
oc
)
*
out_img_size
;
...
...
mace/ops/arm/deconv_2d_neon_3x3.cc
浏览文件 @
b41fa3d6
...
@@ -33,7 +33,7 @@ void Deconv2dNeonK3x3S1(const float *input,
...
@@ -33,7 +33,7 @@ void Deconv2dNeonK3x3S1(const float *input,
const
index_t
out_img_size
=
outh
*
outw
;
const
index_t
out_img_size
=
outh
*
outw
;
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2)
schedule(runtime)
for
(
index_t
b
=
0
;
b
<
out_shape
[
0
];
++
b
)
{
for
(
index_t
b
=
0
;
b
<
out_shape
[
0
];
++
b
)
{
for
(
index_t
oc
=
0
;
oc
<
outch
;
oc
+=
2
)
{
for
(
index_t
oc
=
0
;
oc
<
outch
;
oc
+=
2
)
{
if
(
oc
+
1
<
outch
)
{
if
(
oc
+
1
<
outch
)
{
...
@@ -293,7 +293,7 @@ void Deconv2dNeonK3x3S2(const float *input,
...
@@ -293,7 +293,7 @@ void Deconv2dNeonK3x3S2(const float *input,
const
index_t
outw
=
out_shape
[
3
];
const
index_t
outw
=
out_shape
[
3
];
const
index_t
out_img_size
=
outh
*
outw
;
const
index_t
out_img_size
=
outh
*
outw
;
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2)
schedule(runtime)
for
(
index_t
b
=
0
;
b
<
out_shape
[
0
];
++
b
)
{
for
(
index_t
b
=
0
;
b
<
out_shape
[
0
];
++
b
)
{
for
(
index_t
oc
=
0
;
oc
<
outch
;
++
oc
)
{
for
(
index_t
oc
=
0
;
oc
<
outch
;
++
oc
)
{
float
*
out_base
=
output
+
(
b
*
outch
+
oc
)
*
out_img_size
;
float
*
out_base
=
output
+
(
b
*
outch
+
oc
)
*
out_img_size
;
...
...
mace/ops/arm/deconv_2d_neon_4x4.cc
浏览文件 @
b41fa3d6
...
@@ -31,7 +31,7 @@ void Deconv2dNeonK4x4S1(const float *input,
...
@@ -31,7 +31,7 @@ void Deconv2dNeonK4x4S1(const float *input,
const
index_t
outw
=
out_shape
[
3
];
const
index_t
outw
=
out_shape
[
3
];
const
index_t
outch
=
out_shape
[
1
];
const
index_t
outch
=
out_shape
[
1
];
const
index_t
out_img_size
=
outh
*
outw
;
const
index_t
out_img_size
=
outh
*
outw
;
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2)
schedule(runtime)
for
(
index_t
b
=
0
;
b
<
out_shape
[
0
];
++
b
)
{
for
(
index_t
b
=
0
;
b
<
out_shape
[
0
];
++
b
)
{
for
(
index_t
oc
=
0
;
oc
<
outch
;
oc
+=
2
)
{
for
(
index_t
oc
=
0
;
oc
<
outch
;
oc
+=
2
)
{
if
(
oc
+
1
<
outch
)
{
if
(
oc
+
1
<
outch
)
{
...
@@ -386,7 +386,7 @@ void Deconv2dNeonK4x4S2(const float *input,
...
@@ -386,7 +386,7 @@ void Deconv2dNeonK4x4S2(const float *input,
const
index_t
outch
=
out_shape
[
1
];
const
index_t
outch
=
out_shape
[
1
];
const
index_t
out_img_size
=
outh
*
outw
;
const
index_t
out_img_size
=
outh
*
outw
;
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2)
schedule(runtime)
for
(
index_t
b
=
0
;
b
<
out_shape
[
0
];
++
b
)
{
for
(
index_t
b
=
0
;
b
<
out_shape
[
0
];
++
b
)
{
for
(
index_t
p
=
0
;
p
<
outch
;
p
++
)
{
for
(
index_t
p
=
0
;
p
<
outch
;
p
++
)
{
float
*
out_base
=
output
+
(
b
*
outch
+
p
)
*
out_img_size
;
float
*
out_base
=
output
+
(
b
*
outch
+
p
)
*
out_img_size
;
...
...
mace/ops/arm/depthwise_conv2d_neon_3x3.cc
浏览文件 @
b41fa3d6
...
@@ -70,7 +70,7 @@ void DepthwiseConv2dNeonK3x3S1(const float *input,
...
@@ -70,7 +70,7 @@ void DepthwiseConv2dNeonK3x3S1(const float *input,
const
index_t
in_batch_size
=
in_shape
[
1
]
*
in_image_size
;
const
index_t
in_batch_size
=
in_shape
[
1
]
*
in_image_size
;
const
index_t
out_batch_size
=
out_shape
[
1
]
*
out_image_size
;
const
index_t
out_batch_size
=
out_shape
[
1
]
*
out_image_size
;
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2)
schedule(runtime)
for
(
index_t
b
=
0
;
b
<
in_shape
[
0
];
++
b
)
{
for
(
index_t
b
=
0
;
b
<
in_shape
[
0
];
++
b
)
{
for
(
index_t
m
=
0
;
m
<
out_shape
[
1
];
++
m
)
{
for
(
index_t
m
=
0
;
m
<
out_shape
[
1
];
++
m
)
{
index_t
c
=
m
/
multiplier
;
index_t
c
=
m
/
multiplier
;
...
@@ -250,7 +250,7 @@ void DepthwiseConv2dNeonK3x3S2(const float *input,
...
@@ -250,7 +250,7 @@ void DepthwiseConv2dNeonK3x3S2(const float *input,
const
index_t
in_batch_size
=
in_shape
[
1
]
*
in_image_size
;
const
index_t
in_batch_size
=
in_shape
[
1
]
*
in_image_size
;
const
index_t
out_batch_size
=
out_shape
[
1
]
*
out_image_size
;
const
index_t
out_batch_size
=
out_shape
[
1
]
*
out_image_size
;
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2)
schedule(runtime)
for
(
index_t
b
=
0
;
b
<
in_shape
[
0
];
++
b
)
{
for
(
index_t
b
=
0
;
b
<
in_shape
[
0
];
++
b
)
{
for
(
index_t
m
=
0
;
m
<
out_shape
[
1
];
++
m
)
{
for
(
index_t
m
=
0
;
m
<
out_shape
[
1
];
++
m
)
{
index_t
c
=
m
/
multiplier
;
index_t
c
=
m
/
multiplier
;
...
...
mace/ops/arm/depthwise_deconv2d_neon_3x3.cc
浏览文件 @
b41fa3d6
...
@@ -32,7 +32,7 @@ void DepthwiseDeconv2dNeonK3x3S1(const float *input,
...
@@ -32,7 +32,7 @@ void DepthwiseDeconv2dNeonK3x3S1(const float *input,
const
index_t
outw
=
out_shape
[
3
];
const
index_t
outw
=
out_shape
[
3
];
const
index_t
out_img_size
=
outh
*
outw
;
const
index_t
out_img_size
=
outh
*
outw
;
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2)
schedule(runtime)
for
(
index_t
b
=
0
;
b
<
out_shape
[
0
];
++
b
)
{
for
(
index_t
b
=
0
;
b
<
out_shape
[
0
];
++
b
)
{
for
(
index_t
c
=
0
;
c
<
channels
;
++
c
)
{
for
(
index_t
c
=
0
;
c
<
channels
;
++
c
)
{
const
index_t
offset
=
b
*
channels
+
c
;
const
index_t
offset
=
b
*
channels
+
c
;
...
@@ -137,7 +137,7 @@ void DepthwiseDeconv2dNeonK3x3S2(const float *input,
...
@@ -137,7 +137,7 @@ void DepthwiseDeconv2dNeonK3x3S2(const float *input,
const
index_t
outw
=
out_shape
[
3
];
const
index_t
outw
=
out_shape
[
3
];
const
index_t
out_img_size
=
outh
*
outw
;
const
index_t
out_img_size
=
outh
*
outw
;
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2)
schedule(runtime)
for
(
index_t
b
=
0
;
b
<
out_shape
[
0
];
++
b
)
{
for
(
index_t
b
=
0
;
b
<
out_shape
[
0
];
++
b
)
{
for
(
index_t
c
=
0
;
c
<
channels
;
++
c
)
{
for
(
index_t
c
=
0
;
c
<
channels
;
++
c
)
{
const
index_t
offset
=
b
*
channels
+
c
;
const
index_t
offset
=
b
*
channels
+
c
;
...
@@ -251,7 +251,7 @@ void GroupDeconv2dNeonK3x3S1(const float *input,
...
@@ -251,7 +251,7 @@ void GroupDeconv2dNeonK3x3S1(const float *input,
const
index_t
inch_g
=
inch
/
group
;
const
index_t
inch_g
=
inch
/
group
;
const
index_t
outch_g
=
outch
/
group
;
const
index_t
outch_g
=
outch
/
group
;
#pragma omp parallel for collapse(3)
#pragma omp parallel for collapse(3)
schedule(runtime)
for
(
index_t
b
=
0
;
b
<
out_shape
[
0
];
++
b
)
{
for
(
index_t
b
=
0
;
b
<
out_shape
[
0
];
++
b
)
{
for
(
int
g
=
0
;
g
<
group
;
++
g
)
{
for
(
int
g
=
0
;
g
<
group
;
++
g
)
{
for
(
index_t
oc
=
0
;
oc
<
outch_g
;
oc
+=
2
)
{
for
(
index_t
oc
=
0
;
oc
<
outch_g
;
oc
+=
2
)
{
...
@@ -525,7 +525,7 @@ void GroupDeconv2dNeonK3x3S2(const float *input,
...
@@ -525,7 +525,7 @@ void GroupDeconv2dNeonK3x3S2(const float *input,
const
index_t
inch_g
=
inch
/
group
;
const
index_t
inch_g
=
inch
/
group
;
const
index_t
outch_g
=
outch
/
group
;
const
index_t
outch_g
=
outch
/
group
;
#pragma omp parallel for collapse(3)
#pragma omp parallel for collapse(3)
schedule(runtime)
for
(
index_t
b
=
0
;
b
<
out_shape
[
0
];
++
b
)
{
for
(
index_t
b
=
0
;
b
<
out_shape
[
0
];
++
b
)
{
for
(
int
g
=
0
;
g
<
group
;
++
g
)
{
for
(
int
g
=
0
;
g
<
group
;
++
g
)
{
for
(
index_t
oc
=
0
;
oc
<
outch_g
;
++
oc
)
{
for
(
index_t
oc
=
0
;
oc
<
outch_g
;
++
oc
)
{
...
...
mace/ops/arm/depthwise_deconv2d_neon_4x4.cc
浏览文件 @
b41fa3d6
...
@@ -33,7 +33,7 @@ void DepthwiseDeconv2dNeonK4x4S1(const float *input,
...
@@ -33,7 +33,7 @@ void DepthwiseDeconv2dNeonK4x4S1(const float *input,
const
index_t
outw
=
out_shape
[
3
];
const
index_t
outw
=
out_shape
[
3
];
const
index_t
out_img_size
=
outh
*
outw
;
const
index_t
out_img_size
=
outh
*
outw
;
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2)
schedule(runtime)
for
(
index_t
b
=
0
;
b
<
batch
;
++
b
)
{
for
(
index_t
b
=
0
;
b
<
batch
;
++
b
)
{
for
(
index_t
c
=
0
;
c
<
channels
;
++
c
)
{
for
(
index_t
c
=
0
;
c
<
channels
;
++
c
)
{
const
index_t
offset
=
b
*
channels
+
c
;
const
index_t
offset
=
b
*
channels
+
c
;
...
@@ -169,7 +169,7 @@ void DepthwiseDeconv2dNeonK4x4S2(const float *input,
...
@@ -169,7 +169,7 @@ void DepthwiseDeconv2dNeonK4x4S2(const float *input,
const
index_t
outw
=
out_shape
[
3
];
const
index_t
outw
=
out_shape
[
3
];
const
index_t
out_img_size
=
outh
*
outw
;
const
index_t
out_img_size
=
outh
*
outw
;
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2)
schedule(runtime)
for
(
index_t
b
=
0
;
b
<
out_shape
[
0
];
++
b
)
{
for
(
index_t
b
=
0
;
b
<
out_shape
[
0
];
++
b
)
{
for
(
index_t
c
=
0
;
c
<
channels
;
++
c
)
{
for
(
index_t
c
=
0
;
c
<
channels
;
++
c
)
{
const
index_t
offset
=
b
*
channels
+
c
;
const
index_t
offset
=
b
*
channels
+
c
;
...
@@ -304,7 +304,7 @@ void GroupDeconv2dNeonK4x4S1(const float *input,
...
@@ -304,7 +304,7 @@ void GroupDeconv2dNeonK4x4S1(const float *input,
const
index_t
inch_g
=
inch
/
group
;
const
index_t
inch_g
=
inch
/
group
;
const
index_t
outch_g
=
outch
/
group
;
const
index_t
outch_g
=
outch
/
group
;
#pragma omp parallel for collapse(3)
#pragma omp parallel for collapse(3)
schedule(runtime)
for
(
index_t
b
=
0
;
b
<
out_shape
[
0
];
++
b
)
{
for
(
index_t
b
=
0
;
b
<
out_shape
[
0
];
++
b
)
{
for
(
int
g
=
0
;
g
<
group
;
++
g
)
{
for
(
int
g
=
0
;
g
<
group
;
++
g
)
{
for
(
index_t
oc
=
0
;
oc
<
outch_g
;
oc
+=
2
)
{
for
(
index_t
oc
=
0
;
oc
<
outch_g
;
oc
+=
2
)
{
...
@@ -679,7 +679,7 @@ void GroupDeconv2dNeonK4x4S2(const float *input,
...
@@ -679,7 +679,7 @@ void GroupDeconv2dNeonK4x4S2(const float *input,
const
index_t
inch_g
=
inch
/
group
;
const
index_t
inch_g
=
inch
/
group
;
const
index_t
outch_g
=
outch
/
group
;
const
index_t
outch_g
=
outch
/
group
;
#pragma omp parallel for collapse(3)
#pragma omp parallel for collapse(3)
schedule(runtime)
for
(
index_t
b
=
0
;
b
<
out_shape
[
0
];
++
b
)
{
for
(
index_t
b
=
0
;
b
<
out_shape
[
0
];
++
b
)
{
for
(
int
g
=
0
;
g
<
group
;
++
g
)
{
for
(
int
g
=
0
;
g
<
group
;
++
g
)
{
for
(
index_t
oc
=
0
;
oc
<
outch_g
;
oc
++
)
{
for
(
index_t
oc
=
0
;
oc
<
outch_g
;
oc
++
)
{
...
...
mace/ops/batch_to_space.cc
浏览文件 @
b41fa3d6
...
@@ -124,7 +124,7 @@ class BatchToSpaceNDOp<DeviceType::CPU, float> : public BatchToSpaceOpBase {
...
@@ -124,7 +124,7 @@ class BatchToSpaceNDOp<DeviceType::CPU, float> : public BatchToSpaceOpBase {
std
::
max
(
static_cast
<
index_t
>
(
1
),
8
*
1024
/
block_shape_w
/
out_width
);
std
::
max
(
static_cast
<
index_t
>
(
1
),
8
*
1024
/
block_shape_w
/
out_width
);
// make channel outter loop so we can make best use of cache
// make channel outter loop so we can make best use of cache
#pragma omp parallel for collapse(3)
#pragma omp parallel for collapse(3)
schedule(runtime)
for
(
index_t
c
=
0
;
c
<
channels
;
++
c
)
{
for
(
index_t
c
=
0
;
c
<
channels
;
++
c
)
{
for
(
index_t
block_h
=
0
;
block_h
<
in_height
;
for
(
index_t
block_h
=
0
;
block_h
<
in_height
;
block_h
+=
block_h_size
)
{
block_h
+=
block_h_size
)
{
...
@@ -213,7 +213,7 @@ class BatchToSpaceNDOp<DeviceType::CPU, uint8_t> : public BatchToSpaceOpBase {
...
@@ -213,7 +213,7 @@ class BatchToSpaceNDOp<DeviceType::CPU, uint8_t> : public BatchToSpaceOpBase {
index_t
out_width
=
space_tensor
->
dim
(
2
);
index_t
out_width
=
space_tensor
->
dim
(
2
);
index_t
channels
=
space_tensor
->
dim
(
3
);
index_t
channels
=
space_tensor
->
dim
(
3
);
#pragma omp parallel for
#pragma omp parallel for
schedule(runtime)
for
(
index_t
in_b
=
0
;
in_b
<
in_batches
;
++
in_b
)
{
for
(
index_t
in_b
=
0
;
in_b
<
in_batches
;
++
in_b
)
{
const
index_t
b
=
in_b
%
out_batches
;
const
index_t
b
=
in_b
%
out_batches
;
const
index_t
tile_index
=
in_b
/
out_batches
;
const
index_t
tile_index
=
in_b
/
out_batches
;
...
...
mace/ops/channel_shuffle.cc
浏览文件 @
b41fa3d6
...
@@ -55,7 +55,7 @@ class ChannelShuffleOp<DeviceType::CPU, T> : public Operation {
...
@@ -55,7 +55,7 @@ class ChannelShuffleOp<DeviceType::CPU, T> : public Operation {
index_t
batch_size
=
channels
*
image_size
;
index_t
batch_size
=
channels
*
image_size
;
index_t
channels_per_group
=
channels
/
groups_
;
index_t
channels_per_group
=
channels
/
groups_
;
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2)
schedule(runtime)
for
(
index_t
b
=
0
;
b
<
batch
;
++
b
)
{
for
(
index_t
b
=
0
;
b
<
batch
;
++
b
)
{
for
(
index_t
c
=
0
;
c
<
channels
;
++
c
)
{
for
(
index_t
c
=
0
;
c
<
channels
;
++
c
)
{
const
T
*
input_base
=
input_ptr
+
b
*
batch_size
;
const
T
*
input_base
=
input_ptr
+
b
*
batch_size
;
...
...
mace/ops/conv_2d.cc
浏览文件 @
b41fa3d6
...
@@ -475,7 +475,7 @@ class Conv2dOp<DeviceType::CPU, float> : public ConvPool2dOpBase {
...
@@ -475,7 +475,7 @@ class Conv2dOp<DeviceType::CPU, float> : public ConvPool2dOpBase {
// unpack output
// unpack output
if
(
extra_output_height
!=
height
||
extra_output_width
!=
width
)
{
if
(
extra_output_height
!=
height
||
extra_output_width
!=
width
)
{
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2)
schedule(runtime)
for
(
index_t
b
=
0
;
b
<
batch
;
++
b
)
{
for
(
index_t
b
=
0
;
b
<
batch
;
++
b
)
{
for
(
index_t
c
=
0
;
c
<
channels
;
++
c
)
{
for
(
index_t
c
=
0
;
c
<
channels
;
++
c
)
{
for
(
index_t
h
=
0
;
h
<
height
;
++
h
)
{
for
(
index_t
h
=
0
;
h
<
height
;
++
h
)
{
...
@@ -494,7 +494,7 @@ class Conv2dOp<DeviceType::CPU, float> : public ConvPool2dOpBase {
...
@@ -494,7 +494,7 @@ class Conv2dOp<DeviceType::CPU, float> : public ConvPool2dOpBase {
if
(
bias_data
!=
nullptr
)
{
if
(
bias_data
!=
nullptr
)
{
const
index_t
image_size
=
height
*
width
;
const
index_t
image_size
=
height
*
width
;
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2)
schedule(runtime)
for
(
index_t
b
=
0
;
b
<
batch
;
++
b
)
{
for
(
index_t
b
=
0
;
b
<
batch
;
++
b
)
{
for
(
index_t
c
=
0
;
c
<
channels
;
++
c
)
{
for
(
index_t
c
=
0
;
c
<
channels
;
++
c
)
{
float
*
output_ptr
=
output_data
+
(
b
*
channels
+
c
)
*
image_size
;
float
*
output_ptr
=
output_data
+
(
b
*
channels
+
c
)
*
image_size
;
...
@@ -539,7 +539,7 @@ class Conv2dOp<DeviceType::CPU, float> : public ConvPool2dOpBase {
...
@@ -539,7 +539,7 @@ class Conv2dOp<DeviceType::CPU, float> : public ConvPool2dOpBase {
const
index_t
out_batch_size
=
filter_shape
[
0
]
*
out_image_size
;
const
index_t
out_batch_size
=
filter_shape
[
0
]
*
out_image_size
;
const
index_t
filter_size
=
filter_shape
[
2
]
*
filter_shape
[
3
];
const
index_t
filter_size
=
filter_shape
[
2
]
*
filter_shape
[
3
];
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2)
schedule(runtime)
for
(
index_t
b
=
0
;
b
<
in_shape
[
0
];
b
++
)
{
for
(
index_t
b
=
0
;
b
<
in_shape
[
0
];
b
++
)
{
for
(
index_t
m
=
0
;
m
<
filter_shape
[
0
];
m
+=
4
)
{
for
(
index_t
m
=
0
;
m
<
filter_shape
[
0
];
m
+=
4
)
{
const
index_t
in_width
=
in_shape
[
3
];
const
index_t
in_width
=
in_shape
[
3
];
...
@@ -867,7 +867,7 @@ class Conv2dOp<DeviceType::CPU, uint8_t> : public ConvPool2dOpBase {
...
@@ -867,7 +867,7 @@ class Conv2dOp<DeviceType::CPU, uint8_t> : public ConvPool2dOpBase {
const
index_t
input_row_size
=
in_shape
[
2
]
*
in_shape
[
3
];
const
index_t
input_row_size
=
in_shape
[
2
]
*
in_shape
[
3
];
const
index_t
patch_row_size
=
filter_w
*
in_shape
[
3
];
const
index_t
patch_row_size
=
filter_w
*
in_shape
[
3
];
#pragma omp parallel for collapse(3)
#pragma omp parallel for collapse(3)
schedule(runtime)
for
(
index_t
b
=
0
;
b
<
out_shape
[
0
];
++
b
)
{
for
(
index_t
b
=
0
;
b
<
out_shape
[
0
];
++
b
)
{
for
(
index_t
h
=
0
;
h
<
out_shape
[
1
];
++
h
)
{
for
(
index_t
h
=
0
;
h
<
out_shape
[
1
];
++
h
)
{
for
(
index_t
w
=
0
;
w
<
out_shape
[
2
];
++
w
)
{
for
(
index_t
w
=
0
;
w
<
out_shape
[
2
];
++
w
)
{
...
...
mace/ops/conv_pool_2d_util.cc
浏览文件 @
b41fa3d6
...
@@ -395,7 +395,7 @@ MaceStatus ConstructNCHWInputWithSpecificPadding(const Tensor *input_tensor,
...
@@ -395,7 +395,7 @@ MaceStatus ConstructNCHWInputWithSpecificPadding(const Tensor *input_tensor,
const
index_t
in_batch_size
=
channels
*
in_image_size
;
const
index_t
in_batch_size
=
channels
*
in_image_size
;
const
index_t
out_batch_size
=
channels
*
out_image_size
;
const
index_t
out_batch_size
=
channels
*
out_image_size
;
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2)
schedule(runtime)
for
(
int
i
=
0
;
i
<
batch
;
++
i
)
{
for
(
int
i
=
0
;
i
<
batch
;
++
i
)
{
for
(
int
j
=
0
;
j
<
channels
;
++
j
)
{
for
(
int
j
=
0
;
j
<
channels
;
++
j
)
{
for
(
int
k
=
0
;
k
<
height
;
++
k
)
{
for
(
int
k
=
0
;
k
<
height
;
++
k
)
{
...
@@ -443,7 +443,7 @@ MaceStatus ConstructNHWCInputWithPadding(const Tensor *input_tensor,
...
@@ -443,7 +443,7 @@ MaceStatus ConstructNHWCInputWithPadding(const Tensor *input_tensor,
if
(
padding_same_value
)
{
if
(
padding_same_value
)
{
LOG
(
FATAL
)
<<
"Not implemented"
;
LOG
(
FATAL
)
<<
"Not implemented"
;
}
else
{
}
else
{
#pragma omp parallel for collapse(3)
#pragma omp parallel for collapse(3)
schedule(runtime)
for
(
int
n
=
0
;
n
<
batch
;
++
n
)
{
for
(
int
n
=
0
;
n
<
batch
;
++
n
)
{
for
(
int
h
=
0
;
h
<
height
;
++
h
)
{
for
(
int
h
=
0
;
h
<
height
;
++
h
)
{
for
(
int
w
=
0
;
w
<
width
;
++
w
)
{
for
(
int
w
=
0
;
w
<
width
;
++
w
)
{
...
...
mace/ops/deconv_2d.cc
浏览文件 @
b41fa3d6
...
@@ -276,7 +276,7 @@ class Deconv2dOp<DeviceType::CPU, float> : public Deconv2dOpBase {
...
@@ -276,7 +276,7 @@ class Deconv2dOp<DeviceType::CPU, float> : public Deconv2dOpBase {
const
index_t
batch
=
out_shape
[
0
];
const
index_t
batch
=
out_shape
[
0
];
const
index_t
channels
=
out_shape
[
1
];
const
index_t
channels
=
out_shape
[
1
];
const
index_t
img_size
=
out_shape
[
2
]
*
out_shape
[
3
];
const
index_t
img_size
=
out_shape
[
2
]
*
out_shape
[
3
];
#pragma omp parallel for collapse(3)
#pragma omp parallel for collapse(3)
schedule(runtime)
for
(
index_t
b
=
0
;
b
<
batch
;
++
b
)
{
for
(
index_t
b
=
0
;
b
<
batch
;
++
b
)
{
for
(
index_t
c
=
0
;
c
<
channels
;
++
c
)
{
for
(
index_t
c
=
0
;
c
<
channels
;
++
c
)
{
for
(
index_t
i
=
0
;
i
<
img_size
;
++
i
)
{
for
(
index_t
i
=
0
;
i
<
img_size
;
++
i
)
{
...
@@ -324,7 +324,7 @@ class Deconv2dOp<DeviceType::CPU, float> : public Deconv2dOpBase {
...
@@ -324,7 +324,7 @@ class Deconv2dOp<DeviceType::CPU, float> : public Deconv2dOpBase {
const
index_t
out_channels
=
out_shape
[
1
];
const
index_t
out_channels
=
out_shape
[
1
];
const
index_t
in_channels
=
in_shape
[
1
];
const
index_t
in_channels
=
in_shape
[
1
];
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2)
schedule(runtime)
for
(
int
b
=
0
;
b
<
batch
;
++
b
)
{
for
(
int
b
=
0
;
b
<
batch
;
++
b
)
{
for
(
int
oc
=
0
;
oc
<
out_channels
;
++
oc
)
{
for
(
int
oc
=
0
;
oc
<
out_channels
;
++
oc
)
{
float
*
out_base
=
float
*
out_base
=
...
...
mace/ops/depth_to_space.cc
浏览文件 @
b41fa3d6
...
@@ -57,7 +57,7 @@ class DepthToSpaceOp : public Operation {
...
@@ -57,7 +57,7 @@ class DepthToSpaceOp : public Operation {
const
T
*
input_ptr
=
input
->
data
<
T
>
();
const
T
*
input_ptr
=
input
->
data
<
T
>
();
T
*
output_ptr
=
output
->
mutable_data
<
T
>
();
T
*
output_ptr
=
output
->
mutable_data
<
T
>
();
#pragma omp parallel for
#pragma omp parallel for
schedule(runtime)
for
(
index_t
b
=
0
;
b
<
batch_size
;
++
b
)
{
for
(
index_t
b
=
0
;
b
<
batch_size
;
++
b
)
{
for
(
index_t
d
=
0
;
d
<
output_depth
;
++
d
)
{
for
(
index_t
d
=
0
;
d
<
output_depth
;
++
d
)
{
for
(
index_t
h
=
0
;
h
<
output_height
;
++
h
)
{
for
(
index_t
h
=
0
;
h
<
output_height
;
++
h
)
{
...
...
mace/ops/eltwise.cc
浏览文件 @
b41fa3d6
...
@@ -201,7 +201,7 @@ inline void TensorBroadcastEltwise(const EltwiseType type,
...
@@ -201,7 +201,7 @@ inline void TensorBroadcastEltwise(const EltwiseType type,
switch
(
type
)
{
switch
(
type
)
{
case
SUM
:
case
SUM
:
if
(
coeff
.
empty
())
{
if
(
coeff
.
empty
())
{
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2)
schedule(runtime)
for
(
index_t
d
=
0
;
d
<
diff_size
;
++
d
)
{
for
(
index_t
d
=
0
;
d
<
diff_size
;
++
d
)
{
for
(
index_t
i
=
0
;
i
<
common_size
;
++
i
)
{
for
(
index_t
i
=
0
;
i
<
common_size
;
++
i
)
{
output
[
i
+
d
*
common_size
]
=
output
[
i
+
d
*
common_size
]
=
...
@@ -213,7 +213,7 @@ inline void TensorBroadcastEltwise(const EltwiseType type,
...
@@ -213,7 +213,7 @@ inline void TensorBroadcastEltwise(const EltwiseType type,
if
(
swapped
)
{
if
(
swapped
)
{
std
::
swap
(
coeff_copy
[
0
],
coeff_copy
[
1
]);
std
::
swap
(
coeff_copy
[
0
],
coeff_copy
[
1
]);
}
}
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2)
schedule(runtime)
for
(
index_t
d
=
0
;
d
<
diff_size
;
++
d
)
{
for
(
index_t
d
=
0
;
d
<
diff_size
;
++
d
)
{
for
(
index_t
i
=
0
;
i
<
common_size
;
++
i
)
{
for
(
index_t
i
=
0
;
i
<
common_size
;
++
i
)
{
output
[
i
+
d
*
common_size
]
=
output
[
i
+
d
*
common_size
]
=
...
@@ -225,7 +225,7 @@ inline void TensorBroadcastEltwise(const EltwiseType type,
...
@@ -225,7 +225,7 @@ inline void TensorBroadcastEltwise(const EltwiseType type,
break
;
break
;
case
SUB
:
case
SUB
:
if
(
!
swapped
)
{
if
(
!
swapped
)
{
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2)
schedule(runtime)
for
(
index_t
d
=
0
;
d
<
diff_size
;
++
d
)
{
for
(
index_t
d
=
0
;
d
<
diff_size
;
++
d
)
{
for
(
index_t
i
=
0
;
i
<
common_size
;
++
i
)
{
for
(
index_t
i
=
0
;
i
<
common_size
;
++
i
)
{
output
[
i
+
d
*
common_size
]
=
output
[
i
+
d
*
common_size
]
=
...
@@ -233,7 +233,7 @@ inline void TensorBroadcastEltwise(const EltwiseType type,
...
@@ -233,7 +233,7 @@ inline void TensorBroadcastEltwise(const EltwiseType type,
}
}
}
}
}
else
{
}
else
{
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2)
schedule(runtime)
for
(
index_t
d
=
0
;
d
<
diff_size
;
++
d
)
{
for
(
index_t
d
=
0
;
d
<
diff_size
;
++
d
)
{
for
(
index_t
i
=
0
;
i
<
common_size
;
++
i
)
{
for
(
index_t
i
=
0
;
i
<
common_size
;
++
i
)
{
output
[
i
+
d
*
common_size
]
=
output
[
i
+
d
*
common_size
]
=
...
@@ -243,7 +243,7 @@ inline void TensorBroadcastEltwise(const EltwiseType type,
...
@@ -243,7 +243,7 @@ inline void TensorBroadcastEltwise(const EltwiseType type,
}
}
break
;
break
;
case
PROD
:
case
PROD
:
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2)
schedule(runtime)
for
(
index_t
d
=
0
;
d
<
diff_size
;
++
d
)
{
for
(
index_t
d
=
0
;
d
<
diff_size
;
++
d
)
{
for
(
index_t
i
=
0
;
i
<
common_size
;
++
i
)
{
for
(
index_t
i
=
0
;
i
<
common_size
;
++
i
)
{
output
[
i
+
d
*
common_size
]
=
input0
[
i
+
d
*
common_size
]
*
input1
[
i
];
output
[
i
+
d
*
common_size
]
=
input0
[
i
+
d
*
common_size
]
*
input1
[
i
];
...
@@ -252,7 +252,7 @@ inline void TensorBroadcastEltwise(const EltwiseType type,
...
@@ -252,7 +252,7 @@ inline void TensorBroadcastEltwise(const EltwiseType type,
break
;
break
;
case
DIV
:
case
DIV
:
if
(
!
swapped
)
{
if
(
!
swapped
)
{
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2)
schedule(runtime)
for
(
index_t
d
=
0
;
d
<
diff_size
;
++
d
)
{
for
(
index_t
d
=
0
;
d
<
diff_size
;
++
d
)
{
for
(
index_t
i
=
0
;
i
<
common_size
;
++
i
)
{
for
(
index_t
i
=
0
;
i
<
common_size
;
++
i
)
{
output
[
i
+
d
*
common_size
]
=
output
[
i
+
d
*
common_size
]
=
...
@@ -260,7 +260,7 @@ inline void TensorBroadcastEltwise(const EltwiseType type,
...
@@ -260,7 +260,7 @@ inline void TensorBroadcastEltwise(const EltwiseType type,
}
}
}
}
}
else
{
}
else
{
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2)
schedule(runtime)
for
(
index_t
d
=
0
;
d
<
diff_size
;
++
d
)
{
for
(
index_t
d
=
0
;
d
<
diff_size
;
++
d
)
{
for
(
index_t
i
=
0
;
i
<
common_size
;
++
i
)
{
for
(
index_t
i
=
0
;
i
<
common_size
;
++
i
)
{
output
[
i
+
d
*
common_size
]
=
output
[
i
+
d
*
common_size
]
=
...
@@ -270,7 +270,7 @@ inline void TensorBroadcastEltwise(const EltwiseType type,
...
@@ -270,7 +270,7 @@ inline void TensorBroadcastEltwise(const EltwiseType type,
}
}
break
;
break
;
case
MIN
:
case
MIN
:
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2)
schedule(runtime)
for
(
index_t
d
=
0
;
d
<
diff_size
;
++
d
)
{
for
(
index_t
d
=
0
;
d
<
diff_size
;
++
d
)
{
for
(
index_t
i
=
0
;
i
<
common_size
;
++
i
)
{
for
(
index_t
i
=
0
;
i
<
common_size
;
++
i
)
{
output
[
i
+
d
*
common_size
]
=
output
[
i
+
d
*
common_size
]
=
...
@@ -279,7 +279,7 @@ inline void TensorBroadcastEltwise(const EltwiseType type,
...
@@ -279,7 +279,7 @@ inline void TensorBroadcastEltwise(const EltwiseType type,
}
}
break
;
break
;
case
MAX
:
case
MAX
:
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2)
schedule(runtime)
for
(
index_t
d
=
0
;
d
<
diff_size
;
++
d
)
{
for
(
index_t
d
=
0
;
d
<
diff_size
;
++
d
)
{
for
(
index_t
i
=
0
;
i
<
common_size
;
++
i
)
{
for
(
index_t
i
=
0
;
i
<
common_size
;
++
i
)
{
output
[
i
+
d
*
common_size
]
=
output
[
i
+
d
*
common_size
]
=
...
@@ -288,7 +288,7 @@ inline void TensorBroadcastEltwise(const EltwiseType type,
...
@@ -288,7 +288,7 @@ inline void TensorBroadcastEltwise(const EltwiseType type,
}
}
break
;
break
;
case
SQR_DIFF
:
case
SQR_DIFF
:
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2)
schedule(runtime)
for
(
index_t
d
=
0
;
d
<
diff_size
;
++
d
)
{
for
(
index_t
d
=
0
;
d
<
diff_size
;
++
d
)
{
for
(
index_t
i
=
0
;
i
<
common_size
;
++
i
)
{
for
(
index_t
i
=
0
;
i
<
common_size
;
++
i
)
{
output
[
i
+
d
*
common_size
]
=
output
[
i
+
d
*
common_size
]
=
...
@@ -298,7 +298,7 @@ inline void TensorBroadcastEltwise(const EltwiseType type,
...
@@ -298,7 +298,7 @@ inline void TensorBroadcastEltwise(const EltwiseType type,
break
;
break
;
case
POW
:
case
POW
:
if
(
!
swapped
)
{
if
(
!
swapped
)
{
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2)
schedule(runtime)
for
(
index_t
d
=
0
;
d
<
diff_size
;
++
d
)
{
for
(
index_t
d
=
0
;
d
<
diff_size
;
++
d
)
{
for
(
index_t
i
=
0
;
i
<
common_size
;
++
i
)
{
for
(
index_t
i
=
0
;
i
<
common_size
;
++
i
)
{
output
[
i
+
d
*
common_size
]
=
output
[
i
+
d
*
common_size
]
=
...
@@ -306,7 +306,7 @@ inline void TensorBroadcastEltwise(const EltwiseType type,
...
@@ -306,7 +306,7 @@ inline void TensorBroadcastEltwise(const EltwiseType type,
}
}
}
}
}
else
{
}
else
{
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2)
schedule(runtime)
for
(
index_t
d
=
0
;
d
<
diff_size
;
++
d
)
{
for
(
index_t
d
=
0
;
d
<
diff_size
;
++
d
)
{
for
(
index_t
i
=
0
;
i
<
common_size
;
++
i
)
{
for
(
index_t
i
=
0
;
i
<
common_size
;
++
i
)
{
output
[
i
+
d
*
common_size
]
=
output
[
i
+
d
*
common_size
]
=
...
@@ -316,19 +316,19 @@ inline void TensorBroadcastEltwise(const EltwiseType type,
...
@@ -316,19 +316,19 @@ inline void TensorBroadcastEltwise(const EltwiseType type,
}
}
break
;
break
;
case
NEG
:
case
NEG
:
#pragma omp parallel for
#pragma omp parallel for
schedule(runtime)
for
(
index_t
i
=
0
;
i
<
diff_size
*
common_size
;
++
i
)
{
for
(
index_t
i
=
0
;
i
<
diff_size
*
common_size
;
++
i
)
{
output
[
i
]
=
-
input0
[
i
];
output
[
i
]
=
-
input0
[
i
];
}
}
break
;
break
;
case
ABS
:
case
ABS
:
#pragma omp parallel for
#pragma omp parallel for
schedule(runtime)
for
(
index_t
i
=
0
;
i
<
diff_size
*
common_size
;
++
i
)
{
for
(
index_t
i
=
0
;
i
<
diff_size
*
common_size
;
++
i
)
{
output
[
i
]
=
std
::
fabs
(
input0
[
i
]);
output
[
i
]
=
std
::
fabs
(
input0
[
i
]);
}
}
break
;
break
;
case
EQUAL
:
case
EQUAL
:
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2)
schedule(runtime)
for
(
index_t
d
=
0
;
d
<
diff_size
;
++
d
)
{
for
(
index_t
d
=
0
;
d
<
diff_size
;
++
d
)
{
for
(
index_t
i
=
0
;
i
<
common_size
;
++
i
)
{
for
(
index_t
i
=
0
;
i
<
common_size
;
++
i
)
{
output
[
i
+
d
*
common_size
]
=
output
[
i
+
d
*
common_size
]
=
...
@@ -353,7 +353,7 @@ inline void TensorEltwise(const EltwiseType type,
...
@@ -353,7 +353,7 @@ inline void TensorEltwise(const EltwiseType type,
switch
(
type
)
{
switch
(
type
)
{
case
SUM
:
case
SUM
:
if
(
coeff
.
empty
())
{
if
(
coeff
.
empty
())
{
#pragma omp parallel for
#pragma omp parallel for
schedule(runtime)
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
output
[
i
]
=
input0
[
i
]
+
input1
[
i
];
output
[
i
]
=
input0
[
i
]
+
input1
[
i
];
}
}
...
@@ -363,7 +363,7 @@ inline void TensorEltwise(const EltwiseType type,
...
@@ -363,7 +363,7 @@ inline void TensorEltwise(const EltwiseType type,
if
(
swapped
)
{
if
(
swapped
)
{
std
::
swap
(
coeff_copy
[
0
],
coeff_copy
[
1
]);
std
::
swap
(
coeff_copy
[
0
],
coeff_copy
[
1
]);
}
}
#pragma omp parallel for
#pragma omp parallel for
schedule(runtime)
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
output
[
i
]
=
input0
[
i
]
*
coeff_copy
[
0
]
+
input1
[
i
]
*
coeff_copy
[
1
];
output
[
i
]
=
input0
[
i
]
*
coeff_copy
[
0
]
+
input1
[
i
]
*
coeff_copy
[
1
];
}
}
...
@@ -371,20 +371,20 @@ inline void TensorEltwise(const EltwiseType type,
...
@@ -371,20 +371,20 @@ inline void TensorEltwise(const EltwiseType type,
break
;
break
;
case
SUB
:
case
SUB
:
if
(
!
swapped
)
{
if
(
!
swapped
)
{
#pragma omp parallel for
#pragma omp parallel for
schedule(runtime)
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
output
[
i
]
=
input0
[
i
]
-
input1
[
i
];
output
[
i
]
=
input0
[
i
]
-
input1
[
i
];
}
}
}
else
{
}
else
{
#pragma omp parallel for
#pragma omp parallel for
schedule(runtime)
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
output
[
i
]
=
input1
[
i
]
-
input0
[
i
];
output
[
i
]
=
input1
[
i
]
-
input0
[
i
];
}
}
}
}
break
;
break
;
case
PROD
:
case
PROD
:
#pragma omp parallel for
#pragma omp parallel for
schedule(runtime)
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
output
[
i
]
=
input0
[
i
]
*
input1
[
i
];
output
[
i
]
=
input0
[
i
]
*
input1
[
i
];
}
}
...
@@ -392,34 +392,34 @@ inline void TensorEltwise(const EltwiseType type,
...
@@ -392,34 +392,34 @@ inline void TensorEltwise(const EltwiseType type,
break
;
break
;
case
DIV
:
case
DIV
:
if
(
!
swapped
)
{
if
(
!
swapped
)
{
#pragma omp parallel for
#pragma omp parallel for
schedule(runtime)
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
output
[
i
]
=
input0
[
i
]
/
input1
[
i
];
output
[
i
]
=
input0
[
i
]
/
input1
[
i
];
}
}
}
else
{
}
else
{
#pragma omp parallel for
#pragma omp parallel for
schedule(runtime)
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
output
[
i
]
=
input1
[
i
]
/
input0
[
i
];
output
[
i
]
=
input1
[
i
]
/
input0
[
i
];
}
}
}
}
break
;
break
;
case
MIN
:
case
MIN
:
#pragma omp parallel for
#pragma omp parallel for
schedule(runtime)
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
output
[
i
]
=
std
::
min
(
input0
[
i
],
input1
[
i
]);
output
[
i
]
=
std
::
min
(
input0
[
i
],
input1
[
i
]);
}
}
break
;
break
;
case
MAX
:
case
MAX
:
#pragma omp parallel for
#pragma omp parallel for
schedule(runtime)
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
output
[
i
]
=
std
::
max
(
input0
[
i
],
input1
[
i
]);
output
[
i
]
=
std
::
max
(
input0
[
i
],
input1
[
i
]);
}
}
break
;
break
;
case
SQR_DIFF
:
case
SQR_DIFF
:
#pragma omp parallel for
#pragma omp parallel for
schedule(runtime)
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
output
[
i
]
=
std
::
pow
(
input0
[
i
]
-
input1
[
i
],
2.
f
);
output
[
i
]
=
std
::
pow
(
input0
[
i
]
-
input1
[
i
],
2.
f
);
}
}
...
@@ -427,7 +427,7 @@ inline void TensorEltwise(const EltwiseType type,
...
@@ -427,7 +427,7 @@ inline void TensorEltwise(const EltwiseType type,
break
;
break
;
case
POW
:
case
POW
:
if
(
!
swapped
)
{
if
(
!
swapped
)
{
#pragma omp parallel for
#pragma omp parallel for
schedule(runtime)
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
output
[
i
]
=
std
::
pow
(
input0
[
i
],
input1
[
i
]);
output
[
i
]
=
std
::
pow
(
input0
[
i
],
input1
[
i
]);
}
}
...
@@ -438,19 +438,19 @@ inline void TensorEltwise(const EltwiseType type,
...
@@ -438,19 +438,19 @@ inline void TensorEltwise(const EltwiseType type,
}
}
break
;
break
;
case
NEG
:
case
NEG
:
#pragma omp parallel for
#pragma omp parallel for
schedule(runtime)
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
output
[
i
]
=
-
input0
[
i
];
output
[
i
]
=
-
input0
[
i
];
}
}
break
;
break
;
case
ABS
:
case
ABS
:
#pragma omp parallel for
#pragma omp parallel for
schedule(runtime)
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
output
[
i
]
=
std
::
fabs
(
input0
[
i
]);
output
[
i
]
=
std
::
fabs
(
input0
[
i
]);
}
}
break
;
break
;
case
EQUAL
:
case
EQUAL
:
#pragma omp parallel for
#pragma omp parallel for
schedule(runtime)
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
output
[
i
]
=
input0
[
i
]
==
input1
[
i
];
output
[
i
]
=
input0
[
i
]
==
input1
[
i
];
}
}
...
@@ -472,7 +472,7 @@ inline void TensorScalarEltwise(const EltwiseType type,
...
@@ -472,7 +472,7 @@ inline void TensorScalarEltwise(const EltwiseType type,
switch
(
type
)
{
switch
(
type
)
{
case
SUM
:
case
SUM
:
if
(
coeff
.
empty
())
{
if
(
coeff
.
empty
())
{
#pragma omp parallel for
#pragma omp parallel for
schedule(runtime)
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
output
[
i
]
=
input0
[
i
]
+
input1
;
output
[
i
]
=
input0
[
i
]
+
input1
;
}
}
...
@@ -482,7 +482,7 @@ inline void TensorScalarEltwise(const EltwiseType type,
...
@@ -482,7 +482,7 @@ inline void TensorScalarEltwise(const EltwiseType type,
if
(
swapped
)
{
if
(
swapped
)
{
std
::
swap
(
coeff_copy
[
0
],
coeff_copy
[
1
]);
std
::
swap
(
coeff_copy
[
0
],
coeff_copy
[
1
]);
}
}
#pragma omp parallel for
#pragma omp parallel for
schedule(runtime)
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
output
[
i
]
=
input0
[
i
]
*
coeff_copy
[
0
]
+
input1
*
coeff_copy
[
1
];
output
[
i
]
=
input0
[
i
]
*
coeff_copy
[
0
]
+
input1
*
coeff_copy
[
1
];
}
}
...
@@ -490,20 +490,20 @@ inline void TensorScalarEltwise(const EltwiseType type,
...
@@ -490,20 +490,20 @@ inline void TensorScalarEltwise(const EltwiseType type,
break
;
break
;
case
SUB
:
case
SUB
:
if
(
!
swapped
)
{
if
(
!
swapped
)
{
#pragma omp parallel for
#pragma omp parallel for
schedule(runtime)
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
output
[
i
]
=
input0
[
i
]
-
input1
;
output
[
i
]
=
input0
[
i
]
-
input1
;
}
}
}
else
{
}
else
{
#pragma omp parallel for
#pragma omp parallel for
schedule(runtime)
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
output
[
i
]
=
input1
-
input0
[
i
];
output
[
i
]
=
input1
-
input0
[
i
];
}
}
}
}
break
;
break
;
case
PROD
:
case
PROD
:
#pragma omp parallel for
#pragma omp parallel for
schedule(runtime)
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
output
[
i
]
=
input0
[
i
]
*
input1
;
output
[
i
]
=
input0
[
i
]
*
input1
;
}
}
...
@@ -511,34 +511,34 @@ inline void TensorScalarEltwise(const EltwiseType type,
...
@@ -511,34 +511,34 @@ inline void TensorScalarEltwise(const EltwiseType type,
break
;
break
;
case
DIV
:
case
DIV
:
if
(
!
swapped
)
{
if
(
!
swapped
)
{
#pragma omp parallel for
#pragma omp parallel for
schedule(runtime)
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
output
[
i
]
=
input0
[
i
]
/
input1
;
output
[
i
]
=
input0
[
i
]
/
input1
;
}
}
}
else
{
}
else
{
#pragma omp parallel for
#pragma omp parallel for
schedule(runtime)
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
output
[
i
]
=
input1
/
input0
[
i
];
output
[
i
]
=
input1
/
input0
[
i
];
}
}
}
}
break
;
break
;
case
MIN
:
case
MIN
:
#pragma omp parallel for
#pragma omp parallel for
schedule(runtime)
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
output
[
i
]
=
std
::
min
(
input0
[
i
],
input1
);
output
[
i
]
=
std
::
min
(
input0
[
i
],
input1
);
}
}
break
;
break
;
case
MAX
:
case
MAX
:
#pragma omp parallel for
#pragma omp parallel for
schedule(runtime)
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
output
[
i
]
=
std
::
max
(
input0
[
i
],
input1
);
output
[
i
]
=
std
::
max
(
input0
[
i
],
input1
);
}
}
break
;
break
;
case
SQR_DIFF
:
case
SQR_DIFF
:
#pragma omp parallel for
#pragma omp parallel for
schedule(runtime)
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
output
[
i
]
=
std
::
pow
(
input0
[
i
]
-
input1
,
2.
f
);
output
[
i
]
=
std
::
pow
(
input0
[
i
]
-
input1
,
2.
f
);
}
}
...
@@ -546,7 +546,7 @@ inline void TensorScalarEltwise(const EltwiseType type,
...
@@ -546,7 +546,7 @@ inline void TensorScalarEltwise(const EltwiseType type,
break
;
break
;
case
POW
:
case
POW
:
if
(
!
swapped
)
{
if
(
!
swapped
)
{
#pragma omp parallel for
#pragma omp parallel for
schedule(runtime)
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
output
[
i
]
=
std
::
pow
(
input0
[
i
],
input1
);
output
[
i
]
=
std
::
pow
(
input0
[
i
],
input1
);
}
}
...
@@ -557,19 +557,19 @@ inline void TensorScalarEltwise(const EltwiseType type,
...
@@ -557,19 +557,19 @@ inline void TensorScalarEltwise(const EltwiseType type,
}
}
break
;
break
;
case
NEG
:
case
NEG
:
#pragma omp parallel for
#pragma omp parallel for
schedule(runtime)
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
output
[
i
]
=
-
input0
[
i
];
output
[
i
]
=
-
input0
[
i
];
}
}
break
;
break
;
case
ABS
:
case
ABS
:
#pragma omp parallel for
#pragma omp parallel for
schedule(runtime)
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
output
[
i
]
=
std
::
fabs
(
input0
[
i
]);
output
[
i
]
=
std
::
fabs
(
input0
[
i
]);
}
}
break
;
break
;
case
EQUAL
:
case
EQUAL
:
#pragma omp parallel for
#pragma omp parallel for
schedule(runtime)
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
output
[
i
]
=
input0
[
i
]
==
input1
;
output
[
i
]
=
input0
[
i
]
==
input1
;
}
}
...
@@ -594,7 +594,7 @@ inline void TensorEltwisePerChannel(const EltwiseType type,
...
@@ -594,7 +594,7 @@ inline void TensorEltwisePerChannel(const EltwiseType type,
switch
(
type
)
{
switch
(
type
)
{
case
SUM
:
case
SUM
:
if
(
coeff
.
empty
())
{
if
(
coeff
.
empty
())
{
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2)
schedule(runtime)
for
(
index_t
b
=
0
;
b
<
batch0
;
++
b
)
{
for
(
index_t
b
=
0
;
b
<
batch0
;
++
b
)
{
for
(
index_t
c
=
0
;
c
<
channel
;
++
c
)
{
for
(
index_t
c
=
0
;
c
<
channel
;
++
c
)
{
const
T
*
in0_ptr
=
input0
+
((
b
*
channel
)
+
c
)
*
image_size
;
const
T
*
in0_ptr
=
input0
+
((
b
*
channel
)
+
c
)
*
image_size
;
...
@@ -610,7 +610,7 @@ inline void TensorEltwisePerChannel(const EltwiseType type,
...
@@ -610,7 +610,7 @@ inline void TensorEltwisePerChannel(const EltwiseType type,
if
(
swapped
)
{
if
(
swapped
)
{
std
::
swap
(
coeff_copy
[
0
],
coeff_copy
[
1
]);
std
::
swap
(
coeff_copy
[
0
],
coeff_copy
[
1
]);
}
}
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2)
schedule(runtime)
for
(
index_t
b
=
0
;
b
<
batch0
;
++
b
)
{
for
(
index_t
b
=
0
;
b
<
batch0
;
++
b
)
{
for
(
index_t
c
=
0
;
c
<
channel
;
++
c
)
{
for
(
index_t
c
=
0
;
c
<
channel
;
++
c
)
{
const
T
*
in0_ptr
=
input0
+
((
b
*
channel
)
+
c
)
*
image_size
;
const
T
*
in0_ptr
=
input0
+
((
b
*
channel
)
+
c
)
*
image_size
;
...
@@ -626,7 +626,7 @@ inline void TensorEltwisePerChannel(const EltwiseType type,
...
@@ -626,7 +626,7 @@ inline void TensorEltwisePerChannel(const EltwiseType type,
break
;
break
;
case
SUB
:
case
SUB
:
if
(
!
swapped
)
{
if
(
!
swapped
)
{
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2)
schedule(runtime)
for
(
index_t
b
=
0
;
b
<
batch0
;
++
b
)
{
for
(
index_t
b
=
0
;
b
<
batch0
;
++
b
)
{
for
(
index_t
c
=
0
;
c
<
channel
;
++
c
)
{
for
(
index_t
c
=
0
;
c
<
channel
;
++
c
)
{
const
T
*
in0_ptr
=
input0
+
((
b
*
channel
)
+
c
)
*
image_size
;
const
T
*
in0_ptr
=
input0
+
((
b
*
channel
)
+
c
)
*
image_size
;
...
@@ -638,7 +638,7 @@ inline void TensorEltwisePerChannel(const EltwiseType type,
...
@@ -638,7 +638,7 @@ inline void TensorEltwisePerChannel(const EltwiseType type,
}
}
}
}
}
else
{
}
else
{
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2)
schedule(runtime)
for
(
index_t
b
=
0
;
b
<
batch0
;
++
b
)
{
for
(
index_t
b
=
0
;
b
<
batch0
;
++
b
)
{
for
(
index_t
c
=
0
;
c
<
channel
;
++
c
)
{
for
(
index_t
c
=
0
;
c
<
channel
;
++
c
)
{
const
T
*
in0_ptr
=
input0
+
((
b
*
channel
)
+
c
)
*
image_size
;
const
T
*
in0_ptr
=
input0
+
((
b
*
channel
)
+
c
)
*
image_size
;
...
@@ -652,7 +652,7 @@ inline void TensorEltwisePerChannel(const EltwiseType type,
...
@@ -652,7 +652,7 @@ inline void TensorEltwisePerChannel(const EltwiseType type,
}
}
break
;
break
;
case
PROD
:
case
PROD
:
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2)
schedule(runtime)
for
(
index_t
b
=
0
;
b
<
batch0
;
++
b
)
{
for
(
index_t
b
=
0
;
b
<
batch0
;
++
b
)
{
for
(
index_t
c
=
0
;
c
<
channel
;
++
c
)
{
for
(
index_t
c
=
0
;
c
<
channel
;
++
c
)
{
const
T
*
in0_ptr
=
input0
+
((
b
*
channel
)
+
c
)
*
image_size
;
const
T
*
in0_ptr
=
input0
+
((
b
*
channel
)
+
c
)
*
image_size
;
...
@@ -666,7 +666,7 @@ inline void TensorEltwisePerChannel(const EltwiseType type,
...
@@ -666,7 +666,7 @@ inline void TensorEltwisePerChannel(const EltwiseType type,
break
;
break
;
case
DIV
:
case
DIV
:
if
(
!
swapped
)
{
if
(
!
swapped
)
{
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2)
schedule(runtime)
for
(
index_t
b
=
0
;
b
<
batch0
;
++
b
)
{
for
(
index_t
b
=
0
;
b
<
batch0
;
++
b
)
{
for
(
index_t
c
=
0
;
c
<
channel
;
++
c
)
{
for
(
index_t
c
=
0
;
c
<
channel
;
++
c
)
{
const
T
*
in0_ptr
=
input0
+
((
b
*
channel
)
+
c
)
*
image_size
;
const
T
*
in0_ptr
=
input0
+
((
b
*
channel
)
+
c
)
*
image_size
;
...
@@ -678,7 +678,7 @@ inline void TensorEltwisePerChannel(const EltwiseType type,
...
@@ -678,7 +678,7 @@ inline void TensorEltwisePerChannel(const EltwiseType type,
}
}
}
}
}
else
{
}
else
{
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2)
schedule(runtime)
for
(
index_t
b
=
0
;
b
<
batch0
;
++
b
)
{
for
(
index_t
b
=
0
;
b
<
batch0
;
++
b
)
{
for
(
index_t
c
=
0
;
c
<
channel
;
++
c
)
{
for
(
index_t
c
=
0
;
c
<
channel
;
++
c
)
{
const
T
*
in0_ptr
=
input0
+
((
b
*
channel
)
+
c
)
*
image_size
;
const
T
*
in0_ptr
=
input0
+
((
b
*
channel
)
+
c
)
*
image_size
;
...
@@ -692,7 +692,7 @@ inline void TensorEltwisePerChannel(const EltwiseType type,
...
@@ -692,7 +692,7 @@ inline void TensorEltwisePerChannel(const EltwiseType type,
}
}
break
;
break
;
case
MIN
:
case
MIN
:
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2)
schedule(runtime)
for
(
index_t
b
=
0
;
b
<
batch0
;
++
b
)
{
for
(
index_t
b
=
0
;
b
<
batch0
;
++
b
)
{
for
(
index_t
c
=
0
;
c
<
channel
;
++
c
)
{
for
(
index_t
c
=
0
;
c
<
channel
;
++
c
)
{
const
T
*
in0_ptr
=
input0
+
((
b
*
channel
)
+
c
)
*
image_size
;
const
T
*
in0_ptr
=
input0
+
((
b
*
channel
)
+
c
)
*
image_size
;
...
@@ -705,7 +705,7 @@ inline void TensorEltwisePerChannel(const EltwiseType type,
...
@@ -705,7 +705,7 @@ inline void TensorEltwisePerChannel(const EltwiseType type,
}
}
break
;
break
;
case
MAX
:
case
MAX
:
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2)
schedule(runtime)
for
(
index_t
b
=
0
;
b
<
batch0
;
++
b
)
{
for
(
index_t
b
=
0
;
b
<
batch0
;
++
b
)
{
for
(
index_t
c
=
0
;
c
<
channel
;
++
c
)
{
for
(
index_t
c
=
0
;
c
<
channel
;
++
c
)
{
const
T
*
in0_ptr
=
input0
+
((
b
*
channel
)
+
c
)
*
image_size
;
const
T
*
in0_ptr
=
input0
+
((
b
*
channel
)
+
c
)
*
image_size
;
...
@@ -718,7 +718,7 @@ inline void TensorEltwisePerChannel(const EltwiseType type,
...
@@ -718,7 +718,7 @@ inline void TensorEltwisePerChannel(const EltwiseType type,
}
}
break
;
break
;
case
SQR_DIFF
:
case
SQR_DIFF
:
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2)
schedule(runtime)
for
(
index_t
b
=
0
;
b
<
batch0
;
++
b
)
{
for
(
index_t
b
=
0
;
b
<
batch0
;
++
b
)
{
for
(
index_t
c
=
0
;
c
<
channel
;
++
c
)
{
for
(
index_t
c
=
0
;
c
<
channel
;
++
c
)
{
const
T
*
in0_ptr
=
input0
+
((
b
*
channel
)
+
c
)
*
image_size
;
const
T
*
in0_ptr
=
input0
+
((
b
*
channel
)
+
c
)
*
image_size
;
...
@@ -732,7 +732,7 @@ inline void TensorEltwisePerChannel(const EltwiseType type,
...
@@ -732,7 +732,7 @@ inline void TensorEltwisePerChannel(const EltwiseType type,
break
;
break
;
case
POW
:
case
POW
:
if
(
!
swapped
)
{
if
(
!
swapped
)
{
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2)
schedule(runtime)
for
(
index_t
b
=
0
;
b
<
batch0
;
++
b
)
{
for
(
index_t
b
=
0
;
b
<
batch0
;
++
b
)
{
for
(
index_t
c
=
0
;
c
<
channel
;
++
c
)
{
for
(
index_t
c
=
0
;
c
<
channel
;
++
c
)
{
const
T
*
in0_ptr
=
input0
+
((
b
*
channel
)
+
c
)
*
image_size
;
const
T
*
in0_ptr
=
input0
+
((
b
*
channel
)
+
c
)
*
image_size
;
...
@@ -744,7 +744,7 @@ inline void TensorEltwisePerChannel(const EltwiseType type,
...
@@ -744,7 +744,7 @@ inline void TensorEltwisePerChannel(const EltwiseType type,
}
}
}
}
}
else
{
}
else
{
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2)
schedule(runtime)
for
(
index_t
b
=
0
;
b
<
batch0
;
++
b
)
{
for
(
index_t
b
=
0
;
b
<
batch0
;
++
b
)
{
for
(
index_t
c
=
0
;
c
<
channel
;
++
c
)
{
for
(
index_t
c
=
0
;
c
<
channel
;
++
c
)
{
const
T
*
in0_ptr
=
input0
+
((
b
*
channel
)
+
c
)
*
image_size
;
const
T
*
in0_ptr
=
input0
+
((
b
*
channel
)
+
c
)
*
image_size
;
...
@@ -758,19 +758,19 @@ inline void TensorEltwisePerChannel(const EltwiseType type,
...
@@ -758,19 +758,19 @@ inline void TensorEltwisePerChannel(const EltwiseType type,
}
}
break
;
break
;
case
NEG
:
case
NEG
:
#pragma omp parallel for
#pragma omp parallel for
schedule(runtime)
for
(
index_t
i
=
0
;
i
<
batch0
*
channel
*
image_size
;
++
i
)
{
for
(
index_t
i
=
0
;
i
<
batch0
*
channel
*
image_size
;
++
i
)
{
output
[
i
]
=
-
input0
[
i
];
output
[
i
]
=
-
input0
[
i
];
}
}
break
;
break
;
case
ABS
:
case
ABS
:
#pragma omp parallel for
#pragma omp parallel for
schedule(runtime)
for
(
index_t
i
=
0
;
i
<
batch0
*
channel
*
image_size
;
++
i
)
{
for
(
index_t
i
=
0
;
i
<
batch0
*
channel
*
image_size
;
++
i
)
{
output
[
i
]
=
std
::
fabs
(
input0
[
i
]);
output
[
i
]
=
std
::
fabs
(
input0
[
i
]);
}
}
break
;
break
;
case
EQUAL
:
case
EQUAL
:
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2)
schedule(runtime)
for
(
index_t
b
=
0
;
b
<
batch0
;
++
b
)
{
for
(
index_t
b
=
0
;
b
<
batch0
;
++
b
)
{
for
(
index_t
c
=
0
;
c
<
channel
;
++
c
)
{
for
(
index_t
c
=
0
;
c
<
channel
;
++
c
)
{
const
T
*
in0_ptr
=
input0
+
((
b
*
channel
)
+
c
)
*
image_size
;
const
T
*
in0_ptr
=
input0
+
((
b
*
channel
)
+
c
)
*
image_size
;
...
@@ -989,7 +989,7 @@ class EltwiseOp<DeviceType::CPU, uint8_t> : public Operation {
...
@@ -989,7 +989,7 @@ class EltwiseOp<DeviceType::CPU, uint8_t> : public Operation {
index_t
handled_output_size
=
0
;
index_t
handled_output_size
=
0
;
#ifdef MACE_ENABLE_NEON
#ifdef MACE_ENABLE_NEON
#pragma omp parallel for
#pragma omp parallel for
schedule(runtime)
for
(
index_t
i
=
handled_output_size
;
i
<=
output
->
size
()
-
8
;
i
+=
8
)
{
for
(
index_t
i
=
handled_output_size
;
i
<=
output
->
size
()
-
8
;
i
+=
8
)
{
const
auto
input0_val
=
vld1_u8
(
input0_ptr
+
i
);
const
auto
input0_val
=
vld1_u8
(
input0_ptr
+
i
);
const
auto
input1_val
=
vld1_u8
(
input1_ptr
+
i
);
const
auto
input1_val
=
vld1_u8
(
input1_ptr
+
i
);
...
@@ -1035,7 +1035,7 @@ class EltwiseOp<DeviceType::CPU, uint8_t> : public Operation {
...
@@ -1035,7 +1035,7 @@ class EltwiseOp<DeviceType::CPU, uint8_t> : public Operation {
}
}
handled_output_size
=
output
->
size
()
-
output
->
size
()
%
8
;
handled_output_size
=
output
->
size
()
-
output
->
size
()
%
8
;
#endif // NEON
#endif // NEON
#pragma omp parallel for
#pragma omp parallel for
schedule(runtime)
for
(
index_t
i
=
handled_output_size
;
i
<
output
->
size
();
++
i
)
{
for
(
index_t
i
=
handled_output_size
;
i
<
output
->
size
();
++
i
)
{
const
int32_t
offset_input0
=
input0_ptr
[
i
]
-
input0
->
zero_point
();
const
int32_t
offset_input0
=
input0_ptr
[
i
]
-
input0
->
zero_point
();
const
int32_t
offset_input1
=
input1_ptr
[
i
]
-
input1
->
zero_point
();
const
int32_t
offset_input1
=
input1_ptr
[
i
]
-
input1
->
zero_point
();
...
...
mace/ops/gather.cc
浏览文件 @
b41fa3d6
...
@@ -62,7 +62,7 @@ class GatherOp : public Operation {
...
@@ -62,7 +62,7 @@ class GatherOp : public Operation {
params
->
shape
().
end
(),
1
,
std
::
multiplies
<
index_t
>
());
params
->
shape
().
end
(),
1
,
std
::
multiplies
<
index_t
>
());
index_t
index_size
=
indices
->
size
();
index_t
index_size
=
indices
->
size
();
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2)
schedule(runtime)
for
(
index_t
l
=
0
;
l
<
lhs_size
;
++
l
)
{
for
(
index_t
l
=
0
;
l
<
lhs_size
;
++
l
)
{
for
(
index_t
idx
=
0
;
idx
<
index_size
;
++
idx
)
{
for
(
index_t
idx
=
0
;
idx
<
index_size
;
++
idx
)
{
MACE_ASSERT
(
indices_data
[
idx
]
<
axis_dim_size
,
"idx out of bound: "
,
MACE_ASSERT
(
indices_data
[
idx
]
<
axis_dim_size
,
"idx out of bound: "
,
...
...
mace/ops/local_response_norm.cc
浏览文件 @
b41fa3d6
...
@@ -53,7 +53,7 @@ class LocalResponseNormOp<DeviceType::CPU, float> : public Operation {
...
@@ -53,7 +53,7 @@ class LocalResponseNormOp<DeviceType::CPU, float> : public Operation {
index_t
image_size
=
height
*
width
;
index_t
image_size
=
height
*
width
;
index_t
batch_size
=
channels
*
image_size
;
index_t
batch_size
=
channels
*
image_size
;
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2)
schedule(runtime)
for
(
index_t
b
=
0
;
b
<
batch
;
++
b
)
{
for
(
index_t
b
=
0
;
b
<
batch
;
++
b
)
{
for
(
index_t
c
=
0
;
c
<
channels
;
++
c
)
{
for
(
index_t
c
=
0
;
c
<
channels
;
++
c
)
{
const
int
begin_input_c
=
std
::
max
(
static_cast
<
index_t
>
(
0
),
const
int
begin_input_c
=
std
::
max
(
static_cast
<
index_t
>
(
0
),
...
...
mace/ops/pooling.cc
浏览文件 @
b41fa3d6
...
@@ -133,7 +133,7 @@ class PoolingOp<DeviceType::CPU, float> : public PoolingOpBase {
...
@@ -133,7 +133,7 @@ class PoolingOp<DeviceType::CPU, float> : public PoolingOpBase {
const
index_t
in_batch_size
=
in_shape
[
1
]
*
in_image_size
;
const
index_t
in_batch_size
=
in_shape
[
1
]
*
in_image_size
;
const
index_t
out_batch_size
=
out_shape
[
1
]
*
out_image_size
;
const
index_t
out_batch_size
=
out_shape
[
1
]
*
out_image_size
;
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2)
schedule(runtime)
for
(
index_t
b
=
0
;
b
<
out_shape
[
0
];
++
b
)
{
for
(
index_t
b
=
0
;
b
<
out_shape
[
0
];
++
b
)
{
for
(
index_t
c
=
0
;
c
<
out_shape
[
1
];
++
c
)
{
for
(
index_t
c
=
0
;
c
<
out_shape
[
1
];
++
c
)
{
const
index_t
out_base
=
b
*
out_batch_size
+
c
*
out_image_size
;
const
index_t
out_base
=
b
*
out_batch_size
+
c
*
out_image_size
;
...
@@ -179,7 +179,7 @@ class PoolingOp<DeviceType::CPU, float> : public PoolingOpBase {
...
@@ -179,7 +179,7 @@ class PoolingOp<DeviceType::CPU, float> : public PoolingOpBase {
const
index_t
in_batch_size
=
in_shape
[
1
]
*
in_image_size
;
const
index_t
in_batch_size
=
in_shape
[
1
]
*
in_image_size
;
const
index_t
out_batch_size
=
out_shape
[
1
]
*
out_image_size
;
const
index_t
out_batch_size
=
out_shape
[
1
]
*
out_image_size
;
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2)
schedule(runtime)
for
(
index_t
b
=
0
;
b
<
out_shape
[
0
];
++
b
)
{
for
(
index_t
b
=
0
;
b
<
out_shape
[
0
];
++
b
)
{
for
(
index_t
c
=
0
;
c
<
out_shape
[
1
];
++
c
)
{
for
(
index_t
c
=
0
;
c
<
out_shape
[
1
];
++
c
)
{
const
index_t
out_base
=
b
*
out_batch_size
+
c
*
out_image_size
;
const
index_t
out_base
=
b
*
out_batch_size
+
c
*
out_image_size
;
...
@@ -301,7 +301,7 @@ class PoolingOp<DeviceType::CPU, uint8_t> : public PoolingOpBase {
...
@@ -301,7 +301,7 @@ class PoolingOp<DeviceType::CPU, uint8_t> : public PoolingOpBase {
const
int
*
stride_hw
,
const
int
*
stride_hw
,
const
int
*
pad_hw
,
const
int
*
pad_hw
,
uint8_t
*
output
)
{
uint8_t
*
output
)
{
#pragma omp parallel for collapse(3)
#pragma omp parallel for collapse(3)
schedule(runtime)
for
(
index_t
b
=
0
;
b
<
out_shape
[
0
];
++
b
)
{
for
(
index_t
b
=
0
;
b
<
out_shape
[
0
];
++
b
)
{
for
(
index_t
h
=
0
;
h
<
out_shape
[
1
];
++
h
)
{
for
(
index_t
h
=
0
;
h
<
out_shape
[
1
];
++
h
)
{
for
(
index_t
w
=
0
;
w
<
out_shape
[
2
];
++
w
)
{
for
(
index_t
w
=
0
;
w
<
out_shape
[
2
];
++
w
)
{
...
@@ -358,7 +358,7 @@ class PoolingOp<DeviceType::CPU, uint8_t> : public PoolingOpBase {
...
@@ -358,7 +358,7 @@ class PoolingOp<DeviceType::CPU, uint8_t> : public PoolingOpBase {
const
int
*
stride_hw
,
const
int
*
stride_hw
,
const
int
*
pad_hw
,
const
int
*
pad_hw
,
uint8_t
*
output
)
{
uint8_t
*
output
)
{
#pragma omp parallel for collapse(3)
#pragma omp parallel for collapse(3)
schedule(runtime)
for
(
index_t
b
=
0
;
b
<
out_shape
[
0
];
++
b
)
{
for
(
index_t
b
=
0
;
b
<
out_shape
[
0
];
++
b
)
{
for
(
index_t
h
=
0
;
h
<
out_shape
[
1
];
++
h
)
{
for
(
index_t
h
=
0
;
h
<
out_shape
[
1
];
++
h
)
{
for
(
index_t
w
=
0
;
w
<
out_shape
[
2
];
++
w
)
{
for
(
index_t
w
=
0
;
w
<
out_shape
[
2
];
++
w
)
{
...
...
mace/ops/reduce_mean.cc
浏览文件 @
b41fa3d6
...
@@ -134,7 +134,7 @@ class ReduceMeanOp<DeviceType::CPU, T> : public ReduceMeanOpBase {
...
@@ -134,7 +134,7 @@ class ReduceMeanOp<DeviceType::CPU, T> : public ReduceMeanOpBase {
}
}
output_ptr
[
0
]
=
sum
/
data_reshape_
[
0
];
output_ptr
[
0
]
=
sum
/
data_reshape_
[
0
];
}
else
{
}
else
{
#pragma omp parallel for
#pragma omp parallel for
schedule(runtime)
for
(
int
i
=
0
;
i
<
data_reshape_
[
0
];
++
i
)
{
for
(
int
i
=
0
;
i
<
data_reshape_
[
0
];
++
i
)
{
output_ptr
[
i
]
=
input_ptr
[
i
];
output_ptr
[
i
]
=
input_ptr
[
i
];
}
}
...
@@ -142,7 +142,7 @@ class ReduceMeanOp<DeviceType::CPU, T> : public ReduceMeanOpBase {
...
@@ -142,7 +142,7 @@ class ReduceMeanOp<DeviceType::CPU, T> : public ReduceMeanOpBase {
break
;
break
;
case
2
:
case
2
:
if
(
reduce_first_axis_
)
{
if
(
reduce_first_axis_
)
{
#pragma omp parallel for
#pragma omp parallel for
schedule(runtime)
for
(
int
i
=
0
;
i
<
data_reshape_
[
1
];
++
i
)
{
for
(
int
i
=
0
;
i
<
data_reshape_
[
1
];
++
i
)
{
for
(
int
j
=
0
;
j
<
data_reshape_
[
0
];
++
j
)
{
for
(
int
j
=
0
;
j
<
data_reshape_
[
0
];
++
j
)
{
output_ptr
[
i
]
+=
input_ptr
[
j
*
data_reshape_
[
1
]
+
i
];
output_ptr
[
i
]
+=
input_ptr
[
j
*
data_reshape_
[
1
]
+
i
];
...
@@ -150,7 +150,7 @@ class ReduceMeanOp<DeviceType::CPU, T> : public ReduceMeanOpBase {
...
@@ -150,7 +150,7 @@ class ReduceMeanOp<DeviceType::CPU, T> : public ReduceMeanOpBase {
output_ptr
[
i
]
/=
data_reshape_
[
0
];
output_ptr
[
i
]
/=
data_reshape_
[
0
];
}
}
}
else
{
}
else
{
#pragma omp parallel for
#pragma omp parallel for
schedule(runtime)
for
(
int
i
=
0
;
i
<
data_reshape_
[
0
];
++
i
)
{
for
(
int
i
=
0
;
i
<
data_reshape_
[
0
];
++
i
)
{
for
(
int
j
=
0
;
j
<
data_reshape_
[
1
];
++
j
)
{
for
(
int
j
=
0
;
j
<
data_reshape_
[
1
];
++
j
)
{
output_ptr
[
i
]
+=
input_ptr
[
i
*
data_reshape_
[
1
]
+
j
];
output_ptr
[
i
]
+=
input_ptr
[
i
*
data_reshape_
[
1
]
+
j
];
...
@@ -161,7 +161,7 @@ class ReduceMeanOp<DeviceType::CPU, T> : public ReduceMeanOpBase {
...
@@ -161,7 +161,7 @@ class ReduceMeanOp<DeviceType::CPU, T> : public ReduceMeanOpBase {
break
;
break
;
case
3
:
case
3
:
if
(
reduce_first_axis_
)
{
if
(
reduce_first_axis_
)
{
#pragma omp parallel for
#pragma omp parallel for
schedule(runtime)
for
(
int
i
=
0
;
i
<
data_reshape_
[
1
];
++
i
)
{
for
(
int
i
=
0
;
i
<
data_reshape_
[
1
];
++
i
)
{
for
(
int
j
=
0
;
j
<
data_reshape_
[
2
];
++
j
)
{
for
(
int
j
=
0
;
j
<
data_reshape_
[
2
];
++
j
)
{
for
(
int
k
=
0
;
k
<
data_reshape_
[
0
];
++
k
)
{
for
(
int
k
=
0
;
k
<
data_reshape_
[
0
];
++
k
)
{
...
@@ -173,7 +173,7 @@ class ReduceMeanOp<DeviceType::CPU, T> : public ReduceMeanOpBase {
...
@@ -173,7 +173,7 @@ class ReduceMeanOp<DeviceType::CPU, T> : public ReduceMeanOpBase {
output_ptr
[
i
]
/=
(
data_reshape_
[
0
]
*
data_reshape_
[
2
]);
output_ptr
[
i
]
/=
(
data_reshape_
[
0
]
*
data_reshape_
[
2
]);
}
}
}
else
{
}
else
{
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2)
schedule(runtime)
for
(
int
i
=
0
;
i
<
data_reshape_
[
0
];
++
i
)
{
for
(
int
i
=
0
;
i
<
data_reshape_
[
0
];
++
i
)
{
for
(
int
j
=
0
;
j
<
data_reshape_
[
2
];
++
j
)
{
for
(
int
j
=
0
;
j
<
data_reshape_
[
2
];
++
j
)
{
for
(
int
k
=
0
;
k
<
data_reshape_
[
1
];
++
k
)
{
for
(
int
k
=
0
;
k
<
data_reshape_
[
1
];
++
k
)
{
...
@@ -188,7 +188,7 @@ class ReduceMeanOp<DeviceType::CPU, T> : public ReduceMeanOpBase {
...
@@ -188,7 +188,7 @@ class ReduceMeanOp<DeviceType::CPU, T> : public ReduceMeanOpBase {
break
;
break
;
case
4
:
case
4
:
if
(
reduce_first_axis_
)
{
if
(
reduce_first_axis_
)
{
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2)
schedule(runtime)
for
(
int
i
=
0
;
i
<
data_reshape_
[
1
];
++
i
)
{
for
(
int
i
=
0
;
i
<
data_reshape_
[
1
];
++
i
)
{
for
(
int
j
=
0
;
j
<
data_reshape_
[
3
];
++
j
)
{
for
(
int
j
=
0
;
j
<
data_reshape_
[
3
];
++
j
)
{
for
(
int
k
=
0
;
k
<
data_reshape_
[
2
];
++
k
)
{
for
(
int
k
=
0
;
k
<
data_reshape_
[
2
];
++
k
)
{
...
@@ -203,7 +203,7 @@ class ReduceMeanOp<DeviceType::CPU, T> : public ReduceMeanOpBase {
...
@@ -203,7 +203,7 @@ class ReduceMeanOp<DeviceType::CPU, T> : public ReduceMeanOpBase {
}
}
}
}
}
else
{
}
else
{
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2)
schedule(runtime)
for
(
int
i
=
0
;
i
<
data_reshape_
[
0
];
++
i
)
{
for
(
int
i
=
0
;
i
<
data_reshape_
[
0
];
++
i
)
{
for
(
int
j
=
0
;
j
<
data_reshape_
[
2
];
++
j
)
{
for
(
int
j
=
0
;
j
<
data_reshape_
[
2
];
++
j
)
{
for
(
int
k
=
0
;
k
<
data_reshape_
[
1
];
++
k
)
{
for
(
int
k
=
0
;
k
<
data_reshape_
[
1
];
++
k
)
{
...
...
mace/ops/resize_bicubic.cc
浏览文件 @
b41fa3d6
...
@@ -85,7 +85,7 @@ inline void ResizeImage(const float *images,
...
@@ -85,7 +85,7 @@ inline void ResizeImage(const float *images,
const
float
height_scale
,
const
float
height_scale
,
const
float
width_scale
,
const
float
width_scale
,
float
*
output
)
{
float
*
output
)
{
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2)
schedule(runtime)
for
(
index_t
b
=
0
;
b
<
batch_size
;
++
b
)
{
for
(
index_t
b
=
0
;
b
<
batch_size
;
++
b
)
{
for
(
index_t
y
=
0
;
y
<
out_height
;
++
y
)
{
for
(
index_t
y
=
0
;
y
<
out_height
;
++
y
)
{
std
::
vector
<
float
>
y_weights
;
std
::
vector
<
float
>
y_weights
;
...
...
mace/ops/resize_bilinear.cc
浏览文件 @
b41fa3d6
...
@@ -95,7 +95,7 @@ inline void ResizeImageNCHW(const T *images,
...
@@ -95,7 +95,7 @@ inline void ResizeImageNCHW(const T *images,
T
*
output
)
{
T
*
output
)
{
const
CachedInterpolation
*
xs
=
xs_vec
.
data
();
const
CachedInterpolation
*
xs
=
xs_vec
.
data
();
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2)
schedule(runtime)
for
(
index_t
b
=
0
;
b
<
batch_size
;
++
b
)
{
for
(
index_t
b
=
0
;
b
<
batch_size
;
++
b
)
{
for
(
index_t
c
=
0
;
c
<
channels
;
++
c
)
{
for
(
index_t
c
=
0
;
c
<
channels
;
++
c
)
{
const
T
const
T
...
@@ -141,7 +141,7 @@ inline void ResizeImageNHWC(const T *images,
...
@@ -141,7 +141,7 @@ inline void ResizeImageNHWC(const T *images,
for
(
index_t
b
=
0
;
b
<
batch_size
;
++
b
)
{
for
(
index_t
b
=
0
;
b
<
batch_size
;
++
b
)
{
const
T
*
input_base
=
images
+
b
*
channels
*
in_height
*
in_width
;
const
T
*
input_base
=
images
+
b
*
channels
*
in_height
*
in_width
;
T
*
output_base
=
output
+
b
*
channels
*
out_height
*
out_width
;
T
*
output_base
=
output
+
b
*
channels
*
out_height
*
out_width
;
#pragma omp parallel for
#pragma omp parallel for
schedule(runtime)
for
(
index_t
y
=
0
;
y
<
out_height
;
++
y
)
{
for
(
index_t
y
=
0
;
y
<
out_height
;
++
y
)
{
const
T
const
T
*
y_lower_input_ptr
=
input_base
+
ys
[
y
].
lower
*
in_width
*
channels
;
*
y_lower_input_ptr
=
input_base
+
ys
[
y
].
lower
*
in_width
*
channels
;
...
...
mace/ops/sgemm.cc
浏览文件 @
b41fa3d6
...
@@ -283,7 +283,7 @@ void SGemm::RunInternal(const PackedBlock &lhs,
...
@@ -283,7 +283,7 @@ void SGemm::RunInternal(const PackedBlock &lhs,
}
}
if
(
batch
>=
MaceOpenMPThreadCount
)
{
if
(
batch
>=
MaceOpenMPThreadCount
)
{
#pragma omp parallel for
#pragma omp parallel for
schedule(runtime)
MACE_SGEMM_RUN_PER_BATCH
MACE_SGEMM_RUN_PER_BATCH
}
else
{
}
else
{
MACE_SGEMM_RUN_PER_BATCH
MACE_SGEMM_RUN_PER_BATCH
...
@@ -310,7 +310,7 @@ void SGemm::RunPerBatch(const float *lhs_data,
...
@@ -310,7 +310,7 @@ void SGemm::RunPerBatch(const float *lhs_data,
// as possible to cache, by tiling lhs by height and rhs by width.
// as possible to cache, by tiling lhs by height and rhs by width.
// w: 4
// w: 4
#pragma omp parallel for
#pragma omp parallel for
schedule(runtime)
for
(
index_t
bw
=
0
;
bw
<
block_w
;
++
bw
)
{
for
(
index_t
bw
=
0
;
bw
<
block_w
;
++
bw
)
{
index_t
remain_h
=
height
;
index_t
remain_h
=
height
;
index_t
block_h
=
0
;
index_t
block_h
=
0
;
...
@@ -733,7 +733,7 @@ void SGemm::RunPerBatch(const float *lhs_data,
...
@@ -733,7 +733,7 @@ void SGemm::RunPerBatch(const float *lhs_data,
rhs_data
+=
(
width
-
remain_w
)
*
depth
;
rhs_data
+=
(
width
-
remain_w
)
*
depth
;
// w: 1
// w: 1
#pragma omp parallel for
#pragma omp parallel for
schedule(runtime)
for
(
index_t
bw
=
0
;
bw
<
remain_w
;
++
bw
)
{
for
(
index_t
bw
=
0
;
bw
<
remain_w
;
++
bw
)
{
index_t
remain_h
=
height
;
index_t
remain_h
=
height
;
...
@@ -954,7 +954,7 @@ void SGemm::Pack(const MatrixMap<const float> &src,
...
@@ -954,7 +954,7 @@ void SGemm::Pack(const MatrixMap<const float> &src,
PackPerBatch(src, order, b, packed_data + b * height * width); \
PackPerBatch(src, order, b, packed_data + b * height * width); \
}
}
if
(
src
.
batch
()
>=
MaceOpenMPThreadCount
)
{
if
(
src
.
batch
()
>=
MaceOpenMPThreadCount
)
{
#pragma omp parallel for
#pragma omp parallel for
schedule(runtime)
MACE_SGEMM_PACK_PER_BATCH
MACE_SGEMM_PACK_PER_BATCH
}
else
{
}
else
{
MACE_SGEMM_PACK_PER_BATCH
MACE_SGEMM_PACK_PER_BATCH
...
@@ -976,7 +976,7 @@ void SGemm::UnPack(const PackedBlock &packed_result,
...
@@ -976,7 +976,7 @@ void SGemm::UnPack(const PackedBlock &packed_result,
}
}
if
(
matrix_map
->
batch
()
>=
MaceOpenMPThreadCount
)
{
if
(
matrix_map
->
batch
()
>=
MaceOpenMPThreadCount
)
{
#pragma omp parallel for
#pragma omp parallel for
schedule(runtime)
MACE_SGEMM_UNPACK_PER_BATCH
MACE_SGEMM_UNPACK_PER_BATCH
}
else
{
}
else
{
MACE_SGEMM_UNPACK_PER_BATCH
MACE_SGEMM_UNPACK_PER_BATCH
...
@@ -999,7 +999,7 @@ void SGemm::PackPerBatch(const MatrixMap<const float> &src,
...
@@ -999,7 +999,7 @@ void SGemm::PackPerBatch(const MatrixMap<const float> &src,
index_t
h
=
0
;
index_t
h
=
0
;
#if defined(MACE_ENABLE_NEON)
#if defined(MACE_ENABLE_NEON)
#if defined(__aarch64__)
#if defined(__aarch64__)
#pragma omp parallel for
#pragma omp parallel for
schedule(runtime)
for
(
index_t
ih
=
h
;
ih
<=
height
-
8
;
ih
+=
8
)
{
for
(
index_t
ih
=
h
;
ih
<=
height
-
8
;
ih
+=
8
)
{
const
float
*
src_data_ptr
=
src_data
+
ih
*
width
;
const
float
*
src_data_ptr
=
src_data
+
ih
*
width
;
float
*
packed_data_ptr
=
packed_data
+
ih
*
width
;
float
*
packed_data_ptr
=
packed_data
+
ih
*
width
;
...
@@ -1020,7 +1020,7 @@ void SGemm::PackPerBatch(const MatrixMap<const float> &src,
...
@@ -1020,7 +1020,7 @@ void SGemm::PackPerBatch(const MatrixMap<const float> &src,
}
}
h
+=
(
height
-
h
)
/
8
*
8
;
h
+=
(
height
-
h
)
/
8
*
8
;
#endif
#endif
#pragma omp parallel for
#pragma omp parallel for
schedule(runtime)
for
(
index_t
ih
=
h
;
ih
<=
height
-
4
;
ih
+=
4
)
{
for
(
index_t
ih
=
h
;
ih
<=
height
-
4
;
ih
+=
4
)
{
const
float
*
src_data_ptr
=
src_data
+
ih
*
width
;
const
float
*
src_data_ptr
=
src_data
+
ih
*
width
;
float
*
packed_data_ptr
=
packed_data
+
ih
*
width
;
float
*
packed_data_ptr
=
packed_data
+
ih
*
width
;
...
@@ -1036,7 +1036,7 @@ void SGemm::PackPerBatch(const MatrixMap<const float> &src,
...
@@ -1036,7 +1036,7 @@ void SGemm::PackPerBatch(const MatrixMap<const float> &src,
}
}
h
+=
(
height
-
h
)
/
4
*
4
;
h
+=
(
height
-
h
)
/
4
*
4
;
#endif
#endif
#pragma omp parallel for
#pragma omp parallel for
schedule(runtime)
for
(
index_t
ih
=
h
;
ih
<
height
;
++
ih
)
{
for
(
index_t
ih
=
h
;
ih
<
height
;
++
ih
)
{
std
::
copy_n
(
src_data
+
ih
*
width
,
width
,
packed_data
+
ih
*
width
);
std
::
copy_n
(
src_data
+
ih
*
width
,
width
,
packed_data
+
ih
*
width
);
}
}
...
@@ -1046,7 +1046,7 @@ void SGemm::PackPerBatch(const MatrixMap<const float> &src,
...
@@ -1046,7 +1046,7 @@ void SGemm::PackPerBatch(const MatrixMap<const float> &src,
index_t
h
=
0
;
index_t
h
=
0
;
#if defined(MACE_ENABLE_NEON)
#if defined(MACE_ENABLE_NEON)
#if defined(__aarch64__)
#if defined(__aarch64__)
#pragma omp parallel for
#pragma omp parallel for
schedule(runtime)
for
(
index_t
ih
=
h
;
ih
<=
height
-
8
;
ih
+=
8
)
{
for
(
index_t
ih
=
h
;
ih
<=
height
-
8
;
ih
+=
8
)
{
const
float
*
src_data_ptr
=
src_data
+
ih
;
const
float
*
src_data_ptr
=
src_data
+
ih
;
float
*
packed_data_ptr
=
packed_data
+
ih
*
width
;
float
*
packed_data_ptr
=
packed_data
+
ih
*
width
;
...
@@ -1061,7 +1061,7 @@ void SGemm::PackPerBatch(const MatrixMap<const float> &src,
...
@@ -1061,7 +1061,7 @@ void SGemm::PackPerBatch(const MatrixMap<const float> &src,
}
}
h
+=
(
height
-
h
)
/
8
*
8
;
h
+=
(
height
-
h
)
/
8
*
8
;
#endif
#endif
#pragma omp parallel for
#pragma omp parallel for
schedule(runtime)
for
(
index_t
ih
=
h
;
ih
<=
height
-
4
;
ih
+=
4
)
{
for
(
index_t
ih
=
h
;
ih
<=
height
-
4
;
ih
+=
4
)
{
const
float
*
src_data_ptr
=
src_data
+
ih
;
const
float
*
src_data_ptr
=
src_data
+
ih
;
float
*
packed_data_ptr
=
packed_data
+
ih
*
width
;
float
*
packed_data_ptr
=
packed_data
+
ih
*
width
;
...
@@ -1074,7 +1074,7 @@ void SGemm::PackPerBatch(const MatrixMap<const float> &src,
...
@@ -1074,7 +1074,7 @@ void SGemm::PackPerBatch(const MatrixMap<const float> &src,
}
}
h
+=
(
height
-
h
)
/
4
*
4
;
h
+=
(
height
-
h
)
/
4
*
4
;
#endif
#endif
#pragma omp parallel for
#pragma omp parallel for
schedule(runtime)
for
(
index_t
ih
=
h
;
ih
<
height
;
++
ih
)
{
for
(
index_t
ih
=
h
;
ih
<
height
;
++
ih
)
{
const
float
*
src_data_ptr
=
src_data
+
ih
;
const
float
*
src_data_ptr
=
src_data
+
ih
;
float
*
packed_data_ptr
=
packed_data
+
ih
*
width
;
float
*
packed_data_ptr
=
packed_data
+
ih
*
width
;
...
@@ -1087,7 +1087,7 @@ void SGemm::PackPerBatch(const MatrixMap<const float> &src,
...
@@ -1087,7 +1087,7 @@ void SGemm::PackPerBatch(const MatrixMap<const float> &src,
// This is for packing no-transpose rhs.
// This is for packing no-transpose rhs.
index_t
w
=
0
;
index_t
w
=
0
;
#if defined(MACE_ENABLE_NEON)
#if defined(MACE_ENABLE_NEON)
#pragma omp parallel for
#pragma omp parallel for
schedule(runtime)
for
(
index_t
iw
=
w
;
iw
<=
width
-
4
;
iw
+=
4
)
{
for
(
index_t
iw
=
w
;
iw
<=
width
-
4
;
iw
+=
4
)
{
const
float
*
src_data_ptr
=
src_data
+
iw
;
const
float
*
src_data_ptr
=
src_data
+
iw
;
float
*
packed_data_ptr
=
packed_data
+
iw
*
height
;
float
*
packed_data_ptr
=
packed_data
+
iw
*
height
;
...
@@ -1100,7 +1100,7 @@ void SGemm::PackPerBatch(const MatrixMap<const float> &src,
...
@@ -1100,7 +1100,7 @@ void SGemm::PackPerBatch(const MatrixMap<const float> &src,
}
}
w
+=
(
width
-
w
)
/
4
*
4
;
w
+=
(
width
-
w
)
/
4
*
4
;
#endif
#endif
#pragma omp parallel for
#pragma omp parallel for
schedule(runtime)
for
(
index_t
iw
=
w
;
iw
<
width
;
++
iw
)
{
for
(
index_t
iw
=
w
;
iw
<
width
;
++
iw
)
{
const
float
*
src_data_ptr
=
src_data
+
iw
;
const
float
*
src_data_ptr
=
src_data
+
iw
;
float
*
packed_data_ptr
=
packed_data
+
iw
*
height
;
float
*
packed_data_ptr
=
packed_data
+
iw
*
height
;
...
@@ -1113,7 +1113,7 @@ void SGemm::PackPerBatch(const MatrixMap<const float> &src,
...
@@ -1113,7 +1113,7 @@ void SGemm::PackPerBatch(const MatrixMap<const float> &src,
// This is for packing transpose-needed rhs.
// This is for packing transpose-needed rhs.
index_t
w
=
0
;
index_t
w
=
0
;
#if defined(MACE_ENABLE_NEON)
#if defined(MACE_ENABLE_NEON)
#pragma omp parallel for
#pragma omp parallel for
schedule(runtime)
for
(
index_t
iw
=
w
;
iw
<=
width
-
4
;
iw
+=
4
)
{
for
(
index_t
iw
=
w
;
iw
<=
width
-
4
;
iw
+=
4
)
{
const
float
*
src_data_ptr
=
src_data
+
iw
*
height
;
const
float
*
src_data_ptr
=
src_data
+
iw
*
height
;
float
*
packed_data_ptr
=
packed_data
+
iw
*
height
;
float
*
packed_data_ptr
=
packed_data
+
iw
*
height
;
...
@@ -1129,7 +1129,7 @@ void SGemm::PackPerBatch(const MatrixMap<const float> &src,
...
@@ -1129,7 +1129,7 @@ void SGemm::PackPerBatch(const MatrixMap<const float> &src,
}
}
w
+=
(
width
-
w
)
/
4
*
4
;
w
+=
(
width
-
w
)
/
4
*
4
;
#endif
#endif
#pragma omp parallel for
#pragma omp parallel for
schedule(runtime)
for
(
index_t
iw
=
w
;
iw
<
width
;
++
iw
)
{
for
(
index_t
iw
=
w
;
iw
<
width
;
++
iw
)
{
std
::
copy_n
(
src_data
+
iw
*
height
,
height
,
packed_data
+
iw
*
height
);
std
::
copy_n
(
src_data
+
iw
*
height
,
height
,
packed_data
+
iw
*
height
);
}
}
...
@@ -1149,7 +1149,7 @@ void SGemm::UnPackPerBatch(const float *packed_data,
...
@@ -1149,7 +1149,7 @@ void SGemm::UnPackPerBatch(const float *packed_data,
// This is for non-transposed result
// This is for non-transposed result
index_t
w
=
0
;
index_t
w
=
0
;
#if defined(MACE_ENABLE_NEON)
#if defined(MACE_ENABLE_NEON)
#pragma omp parallel for
#pragma omp parallel for
schedule(runtime)
for
(
index_t
iw
=
w
;
iw
<=
width
-
4
;
iw
+=
4
)
{
for
(
index_t
iw
=
w
;
iw
<=
width
-
4
;
iw
+=
4
)
{
const
float
*
packed_data_ptr
=
packed_data
+
iw
*
height
;
const
float
*
packed_data_ptr
=
packed_data
+
iw
*
height
;
float
*
unpacked_data_ptr
=
unpacked_data
+
iw
;
float
*
unpacked_data_ptr
=
unpacked_data
+
iw
;
...
@@ -1162,7 +1162,7 @@ void SGemm::UnPackPerBatch(const float *packed_data,
...
@@ -1162,7 +1162,7 @@ void SGemm::UnPackPerBatch(const float *packed_data,
}
}
w
+=
(
width
-
w
)
/
4
*
4
;
w
+=
(
width
-
w
)
/
4
*
4
;
#endif
#endif
#pragma omp parallel for
#pragma omp parallel for
schedule(runtime)
for
(
index_t
iw
=
w
;
iw
<
width
;
++
iw
)
{
for
(
index_t
iw
=
w
;
iw
<
width
;
++
iw
)
{
const
float
*
packed_data_ptr
=
packed_data
+
iw
*
height
;
const
float
*
packed_data_ptr
=
packed_data
+
iw
*
height
;
float
*
unpacked_data_ptr
=
unpacked_data
+
iw
;
float
*
unpacked_data_ptr
=
unpacked_data
+
iw
;
...
@@ -1174,7 +1174,7 @@ void SGemm::UnPackPerBatch(const float *packed_data,
...
@@ -1174,7 +1174,7 @@ void SGemm::UnPackPerBatch(const float *packed_data,
// This is for transposed result
// This is for transposed result
index_t
w
=
0
;
index_t
w
=
0
;
#if defined(MACE_ENABLE_NEON)
#if defined(MACE_ENABLE_NEON)
#pragma omp parallel for
#pragma omp parallel for
schedule(runtime)
for
(
index_t
iw
=
w
;
iw
<=
width
-
4
;
iw
+=
4
)
{
for
(
index_t
iw
=
w
;
iw
<=
width
-
4
;
iw
+=
4
)
{
const
float
*
packed_data_ptr
=
packed_data
+
iw
*
height
;
const
float
*
packed_data_ptr
=
packed_data
+
iw
*
height
;
float
*
unpacked_data_ptr
=
unpacked_data
+
iw
*
height
;
float
*
unpacked_data_ptr
=
unpacked_data
+
iw
*
height
;
...
@@ -1190,7 +1190,7 @@ void SGemm::UnPackPerBatch(const float *packed_data,
...
@@ -1190,7 +1190,7 @@ void SGemm::UnPackPerBatch(const float *packed_data,
}
}
w
+=
(
width
-
w
)
/
4
*
4
;
w
+=
(
width
-
w
)
/
4
*
4
;
#endif
#endif
#pragma omp parallel for
#pragma omp parallel for
schedule(runtime)
for
(
index_t
iw
=
w
;
iw
<
width
;
++
iw
)
{
for
(
index_t
iw
=
w
;
iw
<
width
;
++
iw
)
{
std
::
copy_n
(
std
::
copy_n
(
packed_data
+
iw
*
height
,
height
,
unpacked_data
+
iw
*
height
);
packed_data
+
iw
*
height
,
height
,
unpacked_data
+
iw
*
height
);
...
...
mace/ops/softmax.cc
浏览文件 @
b41fa3d6
...
@@ -59,7 +59,7 @@ class SoftmaxOp<DeviceType::CPU, float> : public Operation {
...
@@ -59,7 +59,7 @@ class SoftmaxOp<DeviceType::CPU, float> : public Operation {
const
index_t
batch_size
=
class_count
*
class_size
;
const
index_t
batch_size
=
class_count
*
class_size
;
for
(
index_t
b
=
0
;
b
<
batch
;
++
b
)
{
for
(
index_t
b
=
0
;
b
<
batch
;
++
b
)
{
#pragma omp parallel for
#pragma omp parallel for
schedule(runtime)
for
(
index_t
k
=
0
;
k
<
class_size
;
++
k
)
{
for
(
index_t
k
=
0
;
k
<
class_size
;
++
k
)
{
const
float
*
input_ptr
=
input_data
+
b
*
batch_size
+
k
;
const
float
*
input_ptr
=
input_data
+
b
*
batch_size
+
k
;
float
*
output_ptr
=
output_data
+
b
*
batch_size
+
k
;
float
*
output_ptr
=
output_data
+
b
*
batch_size
+
k
;
...
@@ -94,7 +94,7 @@ class SoftmaxOp<DeviceType::CPU, float> : public Operation {
...
@@ -94,7 +94,7 @@ class SoftmaxOp<DeviceType::CPU, float> : public Operation {
}
else
if
(
input
->
dim_size
()
==
2
)
{
// normal 2d softmax
}
else
if
(
input
->
dim_size
()
==
2
)
{
// normal 2d softmax
const
index_t
class_size
=
input
->
dim
(
0
);
const
index_t
class_size
=
input
->
dim
(
0
);
const
index_t
class_count
=
input
->
dim
(
1
);
const
index_t
class_count
=
input
->
dim
(
1
);
#pragma omp parallel for
#pragma omp parallel for
schedule(runtime)
for
(
index_t
k
=
0
;
k
<
class_size
;
++
k
)
{
for
(
index_t
k
=
0
;
k
<
class_size
;
++
k
)
{
const
float
*
input_ptr
=
input_data
+
k
*
class_count
;
const
float
*
input_ptr
=
input_data
+
k
*
class_count
;
float
*
output_ptr
=
output_data
+
k
*
class_count
;
float
*
output_ptr
=
output_data
+
k
*
class_count
;
...
@@ -172,7 +172,7 @@ class SoftmaxOp<DeviceType::CPU, uint8_t> : public Operation {
...
@@ -172,7 +172,7 @@ class SoftmaxOp<DeviceType::CPU, uint8_t> : public Operation {
// If depth is short, do it using float32. Float computation should not
// If depth is short, do it using float32. Float computation should not
// be here, but as long as it is on CPU, it is fine.
// be here, but as long as it is on CPU, it is fine.
if
(
depth
<
32
)
{
if
(
depth
<
32
)
{
#pragma omp parallel for
#pragma omp parallel for
schedule(runtime)
for
(
index_t
b
=
0
;
b
<
batch
;
++
b
)
{
for
(
index_t
b
=
0
;
b
<
batch
;
++
b
)
{
const
uint8_t
*
input_ptr
=
input_data
+
b
*
depth
;
const
uint8_t
*
input_ptr
=
input_data
+
b
*
depth
;
uint8_t
*
output_ptr
=
output_data
+
b
*
depth
;
uint8_t
*
output_ptr
=
output_data
+
b
*
depth
;
...
@@ -201,7 +201,7 @@ class SoftmaxOp<DeviceType::CPU, uint8_t> : public Operation {
...
@@ -201,7 +201,7 @@ class SoftmaxOp<DeviceType::CPU, uint8_t> : public Operation {
(
1ll
<<
31
)
-
1.0
));
(
1ll
<<
31
)
-
1.0
));
int32_t
input_delta_limit
=
-
((
1ll
<<
31
)
-
1
)
/
scale_q
;
int32_t
input_delta_limit
=
-
((
1ll
<<
31
)
-
1
)
/
scale_q
;
#pragma omp parallel for
#pragma omp parallel for
schedule(runtime)
for
(
index_t
b
=
0
;
b
<
batch
;
++
b
)
{
for
(
index_t
b
=
0
;
b
<
batch
;
++
b
)
{
const
uint8_t
*
input_ptr
=
input_data
+
b
*
depth
;
const
uint8_t
*
input_ptr
=
input_data
+
b
*
depth
;
uint8_t
*
output_ptr
=
output_data
+
b
*
depth
;
uint8_t
*
output_ptr
=
output_data
+
b
*
depth
;
...
...
mace/ops/space_to_batch.cc
浏览文件 @
b41fa3d6
...
@@ -129,7 +129,7 @@ class SpaceToBatchNDOp<DeviceType::CPU, float> : public SpaceToBatchOpBase {
...
@@ -129,7 +129,7 @@ class SpaceToBatchNDOp<DeviceType::CPU, float> : public SpaceToBatchOpBase {
std
::
max
(
static_cast
<
index_t
>
(
1
),
8
*
1024
/
block_shape_w
/
in_width
);
std
::
max
(
static_cast
<
index_t
>
(
1
),
8
*
1024
/
block_shape_w
/
in_width
);
// make channel outter loop so we can make best use of cache
// make channel outter loop so we can make best use of cache
#pragma omp parallel for collapse(3)
#pragma omp parallel for collapse(3)
schedule(runtime)
for
(
index_t
c
=
0
;
c
<
channels
;
++
c
)
{
for
(
index_t
c
=
0
;
c
<
channels
;
++
c
)
{
for
(
index_t
block_h
=
0
;
block_h
<
out_height
;
for
(
index_t
block_h
=
0
;
block_h
<
out_height
;
block_h
+=
block_h_size
)
{
block_h
+=
block_h_size
)
{
...
@@ -238,7 +238,7 @@ class SpaceToBatchNDOp<DeviceType::CPU, uint8_t> : public SpaceToBatchOpBase {
...
@@ -238,7 +238,7 @@ class SpaceToBatchNDOp<DeviceType::CPU, uint8_t> : public SpaceToBatchOpBase {
index_t
out_width
=
batch_tensor
->
dim
(
2
);
index_t
out_width
=
batch_tensor
->
dim
(
2
);
index_t
channels
=
batch_tensor
->
dim
(
3
);
index_t
channels
=
batch_tensor
->
dim
(
3
);
#pragma omp parallel for
#pragma omp parallel for
schedule(runtime)
for
(
index_t
b
=
0
;
b
<
out_batches
;
++
b
)
{
for
(
index_t
b
=
0
;
b
<
out_batches
;
++
b
)
{
const
index_t
in_b
=
b
%
in_batches
;
const
index_t
in_b
=
b
%
in_batches
;
const
index_t
tile_index
=
b
/
in_batches
;
const
index_t
tile_index
=
b
/
in_batches
;
...
...
mace/ops/sqrdiff_mean.cc
浏览文件 @
b41fa3d6
...
@@ -64,7 +64,7 @@ class SqrDiffMeanOp : public Operation {
...
@@ -64,7 +64,7 @@ class SqrDiffMeanOp : public Operation {
const
index_t
img_size
=
input0
->
dim
(
2
)
*
input0
->
dim
(
3
);
const
index_t
img_size
=
input0
->
dim
(
2
)
*
input0
->
dim
(
3
);
const
index_t
bc
=
input0
->
dim
(
0
)
*
input0
->
dim
(
1
);
const
index_t
bc
=
input0
->
dim
(
0
)
*
input0
->
dim
(
1
);
#pragma omp parallel for
#pragma omp parallel for
schedule(runtime)
for
(
int
i
=
0
;
i
<
bc
;
++
i
)
{
for
(
int
i
=
0
;
i
<
bc
;
++
i
)
{
for
(
int
j
=
0
;
j
<
img_size
;
++
j
)
{
for
(
int
j
=
0
;
j
<
img_size
;
++
j
)
{
T
diff
=
input_ptr0
[
i
*
img_size
+
j
]
-
input_ptr1
[
i
];
T
diff
=
input_ptr0
[
i
*
img_size
+
j
]
-
input_ptr1
[
i
];
...
...
mace/public/mace.h
浏览文件 @
b41fa3d6
...
@@ -48,10 +48,28 @@ enum GPUPriorityHint {
...
@@ -48,10 +48,28 @@ enum GPUPriorityHint {
PRIORITY_HIGH
=
3
PRIORITY_HIGH
=
3
};
};
// AFFINITY_NONE: initiate 'num_threads_hint' threads with no affinity
// scheduled.
// If 'num_threads_hint' is -1 or greater than number of available cores,
// 'num_threads_hint' will be reset to number of available cores.
// AFFINITY_BIG_ONLY: all available big cores are used, and number of threads
// is equal to numbers of available big cores.
// AFFINITY_LITTLE_ONLY: all available little cores are used, and number of
// threads is equal to numbers of available little cores.
// AFFINITY_HIGH_PERFORMANCE: initiate 'num_threads_hint' threads on different
// cores with top-num_threads_hint frequencies.
// If 'num_threads_hint' is -1 or greater than number of available cores,
// 'num_threads_hint' will be reset to number of available cores.
// AFFINITY_POWER_SAVE: initiate 'num_threads_hint' threads on different
// cores with bottom-num_threads_hint frequencies.
// If 'num_threads_hint' is -1 or greater than number of available cores,
// 'num_threads_hint' will be reset to number of available cores.
enum
CPUAffinityPolicy
{
enum
CPUAffinityPolicy
{
AFFINITY_NONE
=
0
,
AFFINITY_NONE
=
0
,
AFFINITY_BIG_ONLY
=
1
,
AFFINITY_BIG_ONLY
=
1
,
AFFINITY_LITTLE_ONLY
=
2
,
AFFINITY_LITTLE_ONLY
=
2
,
AFFINITY_HIGH_PERFORMANCE
=
3
,
AFFINITY_POWER_SAVE
=
4
,
};
};
struct
CallStats
{
struct
CallStats
{
...
...
mace/utils/quantize.h
浏览文件 @
b41fa3d6
...
@@ -99,7 +99,7 @@ inline void QuantizeWithScaleAndZeropoint(const float *input,
...
@@ -99,7 +99,7 @@ inline void QuantizeWithScaleAndZeropoint(const float *input,
int32_t
zero_point
,
int32_t
zero_point
,
T
*
output
)
{
T
*
output
)
{
float
recip_scale
=
1
/
scale
;
float
recip_scale
=
1
/
scale
;
#pragma omp parallel for
#pragma omp parallel for
schedule(runtime)
for
(
int
i
=
0
;
i
<
size
;
++
i
)
{
for
(
int
i
=
0
;
i
<
size
;
++
i
)
{
output
[
i
]
=
Saturate
<
T
>
(
roundf
(
zero_point
+
recip_scale
*
input
[
i
]));
output
[
i
]
=
Saturate
<
T
>
(
roundf
(
zero_point
+
recip_scale
*
input
[
i
]));
}
}
...
@@ -128,7 +128,7 @@ inline void Dequantize(const T *input,
...
@@ -128,7 +128,7 @@ inline void Dequantize(const T *input,
const
float
scale
,
const
float
scale
,
const
int32_t
zero_point
,
const
int32_t
zero_point
,
float
*
output
)
{
float
*
output
)
{
#pragma omp parallel for
#pragma omp parallel for
schedule(runtime)
for
(
int
i
=
0
;
i
<
size
;
++
i
)
{
for
(
int
i
=
0
;
i
<
size
;
++
i
)
{
output
[
i
]
=
scale
*
(
input
[
i
]
-
zero_point
);
output
[
i
]
=
scale
*
(
input
[
i
]
-
zero_point
);
}
}
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录