Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
项目经理老王
Mace
提交
69583cd6
Mace
项目概览
项目经理老王
/
Mace
与 Fork 源项目一致
Fork自
Xiaomi / Mace
通知
1
Star
0
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
Mace
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
69583cd6
编写于
11月 21, 2018
作者:
李
李寅
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Using guided openmp scheduler
上级
0102ad55
变更
37
隐藏空白更改
内联
并排
Showing
37 changed file
with
318 addition
and
249 deletion
+318
-249
mace/core/runtime/cpu/cpu_runtime.cc
mace/core/runtime/cpu/cpu_runtime.cc
+139
-88
mace/ops/activation.h
mace/ops/activation.h
+7
-7
mace/ops/argmax.cc
mace/ops/argmax.cc
+1
-1
mace/ops/arm/activation_neon.cc
mace/ops/arm/activation_neon.cc
+4
-4
mace/ops/arm/conv_2d_neon_15x1.cc
mace/ops/arm/conv_2d_neon_15x1.cc
+1
-1
mace/ops/arm/conv_2d_neon_1x15.cc
mace/ops/arm/conv_2d_neon_1x15.cc
+1
-1
mace/ops/arm/conv_2d_neon_1x7.cc
mace/ops/arm/conv_2d_neon_1x7.cc
+1
-1
mace/ops/arm/conv_2d_neon_3x3.cc
mace/ops/arm/conv_2d_neon_3x3.cc
+2
-2
mace/ops/arm/conv_2d_neon_5x5.cc
mace/ops/arm/conv_2d_neon_5x5.cc
+1
-1
mace/ops/arm/conv_2d_neon_7x1.cc
mace/ops/arm/conv_2d_neon_7x1.cc
+1
-1
mace/ops/arm/conv_2d_neon_7x7.cc
mace/ops/arm/conv_2d_neon_7x7.cc
+3
-3
mace/ops/arm/conv_winograd.cc
mace/ops/arm/conv_winograd.cc
+7
-7
mace/ops/arm/deconv_2d_neon_2x2.cc
mace/ops/arm/deconv_2d_neon_2x2.cc
+2
-2
mace/ops/arm/deconv_2d_neon_3x3.cc
mace/ops/arm/deconv_2d_neon_3x3.cc
+2
-2
mace/ops/arm/deconv_2d_neon_4x4.cc
mace/ops/arm/deconv_2d_neon_4x4.cc
+2
-2
mace/ops/arm/depthwise_conv2d_neon_3x3.cc
mace/ops/arm/depthwise_conv2d_neon_3x3.cc
+2
-2
mace/ops/arm/depthwise_deconv2d_neon_3x3.cc
mace/ops/arm/depthwise_deconv2d_neon_3x3.cc
+4
-4
mace/ops/arm/depthwise_deconv2d_neon_4x4.cc
mace/ops/arm/depthwise_deconv2d_neon_4x4.cc
+4
-4
mace/ops/batch_to_space.cc
mace/ops/batch_to_space.cc
+2
-2
mace/ops/channel_shuffle.cc
mace/ops/channel_shuffle.cc
+1
-1
mace/ops/conv_2d.cc
mace/ops/conv_2d.cc
+4
-4
mace/ops/conv_pool_2d_util.cc
mace/ops/conv_pool_2d_util.cc
+2
-2
mace/ops/deconv_2d.cc
mace/ops/deconv_2d.cc
+2
-2
mace/ops/depth_to_space.cc
mace/ops/depth_to_space.cc
+1
-1
mace/ops/eltwise.cc
mace/ops/eltwise.cc
+60
-60
mace/ops/gather.cc
mace/ops/gather.cc
+1
-1
mace/ops/local_response_norm.cc
mace/ops/local_response_norm.cc
+1
-1
mace/ops/pooling.cc
mace/ops/pooling.cc
+4
-4
mace/ops/reduce_mean.cc
mace/ops/reduce_mean.cc
+7
-7
mace/ops/resize_bicubic.cc
mace/ops/resize_bicubic.cc
+1
-1
mace/ops/resize_bilinear.cc
mace/ops/resize_bilinear.cc
+2
-2
mace/ops/sgemm.cc
mace/ops/sgemm.cc
+19
-19
mace/ops/softmax.cc
mace/ops/softmax.cc
+4
-4
mace/ops/space_to_batch.cc
mace/ops/space_to_batch.cc
+2
-2
mace/ops/sqrdiff_mean.cc
mace/ops/sqrdiff_mean.cc
+1
-1
mace/public/mace.h
mace/public/mace.h
+18
-0
mace/utils/quantize.h
mace/utils/quantize.h
+2
-2
未找到文件。
mace/core/runtime/cpu/cpu_runtime.cc
浏览文件 @
69583cd6
...
...
@@ -36,45 +36,98 @@ namespace mace {
int
MaceOpenMPThreadCount
=
1
;
namespace
{
struct
CPUFreq
{
size_t
core_id
;
float
freq
;
};
namespace
{
#if defined(__ANDROID__)
int
GetCPUCount
()
{
char
path
[
64
];
int
cpu_count
=
0
;
int
result
=
0
;
while
(
true
)
{
snprintf
(
path
,
sizeof
(
path
),
"/sys/devices/system/cpu/cpu%d"
,
cpu_count
);
result
=
access
(
path
,
F_OK
);
if
(
result
!=
0
)
{
if
(
errno
!=
ENOENT
)
{
LOG
(
ERROR
)
<<
"Access "
<<
path
<<
" failed: "
<<
strerror
(
errno
);
}
return
cpu_count
;
std
::
string
cpu_sys_conf
=
"/proc/cpuinfo"
;
std
::
ifstream
f
(
cpu_sys_conf
);
if
(
!
f
.
is_open
())
{
LOG
(
ERROR
)
<<
"failed to open "
<<
cpu_sys_conf
;
return
-
1
;
}
std
::
string
line
;
const
std
::
string
processor_key
=
"processor"
;
while
(
std
::
getline
(
f
,
line
))
{
if
(
line
.
size
()
>=
processor_key
.
size
()
&&
line
.
compare
(
0
,
processor_key
.
size
(),
processor_key
)
==
0
)
{
++
cpu_count
;
}
cpu_count
++
;
}
if
(
f
.
bad
())
{
LOG
(
ERROR
)
<<
"failed to read "
<<
cpu_sys_conf
;
}
if
(
!
f
.
eof
())
{
LOG
(
ERROR
)
<<
"failed to read end of "
<<
cpu_sys_conf
;
}
f
.
close
();
VLOG
(
2
)
<<
"CPU cores: "
<<
cpu_count
;
return
cpu_count
;
}
#endif
int
GetCPUMaxFreq
(
int
cpu_id
)
{
char
path
[
64
];
snprintf
(
path
,
sizeof
(
path
),
"/sys/devices/system/cpu/cpu%d/cpufreq/cpuinfo_max_freq"
,
cpu_id
);
FILE
*
fp
=
fopen
(
path
,
"rb"
);
if
(
!
fp
)
{
LOG
(
WARNING
)
<<
"File: "
<<
path
<<
" not exists."
;
return
0
;
int
GetCPUMaxFreq
(
std
::
vector
<
float
>
*
max_freqs
)
{
#if defined(__ANDROID__)
int
cpu_count
=
GetCPUCount
();
for
(
int
cpu_id
=
0
;
cpu_id
<
cpu_count
;
++
cpu_id
)
{
std
::
string
cpuinfo_max_freq_sys_conf
=
MakeString
(
"/sys/devices/system/cpu/cpu"
,
cpu_id
,
"/cpufreq/cpuinfo_max_freq"
);
std
::
ifstream
f
(
cpuinfo_max_freq_sys_conf
);
if
(
!
f
.
is_open
())
{
LOG
(
ERROR
)
<<
"failed to open "
<<
cpuinfo_max_freq_sys_conf
;
return
-
1
;
}
std
::
string
line
;
if
(
std
::
getline
(
f
,
line
))
{
float
freq
=
atof
(
line
.
c_str
());
max_freqs
->
push_back
(
freq
);
}
if
(
f
.
bad
())
{
LOG
(
ERROR
)
<<
"failed to read "
<<
cpuinfo_max_freq_sys_conf
;
}
f
.
close
();
}
#else
std
::
string
cpu_sys_conf
=
"/proc/cpuinfo"
;
std
::
ifstream
f
(
cpu_sys_conf
);
if
(
!
f
.
is_open
())
{
LOG
(
ERROR
)
<<
"failed to open "
<<
cpu_sys_conf
;
return
-
1
;
}
std
::
string
line
;
const
std
::
string
freq_key
=
"cpu MHz"
;
while
(
std
::
getline
(
f
,
line
))
{
if
(
line
.
size
()
>=
freq_key
.
size
()
&&
line
.
compare
(
0
,
freq_key
.
size
(),
freq_key
)
==
0
)
{
size_t
pos
=
line
.
find
(
":"
);
if
(
pos
!=
std
::
string
::
npos
)
{
std
::
string
freq_str
=
line
.
substr
(
pos
+
1
);
float
freq
=
atof
(
freq_str
.
c_str
());
max_freqs
->
push_back
(
freq
);
}
}
}
if
(
f
.
bad
())
{
LOG
(
ERROR
)
<<
"failed to read "
<<
cpu_sys_conf
;
}
if
(
!
f
.
eof
())
{
LOG
(
ERROR
)
<<
"failed to read end of "
<<
cpu_sys_conf
;
}
f
.
close
();
#endif
int
freq
=
0
;
int
items_read
=
fscanf
(
fp
,
"%d"
,
&
freq
);
if
(
items_read
!=
1
)
{
LOG
(
WARNING
)
<<
"Read file: "
<<
path
<<
" failed."
;
for
(
float
freq
:
*
max_freqs
)
{
VLOG
(
2
)
<<
"CPU freq: "
<<
freq
;
}
fclose
(
fp
);
return
freq
;
return
0
;
}
MaceStatus
SetThreadAffinity
(
cpu_set_t
mask
)
{
...
...
@@ -93,51 +146,14 @@ MaceStatus SetThreadAffinity(cpu_set_t mask) {
}
}
MaceStatus
GetCPUBigLittleCoreIDs
(
std
::
vector
<
int
>
*
big_core_ids
,
std
::
vector
<
int
>
*
little_core_ids
)
{
MACE_CHECK_NOTNULL
(
big_core_ids
);
MACE_CHECK_NOTNULL
(
little_core_ids
);
int
cpu_count
=
GetCPUCount
();
std
::
vector
<
int
>
cpu_max_freq
(
cpu_count
);
// set cpu max frequency
for
(
int
i
=
0
;
i
<
cpu_count
;
++
i
)
{
cpu_max_freq
[
i
]
=
GetCPUMaxFreq
(
i
);
if
(
cpu_max_freq
[
i
]
==
0
)
{
LOG
(
WARNING
)
<<
"Cannot get CPU"
<<
i
<<
"'s max frequency info, maybe it is offline."
;
return
MaceStatus
(
MaceStatus
::
MACE_INVALID_ARGS
,
"Cannot get CPU's max frequency info,"
" maybe it is offline."
);
}
}
int
big_core_freq
=
*
(
std
::
max_element
(
cpu_max_freq
.
begin
(),
cpu_max_freq
.
end
()));
int
little_core_freq
=
*
(
std
::
min_element
(
cpu_max_freq
.
begin
(),
cpu_max_freq
.
end
()));
big_core_ids
->
reserve
(
cpu_count
);
little_core_ids
->
reserve
(
cpu_count
);
for
(
int
i
=
0
;
i
<
cpu_count
;
++
i
)
{
if
(
cpu_max_freq
[
i
]
==
little_core_freq
)
{
little_core_ids
->
push_back
(
i
);
}
if
(
cpu_max_freq
[
i
]
==
big_core_freq
)
{
big_core_ids
->
push_back
(
i
);
}
}
return
MaceStatus
::
MACE_SUCCESS
;
}
MaceStatus
SetOpenMPThreadsAndAffinityCPUs
(
int
omp_num_threads
,
const
std
::
vector
<
in
t
>
&
cpu_ids
)
{
const
std
::
vector
<
size_
t
>
&
cpu_ids
)
{
MaceOpenMPThreadCount
=
omp_num_threads
;
#ifdef MACE_ENABLE_OPENMP
VLOG
(
1
)
<<
"Set OpenMP threads number: "
<<
omp_num_threads
<<
", CPU core IDs: "
<<
MakeString
(
cpu_ids
);
omp_set_schedule
(
omp_sched_guided
,
1
);
omp_set_num_threads
(
omp_num_threads
);
#else
MACE_UNUSED
(
omp_num_threads
);
...
...
@@ -174,55 +190,90 @@ MaceStatus SetOpenMPThreadsAndAffinityCPUs(int omp_num_threads,
}
// namespace
MaceStatus
CPURuntime
::
SetOpenMPThreadsAndAffinityPolicy
(
int
omp_
num_threads_hint
,
int
num_threads_hint
,
CPUAffinityPolicy
policy
,
void
*
gemm_context
)
{
// get cpu frequency info
std
::
vector
<
float
>
cpu_max_freqs
;
if
(
GetCPUMaxFreq
(
&
cpu_max_freqs
)
==
-
1
||
cpu_max_freqs
.
size
()
==
0
)
{
return
MaceStatus
::
MACE_INVALID_ARGS
;
}
std
::
vector
<
CPUFreq
>
cpu_freq
(
cpu_max_freqs
.
size
());
for
(
size_t
i
=
0
;
i
<
cpu_max_freqs
.
size
();
++
i
)
{
cpu_freq
[
i
].
core_id
=
i
;
cpu_freq
[
i
].
freq
=
cpu_max_freqs
[
i
];
}
if
(
policy
==
CPUAffinityPolicy
::
AFFINITY_POWER_SAVE
||
policy
==
CPUAffinityPolicy
::
AFFINITY_LITTLE_ONLY
)
{
std
::
sort
(
cpu_freq
.
begin
(),
cpu_freq
.
end
(),
[
=
](
const
CPUFreq
&
lhs
,
const
CPUFreq
&
rhs
)
{
return
lhs
.
freq
<
rhs
.
freq
;
});
}
else
if
(
policy
==
CPUAffinityPolicy
::
AFFINITY_HIGH_PERFORMANCE
||
policy
==
CPUAffinityPolicy
::
AFFINITY_BIG_ONLY
)
{
std
::
sort
(
cpu_freq
.
begin
(),
cpu_freq
.
end
(),
[](
const
CPUFreq
&
lhs
,
const
CPUFreq
&
rhs
)
{
return
lhs
.
freq
>
rhs
.
freq
;
});
}
int
cpu_count
=
static_cast
<
int
>
(
cpu_freq
.
size
());
if
(
num_threads_hint
<=
0
||
num_threads_hint
>
cpu_count
)
{
num_threads_hint
=
cpu_count
;
}
if
(
policy
==
CPUAffinityPolicy
::
AFFINITY_NONE
)
{
#ifdef MACE_ENABLE_QUANTIZE
if
(
gemm_context
)
{
static_cast
<
gemmlowp
::
GemmContext
*>
(
gemm_context
)
->
set_max_num_threads
(
std
::
max
(
0
,
omp_num_threads_hint
)
);
num_threads_hint
);
}
#else
MACE_UNUSED
(
gemm_context
);
#endif // MACE_ENABLE_QUANTIZE
#ifdef MACE_ENABLE_OPENMP
if
(
omp_num_threads_hint
>
0
)
{
omp_set_num_threads
(
std
::
min
(
omp_num_threads_hint
,
omp_get_num_procs
()));
}
omp_set_num_threads
(
num_threads_hint
);
#else
LOG
(
WARNING
)
<<
"Set OpenMP threads number failed: OpenMP not enabled."
;
#endif
return
MaceStatus
::
MACE_SUCCESS
;
}
std
::
vector
<
int
>
big_core_ids
;
std
::
vector
<
int
>
little_core_ids
;
MaceStatus
res
=
GetCPUBigLittleCoreIDs
(
&
big_core_ids
,
&
little_core_ids
);
if
(
res
!=
MaceStatus
::
MACE_SUCCESS
)
{
return
res
;
}
std
::
vector
<
int
>
use_cpu_ids
;
if
(
policy
==
CPUAffinityPolicy
::
AFFINITY_BIG_ONLY
)
{
use_cpu_ids
=
std
::
move
(
big_core_ids
);
// decide num of cores to use
int
cores_to_use
=
0
;
if
(
policy
==
CPUAffinityPolicy
::
AFFINITY_BIG_ONLY
||
policy
==
CPUAffinityPolicy
::
AFFINITY_LITTLE_ONLY
)
{
for
(
size_t
i
=
0
;
i
<
cpu_max_freqs
.
size
();
++
i
)
{
if
(
cpu_freq
[
i
].
freq
!=
cpu_freq
[
0
].
freq
)
{
break
;
}
++
cores_to_use
;
}
num_threads_hint
=
cores_to_use
;
}
else
{
use_cpu_ids
=
std
::
move
(
little_core_ids
)
;
cores_to_use
=
num_threads_hint
;
}
if
(
omp_num_threads_hint
<=
0
||
omp_num_threads_hint
>
static_cast
<
int
>
(
use_cpu_ids
.
size
()))
{
omp_num_threads_hint
=
use_cpu_ids
.
size
();
VLOG
(
2
)
<<
"Use "
<<
num_threads_hint
<<
" threads"
;
std
::
vector
<
size_t
>
cpu_ids
(
cores_to_use
);
for
(
int
i
=
0
;
i
<
cores_to_use
;
++
i
)
{
VLOG
(
2
)
<<
"Bind thread to core: "
<<
cpu_freq
[
i
].
core_id
<<
" with freq "
<<
cpu_freq
[
i
].
freq
;
cpu_ids
[
i
]
=
cpu_freq
[
i
].
core_id
;
}
#ifdef MACE_ENABLE_QUANTIZE
if
(
gemm_context
)
{
static_cast
<
gemmlowp
::
GemmContext
*>
(
gemm_context
)
->
set_max_num_threads
(
omp_
num_threads_hint
);
num_threads_hint
);
}
#endif // MACE_ENABLE_QUANTIZE
return
SetOpenMPThreadsAndAffinityCPUs
(
omp_num_threads_hint
,
use_
cpu_ids
);
return
SetOpenMPThreadsAndAffinityCPUs
(
num_threads_hint
,
cpu_ids
);
}
}
// namespace mace
...
...
mace/ops/activation.h
浏览文件 @
69583cd6
...
...
@@ -66,26 +66,26 @@ void DoActivation(const T *input_ptr,
case
NOOP
:
break
;
case
RELU
:
#pragma omp parallel for
#pragma omp parallel for
schedule(runtime)
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
output_ptr
[
i
]
=
std
::
max
(
input_ptr
[
i
],
static_cast
<
T
>
(
0
));
}
break
;
case
RELUX
:
#pragma omp parallel for
#pragma omp parallel for
schedule(runtime)
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
output_ptr
[
i
]
=
std
::
min
(
std
::
max
(
input_ptr
[
i
],
static_cast
<
T
>
(
0
)),
static_cast
<
T
>
(
relux_max_limit
));
}
break
;
case
TANH
:
#pragma omp parallel for
#pragma omp parallel for
schedule(runtime)
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
output_ptr
[
i
]
=
std
::
tanh
(
input_ptr
[
i
]);
}
break
;
case
SIGMOID
:
#pragma omp parallel for
#pragma omp parallel for
schedule(runtime)
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
output_ptr
[
i
]
=
1
/
(
1
+
std
::
exp
(
-
input_ptr
[
i
]));
}
...
...
@@ -111,13 +111,13 @@ inline void DoActivation(const float *input_ptr,
ReluxNeon
(
input_ptr
,
relux_max_limit
,
size
,
output_ptr
);
break
;
case
TANH
:
#pragma omp parallel for
#pragma omp parallel for
schedule(runtime)
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
output_ptr
[
i
]
=
std
::
tanh
(
input_ptr
[
i
]);
}
break
;
case
SIGMOID
:
#pragma omp parallel for
#pragma omp parallel for
schedule(runtime)
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
output_ptr
[
i
]
=
1
/
(
1
+
std
::
exp
(
-
input_ptr
[
i
]));
}
...
...
@@ -134,7 +134,7 @@ void PReLUActivation(const T *input_ptr,
const
index_t
inner_size
,
const
T
*
alpha_ptr
,
T
*
output_ptr
)
{
#pragma omp parallel for collapse(3)
#pragma omp parallel for collapse(3)
schedule(runtime)
for
(
index_t
i
=
0
;
i
<
outer_size
;
++
i
)
{
for
(
index_t
chan_idx
=
0
;
chan_idx
<
input_chan
;
++
chan_idx
)
{
for
(
index_t
j
=
0
;
j
<
inner_size
;
++
j
)
{
...
...
mace/ops/argmax.cc
浏览文件 @
69583cd6
...
...
@@ -59,7 +59,7 @@ class ArgMaxOp : public Operation {
index_t
outer_size
=
output
->
size
();
index_t
inner_size
=
input
->
dim
(
axis_value
);
#pragma omp parallel for
#pragma omp parallel for
schedule(runtime)
for
(
index_t
i
=
0
;
i
<
outer_size
;
++
i
)
{
int
idx
=
0
;
T
max_value
=
std
::
numeric_limits
<
T
>::
lowest
();
...
...
mace/ops/arm/activation_neon.cc
浏览文件 @
69583cd6
...
...
@@ -25,7 +25,7 @@ namespace ops {
void
ReluNeon
(
const
float
*
input
,
const
index_t
size
,
float
*
output
)
{
#if defined(MACE_ENABLE_NEON)
float32x4_t
vzero
=
vdupq_n_f32
(
0.
f
);
#pragma omp parallel for
#pragma omp parallel for
schedule(runtime)
for
(
index_t
i
=
0
;
i
<=
size
-
4
;
i
+=
4
)
{
float32x4_t
v
=
vld1q_f32
(
input
+
i
);
v
=
vmaxq_f32
(
v
,
vzero
);
...
...
@@ -36,7 +36,7 @@ void ReluNeon(const float *input, const index_t size, float *output) {
output
[
i
]
=
std
::
max
(
input
[
i
],
0.
f
);
}
#else
#pragma omp parallel for
#pragma omp parallel for
schedule(runtime)
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
output
[
i
]
=
std
::
max
(
input
[
i
],
0.
f
);
}
...
...
@@ -48,7 +48,7 @@ void ReluxNeon(const float *input, const float limit,
#if defined(MACE_ENABLE_NEON)
float32x4_t
vzero
=
vdupq_n_f32
(
0.
f
);
float32x4_t
vlimit
=
vdupq_n_f32
(
limit
);
#pragma omp parallel for
#pragma omp parallel for
schedule(runtime)
for
(
index_t
i
=
0
;
i
<=
size
-
4
;
i
+=
4
)
{
float32x4_t
v
=
vld1q_f32
(
input
+
i
);
v
=
vmaxq_f32
(
v
,
vzero
);
...
...
@@ -60,7 +60,7 @@ void ReluxNeon(const float *input, const float limit,
output
[
i
]
=
std
::
min
(
std
::
max
(
input
[
i
],
0.
f
),
limit
);
}
#else
#pragma omp parallel for
#pragma omp parallel for
schedule(runtime)
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
output
[
i
]
=
std
::
min
(
std
::
max
(
input
[
i
],
0.
f
),
limit
);
}
...
...
mace/ops/arm/conv_2d_neon_15x1.cc
浏览文件 @
69583cd6
...
...
@@ -60,7 +60,7 @@ void Conv2dNeonK15x1S1(const float *input,
const
index_t
tile_width
=
out_shape
[
1
]
<
4
?
RoundUpDiv4
(
out_shape
[
3
])
:
out_shape
[
3
];
#pragma omp parallel for collapse(3)
#pragma omp parallel for collapse(3)
schedule(runtime)
for
(
index_t
b
=
0
;
b
<
out_shape
[
0
];
++
b
)
{
for
(
index_t
m
=
0
;
m
<
out_shape
[
1
];
++
m
)
{
for
(
index_t
w
=
0
;
w
<
out_shape
[
3
];
w
+=
tile_width
)
{
...
...
mace/ops/arm/conv_2d_neon_1x15.cc
浏览文件 @
69583cd6
...
...
@@ -61,7 +61,7 @@ void Conv2dNeonK1x15S1(const float *input,
const
index_t
tile_height
=
out_shape
[
1
]
<
4
?
RoundUpDiv4
(
out_shape
[
2
])
:
out_shape
[
2
];
#pragma omp parallel for collapse(3)
#pragma omp parallel for collapse(3)
schedule(runtime)
for
(
index_t
b
=
0
;
b
<
out_shape
[
0
];
++
b
)
{
for
(
index_t
m
=
0
;
m
<
out_shape
[
1
];
++
m
)
{
for
(
index_t
h
=
0
;
h
<
out_shape
[
2
];
h
+=
tile_height
)
{
...
...
mace/ops/arm/conv_2d_neon_1x7.cc
浏览文件 @
69583cd6
...
...
@@ -32,7 +32,7 @@ void Conv2dNeonK1x7S1(const float *input,
const
index_t
in_batch_size
=
in_shape
[
1
]
*
in_image_size
;
const
index_t
out_batch_size
=
out_shape
[
1
]
*
out_image_size
;
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2)
schedule(runtime)
for
(
index_t
b
=
0
;
b
<
out_shape
[
0
];
++
b
)
{
for
(
index_t
m
=
0
;
m
<
out_shape
[
1
];
m
+=
4
)
{
const
index_t
out_channels
=
out_shape
[
1
];
...
...
mace/ops/arm/conv_2d_neon_3x3.cc
浏览文件 @
69583cd6
...
...
@@ -33,7 +33,7 @@ void Conv2dNeonK3x3S1(const float *input,
const
index_t
in_batch_size
=
in_shape
[
1
]
*
in_image_size
;
const
index_t
out_batch_size
=
out_shape
[
1
]
*
out_image_size
;
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2)
schedule(runtime)
for
(
index_t
b
=
0
;
b
<
out_shape
[
0
];
++
b
)
{
for
(
index_t
m
=
0
;
m
<
out_shape
[
1
];
m
+=
2
)
{
const
index_t
out_channels
=
out_shape
[
1
];
...
...
@@ -515,7 +515,7 @@ void Conv2dNeonK3x3S2(const float *input,
const
index_t
in_batch_size
=
in_shape
[
1
]
*
in_image_size
;
const
index_t
out_batch_size
=
out_shape
[
1
]
*
out_image_size
;
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2)
schedule(runtime)
for
(
index_t
b
=
0
;
b
<
out_shape
[
0
];
++
b
)
{
for
(
index_t
m
=
0
;
m
<
out_shape
[
1
];
++
m
)
{
for
(
index_t
c
=
0
;
c
<
in_shape
[
1
];
++
c
)
{
...
...
mace/ops/arm/conv_2d_neon_5x5.cc
浏览文件 @
69583cd6
...
...
@@ -87,7 +87,7 @@ void Conv2dNeonK5x5S1(const float *input,
const
index_t
in_batch_size
=
in_shape
[
1
]
*
in_image_size
;
const
index_t
out_batch_size
=
out_shape
[
1
]
*
out_image_size
;
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2)
schedule(runtime)
for
(
index_t
b
=
0
;
b
<
out_shape
[
0
];
++
b
)
{
for
(
index_t
m
=
0
;
m
<
out_shape
[
1
];
m
+=
4
)
{
const
index_t
out_channels
=
out_shape
[
1
];
...
...
mace/ops/arm/conv_2d_neon_7x1.cc
浏览文件 @
69583cd6
...
...
@@ -32,7 +32,7 @@ void Conv2dNeonK7x1S1(const float *input,
const
index_t
in_batch_size
=
in_shape
[
1
]
*
in_image_size
;
const
index_t
out_batch_size
=
out_shape
[
1
]
*
out_image_size
;
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2)
schedule(runtime)
for
(
index_t
b
=
0
;
b
<
out_shape
[
0
];
++
b
)
{
for
(
index_t
m
=
0
;
m
<
out_shape
[
1
];
m
+=
4
)
{
const
index_t
out_channels
=
out_shape
[
1
];
...
...
mace/ops/arm/conv_2d_neon_7x7.cc
浏览文件 @
69583cd6
...
...
@@ -164,7 +164,7 @@ void Conv2dNeonK7x7S1(const float *input,
const
index_t
in_batch_size
=
in_shape
[
1
]
*
in_image_size
;
const
index_t
out_batch_size
=
out_shape
[
1
]
*
out_image_size
;
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2)
schedule(runtime)
for
(
index_t
b
=
0
;
b
<
out_shape
[
0
];
++
b
)
{
for
(
index_t
m
=
0
;
m
<
out_shape
[
1
];
m
+=
4
)
{
const
index_t
out_channels
=
out_shape
[
1
];
...
...
@@ -319,7 +319,7 @@ void Conv2dNeonK7x7S2(const float *input,
const
index_t
in_batch_size
=
in_shape
[
1
]
*
in_image_size
;
const
index_t
out_batch_size
=
out_shape
[
1
]
*
out_image_size
;
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2)
schedule(runtime)
for
(
index_t
b
=
0
;
b
<
out_shape
[
0
];
++
b
)
{
for
(
index_t
m
=
0
;
m
<
out_shape
[
1
];
m
+=
4
)
{
const
index_t
out_channels
=
out_shape
[
1
];
...
...
@@ -484,7 +484,7 @@ void Conv2dNeonK7x7S3(const float *input,
const
index_t
in_batch_size
=
in_shape
[
1
]
*
in_image_size
;
const
index_t
out_batch_size
=
out_shape
[
1
]
*
out_image_size
;
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2)
schedule(runtime)
for
(
index_t
b
=
0
;
b
<
out_shape
[
0
];
++
b
)
{
for
(
index_t
m
=
0
;
m
<
out_shape
[
1
];
m
+=
4
)
{
const
index_t
out_channels
=
out_shape
[
1
];
...
...
mace/ops/arm/conv_winograd.cc
浏览文件 @
69583cd6
...
...
@@ -34,7 +34,7 @@ void TransformInput4x4(const float *input,
const
index_t
input_batch_size
=
in_height_width
*
in_channels
;
const
index_t
output_batch_size
=
16
*
in_channels
*
tile_count
;
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2)
schedule(runtime)
for
(
index_t
n
=
0
;
n
<
batch
;
++
n
)
{
for
(
index_t
c
=
0
;
c
<
in_channels
;
++
c
)
{
index_t
tile_index
=
0
;
...
...
@@ -155,7 +155,7 @@ void TransformInput8x8(const float *input,
const
index_t
input_batch_size
=
in_height_width
*
in_channels
;
const
index_t
output_batch_size
=
64
*
in_channels
*
tile_count
;
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2)
schedule(runtime)
for
(
index_t
n
=
0
;
n
<
batch
;
++
n
)
{
for
(
index_t
c
=
0
;
c
<
in_channels
;
++
c
)
{
index_t
tile_index
=
0
;
...
...
@@ -292,7 +292,7 @@ void TransformOutput4x4(const float *input,
const
index_t
out_image_size
=
out_height
*
out_width
;
const
index_t
output_batch_size
=
out_channels
*
out_image_size
;
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2)
schedule(runtime)
for
(
index_t
n
=
0
;
n
<
batch
;
++
n
)
{
for
(
index_t
m
=
0
;
m
<
out_channels
;
++
m
)
{
index_t
tile_offset
=
0
;
...
...
@@ -388,7 +388,7 @@ void TransformOutput8x8(const float *input,
const
index_t
out_image_size
=
out_height
*
out_width
;
const
index_t
output_batch_size
=
out_channels
*
out_image_size
;
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2)
schedule(runtime)
for
(
index_t
n
=
0
;
n
<
batch
;
++
n
)
{
for
(
index_t
m
=
0
;
m
<
out_channels
;
++
m
)
{
index_t
tile_offset
=
0
;
...
...
@@ -471,7 +471,7 @@ void TransformFilter4x4(const float *filter,
float
*
output
)
{
const
index_t
stride
=
out_channels
*
in_channels
;
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2)
schedule(runtime)
for
(
index_t
m
=
0
;
m
<
out_channels
;
++
m
)
{
for
(
index_t
c
=
0
;
c
<
in_channels
;
++
c
)
{
float
g0
,
g1
,
g2
,
g3
,
g4
,
g5
,
g6
,
g7
,
g8
;
...
...
@@ -573,7 +573,7 @@ void TransformFilter8x8(const float *filter,
{
1.0
f
/
45
,
-
1.0
f
/
90
,
1.0
f
/
180
},
{
0.0
f
,
0.0
f
,
1.0
f
}};
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2)
schedule(runtime)
for
(
index_t
m
=
0
;
m
<
out_channels
;
++
m
)
{
for
(
index_t
c
=
0
;
c
<
in_channels
;
++
c
)
{
// load filter
...
...
@@ -720,7 +720,7 @@ void ConvRef3x3s1(const float *input,
index_t
out_height
=
in_height
-
2
;
index_t
out_width
=
in_width
-
2
;
#pragma omp parallel for collapse(4)
#pragma omp parallel for collapse(4)
schedule(runtime)
for
(
index_t
b
=
0
;
b
<
batch
;
++
b
)
{
for
(
index_t
m
=
0
;
m
<
out_channels
;
++
m
)
{
for
(
index_t
h
=
0
;
h
<
out_height
;
++
h
)
{
...
...
mace/ops/arm/deconv_2d_neon_2x2.cc
浏览文件 @
69583cd6
...
...
@@ -33,7 +33,7 @@ void Deconv2dNeonK2x2S1(const float *input,
const
index_t
out_img_size
=
outh
*
outw
;
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2)
schedule(runtime)
for
(
index_t
b
=
0
;
b
<
out_shape
[
0
];
++
b
)
{
for
(
index_t
oc
=
0
;
oc
<
outch
;
oc
+=
2
)
{
if
(
oc
+
1
<
outch
)
{
...
...
@@ -199,7 +199,7 @@ void Deconv2dNeonK2x2S2(const float *input,
const
index_t
outw
=
out_shape
[
3
];
const
index_t
out_img_size
=
outh
*
outw
;
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2)
schedule(runtime)
for
(
index_t
b
=
0
;
b
<
out_shape
[
0
];
++
b
)
{
for
(
index_t
oc
=
0
;
oc
<
outch
;
++
oc
)
{
float
*
out_base
=
output
+
(
b
*
outch
+
oc
)
*
out_img_size
;
...
...
mace/ops/arm/deconv_2d_neon_3x3.cc
浏览文件 @
69583cd6
...
...
@@ -33,7 +33,7 @@ void Deconv2dNeonK3x3S1(const float *input,
const
index_t
out_img_size
=
outh
*
outw
;
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2)
schedule(runtime)
for
(
index_t
b
=
0
;
b
<
out_shape
[
0
];
++
b
)
{
for
(
index_t
oc
=
0
;
oc
<
outch
;
oc
+=
2
)
{
if
(
oc
+
1
<
outch
)
{
...
...
@@ -293,7 +293,7 @@ void Deconv2dNeonK3x3S2(const float *input,
const
index_t
outw
=
out_shape
[
3
];
const
index_t
out_img_size
=
outh
*
outw
;
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2)
schedule(runtime)
for
(
index_t
b
=
0
;
b
<
out_shape
[
0
];
++
b
)
{
for
(
index_t
oc
=
0
;
oc
<
outch
;
++
oc
)
{
float
*
out_base
=
output
+
(
b
*
outch
+
oc
)
*
out_img_size
;
...
...
mace/ops/arm/deconv_2d_neon_4x4.cc
浏览文件 @
69583cd6
...
...
@@ -31,7 +31,7 @@ void Deconv2dNeonK4x4S1(const float *input,
const
index_t
outw
=
out_shape
[
3
];
const
index_t
outch
=
out_shape
[
1
];
const
index_t
out_img_size
=
outh
*
outw
;
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2)
schedule(runtime)
for
(
index_t
b
=
0
;
b
<
out_shape
[
0
];
++
b
)
{
for
(
index_t
oc
=
0
;
oc
<
outch
;
oc
+=
2
)
{
if
(
oc
+
1
<
outch
)
{
...
...
@@ -386,7 +386,7 @@ void Deconv2dNeonK4x4S2(const float *input,
const
index_t
outch
=
out_shape
[
1
];
const
index_t
out_img_size
=
outh
*
outw
;
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2)
schedule(runtime)
for
(
index_t
b
=
0
;
b
<
out_shape
[
0
];
++
b
)
{
for
(
index_t
p
=
0
;
p
<
outch
;
p
++
)
{
float
*
out_base
=
output
+
(
b
*
outch
+
p
)
*
out_img_size
;
...
...
mace/ops/arm/depthwise_conv2d_neon_3x3.cc
浏览文件 @
69583cd6
...
...
@@ -70,7 +70,7 @@ void DepthwiseConv2dNeonK3x3S1(const float *input,
const
index_t
in_batch_size
=
in_shape
[
1
]
*
in_image_size
;
const
index_t
out_batch_size
=
out_shape
[
1
]
*
out_image_size
;
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2)
schedule(runtime)
for
(
index_t
b
=
0
;
b
<
in_shape
[
0
];
++
b
)
{
for
(
index_t
m
=
0
;
m
<
out_shape
[
1
];
++
m
)
{
index_t
c
=
m
/
multiplier
;
...
...
@@ -250,7 +250,7 @@ void DepthwiseConv2dNeonK3x3S2(const float *input,
const
index_t
in_batch_size
=
in_shape
[
1
]
*
in_image_size
;
const
index_t
out_batch_size
=
out_shape
[
1
]
*
out_image_size
;
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2)
schedule(runtime)
for
(
index_t
b
=
0
;
b
<
in_shape
[
0
];
++
b
)
{
for
(
index_t
m
=
0
;
m
<
out_shape
[
1
];
++
m
)
{
index_t
c
=
m
/
multiplier
;
...
...
mace/ops/arm/depthwise_deconv2d_neon_3x3.cc
浏览文件 @
69583cd6
...
...
@@ -32,7 +32,7 @@ void DepthwiseDeconv2dNeonK3x3S1(const float *input,
const
index_t
outw
=
out_shape
[
3
];
const
index_t
out_img_size
=
outh
*
outw
;
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2)
schedule(runtime)
for
(
index_t
b
=
0
;
b
<
out_shape
[
0
];
++
b
)
{
for
(
index_t
c
=
0
;
c
<
channels
;
++
c
)
{
const
index_t
offset
=
b
*
channels
+
c
;
...
...
@@ -137,7 +137,7 @@ void DepthwiseDeconv2dNeonK3x3S2(const float *input,
const
index_t
outw
=
out_shape
[
3
];
const
index_t
out_img_size
=
outh
*
outw
;
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2)
schedule(runtime)
for
(
index_t
b
=
0
;
b
<
out_shape
[
0
];
++
b
)
{
for
(
index_t
c
=
0
;
c
<
channels
;
++
c
)
{
const
index_t
offset
=
b
*
channels
+
c
;
...
...
@@ -251,7 +251,7 @@ void GroupDeconv2dNeonK3x3S1(const float *input,
const
index_t
inch_g
=
inch
/
group
;
const
index_t
outch_g
=
outch
/
group
;
#pragma omp parallel for collapse(3)
#pragma omp parallel for collapse(3)
schedule(runtime)
for
(
index_t
b
=
0
;
b
<
out_shape
[
0
];
++
b
)
{
for
(
int
g
=
0
;
g
<
group
;
++
g
)
{
for
(
index_t
oc
=
0
;
oc
<
outch_g
;
oc
+=
2
)
{
...
...
@@ -525,7 +525,7 @@ void GroupDeconv2dNeonK3x3S2(const float *input,
const
index_t
inch_g
=
inch
/
group
;
const
index_t
outch_g
=
outch
/
group
;
#pragma omp parallel for collapse(3)
#pragma omp parallel for collapse(3)
schedule(runtime)
for
(
index_t
b
=
0
;
b
<
out_shape
[
0
];
++
b
)
{
for
(
int
g
=
0
;
g
<
group
;
++
g
)
{
for
(
index_t
oc
=
0
;
oc
<
outch_g
;
++
oc
)
{
...
...
mace/ops/arm/depthwise_deconv2d_neon_4x4.cc
浏览文件 @
69583cd6
...
...
@@ -33,7 +33,7 @@ void DepthwiseDeconv2dNeonK4x4S1(const float *input,
const
index_t
outw
=
out_shape
[
3
];
const
index_t
out_img_size
=
outh
*
outw
;
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2)
schedule(runtime)
for
(
index_t
b
=
0
;
b
<
batch
;
++
b
)
{
for
(
index_t
c
=
0
;
c
<
channels
;
++
c
)
{
const
index_t
offset
=
b
*
channels
+
c
;
...
...
@@ -169,7 +169,7 @@ void DepthwiseDeconv2dNeonK4x4S2(const float *input,
const
index_t
outw
=
out_shape
[
3
];
const
index_t
out_img_size
=
outh
*
outw
;
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2)
schedule(runtime)
for
(
index_t
b
=
0
;
b
<
out_shape
[
0
];
++
b
)
{
for
(
index_t
c
=
0
;
c
<
channels
;
++
c
)
{
const
index_t
offset
=
b
*
channels
+
c
;
...
...
@@ -304,7 +304,7 @@ void GroupDeconv2dNeonK4x4S1(const float *input,
const
index_t
inch_g
=
inch
/
group
;
const
index_t
outch_g
=
outch
/
group
;
#pragma omp parallel for collapse(3)
#pragma omp parallel for collapse(3)
schedule(runtime)
for
(
index_t
b
=
0
;
b
<
out_shape
[
0
];
++
b
)
{
for
(
int
g
=
0
;
g
<
group
;
++
g
)
{
for
(
index_t
oc
=
0
;
oc
<
outch_g
;
oc
+=
2
)
{
...
...
@@ -679,7 +679,7 @@ void GroupDeconv2dNeonK4x4S2(const float *input,
const
index_t
inch_g
=
inch
/
group
;
const
index_t
outch_g
=
outch
/
group
;
#pragma omp parallel for collapse(3)
#pragma omp parallel for collapse(3)
schedule(runtime)
for
(
index_t
b
=
0
;
b
<
out_shape
[
0
];
++
b
)
{
for
(
int
g
=
0
;
g
<
group
;
++
g
)
{
for
(
index_t
oc
=
0
;
oc
<
outch_g
;
oc
++
)
{
...
...
mace/ops/batch_to_space.cc
浏览文件 @
69583cd6
...
...
@@ -124,7 +124,7 @@ class BatchToSpaceNDOp<DeviceType::CPU, float> : public BatchToSpaceOpBase {
std
::
max
(
static_cast
<
index_t
>
(
1
),
8
*
1024
/
block_shape_w
/
out_width
);
// make channel outter loop so we can make best use of cache
#pragma omp parallel for collapse(3)
#pragma omp parallel for collapse(3)
schedule(runtime)
for
(
index_t
c
=
0
;
c
<
channels
;
++
c
)
{
for
(
index_t
block_h
=
0
;
block_h
<
in_height
;
block_h
+=
block_h_size
)
{
...
...
@@ -213,7 +213,7 @@ class BatchToSpaceNDOp<DeviceType::CPU, uint8_t> : public BatchToSpaceOpBase {
index_t
out_width
=
space_tensor
->
dim
(
2
);
index_t
channels
=
space_tensor
->
dim
(
3
);
#pragma omp parallel for
#pragma omp parallel for
schedule(runtime)
for
(
index_t
in_b
=
0
;
in_b
<
in_batches
;
++
in_b
)
{
const
index_t
b
=
in_b
%
out_batches
;
const
index_t
tile_index
=
in_b
/
out_batches
;
...
...
mace/ops/channel_shuffle.cc
浏览文件 @
69583cd6
...
...
@@ -55,7 +55,7 @@ class ChannelShuffleOp<DeviceType::CPU, T> : public Operation {
index_t
batch_size
=
channels
*
image_size
;
index_t
channels_per_group
=
channels
/
groups_
;
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2)
schedule(runtime)
for
(
index_t
b
=
0
;
b
<
batch
;
++
b
)
{
for
(
index_t
c
=
0
;
c
<
channels
;
++
c
)
{
const
T
*
input_base
=
input_ptr
+
b
*
batch_size
;
...
...
mace/ops/conv_2d.cc
浏览文件 @
69583cd6
...
...
@@ -475,7 +475,7 @@ class Conv2dOp<DeviceType::CPU, float> : public ConvPool2dOpBase {
// unpack output
if
(
extra_output_height
!=
height
||
extra_output_width
!=
width
)
{
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2)
schedule(runtime)
for
(
index_t
b
=
0
;
b
<
batch
;
++
b
)
{
for
(
index_t
c
=
0
;
c
<
channels
;
++
c
)
{
for
(
index_t
h
=
0
;
h
<
height
;
++
h
)
{
...
...
@@ -494,7 +494,7 @@ class Conv2dOp<DeviceType::CPU, float> : public ConvPool2dOpBase {
if
(
bias_data
!=
nullptr
)
{
const
index_t
image_size
=
height
*
width
;
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2)
schedule(runtime)
for
(
index_t
b
=
0
;
b
<
batch
;
++
b
)
{
for
(
index_t
c
=
0
;
c
<
channels
;
++
c
)
{
float
*
output_ptr
=
output_data
+
(
b
*
channels
+
c
)
*
image_size
;
...
...
@@ -539,7 +539,7 @@ class Conv2dOp<DeviceType::CPU, float> : public ConvPool2dOpBase {
const
index_t
out_batch_size
=
filter_shape
[
0
]
*
out_image_size
;
const
index_t
filter_size
=
filter_shape
[
2
]
*
filter_shape
[
3
];
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2)
schedule(runtime)
for
(
index_t
b
=
0
;
b
<
in_shape
[
0
];
b
++
)
{
for
(
index_t
m
=
0
;
m
<
filter_shape
[
0
];
m
+=
4
)
{
const
index_t
in_width
=
in_shape
[
3
];
...
...
@@ -867,7 +867,7 @@ class Conv2dOp<DeviceType::CPU, uint8_t> : public ConvPool2dOpBase {
const
index_t
input_row_size
=
in_shape
[
2
]
*
in_shape
[
3
];
const
index_t
patch_row_size
=
filter_w
*
in_shape
[
3
];
#pragma omp parallel for collapse(3)
#pragma omp parallel for collapse(3)
schedule(runtime)
for
(
index_t
b
=
0
;
b
<
out_shape
[
0
];
++
b
)
{
for
(
index_t
h
=
0
;
h
<
out_shape
[
1
];
++
h
)
{
for
(
index_t
w
=
0
;
w
<
out_shape
[
2
];
++
w
)
{
...
...
mace/ops/conv_pool_2d_util.cc
浏览文件 @
69583cd6
...
...
@@ -395,7 +395,7 @@ MaceStatus ConstructNCHWInputWithSpecificPadding(const Tensor *input_tensor,
const
index_t
in_batch_size
=
channels
*
in_image_size
;
const
index_t
out_batch_size
=
channels
*
out_image_size
;
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2)
schedule(runtime)
for
(
int
i
=
0
;
i
<
batch
;
++
i
)
{
for
(
int
j
=
0
;
j
<
channels
;
++
j
)
{
for
(
int
k
=
0
;
k
<
height
;
++
k
)
{
...
...
@@ -443,7 +443,7 @@ MaceStatus ConstructNHWCInputWithPadding(const Tensor *input_tensor,
if
(
padding_same_value
)
{
LOG
(
FATAL
)
<<
"Not implemented"
;
}
else
{
#pragma omp parallel for collapse(3)
#pragma omp parallel for collapse(3)
schedule(runtime)
for
(
int
n
=
0
;
n
<
batch
;
++
n
)
{
for
(
int
h
=
0
;
h
<
height
;
++
h
)
{
for
(
int
w
=
0
;
w
<
width
;
++
w
)
{
...
...
mace/ops/deconv_2d.cc
浏览文件 @
69583cd6
...
...
@@ -276,7 +276,7 @@ class Deconv2dOp<DeviceType::CPU, float> : public Deconv2dOpBase {
const
index_t
batch
=
out_shape
[
0
];
const
index_t
channels
=
out_shape
[
1
];
const
index_t
img_size
=
out_shape
[
2
]
*
out_shape
[
3
];
#pragma omp parallel for collapse(3)
#pragma omp parallel for collapse(3)
schedule(runtime)
for
(
index_t
b
=
0
;
b
<
batch
;
++
b
)
{
for
(
index_t
c
=
0
;
c
<
channels
;
++
c
)
{
for
(
index_t
i
=
0
;
i
<
img_size
;
++
i
)
{
...
...
@@ -324,7 +324,7 @@ class Deconv2dOp<DeviceType::CPU, float> : public Deconv2dOpBase {
const
index_t
out_channels
=
out_shape
[
1
];
const
index_t
in_channels
=
in_shape
[
1
];
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2)
schedule(runtime)
for
(
int
b
=
0
;
b
<
batch
;
++
b
)
{
for
(
int
oc
=
0
;
oc
<
out_channels
;
++
oc
)
{
float
*
out_base
=
...
...
mace/ops/depth_to_space.cc
浏览文件 @
69583cd6
...
...
@@ -57,7 +57,7 @@ class DepthToSpaceOp : public Operation {
const
T
*
input_ptr
=
input
->
data
<
T
>
();
T
*
output_ptr
=
output
->
mutable_data
<
T
>
();
#pragma omp parallel for
#pragma omp parallel for
schedule(runtime)
for
(
index_t
b
=
0
;
b
<
batch_size
;
++
b
)
{
for
(
index_t
d
=
0
;
d
<
output_depth
;
++
d
)
{
for
(
index_t
h
=
0
;
h
<
output_height
;
++
h
)
{
...
...
mace/ops/eltwise.cc
浏览文件 @
69583cd6
...
...
@@ -201,7 +201,7 @@ inline void TensorBroadcastEltwise(const EltwiseType type,
switch
(
type
)
{
case
SUM
:
if
(
coeff
.
empty
())
{
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2)
schedule(runtime)
for
(
index_t
d
=
0
;
d
<
diff_size
;
++
d
)
{
for
(
index_t
i
=
0
;
i
<
common_size
;
++
i
)
{
output
[
i
+
d
*
common_size
]
=
...
...
@@ -213,7 +213,7 @@ inline void TensorBroadcastEltwise(const EltwiseType type,
if
(
swapped
)
{
std
::
swap
(
coeff_copy
[
0
],
coeff_copy
[
1
]);
}
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2)
schedule(runtime)
for
(
index_t
d
=
0
;
d
<
diff_size
;
++
d
)
{
for
(
index_t
i
=
0
;
i
<
common_size
;
++
i
)
{
output
[
i
+
d
*
common_size
]
=
...
...
@@ -225,7 +225,7 @@ inline void TensorBroadcastEltwise(const EltwiseType type,
break
;
case
SUB
:
if
(
!
swapped
)
{
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2)
schedule(runtime)
for
(
index_t
d
=
0
;
d
<
diff_size
;
++
d
)
{
for
(
index_t
i
=
0
;
i
<
common_size
;
++
i
)
{
output
[
i
+
d
*
common_size
]
=
...
...
@@ -233,7 +233,7 @@ inline void TensorBroadcastEltwise(const EltwiseType type,
}
}
}
else
{
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2)
schedule(runtime)
for
(
index_t
d
=
0
;
d
<
diff_size
;
++
d
)
{
for
(
index_t
i
=
0
;
i
<
common_size
;
++
i
)
{
output
[
i
+
d
*
common_size
]
=
...
...
@@ -243,7 +243,7 @@ inline void TensorBroadcastEltwise(const EltwiseType type,
}
break
;
case
PROD
:
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2)
schedule(runtime)
for
(
index_t
d
=
0
;
d
<
diff_size
;
++
d
)
{
for
(
index_t
i
=
0
;
i
<
common_size
;
++
i
)
{
output
[
i
+
d
*
common_size
]
=
input0
[
i
+
d
*
common_size
]
*
input1
[
i
];
...
...
@@ -252,7 +252,7 @@ inline void TensorBroadcastEltwise(const EltwiseType type,
break
;
case
DIV
:
if
(
!
swapped
)
{
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2)
schedule(runtime)
for
(
index_t
d
=
0
;
d
<
diff_size
;
++
d
)
{
for
(
index_t
i
=
0
;
i
<
common_size
;
++
i
)
{
output
[
i
+
d
*
common_size
]
=
...
...
@@ -260,7 +260,7 @@ inline void TensorBroadcastEltwise(const EltwiseType type,
}
}
}
else
{
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2)
schedule(runtime)
for
(
index_t
d
=
0
;
d
<
diff_size
;
++
d
)
{
for
(
index_t
i
=
0
;
i
<
common_size
;
++
i
)
{
output
[
i
+
d
*
common_size
]
=
...
...
@@ -270,7 +270,7 @@ inline void TensorBroadcastEltwise(const EltwiseType type,
}
break
;
case
MIN
:
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2)
schedule(runtime)
for
(
index_t
d
=
0
;
d
<
diff_size
;
++
d
)
{
for
(
index_t
i
=
0
;
i
<
common_size
;
++
i
)
{
output
[
i
+
d
*
common_size
]
=
...
...
@@ -279,7 +279,7 @@ inline void TensorBroadcastEltwise(const EltwiseType type,
}
break
;
case
MAX
:
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2)
schedule(runtime)
for
(
index_t
d
=
0
;
d
<
diff_size
;
++
d
)
{
for
(
index_t
i
=
0
;
i
<
common_size
;
++
i
)
{
output
[
i
+
d
*
common_size
]
=
...
...
@@ -288,7 +288,7 @@ inline void TensorBroadcastEltwise(const EltwiseType type,
}
break
;
case
SQR_DIFF
:
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2)
schedule(runtime)
for
(
index_t
d
=
0
;
d
<
diff_size
;
++
d
)
{
for
(
index_t
i
=
0
;
i
<
common_size
;
++
i
)
{
output
[
i
+
d
*
common_size
]
=
...
...
@@ -298,7 +298,7 @@ inline void TensorBroadcastEltwise(const EltwiseType type,
break
;
case
POW
:
if
(
!
swapped
)
{
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2)
schedule(runtime)
for
(
index_t
d
=
0
;
d
<
diff_size
;
++
d
)
{
for
(
index_t
i
=
0
;
i
<
common_size
;
++
i
)
{
output
[
i
+
d
*
common_size
]
=
...
...
@@ -306,7 +306,7 @@ inline void TensorBroadcastEltwise(const EltwiseType type,
}
}
}
else
{
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2)
schedule(runtime)
for
(
index_t
d
=
0
;
d
<
diff_size
;
++
d
)
{
for
(
index_t
i
=
0
;
i
<
common_size
;
++
i
)
{
output
[
i
+
d
*
common_size
]
=
...
...
@@ -316,19 +316,19 @@ inline void TensorBroadcastEltwise(const EltwiseType type,
}
break
;
case
NEG
:
#pragma omp parallel for
#pragma omp parallel for
schedule(runtime)
for
(
index_t
i
=
0
;
i
<
diff_size
*
common_size
;
++
i
)
{
output
[
i
]
=
-
input0
[
i
];
}
break
;
case
ABS
:
#pragma omp parallel for
#pragma omp parallel for
schedule(runtime)
for
(
index_t
i
=
0
;
i
<
diff_size
*
common_size
;
++
i
)
{
output
[
i
]
=
std
::
fabs
(
input0
[
i
]);
}
break
;
case
EQUAL
:
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2)
schedule(runtime)
for
(
index_t
d
=
0
;
d
<
diff_size
;
++
d
)
{
for
(
index_t
i
=
0
;
i
<
common_size
;
++
i
)
{
output
[
i
+
d
*
common_size
]
=
...
...
@@ -353,7 +353,7 @@ inline void TensorEltwise(const EltwiseType type,
switch
(
type
)
{
case
SUM
:
if
(
coeff
.
empty
())
{
#pragma omp parallel for
#pragma omp parallel for
schedule(runtime)
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
output
[
i
]
=
input0
[
i
]
+
input1
[
i
];
}
...
...
@@ -363,7 +363,7 @@ inline void TensorEltwise(const EltwiseType type,
if
(
swapped
)
{
std
::
swap
(
coeff_copy
[
0
],
coeff_copy
[
1
]);
}
#pragma omp parallel for
#pragma omp parallel for
schedule(runtime)
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
output
[
i
]
=
input0
[
i
]
*
coeff_copy
[
0
]
+
input1
[
i
]
*
coeff_copy
[
1
];
}
...
...
@@ -371,20 +371,20 @@ inline void TensorEltwise(const EltwiseType type,
break
;
case
SUB
:
if
(
!
swapped
)
{
#pragma omp parallel for
#pragma omp parallel for
schedule(runtime)
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
output
[
i
]
=
input0
[
i
]
-
input1
[
i
];
}
}
else
{
#pragma omp parallel for
#pragma omp parallel for
schedule(runtime)
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
output
[
i
]
=
input1
[
i
]
-
input0
[
i
];
}
}
break
;
case
PROD
:
#pragma omp parallel for
#pragma omp parallel for
schedule(runtime)
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
output
[
i
]
=
input0
[
i
]
*
input1
[
i
];
}
...
...
@@ -392,34 +392,34 @@ inline void TensorEltwise(const EltwiseType type,
break
;
case
DIV
:
if
(
!
swapped
)
{
#pragma omp parallel for
#pragma omp parallel for
schedule(runtime)
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
output
[
i
]
=
input0
[
i
]
/
input1
[
i
];
}
}
else
{
#pragma omp parallel for
#pragma omp parallel for
schedule(runtime)
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
output
[
i
]
=
input1
[
i
]
/
input0
[
i
];
}
}
break
;
case
MIN
:
#pragma omp parallel for
#pragma omp parallel for
schedule(runtime)
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
output
[
i
]
=
std
::
min
(
input0
[
i
],
input1
[
i
]);
}
break
;
case
MAX
:
#pragma omp parallel for
#pragma omp parallel for
schedule(runtime)
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
output
[
i
]
=
std
::
max
(
input0
[
i
],
input1
[
i
]);
}
break
;
case
SQR_DIFF
:
#pragma omp parallel for
#pragma omp parallel for
schedule(runtime)
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
output
[
i
]
=
std
::
pow
(
input0
[
i
]
-
input1
[
i
],
2.
f
);
}
...
...
@@ -427,7 +427,7 @@ inline void TensorEltwise(const EltwiseType type,
break
;
case
POW
:
if
(
!
swapped
)
{
#pragma omp parallel for
#pragma omp parallel for
schedule(runtime)
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
output
[
i
]
=
std
::
pow
(
input0
[
i
],
input1
[
i
]);
}
...
...
@@ -438,19 +438,19 @@ inline void TensorEltwise(const EltwiseType type,
}
break
;
case
NEG
:
#pragma omp parallel for
#pragma omp parallel for
schedule(runtime)
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
output
[
i
]
=
-
input0
[
i
];
}
break
;
case
ABS
:
#pragma omp parallel for
#pragma omp parallel for
schedule(runtime)
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
output
[
i
]
=
std
::
fabs
(
input0
[
i
]);
}
break
;
case
EQUAL
:
#pragma omp parallel for
#pragma omp parallel for
schedule(runtime)
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
output
[
i
]
=
input0
[
i
]
==
input1
[
i
];
}
...
...
@@ -472,7 +472,7 @@ inline void TensorScalarEltwise(const EltwiseType type,
switch
(
type
)
{
case
SUM
:
if
(
coeff
.
empty
())
{
#pragma omp parallel for
#pragma omp parallel for
schedule(runtime)
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
output
[
i
]
=
input0
[
i
]
+
input1
;
}
...
...
@@ -482,7 +482,7 @@ inline void TensorScalarEltwise(const EltwiseType type,
if
(
swapped
)
{
std
::
swap
(
coeff_copy
[
0
],
coeff_copy
[
1
]);
}
#pragma omp parallel for
#pragma omp parallel for
schedule(runtime)
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
output
[
i
]
=
input0
[
i
]
*
coeff_copy
[
0
]
+
input1
*
coeff_copy
[
1
];
}
...
...
@@ -490,20 +490,20 @@ inline void TensorScalarEltwise(const EltwiseType type,
break
;
case
SUB
:
if
(
!
swapped
)
{
#pragma omp parallel for
#pragma omp parallel for
schedule(runtime)
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
output
[
i
]
=
input0
[
i
]
-
input1
;
}
}
else
{
#pragma omp parallel for
#pragma omp parallel for
schedule(runtime)
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
output
[
i
]
=
input1
-
input0
[
i
];
}
}
break
;
case
PROD
:
#pragma omp parallel for
#pragma omp parallel for
schedule(runtime)
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
output
[
i
]
=
input0
[
i
]
*
input1
;
}
...
...
@@ -511,34 +511,34 @@ inline void TensorScalarEltwise(const EltwiseType type,
break
;
case
DIV
:
if
(
!
swapped
)
{
#pragma omp parallel for
#pragma omp parallel for
schedule(runtime)
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
output
[
i
]
=
input0
[
i
]
/
input1
;
}
}
else
{
#pragma omp parallel for
#pragma omp parallel for
schedule(runtime)
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
output
[
i
]
=
input1
/
input0
[
i
];
}
}
break
;
case
MIN
:
#pragma omp parallel for
#pragma omp parallel for
schedule(runtime)
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
output
[
i
]
=
std
::
min
(
input0
[
i
],
input1
);
}
break
;
case
MAX
:
#pragma omp parallel for
#pragma omp parallel for
schedule(runtime)
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
output
[
i
]
=
std
::
max
(
input0
[
i
],
input1
);
}
break
;
case
SQR_DIFF
:
#pragma omp parallel for
#pragma omp parallel for
schedule(runtime)
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
output
[
i
]
=
std
::
pow
(
input0
[
i
]
-
input1
,
2.
f
);
}
...
...
@@ -546,7 +546,7 @@ inline void TensorScalarEltwise(const EltwiseType type,
break
;
case
POW
:
if
(
!
swapped
)
{
#pragma omp parallel for
#pragma omp parallel for
schedule(runtime)
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
output
[
i
]
=
std
::
pow
(
input0
[
i
],
input1
);
}
...
...
@@ -557,19 +557,19 @@ inline void TensorScalarEltwise(const EltwiseType type,
}
break
;
case
NEG
:
#pragma omp parallel for
#pragma omp parallel for
schedule(runtime)
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
output
[
i
]
=
-
input0
[
i
];
}
break
;
case
ABS
:
#pragma omp parallel for
#pragma omp parallel for
schedule(runtime)
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
output
[
i
]
=
std
::
fabs
(
input0
[
i
]);
}
break
;
case
EQUAL
:
#pragma omp parallel for
#pragma omp parallel for
schedule(runtime)
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
output
[
i
]
=
input0
[
i
]
==
input1
;
}
...
...
@@ -594,7 +594,7 @@ inline void TensorEltwisePerChannel(const EltwiseType type,
switch
(
type
)
{
case
SUM
:
if
(
coeff
.
empty
())
{
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2)
schedule(runtime)
for
(
index_t
b
=
0
;
b
<
batch0
;
++
b
)
{
for
(
index_t
c
=
0
;
c
<
channel
;
++
c
)
{
const
T
*
in0_ptr
=
input0
+
((
b
*
channel
)
+
c
)
*
image_size
;
...
...
@@ -610,7 +610,7 @@ inline void TensorEltwisePerChannel(const EltwiseType type,
if
(
swapped
)
{
std
::
swap
(
coeff_copy
[
0
],
coeff_copy
[
1
]);
}
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2)
schedule(runtime)
for
(
index_t
b
=
0
;
b
<
batch0
;
++
b
)
{
for
(
index_t
c
=
0
;
c
<
channel
;
++
c
)
{
const
T
*
in0_ptr
=
input0
+
((
b
*
channel
)
+
c
)
*
image_size
;
...
...
@@ -626,7 +626,7 @@ inline void TensorEltwisePerChannel(const EltwiseType type,
break
;
case
SUB
:
if
(
!
swapped
)
{
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2)
schedule(runtime)
for
(
index_t
b
=
0
;
b
<
batch0
;
++
b
)
{
for
(
index_t
c
=
0
;
c
<
channel
;
++
c
)
{
const
T
*
in0_ptr
=
input0
+
((
b
*
channel
)
+
c
)
*
image_size
;
...
...
@@ -638,7 +638,7 @@ inline void TensorEltwisePerChannel(const EltwiseType type,
}
}
}
else
{
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2)
schedule(runtime)
for
(
index_t
b
=
0
;
b
<
batch0
;
++
b
)
{
for
(
index_t
c
=
0
;
c
<
channel
;
++
c
)
{
const
T
*
in0_ptr
=
input0
+
((
b
*
channel
)
+
c
)
*
image_size
;
...
...
@@ -652,7 +652,7 @@ inline void TensorEltwisePerChannel(const EltwiseType type,
}
break
;
case
PROD
:
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2)
schedule(runtime)
for
(
index_t
b
=
0
;
b
<
batch0
;
++
b
)
{
for
(
index_t
c
=
0
;
c
<
channel
;
++
c
)
{
const
T
*
in0_ptr
=
input0
+
((
b
*
channel
)
+
c
)
*
image_size
;
...
...
@@ -666,7 +666,7 @@ inline void TensorEltwisePerChannel(const EltwiseType type,
break
;
case
DIV
:
if
(
!
swapped
)
{
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2)
schedule(runtime)
for
(
index_t
b
=
0
;
b
<
batch0
;
++
b
)
{
for
(
index_t
c
=
0
;
c
<
channel
;
++
c
)
{
const
T
*
in0_ptr
=
input0
+
((
b
*
channel
)
+
c
)
*
image_size
;
...
...
@@ -678,7 +678,7 @@ inline void TensorEltwisePerChannel(const EltwiseType type,
}
}
}
else
{
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2)
schedule(runtime)
for
(
index_t
b
=
0
;
b
<
batch0
;
++
b
)
{
for
(
index_t
c
=
0
;
c
<
channel
;
++
c
)
{
const
T
*
in0_ptr
=
input0
+
((
b
*
channel
)
+
c
)
*
image_size
;
...
...
@@ -692,7 +692,7 @@ inline void TensorEltwisePerChannel(const EltwiseType type,
}
break
;
case
MIN
:
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2)
schedule(runtime)
for
(
index_t
b
=
0
;
b
<
batch0
;
++
b
)
{
for
(
index_t
c
=
0
;
c
<
channel
;
++
c
)
{
const
T
*
in0_ptr
=
input0
+
((
b
*
channel
)
+
c
)
*
image_size
;
...
...
@@ -705,7 +705,7 @@ inline void TensorEltwisePerChannel(const EltwiseType type,
}
break
;
case
MAX
:
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2)
schedule(runtime)
for
(
index_t
b
=
0
;
b
<
batch0
;
++
b
)
{
for
(
index_t
c
=
0
;
c
<
channel
;
++
c
)
{
const
T
*
in0_ptr
=
input0
+
((
b
*
channel
)
+
c
)
*
image_size
;
...
...
@@ -718,7 +718,7 @@ inline void TensorEltwisePerChannel(const EltwiseType type,
}
break
;
case
SQR_DIFF
:
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2)
schedule(runtime)
for
(
index_t
b
=
0
;
b
<
batch0
;
++
b
)
{
for
(
index_t
c
=
0
;
c
<
channel
;
++
c
)
{
const
T
*
in0_ptr
=
input0
+
((
b
*
channel
)
+
c
)
*
image_size
;
...
...
@@ -732,7 +732,7 @@ inline void TensorEltwisePerChannel(const EltwiseType type,
break
;
case
POW
:
if
(
!
swapped
)
{
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2)
schedule(runtime)
for
(
index_t
b
=
0
;
b
<
batch0
;
++
b
)
{
for
(
index_t
c
=
0
;
c
<
channel
;
++
c
)
{
const
T
*
in0_ptr
=
input0
+
((
b
*
channel
)
+
c
)
*
image_size
;
...
...
@@ -744,7 +744,7 @@ inline void TensorEltwisePerChannel(const EltwiseType type,
}
}
}
else
{
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2)
schedule(runtime)
for
(
index_t
b
=
0
;
b
<
batch0
;
++
b
)
{
for
(
index_t
c
=
0
;
c
<
channel
;
++
c
)
{
const
T
*
in0_ptr
=
input0
+
((
b
*
channel
)
+
c
)
*
image_size
;
...
...
@@ -758,19 +758,19 @@ inline void TensorEltwisePerChannel(const EltwiseType type,
}
break
;
case
NEG
:
#pragma omp parallel for
#pragma omp parallel for
schedule(runtime)
for
(
index_t
i
=
0
;
i
<
batch0
*
channel
*
image_size
;
++
i
)
{
output
[
i
]
=
-
input0
[
i
];
}
break
;
case
ABS
:
#pragma omp parallel for
#pragma omp parallel for
schedule(runtime)
for
(
index_t
i
=
0
;
i
<
batch0
*
channel
*
image_size
;
++
i
)
{
output
[
i
]
=
std
::
fabs
(
input0
[
i
]);
}
break
;
case
EQUAL
:
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2)
schedule(runtime)
for
(
index_t
b
=
0
;
b
<
batch0
;
++
b
)
{
for
(
index_t
c
=
0
;
c
<
channel
;
++
c
)
{
const
T
*
in0_ptr
=
input0
+
((
b
*
channel
)
+
c
)
*
image_size
;
...
...
@@ -991,7 +991,7 @@ class EltwiseOp<DeviceType::CPU, uint8_t> : public Operation {
index_t
handled_output_size
=
0
;
#ifdef MACE_ENABLE_NEON
#pragma omp parallel for
#pragma omp parallel for
schedule(runtime)
for
(
index_t
i
=
handled_output_size
;
i
<=
output
->
size
()
-
8
;
i
+=
8
)
{
const
auto
input0_val
=
vld1_u8
(
input0_ptr
+
i
);
const
auto
input1_val
=
vld1_u8
(
input1_ptr
+
i
);
...
...
@@ -1037,7 +1037,7 @@ class EltwiseOp<DeviceType::CPU, uint8_t> : public Operation {
}
handled_output_size
=
output
->
size
()
-
output
->
size
()
%
8
;
#endif // NEON
#pragma omp parallel for
#pragma omp parallel for
schedule(runtime)
for
(
index_t
i
=
handled_output_size
;
i
<
output
->
size
();
++
i
)
{
const
int32_t
offset_input0
=
input0_ptr
[
i
]
-
input0
->
zero_point
();
const
int32_t
offset_input1
=
input1_ptr
[
i
]
-
input1
->
zero_point
();
...
...
mace/ops/gather.cc
浏览文件 @
69583cd6
...
...
@@ -62,7 +62,7 @@ class GatherOp : public Operation {
params
->
shape
().
end
(),
1
,
std
::
multiplies
<
index_t
>
());
index_t
index_size
=
indices
->
size
();
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2)
schedule(runtime)
for
(
index_t
l
=
0
;
l
<
lhs_size
;
++
l
)
{
for
(
index_t
idx
=
0
;
idx
<
index_size
;
++
idx
)
{
MACE_ASSERT
(
indices_data
[
idx
]
<
axis_dim_size
,
"idx out of bound: "
,
...
...
mace/ops/local_response_norm.cc
浏览文件 @
69583cd6
...
...
@@ -53,7 +53,7 @@ class LocalResponseNormOp<DeviceType::CPU, float> : public Operation {
index_t
image_size
=
height
*
width
;
index_t
batch_size
=
channels
*
image_size
;
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2)
schedule(runtime)
for
(
index_t
b
=
0
;
b
<
batch
;
++
b
)
{
for
(
index_t
c
=
0
;
c
<
channels
;
++
c
)
{
const
int
begin_input_c
=
std
::
max
(
static_cast
<
index_t
>
(
0
),
...
...
mace/ops/pooling.cc
浏览文件 @
69583cd6
...
...
@@ -133,7 +133,7 @@ class PoolingOp<DeviceType::CPU, float> : public PoolingOpBase {
const
index_t
in_batch_size
=
in_shape
[
1
]
*
in_image_size
;
const
index_t
out_batch_size
=
out_shape
[
1
]
*
out_image_size
;
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2)
schedule(runtime)
for
(
index_t
b
=
0
;
b
<
out_shape
[
0
];
++
b
)
{
for
(
index_t
c
=
0
;
c
<
out_shape
[
1
];
++
c
)
{
const
index_t
out_base
=
b
*
out_batch_size
+
c
*
out_image_size
;
...
...
@@ -179,7 +179,7 @@ class PoolingOp<DeviceType::CPU, float> : public PoolingOpBase {
const
index_t
in_batch_size
=
in_shape
[
1
]
*
in_image_size
;
const
index_t
out_batch_size
=
out_shape
[
1
]
*
out_image_size
;
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2)
schedule(runtime)
for
(
index_t
b
=
0
;
b
<
out_shape
[
0
];
++
b
)
{
for
(
index_t
c
=
0
;
c
<
out_shape
[
1
];
++
c
)
{
const
index_t
out_base
=
b
*
out_batch_size
+
c
*
out_image_size
;
...
...
@@ -301,7 +301,7 @@ class PoolingOp<DeviceType::CPU, uint8_t> : public PoolingOpBase {
const
int
*
stride_hw
,
const
int
*
pad_hw
,
uint8_t
*
output
)
{
#pragma omp parallel for collapse(3)
#pragma omp parallel for collapse(3)
schedule(runtime)
for
(
index_t
b
=
0
;
b
<
out_shape
[
0
];
++
b
)
{
for
(
index_t
h
=
0
;
h
<
out_shape
[
1
];
++
h
)
{
for
(
index_t
w
=
0
;
w
<
out_shape
[
2
];
++
w
)
{
...
...
@@ -358,7 +358,7 @@ class PoolingOp<DeviceType::CPU, uint8_t> : public PoolingOpBase {
const
int
*
stride_hw
,
const
int
*
pad_hw
,
uint8_t
*
output
)
{
#pragma omp parallel for collapse(3)
#pragma omp parallel for collapse(3)
schedule(runtime)
for
(
index_t
b
=
0
;
b
<
out_shape
[
0
];
++
b
)
{
for
(
index_t
h
=
0
;
h
<
out_shape
[
1
];
++
h
)
{
for
(
index_t
w
=
0
;
w
<
out_shape
[
2
];
++
w
)
{
...
...
mace/ops/reduce_mean.cc
浏览文件 @
69583cd6
...
...
@@ -134,7 +134,7 @@ class ReduceMeanOp<DeviceType::CPU, T> : public ReduceMeanOpBase {
}
output_ptr
[
0
]
=
sum
/
data_reshape_
[
0
];
}
else
{
#pragma omp parallel for
#pragma omp parallel for
schedule(runtime)
for
(
int
i
=
0
;
i
<
data_reshape_
[
0
];
++
i
)
{
output_ptr
[
i
]
=
input_ptr
[
i
];
}
...
...
@@ -142,7 +142,7 @@ class ReduceMeanOp<DeviceType::CPU, T> : public ReduceMeanOpBase {
break
;
case
2
:
if
(
reduce_first_axis_
)
{
#pragma omp parallel for
#pragma omp parallel for
schedule(runtime)
for
(
int
i
=
0
;
i
<
data_reshape_
[
1
];
++
i
)
{
for
(
int
j
=
0
;
j
<
data_reshape_
[
0
];
++
j
)
{
output_ptr
[
i
]
+=
input_ptr
[
j
*
data_reshape_
[
1
]
+
i
];
...
...
@@ -150,7 +150,7 @@ class ReduceMeanOp<DeviceType::CPU, T> : public ReduceMeanOpBase {
output_ptr
[
i
]
/=
data_reshape_
[
0
];
}
}
else
{
#pragma omp parallel for
#pragma omp parallel for
schedule(runtime)
for
(
int
i
=
0
;
i
<
data_reshape_
[
0
];
++
i
)
{
for
(
int
j
=
0
;
j
<
data_reshape_
[
1
];
++
j
)
{
output_ptr
[
i
]
+=
input_ptr
[
i
*
data_reshape_
[
1
]
+
j
];
...
...
@@ -161,7 +161,7 @@ class ReduceMeanOp<DeviceType::CPU, T> : public ReduceMeanOpBase {
break
;
case
3
:
if
(
reduce_first_axis_
)
{
#pragma omp parallel for
#pragma omp parallel for
schedule(runtime)
for
(
int
i
=
0
;
i
<
data_reshape_
[
1
];
++
i
)
{
for
(
int
j
=
0
;
j
<
data_reshape_
[
2
];
++
j
)
{
for
(
int
k
=
0
;
k
<
data_reshape_
[
0
];
++
k
)
{
...
...
@@ -173,7 +173,7 @@ class ReduceMeanOp<DeviceType::CPU, T> : public ReduceMeanOpBase {
output_ptr
[
i
]
/=
(
data_reshape_
[
0
]
*
data_reshape_
[
2
]);
}
}
else
{
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2)
schedule(runtime)
for
(
int
i
=
0
;
i
<
data_reshape_
[
0
];
++
i
)
{
for
(
int
j
=
0
;
j
<
data_reshape_
[
2
];
++
j
)
{
for
(
int
k
=
0
;
k
<
data_reshape_
[
1
];
++
k
)
{
...
...
@@ -188,7 +188,7 @@ class ReduceMeanOp<DeviceType::CPU, T> : public ReduceMeanOpBase {
break
;
case
4
:
if
(
reduce_first_axis_
)
{
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2)
schedule(runtime)
for
(
int
i
=
0
;
i
<
data_reshape_
[
1
];
++
i
)
{
for
(
int
j
=
0
;
j
<
data_reshape_
[
3
];
++
j
)
{
for
(
int
k
=
0
;
k
<
data_reshape_
[
2
];
++
k
)
{
...
...
@@ -203,7 +203,7 @@ class ReduceMeanOp<DeviceType::CPU, T> : public ReduceMeanOpBase {
}
}
}
else
{
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2)
schedule(runtime)
for
(
int
i
=
0
;
i
<
data_reshape_
[
0
];
++
i
)
{
for
(
int
j
=
0
;
j
<
data_reshape_
[
2
];
++
j
)
{
for
(
int
k
=
0
;
k
<
data_reshape_
[
1
];
++
k
)
{
...
...
mace/ops/resize_bicubic.cc
浏览文件 @
69583cd6
...
...
@@ -85,7 +85,7 @@ inline void ResizeImage(const float *images,
const
float
height_scale
,
const
float
width_scale
,
float
*
output
)
{
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2)
schedule(runtime)
for
(
index_t
b
=
0
;
b
<
batch_size
;
++
b
)
{
for
(
index_t
y
=
0
;
y
<
out_height
;
++
y
)
{
std
::
vector
<
float
>
y_weights
;
...
...
mace/ops/resize_bilinear.cc
浏览文件 @
69583cd6
...
...
@@ -95,7 +95,7 @@ inline void ResizeImageNCHW(const T *images,
T
*
output
)
{
const
CachedInterpolation
*
xs
=
xs_vec
.
data
();
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2)
schedule(runtime)
for
(
index_t
b
=
0
;
b
<
batch_size
;
++
b
)
{
for
(
index_t
c
=
0
;
c
<
channels
;
++
c
)
{
const
T
...
...
@@ -141,7 +141,7 @@ inline void ResizeImageNHWC(const T *images,
for
(
index_t
b
=
0
;
b
<
batch_size
;
++
b
)
{
const
T
*
input_base
=
images
+
b
*
channels
*
in_height
*
in_width
;
T
*
output_base
=
output
+
b
*
channels
*
out_height
*
out_width
;
#pragma omp parallel for
#pragma omp parallel for
schedule(runtime)
for
(
index_t
y
=
0
;
y
<
out_height
;
++
y
)
{
const
T
*
y_lower_input_ptr
=
input_base
+
ys
[
y
].
lower
*
in_width
*
channels
;
...
...
mace/ops/sgemm.cc
浏览文件 @
69583cd6
...
...
@@ -252,7 +252,7 @@ void SGemm::RunInternal(const PackedBlock &lhs,
}
if
(
batch
>=
MaceOpenMPThreadCount
)
{
#pragma omp parallel for
#pragma omp parallel for
schedule(runtime)
MACE_SGEMM_RUN_PER_BATCH
}
else
{
MACE_SGEMM_RUN_PER_BATCH
...
...
@@ -279,7 +279,7 @@ void SGemm::RunPerBatch(const float *lhs_data,
// as possible to cache, by tiling lhs by height and rhs by width.
// w: 4
#pragma omp parallel for
#pragma omp parallel for
schedule(runtime)
for
(
index_t
bw
=
0
;
bw
<
block_w
;
++
bw
)
{
index_t
remain_h
=
height
;
index_t
block_h
=
0
;
...
...
@@ -702,7 +702,7 @@ void SGemm::RunPerBatch(const float *lhs_data,
rhs_data
+=
(
width
-
remain_w
)
*
depth
;
// w: 1
#pragma omp parallel for
#pragma omp parallel for
schedule(runtime)
for
(
index_t
bw
=
0
;
bw
<
remain_w
;
++
bw
)
{
index_t
remain_h
=
height
;
...
...
@@ -923,7 +923,7 @@ void SGemm::Pack(const MatrixMap<const float> &src,
PackPerBatch(src, order, b, packed_data + b * height * width); \
}
if
(
src
.
batch
()
>=
MaceOpenMPThreadCount
)
{
#pragma omp parallel for
#pragma omp parallel for
schedule(runtime)
MACE_SGEMM_PACK_PER_BATCH
}
else
{
MACE_SGEMM_PACK_PER_BATCH
...
...
@@ -945,7 +945,7 @@ void SGemm::UnPack(const PackedBlock &packed_result,
}
if
(
matrix_map
->
batch
()
>=
MaceOpenMPThreadCount
)
{
#pragma omp parallel for
#pragma omp parallel for
schedule(runtime)
MACE_SGEMM_UNPACK_PER_BATCH
}
else
{
MACE_SGEMM_UNPACK_PER_BATCH
...
...
@@ -968,7 +968,7 @@ void SGemm::PackPerBatch(const MatrixMap<const float> &src,
index_t
h
=
0
;
#if defined(MACE_ENABLE_NEON)
#if defined(__aarch64__)
#pragma omp parallel for
#pragma omp parallel for
schedule(runtime)
for
(
index_t
ih
=
h
;
ih
<=
height
-
8
;
ih
+=
8
)
{
const
float
*
src_data_ptr
=
src_data
+
ih
*
width
;
float
*
packed_data_ptr
=
packed_data
+
ih
*
width
;
...
...
@@ -989,7 +989,7 @@ void SGemm::PackPerBatch(const MatrixMap<const float> &src,
}
h
+=
(
height
-
h
)
/
8
*
8
;
#endif
#pragma omp parallel for
#pragma omp parallel for
schedule(runtime)
for
(
index_t
ih
=
h
;
ih
<=
height
-
4
;
ih
+=
4
)
{
const
float
*
src_data_ptr
=
src_data
+
ih
*
width
;
float
*
packed_data_ptr
=
packed_data
+
ih
*
width
;
...
...
@@ -1005,7 +1005,7 @@ void SGemm::PackPerBatch(const MatrixMap<const float> &src,
}
h
+=
(
height
-
h
)
/
4
*
4
;
#endif
#pragma omp parallel for
#pragma omp parallel for
schedule(runtime)
for
(
index_t
ih
=
h
;
ih
<
height
;
++
ih
)
{
std
::
copy_n
(
src_data
+
ih
*
width
,
width
,
packed_data
+
ih
*
width
);
}
...
...
@@ -1015,7 +1015,7 @@ void SGemm::PackPerBatch(const MatrixMap<const float> &src,
index_t
h
=
0
;
#if defined(MACE_ENABLE_NEON)
#if defined(__aarch64__)
#pragma omp parallel for
#pragma omp parallel for
schedule(runtime)
for
(
index_t
ih
=
h
;
ih
<=
height
-
8
;
ih
+=
8
)
{
const
float
*
src_data_ptr
=
src_data
+
ih
;
float
*
packed_data_ptr
=
packed_data
+
ih
*
width
;
...
...
@@ -1030,7 +1030,7 @@ void SGemm::PackPerBatch(const MatrixMap<const float> &src,
}
h
+=
(
height
-
h
)
/
8
*
8
;
#endif
#pragma omp parallel for
#pragma omp parallel for
schedule(runtime)
for
(
index_t
ih
=
h
;
ih
<=
height
-
4
;
ih
+=
4
)
{
const
float
*
src_data_ptr
=
src_data
+
ih
;
float
*
packed_data_ptr
=
packed_data
+
ih
*
width
;
...
...
@@ -1043,7 +1043,7 @@ void SGemm::PackPerBatch(const MatrixMap<const float> &src,
}
h
+=
(
height
-
h
)
/
4
*
4
;
#endif
#pragma omp parallel for
#pragma omp parallel for
schedule(runtime)
for
(
index_t
ih
=
h
;
ih
<
height
;
++
ih
)
{
const
float
*
src_data_ptr
=
src_data
+
ih
;
float
*
packed_data_ptr
=
packed_data
+
ih
*
width
;
...
...
@@ -1056,7 +1056,7 @@ void SGemm::PackPerBatch(const MatrixMap<const float> &src,
// This is for packing no-transpose rhs.
index_t
w
=
0
;
#if defined(MACE_ENABLE_NEON)
#pragma omp parallel for
#pragma omp parallel for
schedule(runtime)
for
(
index_t
iw
=
w
;
iw
<=
width
-
4
;
iw
+=
4
)
{
const
float
*
src_data_ptr
=
src_data
+
iw
;
float
*
packed_data_ptr
=
packed_data
+
iw
*
height
;
...
...
@@ -1069,7 +1069,7 @@ void SGemm::PackPerBatch(const MatrixMap<const float> &src,
}
w
+=
(
width
-
w
)
/
4
*
4
;
#endif
#pragma omp parallel for
#pragma omp parallel for
schedule(runtime)
for
(
index_t
iw
=
w
;
iw
<
width
;
++
iw
)
{
const
float
*
src_data_ptr
=
src_data
+
iw
;
float
*
packed_data_ptr
=
packed_data
+
iw
*
height
;
...
...
@@ -1082,7 +1082,7 @@ void SGemm::PackPerBatch(const MatrixMap<const float> &src,
// This is for packing transpose-needed rhs.
index_t
w
=
0
;
#if defined(MACE_ENABLE_NEON)
#pragma omp parallel for
#pragma omp parallel for
schedule(runtime)
for
(
index_t
iw
=
w
;
iw
<=
width
-
4
;
iw
+=
4
)
{
const
float
*
src_data_ptr
=
src_data
+
iw
*
height
;
float
*
packed_data_ptr
=
packed_data
+
iw
*
height
;
...
...
@@ -1098,7 +1098,7 @@ void SGemm::PackPerBatch(const MatrixMap<const float> &src,
}
w
+=
(
width
-
w
)
/
4
*
4
;
#endif
#pragma omp parallel for
#pragma omp parallel for
schedule(runtime)
for
(
index_t
iw
=
w
;
iw
<
width
;
++
iw
)
{
std
::
copy_n
(
src_data
+
iw
*
height
,
height
,
packed_data
+
iw
*
height
);
}
...
...
@@ -1118,7 +1118,7 @@ void SGemm::UnPackPerBatch(const float *packed_data,
// This is for non-transposed result
index_t
w
=
0
;
#if defined(MACE_ENABLE_NEON)
#pragma omp parallel for
#pragma omp parallel for
schedule(runtime)
for
(
index_t
iw
=
w
;
iw
<=
width
-
4
;
iw
+=
4
)
{
const
float
*
packed_data_ptr
=
packed_data
+
iw
*
height
;
float
*
unpacked_data_ptr
=
unpacked_data
+
iw
;
...
...
@@ -1131,7 +1131,7 @@ void SGemm::UnPackPerBatch(const float *packed_data,
}
w
+=
(
width
-
w
)
/
4
*
4
;
#endif
#pragma omp parallel for
#pragma omp parallel for
schedule(runtime)
for
(
index_t
iw
=
w
;
iw
<
width
;
++
iw
)
{
const
float
*
packed_data_ptr
=
packed_data
+
iw
*
height
;
float
*
unpacked_data_ptr
=
unpacked_data
+
iw
;
...
...
@@ -1143,7 +1143,7 @@ void SGemm::UnPackPerBatch(const float *packed_data,
// This is for transposed result
index_t
w
=
0
;
#if defined(MACE_ENABLE_NEON)
#pragma omp parallel for
#pragma omp parallel for
schedule(runtime)
for
(
index_t
iw
=
w
;
iw
<=
width
-
4
;
iw
+=
4
)
{
const
float
*
packed_data_ptr
=
packed_data
+
iw
*
height
;
float
*
unpacked_data_ptr
=
unpacked_data
+
iw
*
height
;
...
...
@@ -1159,7 +1159,7 @@ void SGemm::UnPackPerBatch(const float *packed_data,
}
w
+=
(
width
-
w
)
/
4
*
4
;
#endif
#pragma omp parallel for
#pragma omp parallel for
schedule(runtime)
for
(
index_t
iw
=
w
;
iw
<
width
;
++
iw
)
{
std
::
copy_n
(
packed_data
+
iw
*
height
,
height
,
unpacked_data
+
iw
*
height
);
...
...
mace/ops/softmax.cc
浏览文件 @
69583cd6
...
...
@@ -59,7 +59,7 @@ class SoftmaxOp<DeviceType::CPU, float> : public Operation {
const
index_t
batch_size
=
class_count
*
class_size
;
for
(
index_t
b
=
0
;
b
<
batch
;
++
b
)
{
#pragma omp parallel for
#pragma omp parallel for
schedule(runtime)
for
(
index_t
k
=
0
;
k
<
class_size
;
++
k
)
{
const
float
*
input_ptr
=
input_data
+
b
*
batch_size
+
k
;
float
*
output_ptr
=
output_data
+
b
*
batch_size
+
k
;
...
...
@@ -94,7 +94,7 @@ class SoftmaxOp<DeviceType::CPU, float> : public Operation {
}
else
if
(
input
->
dim_size
()
==
2
)
{
// normal 2d softmax
const
index_t
class_size
=
input
->
dim
(
0
);
const
index_t
class_count
=
input
->
dim
(
1
);
#pragma omp parallel for
#pragma omp parallel for
schedule(runtime)
for
(
index_t
k
=
0
;
k
<
class_size
;
++
k
)
{
const
float
*
input_ptr
=
input_data
+
k
*
class_count
;
float
*
output_ptr
=
output_data
+
k
*
class_count
;
...
...
@@ -172,7 +172,7 @@ class SoftmaxOp<DeviceType::CPU, uint8_t> : public Operation {
// If depth is short, do it using float32. Float computation should not
// be here, but as long as it is on CPU, it is fine.
if
(
depth
<
32
)
{
#pragma omp parallel for
#pragma omp parallel for
schedule(runtime)
for
(
index_t
b
=
0
;
b
<
batch
;
++
b
)
{
const
uint8_t
*
input_ptr
=
input_data
+
b
*
depth
;
uint8_t
*
output_ptr
=
output_data
+
b
*
depth
;
...
...
@@ -201,7 +201,7 @@ class SoftmaxOp<DeviceType::CPU, uint8_t> : public Operation {
(
1ll
<<
31
)
-
1.0
));
int32_t
input_delta_limit
=
-
((
1ll
<<
31
)
-
1
)
/
scale_q
;
#pragma omp parallel for
#pragma omp parallel for
schedule(runtime)
for
(
index_t
b
=
0
;
b
<
batch
;
++
b
)
{
const
uint8_t
*
input_ptr
=
input_data
+
b
*
depth
;
uint8_t
*
output_ptr
=
output_data
+
b
*
depth
;
...
...
mace/ops/space_to_batch.cc
浏览文件 @
69583cd6
...
...
@@ -129,7 +129,7 @@ class SpaceToBatchNDOp<DeviceType::CPU, float> : public SpaceToBatchOpBase {
std
::
max
(
static_cast
<
index_t
>
(
1
),
8
*
1024
/
block_shape_w
/
in_width
);
// make channel outter loop so we can make best use of cache
#pragma omp parallel for collapse(3)
#pragma omp parallel for collapse(3)
schedule(runtime)
for
(
index_t
c
=
0
;
c
<
channels
;
++
c
)
{
for
(
index_t
block_h
=
0
;
block_h
<
out_height
;
block_h
+=
block_h_size
)
{
...
...
@@ -238,7 +238,7 @@ class SpaceToBatchNDOp<DeviceType::CPU, uint8_t> : public SpaceToBatchOpBase {
index_t
out_width
=
batch_tensor
->
dim
(
2
);
index_t
channels
=
batch_tensor
->
dim
(
3
);
#pragma omp parallel for
#pragma omp parallel for
schedule(runtime)
for
(
index_t
b
=
0
;
b
<
out_batches
;
++
b
)
{
const
index_t
in_b
=
b
%
in_batches
;
const
index_t
tile_index
=
b
/
in_batches
;
...
...
mace/ops/sqrdiff_mean.cc
浏览文件 @
69583cd6
...
...
@@ -64,7 +64,7 @@ class SqrDiffMeanOp : public Operation {
const
index_t
img_size
=
input0
->
dim
(
2
)
*
input0
->
dim
(
3
);
const
index_t
bc
=
input0
->
dim
(
0
)
*
input0
->
dim
(
1
);
#pragma omp parallel for
#pragma omp parallel for
schedule(runtime)
for
(
int
i
=
0
;
i
<
bc
;
++
i
)
{
for
(
int
j
=
0
;
j
<
img_size
;
++
j
)
{
T
diff
=
input_ptr0
[
i
*
img_size
+
j
]
-
input_ptr1
[
i
];
...
...
mace/public/mace.h
浏览文件 @
69583cd6
...
...
@@ -48,10 +48,28 @@ enum GPUPriorityHint {
PRIORITY_HIGH
=
3
};
// AFFINITY_NONE: initiate 'num_threads_hint' threads with no affinity
// scheduled.
// If 'num_threads_hint' is -1 or greater than number of available cores,
// 'num_threads_hint' will be reset to number of available cores.
// AFFINITY_BIG_ONLY: all available big cores are used, and number of threads
// is equal to numbers of available big cores.
// AFFINITY_LITTLE_ONLY: all available little cores are used, and number of
// threads is equal to numbers of available little cores.
// AFFINITY_HIGH_PERFORMANCE: initiate 'num_threads_hint' threads on different
// cores with top-num_threads_hint frequencies.
// If 'num_threads_hint' is -1 or greater than number of available cores,
// 'num_threads_hint' will be reset to number of available cores.
// AFFINITY_POWER_SAVE: initiate 'num_threads_hint' threads on different
// cores with bottom-num_threads_hint frequencies.
// If 'num_threads_hint' is -1 or greater than number of available cores,
// 'num_threads_hint' will be reset to number of available cores.
enum
CPUAffinityPolicy
{
AFFINITY_NONE
=
0
,
AFFINITY_BIG_ONLY
=
1
,
AFFINITY_LITTLE_ONLY
=
2
,
AFFINITY_HIGH_PERFORMANCE
=
3
,
AFFINITY_POWER_SAVE
=
4
,
};
struct
CallStats
{
...
...
mace/utils/quantize.h
浏览文件 @
69583cd6
...
...
@@ -99,7 +99,7 @@ inline void QuantizeWithScaleAndZeropoint(const float *input,
int32_t
zero_point
,
T
*
output
)
{
float
recip_scale
=
1
/
scale
;
#pragma omp parallel for
#pragma omp parallel for
schedule(runtime)
for
(
int
i
=
0
;
i
<
size
;
++
i
)
{
output
[
i
]
=
Saturate
<
T
>
(
roundf
(
zero_point
+
recip_scale
*
input
[
i
]));
}
...
...
@@ -128,7 +128,7 @@ inline void Dequantize(const T *input,
const
float
scale
,
const
int32_t
zero_point
,
float
*
output
)
{
#pragma omp parallel for
#pragma omp parallel for
schedule(runtime)
for
(
int
i
=
0
;
i
<
size
;
++
i
)
{
output
[
i
]
=
scale
*
(
input
[
i
]
-
zero_point
);
}
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录