Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Xiaomi
Mace
提交
fdb477e2
Mace
项目概览
Xiaomi
/
Mace
通知
106
Star
40
Fork
27
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
Mace
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
fdb477e2
编写于
4月 27, 2018
作者:
刘
刘托
浏览文件
操作
浏览文件
下载
差异文件
Merge branch 'refactor-eltwise' into 'master'
Refactor eltwise op. See merge request !428
上级
d5cbd799
25d2ad2d
变更
8
展开全部
隐藏空白更改
内联
并排
Showing
8 changed file
with
969 addition
and
677 deletion
+969
-677
mace/kernels/eltwise.h
mace/kernels/eltwise.h
+302
-189
mace/kernels/opencl/cl/eltwise.cl
mace/kernels/opencl/cl/eltwise.cl
+49
-71
mace/kernels/opencl/eltwise_opencl.cc
mace/kernels/opencl/eltwise_opencl.cc
+34
-16
mace/ops/eltwise.h
mace/ops/eltwise.h
+7
-44
mace/ops/eltwise_benchmark.cc
mace/ops/eltwise_benchmark.cc
+15
-11
mace/ops/eltwise_test.cc
mace/ops/eltwise_test.cc
+529
-327
mace/python/tools/caffe_converter_lib.py
mace/python/tools/caffe_converter_lib.py
+8
-2
mace/python/tools/tf_converter_lib.py
mace/python/tools/tf_converter_lib.py
+25
-17
未找到文件。
mace/kernels/eltwise.h
浏览文件 @
fdb477e2
...
...
@@ -18,6 +18,7 @@
#include <algorithm>
#include <memory>
#include <vector>
#include <utility>
#include "mace/core/future.h"
#include "mace/core/tensor.h"
...
...
@@ -30,216 +31,331 @@ namespace mace {
namespace
kernels
{
enum
EltwiseType
{
PROD
=
0
,
SU
M
=
1
,
MAX
=
2
,
MIN
=
3
,
SUB
=
4
,
DIV
=
5
,
SUM
=
0
,
SU
B
=
1
,
PROD
=
2
,
DIV
=
3
,
MIN
=
4
,
MAX
=
5
,
NEG
=
6
,
ABS
=
7
,
SQR_DIFF
=
8
,
NONE
=
9
,
};
struct
EltwiseFunctorBase
{
EltwiseFunctorBase
(
const
EltwiseType
type
,
const
std
::
vector
<
float
>
&
coeff
)
:
type_
(
type
),
coeff_
(
coeff
)
{}
EltwiseType
type_
;
std
::
vector
<
float
>
coeff_
;
};
template
<
DeviceType
D
,
typename
T
>
struct
EltwiseFunctor
:
EltwiseFunctorBase
{
EltwiseFunctor
(
const
EltwiseType
type
,
const
std
::
vector
<
float
>
&
coeff
)
:
EltwiseFunctorBase
(
type
,
coeff
)
{}
void
operator
()(
const
Tensor
*
input0
,
const
Tensor
*
input1
,
const
index_t
start_axis
,
const
bool
is_scaler
,
const
float
value
,
const
bool
swap
,
Tensor
*
output
,
StatsFuture
*
future
)
{
if
(
is_scaler
)
{
Tensor
::
MappingGuard
input0_guard
(
input0
);
Tensor
::
MappingGuard
output_guard
(
output
);
const
T
*
input0_ptr
=
input0
->
data
<
T
>
();
T
*
output_ptr
=
output
->
mutable_data
<
T
>
();
const
index_t
num
=
input0
->
size
();
switch
(
type_
)
{
case
PROD
:
inline
void
TensorScalar
(
const
EltwiseType
type
,
const
float
*
input0
,
const
float
value
,
const
index_t
size
,
float
*
output
)
{
switch
(
type
)
{
case
SUM
:
#pragma omp parallel for
for
(
index_t
i
=
0
;
i
<
num
;
++
i
)
{
output_ptr
[
i
]
=
input0_ptr
[
i
]
*
value
;
}
break
;
case
SUM
:
if
(
coeff_
.
empty
())
{
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
output
[
i
]
=
input0
[
i
]
+
value
;
}
break
;
case
SUB
:
#pragma omp parallel for
for
(
index_t
i
=
0
;
i
<
num
;
++
i
)
{
output_ptr
[
i
]
=
input0_ptr
[
i
]
+
value
;
}
}
else
{
const
float
coeff_0
=
swap
?
coeff_
[
1
]
:
coeff_
[
0
];
const
float
coeff_1
=
swap
?
coeff_
[
0
]
:
coeff_
[
1
];
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
output
[
i
]
=
input0
[
i
]
-
value
;
}
break
;
case
PROD
:
#pragma omp parallel for
for
(
index_t
i
=
0
;
i
<
num
;
++
i
)
{
output_ptr
[
i
]
=
coeff_0
*
input0_ptr
[
i
]
+
coeff_1
*
value
;
}
}
break
;
case
MAX
:
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
output
[
i
]
=
input0
[
i
]
*
value
;
}
break
;
case
DIV
:
#pragma omp parallel for
for
(
index_t
i
=
0
;
i
<
num
;
++
i
)
{
output_ptr
[
i
]
=
std
::
max
<
T
>
(
input0_ptr
[
i
],
value
)
;
}
break
;
case
MIN
:
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
output
[
i
]
=
input0
[
i
]
/
value
;
}
break
;
case
MIN
:
#pragma omp parallel for
for
(
index_t
i
=
0
;
i
<
num
;
++
i
)
{
output_ptr
[
i
]
=
std
::
min
<
T
>
(
input0_ptr
[
i
],
value
);
}
break
;
case
SUB
:
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
output
[
i
]
=
std
::
min
<
float
>
(
input0
[
i
],
value
);
}
break
;
case
MAX
:
#pragma omp parallel for
for
(
index_t
i
=
0
;
i
<
num
;
++
i
)
{
output_ptr
[
i
]
=
swap
?
value
-
input0_ptr
[
i
]
:
input0_ptr
[
i
]
-
value
;
}
break
;
case
DIV
:
if
(
!
swap
)
{
MACE_CHECK
(
fabs
(
value
)
>
1e-6
,
"cannot divided by 0."
);
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
output
[
i
]
=
std
::
max
<
float
>
(
input0
[
i
],
value
);
}
break
;
case
NEG
:
#pragma omp parallel for
for
(
index_t
i
=
0
;
i
<
num
;
++
i
)
{
output_ptr
[
i
]
=
input0_ptr
[
i
]
/
value
;
}
}
else
{
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
output
[
i
]
=
-
input0
[
i
];
}
break
;
case
ABS
:
#pragma omp parallel for
for
(
index_t
i
=
0
;
i
<
num
;
++
i
)
{
MACE_CHECK
(
fabs
(
input0_ptr
[
i
])
>
1e-6
,
"cannot divided by 0."
);
output_ptr
[
i
]
=
value
/
input0_ptr
[
i
];
}
}
break
;
case
SQR_DIFF
:
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
output
[
i
]
=
std
::
abs
(
input0
[
i
]);
}
break
;
case
SQR_DIFF
:
#pragma omp parallel for
for
(
index_t
i
=
0
;
i
<
num
;
++
i
)
{
const
float
tmp
=
input0_ptr
[
i
]
-
value
;
output_ptr
[
i
]
=
tmp
*
tmp
;
}
break
;
default:
LOG
(
FATAL
)
<<
"Eltwise op not support type "
<<
type_
;
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
output
[
i
]
=
std
::
pow
(
input0
[
i
]
-
value
,
2.
f
);
}
}
else
{
MACE_CHECK_NOTNULL
(
input0
);
MACE_CHECK_NOTNULL
(
input1
);
Tensor
::
MappingGuard
input0_guard
(
input0
);
Tensor
::
MappingGuard
input1_guard
(
input1
);
Tensor
::
MappingGuard
output_guard
(
output
);
const
T
*
input0_ptr
=
input0
->
data
<
T
>
();
const
T
*
input1_ptr
=
input1
->
data
<
T
>
();
T
*
output_ptr
=
output
->
mutable_data
<
T
>
();
const
index_t
size0
=
input0
->
size
();
const
index_t
size1
=
input1
->
size
();
break
;
default:
LOG
(
FATAL
)
<<
"Eltwise op not support type "
<<
type
;
}
}
const
index_t
num
=
size0
/
size1
;
switch
(
type_
)
{
case
PROD
:
#pragma omp parallel for collapse(2)
for
(
index_t
i
=
0
;
i
<
num
;
++
i
)
{
for
(
index_t
j
=
0
;
j
<
size1
;
++
j
)
{
output_ptr
[
i
*
size1
+
j
]
=
input0_ptr
[
i
*
size1
+
j
]
*
input1_ptr
[
j
];
}
inline
void
TensorVector
(
const
EltwiseType
type
,
const
float
*
input0
,
const
float
*
input1
,
const
index_t
batch
,
const
index_t
channel
,
const
index_t
hw
,
const
bool
swapped
,
float
*
output
)
{
switch
(
type
)
{
case
SUM
:
#pragma omp parallel for collapse(3)
for
(
index_t
b
=
0
;
b
<
batch
;
++
b
)
{
for
(
index_t
c
=
0
;
c
<
channel
;
++
c
)
{
for
(
index_t
i
=
0
;
i
<
hw
;
++
i
)
{
const
index_t
idx0
=
(
b
*
channel
+
c
)
*
hw
+
i
;
const
index_t
idx1
=
b
*
channel
+
c
;
output
[
idx0
]
=
input0
[
idx0
]
+
input1
[
idx1
];
}
break
;
case
SUM
:
if
(
coeff_
.
empty
())
{
#pragma omp parallel for collapse(2)
for
(
index_t
i
=
0
;
i
<
num
;
++
i
)
{
for
(
index_t
j
=
0
;
j
<
size1
;
++
j
)
{
output_ptr
[
i
*
size1
+
j
]
=
input0_ptr
[
i
*
size1
+
j
]
+
input1_ptr
[
j
];
}
}
}
else
{
const
float
coeff_0
=
swap
?
coeff_
[
1
]
:
coeff_
[
0
];
const
float
coeff_1
=
swap
?
coeff_
[
0
]
:
coeff_
[
1
];
#pragma omp parallel for collapse(2)
for
(
index_t
i
=
0
;
i
<
num
;
++
i
)
{
for
(
index_t
j
=
0
;
j
<
size1
;
++
j
)
{
output_ptr
[
i
*
size1
+
j
]
=
coeff_0
*
input0_ptr
[
i
*
size1
+
j
]
+
coeff_1
*
input1_ptr
[
j
];
}
}
}
break
;
case
SUB
:
if
(
swapped
)
{
#pragma omp parallel for collapse(3)
for
(
index_t
b
=
0
;
b
<
batch
;
++
b
)
{
for
(
index_t
c
=
0
;
c
<
channel
;
++
c
)
{
for
(
index_t
i
=
0
;
i
<
hw
;
++
i
)
{
const
index_t
idx0
=
(
b
*
channel
+
c
)
*
hw
+
i
;
const
index_t
idx1
=
b
*
channel
+
c
;
output
[
idx0
]
=
input1
[
idx1
]
-
input0
[
idx0
];
}
}
break
;
case
MAX
:
#pragma omp parallel for collapse(2)
for
(
index_t
i
=
0
;
i
<
num
;
++
i
)
{
for
(
index_t
j
=
0
;
j
<
size1
;
++
j
)
{
output_ptr
[
i
*
size1
+
j
]
=
std
::
max
<
T
>
(
input0_ptr
[
i
*
size1
+
j
],
input1_ptr
[
j
]);
}
}
else
{
#pragma omp parallel for collapse(3)
for
(
index_t
b
=
0
;
b
<
batch
;
++
b
)
{
for
(
index_t
c
=
0
;
c
<
channel
;
++
c
)
{
for
(
index_t
i
=
0
;
i
<
hw
;
++
i
)
{
const
index_t
idx0
=
(
b
*
channel
+
c
)
*
hw
+
i
;
const
index_t
idx1
=
b
*
channel
+
c
;
output
[
idx0
]
=
input0
[
idx0
]
-
input1
[
idx1
];
}
}
break
;
case
MIN
:
#pragma omp parallel for collapse(2)
for
(
index_t
i
=
0
;
i
<
num
;
++
i
)
{
for
(
index_t
j
=
0
;
j
<
size1
;
++
j
)
{
output_ptr
[
i
*
size1
+
j
]
=
std
::
min
<
T
>
(
input0_ptr
[
i
*
size1
+
j
],
input1_ptr
[
j
]);
}
}
}
break
;
case
PROD
:
#pragma omp parallel for collapse(3)
for
(
index_t
b
=
0
;
b
<
batch
;
++
b
)
{
for
(
index_t
c
=
0
;
c
<
channel
;
++
c
)
{
for
(
index_t
i
=
0
;
i
<
hw
;
++
i
)
{
const
index_t
idx0
=
(
b
*
channel
+
c
)
*
hw
+
i
;
const
index_t
idx1
=
b
*
channel
+
c
;
output
[
idx0
]
=
input0
[
idx0
]
*
input1
[
idx1
];
}
break
;
case
SUB
:
#pragma omp parallel for collapse(2)
for
(
index_t
i
=
0
;
i
<
num
;
++
i
)
{
for
(
index_t
j
=
0
;
j
<
size1
;
++
j
)
{
output_ptr
[
i
*
size1
+
j
]
=
swap
?
input0_ptr
[
i
*
size1
+
j
]
-
input1_ptr
[
j
]
:
input1_ptr
[
j
]
-
input0_ptr
[
i
*
size1
+
j
];
}
}
break
;
case
DIV
:
if
(
swapped
)
{
#pragma omp parallel for collapse(3)
for
(
index_t
b
=
0
;
b
<
batch
;
++
b
)
{
for
(
index_t
c
=
0
;
c
<
channel
;
++
c
)
{
for
(
index_t
i
=
0
;
i
<
hw
;
++
i
)
{
const
index_t
idx0
=
(
b
*
channel
+
c
)
*
hw
+
i
;
const
index_t
idx1
=
b
*
channel
+
c
;
output
[
idx0
]
=
input1
[
idx1
]
/
input0
[
idx0
];
}
}
break
;
case
DIV
:
#pragma omp parallel for collapse(2)
for
(
index_t
i
=
0
;
i
<
num
;
++
i
)
{
for
(
index_t
j
=
0
;
j
<
size1
;
++
j
)
{
if
(
!
swap
)
{
MACE_CHECK
(
fabs
(
input1_ptr
[
j
])
>
1e-6
,
"cannot divided by 0."
);
output_ptr
[
i
*
size1
+
j
]
=
input0_ptr
[
i
*
size1
+
j
]
/
input1_ptr
[
j
];
}
else
{
MACE_CHECK
(
fabs
(
input0_ptr
[
i
*
size1
+
j
])
>
1e-6
,
"cannot divided by 0."
);
output_ptr
[
i
*
size1
+
j
]
=
input1_ptr
[
j
]
/
input0_ptr
[
i
*
size1
+
j
];
}
}
}
else
{
#pragma omp parallel for collapse(3)
for
(
index_t
b
=
0
;
b
<
batch
;
++
b
)
{
for
(
index_t
c
=
0
;
c
<
channel
;
++
c
)
{
for
(
index_t
i
=
0
;
i
<
hw
;
++
i
)
{
const
index_t
idx0
=
(
b
*
channel
+
c
)
*
hw
+
i
;
const
index_t
idx1
=
b
*
channel
+
c
;
output
[
idx0
]
=
input0
[
idx0
]
/
input1
[
idx1
];
}
}
break
;
case
SQR_DIFF
:
#pragma omp parallel for collapse(2)
for
(
index_t
i
=
0
;
i
<
num
;
++
i
)
{
for
(
index_t
j
=
0
;
j
<
size1
;
++
j
)
{
const
T
tmp
=
input0_ptr
[
i
*
size1
+
j
]
-
input1_ptr
[
j
];
output_ptr
[
i
*
size1
+
j
]
=
tmp
*
tmp
;
}
}
}
break
;
case
MIN
:
#pragma omp parallel for collapse(3)
for
(
index_t
b
=
0
;
b
<
batch
;
++
b
)
{
for
(
index_t
c
=
0
;
c
<
channel
;
++
c
)
{
for
(
index_t
i
=
0
;
i
<
hw
;
++
i
)
{
const
index_t
idx0
=
(
b
*
channel
+
c
)
*
hw
+
i
;
const
index_t
idx1
=
b
*
channel
+
c
;
output
[
idx0
]
=
std
::
min
<
float
>
(
input0
[
idx0
],
input1
[
idx1
]);
}
}
}
break
;
case
MAX
:
#pragma omp parallel for collapse(3)
for
(
index_t
b
=
0
;
b
<
batch
;
++
b
)
{
for
(
index_t
c
=
0
;
c
<
channel
;
++
c
)
{
for
(
index_t
i
=
0
;
i
<
hw
;
++
i
)
{
const
index_t
idx0
=
(
b
*
channel
+
c
)
*
hw
+
i
;
const
index_t
idx1
=
b
*
channel
+
c
;
output
[
idx0
]
=
std
::
max
<
float
>
(
input0
[
idx0
],
input1
[
idx1
]);
}
}
}
break
;
case
SQR_DIFF
:
#pragma omp parallel for collapse(3)
for
(
index_t
b
=
0
;
b
<
batch
;
++
b
)
{
for
(
index_t
c
=
0
;
c
<
channel
;
++
c
)
{
for
(
index_t
i
=
0
;
i
<
hw
;
++
i
)
{
const
index_t
idx0
=
(
b
*
channel
+
c
)
*
hw
+
i
;
const
index_t
idx1
=
b
*
channel
+
c
;
output
[
idx0
]
=
std
::
pow
(
input0
[
idx0
]
-
input1
[
idx1
],
2.
f
);
}
}
}
break
;
default:
LOG
(
FATAL
)
<<
"Eltwise op not support type "
<<
type
;
}
}
inline
void
TensorEltwise
(
const
EltwiseType
type
,
const
float
*
input0
,
const
float
*
input1
,
const
index_t
size
,
float
*
output
)
{
switch
(
type
)
{
case
SUM
:
#pragma omp parallel for
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
output
[
i
]
=
input0
[
i
]
+
input1
[
i
];
}
break
;
case
SUB
:
#pragma omp parallel for
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
output
[
i
]
=
input0
[
i
]
-
input1
[
i
];
}
break
;
case
PROD
:
#pragma omp parallel for
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
output
[
i
]
=
input0
[
i
]
*
input1
[
i
];
}
break
;
case
DIV
:
#pragma omp parallel for
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
output
[
i
]
=
input0
[
i
]
/
input1
[
i
];
}
break
;
case
MIN
:
#pragma omp parallel for
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
output
[
i
]
=
std
::
min
<
float
>
(
input0
[
i
],
input1
[
i
]);
}
break
;
case
MAX
:
#pragma omp parallel for
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
output
[
i
]
=
std
::
max
<
float
>
(
input0
[
i
],
input1
[
i
]);
}
break
;
case
SQR_DIFF
:
#pragma omp parallel for
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
output
[
i
]
=
std
::
pow
(
input0
[
i
]
-
input1
[
i
],
2.
f
);
}
break
;
default:
LOG
(
FATAL
)
<<
"Eltwise op not support type "
<<
type
;
}
}
struct
EltwiseFunctorBase
{
EltwiseFunctorBase
(
const
EltwiseType
type
,
const
std
::
vector
<
float
>
&
coeff
,
const
float
value
)
:
type_
(
type
),
coeff_
(
coeff
),
value_
(
value
)
{}
EltwiseType
type_
;
std
::
vector
<
float
>
coeff_
;
float
value_
;
};
template
<
DeviceType
D
,
typename
T
>
struct
EltwiseFunctor
;
template
<
>
struct
EltwiseFunctor
<
DeviceType
::
CPU
,
float
>:
EltwiseFunctorBase
{
EltwiseFunctor
(
const
EltwiseType
type
,
const
std
::
vector
<
float
>
&
coeff
,
const
float
value
)
:
EltwiseFunctorBase
(
type
,
coeff
,
value
)
{}
void
operator
()(
const
Tensor
*
input0
,
const
Tensor
*
input1
,
Tensor
*
output
,
StatsFuture
*
future
)
{
bool
swapped
=
false
;
if
(
input1
!=
nullptr
)
{
MACE_CHECK
(
input0
->
dim_size
()
==
input1
->
dim_size
())
<<
"Inputs of Eltwise op must be same shape"
;
if
(
input0
->
size
()
!=
input1
->
size
())
{
if
(
input0
->
size
()
<
input1
->
size
())
{
std
::
swap
(
input0
,
input1
);
swapped
=
true
;
}
MACE_CHECK
(
input0
->
dim
(
0
)
==
input1
->
dim
(
0
)
&&
input0
->
dim
(
1
)
==
input1
->
dim
(
1
)
&&
input1
->
dim
(
2
)
==
1
&&
input1
->
dim
(
3
)
==
1
)
<<
"Element-Wise op only support channel dimension broadcast"
;
}
}
output
->
ResizeLike
(
input0
);
Tensor
::
MappingGuard
input0_guard
(
input0
);
Tensor
::
MappingGuard
output_guard
(
output
);
const
float
*
input0_ptr
=
input0
->
data
<
float
>
();
float
*
output_ptr
=
output
->
mutable_data
<
float
>
();
const
index_t
size
=
input0
->
size
();
if
(
input1
==
nullptr
)
{
TensorScalar
(
type_
,
input0_ptr
,
value_
,
size
,
output_ptr
);
}
else
{
Tensor
::
MappingGuard
input1_guard
(
input1
);
const
float
*
input1_ptr
=
input1
->
data
<
float
>
();
if
(
input1
->
size
()
!=
input0
->
size
())
{
const
index_t
batch
=
input0
->
dim
(
0
);
const
index_t
channel
=
input0
->
dim
(
1
);
const
index_t
hw
=
input0
->
dim
(
2
)
*
input0
->
dim
(
3
);
TensorVector
(
type_
,
input0_ptr
,
input1_ptr
,
batch
,
channel
,
hw
,
swapped
,
output_ptr
);
}
else
{
if
(
!
coeff_
.
empty
()
&&
type_
==
SUM
)
{
#pragma omp parallel for
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
output_ptr
[
i
]
=
coeff_
[
0
]
*
input0_ptr
[
i
]
+
coeff_
[
1
]
*
input1_ptr
[
i
];
}
break
;
default:
LOG
(
FATAL
)
<<
"Eltwise op not support type "
<<
type_
;
}
else
{
TensorEltwise
(
type_
,
input0_ptr
,
input1_ptr
,
size
,
output_ptr
);
}
}
}
}
...
...
@@ -249,15 +365,12 @@ struct EltwiseFunctor : EltwiseFunctorBase {
template
<
typename
T
>
struct
EltwiseFunctor
<
DeviceType
::
OPENCL
,
T
>
:
EltwiseFunctorBase
{
EltwiseFunctor
(
const
EltwiseType
type
,
const
std
::
vector
<
float
>
&
coeff
)
:
EltwiseFunctorBase
(
type
,
coeff
)
{}
const
std
::
vector
<
float
>
&
coeff
,
const
float
value
)
:
EltwiseFunctorBase
(
type
,
coeff
,
value
)
{}
void
operator
()(
const
Tensor
*
input0
,
const
Tensor
*
input1
,
const
index_t
start_axis
,
const
bool
is_scaler
,
const
float
value
,
const
bool
swap
,
Tensor
*
output
,
StatsFuture
*
future
);
...
...
mace/kernels/opencl/cl/eltwise.cl
浏览文件 @
fdb477e2
...
...
@@ -3,8 +3,11 @@
__kernel
void
eltwise
(
KERNEL_ERROR_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM3
__read_only
image2d_t
input0,
__read_only
image2d_t
input1,
#
if
INPUT_TYPE
==
1
__private
const
float
value,
#
else
__read_only
image2d_t
input1,
#
endif
__private
const
int
height,
__private
const
int
width,
__private
const
int
channel,
...
...
@@ -13,101 +16,76 @@ __kernel void eltwise(KERNEL_ERROR_PARAMS
__private
const
float
coeff1,
#
endif
__write_only
image2d_t
output
)
{
const
int
c
=
get_global_id
(
0
)
;
const
int
w
=
get_global_id
(
1
)
;
const
int
c
han_idx
=
get_global_id
(
0
)
;
const
int
w
idth_idx
=
get_global_id
(
1
)
;
const
int
hb
=
get_global_id
(
2
)
;
#
ifndef
NON_UNIFORM_WORK_GROUP
if
(
c
>=
global_size_dim0
|
| w >= global_size_dim1 || hb >= global_size_dim2)
if
(
chan_idx
>=
global_size_dim0
|
|
width_idx >= global_size_dim1 || hb >= global_size_dim2)
return;
#endif
int pos_w;
int pos_h;
#if START_AXIS == 0
pos_w = mad24(c, width, w);
pos_h = hb;
#elif START_AXIS == 1
pos_w = mad24(c, width, w);
pos_h = hb % height;
#elif START_AXIS == 2
pos_w = mad24(c, width, w);
pos_h = 0;
#elif START_AXIS == 3
pos_w = c;
pos_h = 0;
#endif
const int pos = mad24(c, width, w);
const int remain_channel = channel - 4 * c;
const int pos = mad24(chan_idx, width, width_idx);
DATA_TYPE4 in0 = READ_IMAGET(input0, SAMPLER, (int2)(pos, hb));
DATA_TYPE4 in1 ;
#if IS_SCALER == 1
in1 = (DATA_TYPE4){value, value, value, value};
#if INPUT_TYPE == 1
DATA_TYPE4 in1 = (DATA_TYPE4)(value, value, value, value);
#elif INPUT_TYPE == 2
const int batch_idx = hb / height;
DATA_TYPE4 in1 = READ_IMAGET(input1, SAMPLER, (int2)(chan_idx, batch_idx));
#else
in1 = READ_IMAGET(input1, SAMPLER, (int2)(pos_w, pos_h
));
DATA_TYPE4 in1 = READ_IMAGET(input1, SAMPLER, (int2)(pos, hb
));
#endif
DATA_TYPE4 out;
#if ELTWISE_TYPE == 0
out = in0 * in1;
#ifdef COEFF_SUM
out = mad(coeff1, in0, mad(coeff0, in1, 0));
#else
out = in0 + in1;
#endif
#elif ELTWISE_TYPE == 1
#ifdef COEFF_SUM
#if NEEDSWAP == 0
out = mad(coeff0, in0, mad(coeff1, in1, 0));
#ifdef SWAPPED
out = in1 - in0;
#else
out =
mad(coeff1, in0, mad(coeff0, in1, 0))
;
out =
in0 - in1
;
#endif
#else
out = in0 + in1;
#endif
#elif ELTWISE_TYPE == 2
out =
fmax(in0, in1)
;
out =
in0 * in1
;
#elif ELTWISE_TYPE == 3
out = fmin(in0, in1);
#elif ELTWISE_TYPE == 4
#if NEED_SWAP == 0
out = in0 - in1;
#ifdef SWAPPED
out = in1 / in0;
#else
out = in
1 - in0
;
out = in
0 / in1
;
#endif
#elif ELTWISE_TYPE == 4
out = fmin(in0, in1);
#elif ELTWISE_TYPE == 5
#if NEED_SWAP == 0
if (fabs(in1.x) > 0.000001f)
out.x = in0.x / in1.x;
if (fabs(in1.y) > 0.000001f)
out.y = in0.y / in1.y;
if (fabs(in1.z) > 0.000001f)
out.z = in0.z / in1.z;
if (fabs(in1.w) > 0.000001f)
out.w = in0.w / in1.w;
#else
if (fabs(in1.x) > 0.000001f)
out.x = in1.x / in0.x;
if (fabs(in1.y) > 0.000001f)
out.y = in1.y / in0.y;
if (fabs(in1.z) > 0.000001f)
out.z = in1.z / in0.z;
if (fabs(in1.w) > 0.000001f)
out.w = in1.w / in0.w;
#endif
out = fmax(in0, in1);
#elif ELTWISE_TYPE == 6
in1 = (DATA_TYPE4)(0, 0, 0, 0);
out = in1 - in0;
#elif ELTWISE_TYPE == 7
out = fabs(in0);
#elif ELTWISE_TYPE == 8
DATA_TYPE4 diff = in0 - in1;
out = diff * diff;
#endif
#if ELTWISE_TYPE == 1 || ELTWISE_TYPE == 2 || ELTWISE_TYPE == 3 \
|| ELTWISE_TYPE == 4 |
|
ELTWISE_TYPE
==
8
if
(
remain_channel
<
4
)
{
switch
(
remain_channel
)
{
case
1:
out.y
=
0
;
case
2:
out.z
=
0
;
case
3:
out.w
=
0
;
#if INPUT_TYPE == 1
#if ELTWISE_TYPE == 0 || ELTWISE_TYPE == 1 || ELTWISE_TYPE == 4 || ELTWISE_TYPE == 5 |
|
ELTWISE_TYPE
==
8
const
int
remain_channel
=
channel
-
4
*
chan_idx
;
if
(
remain_channel
<
4
)
{
switch
(
remain_channel
)
{
case
1:
out.y
=
0
;
case
2:
out.z
=
0
;
case
3:
out.w
=
0
;
}
}
}
#
endif
#
endif
WRITE_IMAGET
(
output,
(
int2
)(
pos,
hb
)
,
out
)
;
...
...
mace/kernels/opencl/eltwise_opencl.cc
浏览文件 @
fdb477e2
...
...
@@ -23,16 +23,29 @@ namespace kernels {
template
<
typename
T
>
void
EltwiseFunctor
<
DeviceType
::
OPENCL
,
T
>::
operator
()(
const
Tensor
*
input0
,
const
Tensor
*
input1
,
const
index_t
start_axis
,
const
bool
is_scaler
,
const
float
value
,
const
bool
swap
,
Tensor
*
output
,
StatsFuture
*
future
)
{
const
index_t
batch
=
input0
->
dim
(
0
);
const
index_t
height
=
input0
->
dim
(
1
);
const
index_t
width
=
input0
->
dim
(
2
);
const
index_t
channels
=
input0
->
dim
(
3
);
bool
swapped
=
false
;
if
(
input1
!=
nullptr
)
{
MACE_CHECK
(
input0
->
dim_size
()
==
input1
->
dim_size
())
<<
"Inputs of Eltwise op must be same shape"
;
if
(
input0
->
size
()
!=
input1
->
size
())
{
if
(
input0
->
size
()
<
input1
->
size
())
{
std
::
swap
(
input0
,
input1
);
swapped
=
true
;
}
MACE_CHECK
(
input0
->
dim
(
0
)
==
input1
->
dim
(
0
)
&&
input1
->
dim
(
1
)
==
1
&&
input1
->
dim
(
2
)
==
1
&&
input0
->
dim
(
3
)
==
input1
->
dim
(
3
))
<<
"Element-Wise op only support channel dimension broadcast"
;
}
}
output
->
ResizeLike
(
input0
);
const
index_t
batch
=
output
->
dim
(
0
);
const
index_t
height
=
output
->
dim
(
1
);
const
index_t
width
=
output
->
dim
(
2
);
const
index_t
channels
=
output
->
dim
(
3
);
const
index_t
channel_blocks
=
RoundUpDiv4
(
channels
);
const
index_t
batch_height_pixels
=
batch
*
height
;
...
...
@@ -41,8 +54,6 @@ void EltwiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input0,
static_cast
<
uint32_t
>
(
width
),
static_cast
<
uint32_t
>
(
batch_height_pixels
)};
const
int
scaler
=
is_scaler
?
1
:
0
;
const
int
need_swap
=
swap
?
1
:
0
;
auto
runtime
=
OpenCLRuntime
::
Global
();
if
(
kernel_
.
get
()
==
nullptr
)
{
std
::
set
<
std
::
string
>
built_options
;
...
...
@@ -52,9 +63,14 @@ void EltwiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input0,
built_options
.
emplace
(
"-DDATA_TYPE="
+
DtToUpstreamCLDt
(
dt
));
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
DtToUpstreamCLCMDDt
(
dt
));
built_options
.
emplace
(
MakeString
(
"-DELTWISE_TYPE="
,
type_
));
built_options
.
emplace
(
MakeString
(
"-DSTART_AXIS="
,
start_axis
));
built_options
.
emplace
(
MakeString
(
"-DIS_SCALER="
,
scaler
));
built_options
.
emplace
(
MakeString
(
"-DNEEDSWAP="
,
need_swap
));
if
(
input1
==
nullptr
)
{
built_options
.
emplace
(
"-DINPUT_TYPE=1"
);
}
else
if
(
input0
->
size
()
!=
input1
->
size
())
{
built_options
.
emplace
(
"-DINPUT_TYPE=2"
);
if
(
swapped
)
built_options
.
emplace
(
"-DSWAPPED"
);
}
if
(
!
coeff_
.
empty
())
built_options
.
emplace
(
"-DCOEFF_SUM"
);
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
built_options
.
emplace
(
"-DOUT_OF_RANGE_CHECK"
);
kernel_error_
=
std
::
move
(
std
::
unique_ptr
<
Buffer
>
(
...
...
@@ -66,7 +82,6 @@ void EltwiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input0,
if
(
runtime
->
IsNonUniformWorkgroupsSupported
())
{
built_options
.
emplace
(
"-DNON_UNIFORM_WORK_GROUP"
);
}
if
(
!
coeff_
.
empty
())
built_options
.
emplace
(
"-DCOEFF_SUM"
);
kernel_
=
runtime
->
BuildKernel
(
"eltwise"
,
kernel_name
,
built_options
);
kwg_size_
=
...
...
@@ -84,8 +99,11 @@ void EltwiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input0,
kernel_
.
setArg
(
idx
++
,
gws
[
2
]);
}
kernel_
.
setArg
(
idx
++
,
*
(
input0
->
opencl_image
()));
kernel_
.
setArg
(
idx
++
,
*
(
input1
->
opencl_image
()));
kernel_
.
setArg
(
idx
++
,
value
);
if
(
input1
==
nullptr
)
{
kernel_
.
setArg
(
idx
++
,
value_
);
}
else
{
kernel_
.
setArg
(
idx
++
,
*
(
input1
->
opencl_image
()));
}
kernel_
.
setArg
(
idx
++
,
static_cast
<
int32_t
>
(
height
));
kernel_
.
setArg
(
idx
++
,
static_cast
<
int32_t
>
(
width
));
kernel_
.
setArg
(
idx
++
,
static_cast
<
int32_t
>
(
channels
));
...
...
mace/ops/eltwise.h
浏览文件 @
fdb477e2
...
...
@@ -28,57 +28,20 @@ class EltwiseOp : public Operator<D, T> {
:
Operator
<
D
,
T
>
(
op_def
,
ws
),
functor_
(
static_cast
<
kernels
::
EltwiseType
>
(
OperatorBase
::
GetSingleArgument
<
int
>
(
"type"
,
static_cast
<
int
>
(
kernels
::
EltwiseType
::
SUM
))),
OperatorBase
::
GetRepeatedArgument
<
float
>
(
"coeff"
))
{}
"type"
,
static_cast
<
int
>
(
kernels
::
EltwiseType
::
NONE
))),
OperatorBase
::
GetRepeatedArgument
<
float
>
(
"coeff"
),
OperatorBase
::
GetSingleArgument
<
float
>
(
"x"
,
1.0
))
{}
bool
Run
(
StatsFuture
*
future
)
override
{
if
(
this
->
InputSize
()
==
1
)
{
const
Tensor
*
input
=
this
->
Input
(
0
);
Tensor
*
output
=
this
->
Output
(
OUTPUT
);
start_axis_
=
input
->
dim_size
()
-
1
;
is_scaler_
=
true
;
output
->
ResizeLike
(
input
);
const
float
x
=
OperatorBase
::
GetSingleArgument
<
float
>
(
"x"
,
1.0
);
functor_
(
input
,
nullptr
,
start_axis_
,
is_scaler_
,
x
,
false
,
output
,
future
);
}
else
{
const
index_t
size0
=
this
->
Input
(
0
)
->
size
();
const
index_t
size1
=
this
->
Input
(
1
)
->
size
();
const
bool
swap
=
(
size0
<
size1
);
const
Tensor
*
input0
=
swap
?
this
->
Input
(
1
)
:
this
->
Input
(
0
);
const
Tensor
*
input1
=
swap
?
this
->
Input
(
0
)
:
this
->
Input
(
1
);
Tensor
*
output
=
this
->
Output
(
OUTPUT
);
MACE_CHECK
(
input0
->
dim_size
()
==
input1
->
dim_size
())
<<
"Inputs of Eltwise op must be same shape"
;
start_axis_
=
input0
->
dim_size
()
-
1
;
is_scaler_
=
(
input1
->
size
()
==
1
);
uint32_t
compared_size
=
1
;
if
(
!
is_scaler_
)
{
while
(
start_axis_
>=
0
)
{
MACE_CHECK
(
input0
->
dim
(
start_axis_
)
==
input1
->
dim
(
start_axis_
),
"Invalid inputs dimension at axis: "
)
<<
start_axis_
<<
"input 0: "
<<
input0
->
dim
(
start_axis_
)
<<
"input 1: "
<<
input1
->
dim
(
start_axis_
);
compared_size
*=
input1
->
dim
(
start_axis_
);
if
(
compared_size
==
input1
->
size
())
{
break
;
}
start_axis_
--
;
}
}
output
->
ResizeLike
(
input0
);
const
float
x
=
OperatorBase
::
GetSingleArgument
<
float
>
(
"x"
,
1.0
);
functor_
(
input0
,
input1
,
start_axis_
,
is_scaler_
,
x
,
swap
,
output
,
future
);
}
const
Tensor
*
input0
=
this
->
Input
(
0
);
const
Tensor
*
input1
=
this
->
InputSize
()
==
2
?
this
->
Input
(
1
)
:
nullptr
;
Tensor
*
output
=
this
->
Output
(
OUTPUT
);
functor_
(
input0
,
input1
,
output
,
future
);
return
true
;
}
private:
kernels
::
EltwiseFunctor
<
D
,
T
>
functor_
;
index_t
start_axis_
;
bool
is_scaler_
;
private:
OP_OUTPUT_TAGS
(
OUTPUT
);
...
...
mace/ops/eltwise_benchmark.cc
浏览文件 @
fdb477e2
...
...
@@ -35,10 +35,10 @@ void EltwiseBenchmark(
net
.
AddRandomInput
<
D
,
T
>
(
"Input1"
,
{
n
,
h
,
w
,
c
});
if
(
D
==
DeviceType
::
OPENCL
)
{
BufferToImage
<
D
,
half
>
(
&
net
,
"Input0"
,
"InputImg0"
,
kernels
::
BufferType
::
IN_OUT_CHANNEL
);
BufferToImage
<
D
,
half
>
(
&
net
,
"Input1"
,
"InputImg1"
,
kernels
::
BufferType
::
IN_OUT_CHANNEL
);
BufferToImage
<
D
,
T
>
(
&
net
,
"Input0"
,
"InputImg0"
,
kernels
::
BufferType
::
IN_OUT_CHANNEL
);
BufferToImage
<
D
,
T
>
(
&
net
,
"Input1"
,
"InputImg1"
,
kernels
::
BufferType
::
IN_OUT_CHANNEL
);
OpDefBuilder
(
"Eltwise"
,
"EltwiseTest"
)
.
Input
(
"InputImg0"
)
.
Input
(
"InputImg1"
)
...
...
@@ -48,9 +48,13 @@ void EltwiseBenchmark(
.
Output
(
"OutputImg"
)
.
Finalize
(
net
.
NewOperatorDef
());
}
else
{
net
.
TransformDataFormat
<
D
,
float
>
(
"Input0"
,
NHWC
,
"TInput0"
,
NCHW
);
net
.
TransformDataFormat
<
D
,
float
>
(
"Input1"
,
NHWC
,
"TInput1"
,
NCHW
);
OpDefBuilder
(
"Eltwise"
,
"EltwiseTest"
)
.
Input
(
"Input0"
)
.
Input
(
"Input1"
)
.
Input
(
"
T
Input0"
)
.
Input
(
"
T
Input1"
)
.
AddIntArg
(
"type"
,
static_cast
<
int
>
(
type
))
.
AddFloatsArg
(
"coeff"
,
{
1.2
,
2.1
})
.
AddIntArg
(
"T"
,
static_cast
<
int
>
(
DataTypeToEnum
<
T
>::
value
))
...
...
@@ -89,13 +93,13 @@ void EltwiseBenchmark(
BM_ELTWISE_MACRO(ELT_TYPE, N, H, W, C, float, OPENCL); \
BM_ELTWISE_MACRO(ELT_TYPE, N, H, W, C, half, OPENCL);
BM_ELTWISE
(
0
,
1
,
256
,
256
,
32
);
BM_ELTWISE
(
0
,
1
,
128
,
128
,
32
);
BM_ELTWISE
(
1
,
1
,
128
,
128
,
32
);
BM_ELTWISE
(
2
,
1
,
128
,
128
,
32
);
BM_ELTWISE
(
0
,
1
,
240
,
240
,
256
);
BM_ELTWISE
(
1
,
1
,
240
,
240
,
256
);
BM_ELTWISE
(
2
,
1
,
240
,
240
,
256
);
BM_ELTWISE
(
2
,
1
,
256
,
256
,
32
);
BM_ELTWISE
(
0
,
1
,
128
,
128
,
32
);
BM_ELTWISE
(
0
,
1
,
240
,
240
,
256
);
BM_ELTWISE
(
5
,
1
,
128
,
128
,
32
);
BM_ELTWISE
(
5
,
1
,
240
,
240
,
256
);
}
// namespace test
}
// namespace ops
...
...
mace/ops/eltwise_test.cc
浏览文件 @
fdb477e2
此差异已折叠。
点击以展开。
mace/python/tools/caffe_converter_lib.py
浏览文件 @
fdb477e2
...
...
@@ -41,6 +41,12 @@ activation_name_map = {
'TanH'
:
'TANH'
,
}
math_type_mode
=
{
0
:
2
,
# PROD
1
:
0
,
# SUM
2
:
5
,
# MAX
}
MACE_INPUT_NODE_NAME
=
"mace_input_node"
MACE_OUTPUT_NODE_NAME
=
"mace_output_node"
...
...
@@ -921,11 +927,11 @@ class CaffeConverter(object):
param
=
op
.
layer
.
eltwise_param
type_arg
=
op_def
.
arg
.
add
()
type_arg
.
name
=
'type'
type_arg
.
i
=
param
.
operation
type_arg
.
i
=
math_type_mode
[
param
.
operation
]
if
len
(
param
.
coeff
)
>
0
:
coeff_arg
=
op_def
.
arg
.
add
()
coeff_arg
.
name
=
'coeff'
coeff_arg
.
in
ts
.
extend
(
list
(
param
.
coeff
))
coeff_arg
.
floa
ts
.
extend
(
list
(
param
.
coeff
))
output_shape
=
op
.
parents
[
0
].
output_shape_map
[
op
.
layer
.
bottom
[
0
]]
op
.
output_shape_map
[
op
.
layer
.
top
[
0
]]
=
output_shape
...
...
mace/python/tools/tf_converter_lib.py
浏览文件 @
fdb477e2
...
...
@@ -30,14 +30,14 @@ pooling_type_mode = {'AvgPool': 1, 'MaxPool': 2}
# and also cwise type's in mace/kernels/cwise.h
# cuz these math ops should have compatible with "EltWise" and "CWise"
math_type_mode
=
{
'
MUL
'
:
0
,
'
ADD
'
:
1
,
'M
AX
'
:
2
,
'
MIN
'
:
3
,
'
SUB
'
:
4
,
'
DIV
'
:
5
,
'
ADD
'
:
0
,
'
SUB
'
:
1
,
'M
UL
'
:
2
,
'
DIV
'
:
3
,
'
MIN
'
:
4
,
'
MAX
'
:
5
,
'NEG'
:
6
,
'ABS'
:
7
'ABS'
:
7
,
}
buffer_type_map
=
{
...
...
@@ -859,18 +859,26 @@ class TFConverter(object):
arg
.
i
=
self
.
dt
op_def
.
name
=
op
.
name
op_def
.
type
=
"Eltwise"
op_def
.
input
.
extend
([
input
.
name
for
input
in
op
.
inputs
])
x_value
=
op
.
get_attr
(
'x'
)
if
len
(
op
.
inputs
)
>=
2
:
if
len
(
op
.
inputs
)
==
2
:
input_tensor0
=
get_input_tensor
(
op
,
0
)
input_tensor1
=
get_input_tensor
(
op
,
1
)
if
len
(
input_tensor0
)
==
1
:
x_value
=
input_tensor0
.
eval
().
astype
(
np
.
float32
)
elif
len
(
input_tensor1
)
==
1
:
x_value
=
input_tensor1
.
eval
().
astype
(
np
.
float32
)
x_arg
=
op_def
.
arg
.
add
()
x_arg
.
name
=
'x'
x_arg
.
f
=
x_value
x_value
=
None
if
np
.
asarray
(
input_tensor1
.
shape
).
size
==
0
:
x_value
=
input_tensor1
.
eval
()
op_def
.
input
.
extend
([
op
.
inputs
[
0
].
name
])
self
.
unused_tensor
.
add
(
input_tensor1
.
name
)
elif
np
.
asarray
(
input_tensor0
.
shape
).
size
==
0
:
x_value
=
input_tensor0
.
eval
()
op_def
.
input
.
extend
([
op
.
inputs
[
1
].
name
])
self
.
unused_tensor
.
add
(
input_tensor0
.
name
)
else
:
op_def
.
input
.
extend
([
input
.
name
for
input
in
op
.
inputs
])
if
x_value
is
not
None
:
x_arg
=
op_def
.
arg
.
add
()
x_arg
.
name
=
'x'
x_arg
.
f
=
x_value
else
:
op_def
.
input
.
extend
([
input
.
name
for
input
in
op
.
inputs
])
type_arg
=
op_def
.
arg
.
add
()
type_arg
.
name
=
'type'
type_arg
.
i
=
math_type_mode
[
math_type
]
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录