Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Mr.Vain
Mace
提交
46baf92a
Mace
项目概览
Mr.Vain
/
Mace
与 Fork 源项目一致
Fork自
Xiaomi / Mace
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
Mace
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
前往新版Gitcode,体验更适合开发者的 AI 搜索 >>
提交
46baf92a
编写于
6月 05, 2018
作者:
李
李寅
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Refactor eltwise
上级
7bdc8a4d
变更
5
隐藏空白更改
内联
并排
Showing
5 changed file
with
603 addition
and
340 deletion
+603
-340
mace/kernels/eltwise.h
mace/kernels/eltwise.h
+477
-303
mace/kernels/softmax.h
mace/kernels/softmax.h
+54
-23
mace/ops/eltwise.h
mace/ops/eltwise.h
+3
-1
mace/ops/eltwise_test.cc
mace/ops/eltwise_test.cc
+48
-7
mace/ops/softmax_test.cc
mace/ops/softmax_test.cc
+21
-6
未找到文件。
mace/kernels/eltwise.h
浏览文件 @
46baf92a
...
...
@@ -17,8 +17,8 @@
#include <algorithm>
#include <memory>
#include <vector>
#include <utility>
#include <vector>
#include "mace/core/future.h"
#include "mace/core/tensor.h"
...
...
@@ -44,70 +44,253 @@ enum EltwiseType {
NONE
=
10
,
};
inline
void
TensorScalar
(
const
EltwiseType
type
,
const
float
*
input0
,
const
float
value
,
const
index_t
size
,
float
*
output
)
{
inline
void
TensorBroadcastEltwise
(
const
EltwiseType
type
,
const
float
*
input0
,
const
float
*
input1
,
const
std
::
vector
<
float
>
&
coeff
,
const
index_t
diff_size
,
const
index_t
common_size
,
const
bool
swapped
,
float
*
output
)
{
switch
(
type
)
{
case
SUM
:
if
(
coeff
.
empty
())
{
#pragma omp parallel for collapse(2)
for
(
index_t
d
=
0
;
d
<
diff_size
;
++
d
)
{
for
(
index_t
i
=
0
;
i
<
common_size
;
++
i
)
{
output
[
i
+
d
*
common_size
]
=
input0
[
i
+
d
*
common_size
]
+
input1
[
i
];
}
}
}
else
{
std
::
vector
<
float
>
coeff_copy
=
coeff
;
if
(
swapped
)
{
std
::
swap
(
coeff_copy
[
0
],
coeff_copy
[
1
]);
}
#pragma omp parallel for collapse(2)
for
(
index_t
d
=
0
;
d
<
diff_size
;
++
d
)
{
for
(
index_t
i
=
0
;
i
<
common_size
;
++
i
)
{
output
[
i
+
d
*
common_size
]
=
input0
[
i
+
d
*
common_size
]
*
coeff_copy
[
0
]
+
input1
[
i
]
*
coeff_copy
[
1
];
}
}
}
break
;
case
SUB
:
if
(
!
swapped
)
{
#pragma omp parallel for collapse(2)
for
(
index_t
d
=
0
;
d
<
diff_size
;
++
d
)
{
for
(
index_t
i
=
0
;
i
<
common_size
;
++
i
)
{
output
[
i
+
d
*
common_size
]
=
input0
[
i
+
d
*
common_size
]
-
input1
[
i
];
}
}
}
else
{
#pragma omp parallel for collapse(2)
for
(
index_t
d
=
0
;
d
<
diff_size
;
++
d
)
{
for
(
index_t
i
=
0
;
i
<
common_size
;
++
i
)
{
output
[
i
+
d
*
common_size
]
=
input1
[
i
]
-
input0
[
i
+
d
*
common_size
];
}
}
}
break
;
case
PROD
:
#pragma omp parallel for collapse(2)
for
(
index_t
d
=
0
;
d
<
diff_size
;
++
d
)
{
for
(
index_t
i
=
0
;
i
<
common_size
;
++
i
)
{
output
[
i
+
d
*
common_size
]
=
input0
[
i
+
d
*
common_size
]
*
input1
[
i
];
}
}
break
;
case
DIV
:
if
(
!
swapped
)
{
#pragma omp parallel for collapse(2)
for
(
index_t
d
=
0
;
d
<
diff_size
;
++
d
)
{
for
(
index_t
i
=
0
;
i
<
common_size
;
++
i
)
{
output
[
i
+
d
*
common_size
]
=
input0
[
i
+
d
*
common_size
]
/
input1
[
i
];
}
}
}
else
{
#pragma omp parallel for collapse(2)
for
(
index_t
d
=
0
;
d
<
diff_size
;
++
d
)
{
for
(
index_t
i
=
0
;
i
<
common_size
;
++
i
)
{
output
[
i
+
d
*
common_size
]
=
input1
[
i
]
/
input0
[
i
+
d
*
common_size
];
}
}
}
break
;
case
MIN
:
#pragma omp parallel for collapse(2)
for
(
index_t
d
=
0
;
d
<
diff_size
;
++
d
)
{
for
(
index_t
i
=
0
;
i
<
common_size
;
++
i
)
{
output
[
i
+
d
*
common_size
]
=
std
::
min
(
input0
[
i
+
d
*
common_size
],
input1
[
i
]);
}
}
break
;
case
MAX
:
#pragma omp parallel for collapse(2)
for
(
index_t
d
=
0
;
d
<
diff_size
;
++
d
)
{
for
(
index_t
i
=
0
;
i
<
common_size
;
++
i
)
{
output
[
i
+
d
*
common_size
]
=
std
::
max
(
input0
[
i
+
d
*
common_size
],
input1
[
i
]);
}
}
break
;
case
SQR_DIFF
:
#pragma omp parallel for collapse(2)
for
(
index_t
d
=
0
;
d
<
diff_size
;
++
d
)
{
for
(
index_t
i
=
0
;
i
<
common_size
;
++
i
)
{
output
[
i
+
d
*
common_size
]
=
std
::
pow
(
input0
[
i
+
d
*
common_size
]
-
input1
[
i
],
2.
f
);
}
}
break
;
case
POW
:
if
(
!
swapped
)
{
#pragma omp parallel for collapse(2)
for
(
index_t
d
=
0
;
d
<
diff_size
;
++
d
)
{
for
(
index_t
i
=
0
;
i
<
common_size
;
++
i
)
{
output
[
i
+
d
*
common_size
]
=
std
::
pow
(
input0
[
i
+
d
*
common_size
],
input1
[
i
]);
}
}
}
else
{
#pragma omp parallel for collapse(2)
for
(
index_t
d
=
0
;
d
<
diff_size
;
++
d
)
{
for
(
index_t
i
=
0
;
i
<
common_size
;
++
i
)
{
output
[
i
+
d
*
common_size
]
=
std
::
pow
(
input1
[
i
],
input0
[
i
+
d
*
common_size
]);
}
}
}
break
;
case
NEG
:
#pragma omp parallel for
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
output
[
i
]
=
input0
[
i
]
+
value
;
for
(
index_t
i
=
0
;
i
<
diff_size
*
common_size
;
++
i
)
{
output
[
i
]
=
-
input0
[
i
];
}
break
;
case
ABS
:
#pragma omp parallel for
for
(
index_t
i
=
0
;
i
<
diff_size
*
common_size
;
++
i
)
{
output
[
i
]
=
std
::
fabs
(
input0
[
i
]);
}
break
;
default:
LOG
(
FATAL
)
<<
"Eltwise op not support type "
<<
type
;
}
}
// Multiplication is costly, so we specialize the following case.
inline
void
TensorEltwise
(
const
EltwiseType
type
,
const
float
*
input0
,
const
float
*
input1
,
const
std
::
vector
<
float
>
&
coeff
,
const
index_t
size
,
const
bool
swapped
,
float
*
output
)
{
switch
(
type
)
{
case
SUM
:
if
(
coeff
.
empty
())
{
#pragma omp parallel for
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
output
[
i
]
=
input0
[
i
]
+
input1
[
i
];
}
}
else
{
std
::
vector
<
float
>
coeff_copy
=
coeff
;
if
(
swapped
)
{
std
::
swap
(
coeff_copy
[
0
],
coeff_copy
[
1
]);
}
#pragma omp parallel for
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
output
[
i
]
=
input0
[
i
]
*
coeff_copy
[
0
]
+
input1
[
i
]
*
coeff_copy
[
1
];
}
}
break
;
case
SUB
:
if
(
!
swapped
)
{
#pragma omp parallel for
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
output
[
i
]
=
input0
[
i
]
-
value
;
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
output
[
i
]
=
input0
[
i
]
-
input1
[
i
];
}
}
else
{
#pragma omp parallel for
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
output
[
i
]
=
input1
[
i
]
-
input0
[
i
];
}
}
break
;
case
PROD
:
#pragma omp parallel for
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
output
[
i
]
=
input0
[
i
]
*
value
;
output
[
i
]
=
input0
[
i
]
*
input1
[
i
]
;
}
break
;
case
DIV
:
if
(
!
swapped
)
{
#pragma omp parallel for
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
output
[
i
]
=
input0
[
i
]
/
value
;
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
output
[
i
]
=
input0
[
i
]
/
input1
[
i
];
}
}
else
{
#pragma omp parallel for
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
output
[
i
]
=
input1
[
i
]
/
input0
[
i
];
}
}
break
;
case
MIN
:
#pragma omp parallel for
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
output
[
i
]
=
std
::
min
<
float
>
(
input0
[
i
],
value
);
output
[
i
]
=
std
::
min
(
input0
[
i
],
input1
[
i
]
);
}
break
;
case
MAX
:
#pragma omp parallel for
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
output
[
i
]
=
std
::
max
<
float
>
(
input0
[
i
],
value
);
output
[
i
]
=
std
::
max
(
input0
[
i
],
input1
[
i
]
);
}
break
;
case
NEG
:
case
SQR_DIFF
:
#pragma omp parallel for
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
output
[
i
]
=
-
input0
[
i
]
;
output
[
i
]
=
std
::
pow
(
input0
[
i
]
-
input1
[
i
],
2.
f
)
;
}
break
;
case
ABS
:
case
POW
:
if
(
!
swapped
)
{
#pragma omp parallel for
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
output
[
i
]
=
std
::
abs
(
input0
[
i
]);
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
output
[
i
]
=
std
::
pow
(
input0
[
i
],
input1
[
i
]);
}
}
else
{
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
output
[
i
]
=
std
::
pow
(
input1
[
i
],
input0
[
i
]);
}
}
break
;
case
SQR_DIFF
:
case
NEG
:
#pragma omp parallel for
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
output
[
i
]
=
std
::
pow
(
input0
[
i
]
-
value
,
2.
f
)
;
output
[
i
]
=
-
input0
[
i
]
;
}
break
;
case
POW
:
case
ABS
:
#pragma omp parallel for
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
output
[
i
]
=
std
::
pow
(
input0
[
i
],
value
);
output
[
i
]
=
std
::
fabs
(
input0
[
i
]
);
}
break
;
default:
...
...
@@ -115,328 +298,304 @@ inline void TensorScalar(const EltwiseType type,
}
}
inline
void
TensorBatchVector
(
const
EltwiseType
type
,
const
float
*
input0
,
const
float
*
input1
,
const
index_t
batch
,
const
index_t
channel
,
const
index_t
hw
,
const
bool
swapped
,
float
*
output
)
{
// Multiplication is costly, so we specialize the following case.
inline
void
TensorScalarEltwise
(
const
EltwiseType
type
,
const
float
*
input0
,
const
float
input1
,
const
std
::
vector
<
float
>
&
coeff
,
const
index_t
size
,
const
bool
swapped
,
float
*
output
)
{
switch
(
type
)
{
case
SUM
:
#pragma omp parallel for collapse(3)
for
(
index_t
b
=
0
;
b
<
batch
;
++
b
)
{
for
(
index_t
c
=
0
;
c
<
channel
;
++
c
)
{
for
(
index_t
i
=
0
;
i
<
hw
;
++
i
)
{
const
index_t
idx0
=
(
b
*
channel
+
c
)
*
hw
+
i
;
const
index_t
idx1
=
b
*
channel
+
c
;
output
[
idx0
]
=
input0
[
idx0
]
+
input1
[
idx1
];
}
if
(
coeff
.
empty
())
{
#pragma omp parallel for
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
output
[
i
]
=
input0
[
i
]
+
input1
;
}
}
else
{
std
::
vector
<
float
>
coeff_copy
=
coeff
;
if
(
swapped
)
{
std
::
swap
(
coeff_copy
[
0
],
coeff_copy
[
1
]);
}
#pragma omp parallel for
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
output
[
i
]
=
input0
[
i
]
*
coeff_copy
[
0
]
+
input1
*
coeff_copy
[
1
];
}
}
break
;
case
SUB
:
if
(
swapped
)
{
#pragma omp parallel for collapse(3)
for
(
index_t
b
=
0
;
b
<
batch
;
++
b
)
{
for
(
index_t
c
=
0
;
c
<
channel
;
++
c
)
{
for
(
index_t
i
=
0
;
i
<
hw
;
++
i
)
{
const
index_t
idx0
=
(
b
*
channel
+
c
)
*
hw
+
i
;
const
index_t
idx1
=
b
*
channel
+
c
;
output
[
idx0
]
=
input1
[
idx1
]
-
input0
[
idx0
];
}
}
if
(
!
swapped
)
{
#pragma omp parallel for
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
output
[
i
]
=
input0
[
i
]
-
input1
;
}
}
else
{
#pragma omp parallel for collapse(3)
for
(
index_t
b
=
0
;
b
<
batch
;
++
b
)
{
for
(
index_t
c
=
0
;
c
<
channel
;
++
c
)
{
for
(
index_t
i
=
0
;
i
<
hw
;
++
i
)
{
const
index_t
idx0
=
(
b
*
channel
+
c
)
*
hw
+
i
;
const
index_t
idx1
=
b
*
channel
+
c
;
output
[
idx0
]
=
input0
[
idx0
]
-
input1
[
idx1
];
}
}
#pragma omp parallel for
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
output
[
i
]
=
input1
-
input0
[
i
];
}
}
break
;
case
PROD
:
#pragma omp parallel for collapse(3)
for
(
index_t
b
=
0
;
b
<
batch
;
++
b
)
{
for
(
index_t
c
=
0
;
c
<
channel
;
++
c
)
{
for
(
index_t
i
=
0
;
i
<
hw
;
++
i
)
{
const
index_t
idx0
=
(
b
*
channel
+
c
)
*
hw
+
i
;
const
index_t
idx1
=
b
*
channel
+
c
;
output
[
idx0
]
=
input0
[
idx0
]
*
input1
[
idx1
];
}
}
#pragma omp parallel for
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
output
[
i
]
=
input0
[
i
]
*
input1
;
}
break
;
case
DIV
:
if
(
swapped
)
{
#pragma omp parallel for collapse(3)
for
(
index_t
b
=
0
;
b
<
batch
;
++
b
)
{
for
(
index_t
c
=
0
;
c
<
channel
;
++
c
)
{
for
(
index_t
i
=
0
;
i
<
hw
;
++
i
)
{
const
index_t
idx0
=
(
b
*
channel
+
c
)
*
hw
+
i
;
const
index_t
idx1
=
b
*
channel
+
c
;
output
[
idx0
]
=
input1
[
idx1
]
/
input0
[
idx0
];
}
}
if
(
!
swapped
)
{
#pragma omp parallel for
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
output
[
i
]
=
input0
[
i
]
/
input1
;
}
}
else
{
#pragma omp parallel for collapse(3)
for
(
index_t
b
=
0
;
b
<
batch
;
++
b
)
{
for
(
index_t
c
=
0
;
c
<
channel
;
++
c
)
{
for
(
index_t
i
=
0
;
i
<
hw
;
++
i
)
{
const
index_t
idx0
=
(
b
*
channel
+
c
)
*
hw
+
i
;
const
index_t
idx1
=
b
*
channel
+
c
;
output
[
idx0
]
=
input0
[
idx0
]
/
input1
[
idx1
];
}
}
#pragma omp parallel for
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
output
[
i
]
=
input1
/
input0
[
i
];
}
}
break
;
case
MIN
:
#pragma omp parallel for collapse(3)
for
(
index_t
b
=
0
;
b
<
batch
;
++
b
)
{
for
(
index_t
c
=
0
;
c
<
channel
;
++
c
)
{
for
(
index_t
i
=
0
;
i
<
hw
;
++
i
)
{
const
index_t
idx0
=
(
b
*
channel
+
c
)
*
hw
+
i
;
const
index_t
idx1
=
b
*
channel
+
c
;
output
[
idx0
]
=
std
::
min
<
float
>
(
input0
[
idx0
],
input1
[
idx1
]);
}
}
#pragma omp parallel for
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
output
[
i
]
=
std
::
min
(
input0
[
i
],
input1
);
}
break
;
case
MAX
:
#pragma omp parallel for collapse(3)
for
(
index_t
b
=
0
;
b
<
batch
;
++
b
)
{
for
(
index_t
c
=
0
;
c
<
channel
;
++
c
)
{
for
(
index_t
i
=
0
;
i
<
hw
;
++
i
)
{
const
index_t
idx0
=
(
b
*
channel
+
c
)
*
hw
+
i
;
const
index_t
idx1
=
b
*
channel
+
c
;
output
[
idx0
]
=
std
::
max
<
float
>
(
input0
[
idx0
],
input1
[
idx1
]);
}
}
#pragma omp parallel for
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
output
[
i
]
=
std
::
max
(
input0
[
i
],
input1
);
}
break
;
case
SQR_DIFF
:
#pragma omp parallel for collapse(3)
for
(
index_t
b
=
0
;
b
<
batch
;
++
b
)
{
for
(
index_t
c
=
0
;
c
<
channel
;
++
c
)
{
for
(
index_t
i
=
0
;
i
<
hw
;
++
i
)
{
const
index_t
idx0
=
(
b
*
channel
+
c
)
*
hw
+
i
;
const
index_t
idx1
=
b
*
channel
+
c
;
output
[
idx0
]
=
std
::
pow
(
input0
[
idx0
]
-
input1
[
idx1
],
2.
f
);
}
}
#pragma omp parallel for
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
output
[
i
]
=
std
::
pow
(
input0
[
i
]
-
input1
,
2.
f
);
}
break
;
case
POW
:
#pragma omp parallel for collapse(3)
for
(
index_t
b
=
0
;
b
<
batch
;
++
b
)
{
for
(
index_t
c
=
0
;
c
<
channel
;
++
c
)
{
for
(
index_t
i
=
0
;
i
<
hw
;
++
i
)
{
const
index_t
idx0
=
(
b
*
channel
+
c
)
*
hw
+
i
;
const
index_t
idx1
=
b
*
channel
+
c
;
output
[
idx0
]
=
std
::
pow
(
input0
[
idx0
],
input1
[
idx1
]);
}
if
(
!
swapped
)
{
#pragma omp parallel for
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
output
[
i
]
=
std
::
pow
(
input0
[
i
],
input1
);
}
}
else
{
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
output
[
i
]
=
std
::
pow
(
input1
,
input0
[
i
]);
}
}
break
;
case
NEG
:
#pragma omp parallel for
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
output
[
i
]
=
-
input0
[
i
];
}
break
;
case
ABS
:
#pragma omp parallel for
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
output
[
i
]
=
std
::
fabs
(
input0
[
i
]);
}
break
;
default:
LOG
(
FATAL
)
<<
"Eltwise op not support type "
<<
type
;
}
}
inline
void
TensorVector
(
const
EltwiseType
type
,
const
float
*
input0
,
const
float
*
input1
,
const
index_t
batch
,
const
index_t
channel
,
const
index_t
hw
,
const
bool
swapped
,
float
*
output
)
{
inline
void
TensorEltwisePerChannel
(
const
EltwiseType
type
,
const
float
*
input0
,
const
float
*
input1
,
const
std
::
vector
<
float
>
&
coeff
,
const
index_t
batch0
,
const
index_t
batch1
,
const
index_t
channel
,
const
index_t
image_size
,
const
bool
swapped
,
float
*
output
)
{
switch
(
type
)
{
case
SUM
:
#pragma omp parallel for collapse(3)
for
(
index_t
b
=
0
;
b
<
batch
;
++
b
)
{
for
(
index_t
c
=
0
;
c
<
channel
;
++
c
)
{
for
(
index_t
i
=
0
;
i
<
hw
;
++
i
)
{
const
index_t
idx0
=
(
b
*
channel
+
c
)
*
hw
+
i
;
const
index_t
idx1
=
c
;
output
[
idx0
]
=
input0
[
idx0
]
+
input1
[
idx1
];
if
(
coeff
.
empty
())
{
#pragma omp parallel for collapse(2)
for
(
index_t
b
=
0
;
b
<
batch0
;
++
b
)
{
for
(
index_t
c
=
0
;
c
<
channel
;
++
c
)
{
const
float
*
in0_ptr
=
input0
+
((
b
*
channel
)
+
c
)
*
image_size
;
const
float
*
in1_ptr
=
input1
+
(
batch1
>
1
?
b
*
channel
:
0
);
float
*
out_ptr
=
output
+
((
b
*
channel
)
+
c
)
*
image_size
;
for
(
index_t
i
=
0
;
i
<
image_size
;
++
i
)
{
out_ptr
[
i
]
=
in0_ptr
[
i
]
+
in1_ptr
[
c
];
}
}
}
}
else
{
std
::
vector
<
float
>
coeff_copy
=
coeff
;
if
(
swapped
)
{
std
::
swap
(
coeff_copy
[
0
],
coeff_copy
[
1
]);
}
#pragma omp parallel for collapse(2)
for
(
index_t
b
=
0
;
b
<
batch0
;
++
b
)
{
for
(
index_t
c
=
0
;
c
<
channel
;
++
c
)
{
const
float
*
in0_ptr
=
input0
+
((
b
*
channel
)
+
c
)
*
image_size
;
const
float
*
in1_ptr
=
input1
+
(
batch1
>
1
?
b
*
channel
:
0
);
float
*
out_ptr
=
output
+
((
b
*
channel
)
+
c
)
*
image_size
;
for
(
index_t
i
=
0
;
i
<
image_size
;
++
i
)
{
out_ptr
[
i
]
=
in0_ptr
[
i
]
*
coeff_copy
[
0
]
+
in1_ptr
[
c
]
*
coeff_copy
[
1
];
}
}
}
}
break
;
case
SUB
:
if
(
swapped
)
{
#pragma omp parallel for collapse(
3
)
for
(
index_t
b
=
0
;
b
<
batch
;
++
b
)
{
if
(
!
swapped
)
{
#pragma omp parallel for collapse(
2
)
for
(
index_t
b
=
0
;
b
<
batch
0
;
++
b
)
{
for
(
index_t
c
=
0
;
c
<
channel
;
++
c
)
{
for
(
index_t
i
=
0
;
i
<
hw
;
++
i
)
{
const
index_t
idx0
=
(
b
*
channel
+
c
)
*
hw
+
i
;
const
index_t
idx1
=
c
;
output
[
idx0
]
=
input1
[
idx1
]
-
input0
[
idx0
];
const
float
*
in0_ptr
=
input0
+
((
b
*
channel
)
+
c
)
*
image_size
;
const
float
*
in1_ptr
=
input1
+
(
batch1
>
1
?
b
*
channel
:
0
);
float
*
out_ptr
=
output
+
((
b
*
channel
)
+
c
)
*
image_size
;
for
(
index_t
i
=
0
;
i
<
image_size
;
++
i
)
{
out_ptr
[
i
]
=
in0_ptr
[
i
]
-
in1_ptr
[
c
];
}
}
}
}
else
{
#pragma omp parallel for collapse(
3
)
for
(
index_t
b
=
0
;
b
<
batch
;
++
b
)
{
#pragma omp parallel for collapse(
2
)
for
(
index_t
b
=
0
;
b
<
batch
0
;
++
b
)
{
for
(
index_t
c
=
0
;
c
<
channel
;
++
c
)
{
for
(
index_t
i
=
0
;
i
<
hw
;
++
i
)
{
const
index_t
idx0
=
(
b
*
channel
+
c
)
*
hw
+
i
;
const
index_t
idx1
=
c
;
output
[
idx0
]
=
input0
[
idx0
]
-
input1
[
idx1
];
const
float
*
in0_ptr
=
input0
+
((
b
*
channel
)
+
c
)
*
image_size
;
const
float
*
in1_ptr
=
input1
+
(
batch1
>
1
?
b
*
channel
:
0
);
float
*
out_ptr
=
output
+
((
b
*
channel
)
+
c
)
*
image_size
;
for
(
index_t
i
=
0
;
i
<
image_size
;
++
i
)
{
out_ptr
[
i
]
=
in1_ptr
[
c
]
-
in0_ptr
[
i
];
}
}
}
}
break
;
case
PROD
:
#pragma omp parallel for collapse(
3
)
for
(
index_t
b
=
0
;
b
<
batch
;
++
b
)
{
#pragma omp parallel for collapse(
2
)
for
(
index_t
b
=
0
;
b
<
batch
0
;
++
b
)
{
for
(
index_t
c
=
0
;
c
<
channel
;
++
c
)
{
for
(
index_t
i
=
0
;
i
<
hw
;
++
i
)
{
const
index_t
idx0
=
(
b
*
channel
+
c
)
*
hw
+
i
;
const
index_t
idx1
=
c
;
output
[
idx0
]
=
input0
[
idx0
]
*
input1
[
idx1
];
const
float
*
in0_ptr
=
input0
+
((
b
*
channel
)
+
c
)
*
image_size
;
const
float
*
in1_ptr
=
input1
+
(
batch1
>
1
?
b
*
channel
:
0
);
float
*
out_ptr
=
output
+
((
b
*
channel
)
+
c
)
*
image_size
;
for
(
index_t
i
=
0
;
i
<
image_size
;
++
i
)
{
out_ptr
[
i
]
=
in0_ptr
[
i
]
*
in1_ptr
[
c
];
}
}
}
break
;
case
DIV
:
if
(
swapped
)
{
#pragma omp parallel for collapse(
3
)
for
(
index_t
b
=
0
;
b
<
batch
;
++
b
)
{
if
(
!
swapped
)
{
#pragma omp parallel for collapse(
2
)
for
(
index_t
b
=
0
;
b
<
batch
0
;
++
b
)
{
for
(
index_t
c
=
0
;
c
<
channel
;
++
c
)
{
for
(
index_t
i
=
0
;
i
<
hw
;
++
i
)
{
const
index_t
idx0
=
(
b
*
channel
+
c
)
*
hw
+
i
;
const
index_t
idx1
=
c
;
output
[
idx0
]
=
input1
[
idx1
]
/
input0
[
idx0
];
const
float
*
in0_ptr
=
input0
+
((
b
*
channel
)
+
c
)
*
image_size
;
const
float
*
in1_ptr
=
input1
+
(
batch1
>
1
?
b
*
channel
:
0
);
float
*
out_ptr
=
output
+
((
b
*
channel
)
+
c
)
*
image_size
;
for
(
index_t
i
=
0
;
i
<
image_size
;
++
i
)
{
out_ptr
[
i
]
=
in0_ptr
[
i
]
/
in1_ptr
[
c
];
}
}
}
}
else
{
#pragma omp parallel for collapse(
3
)
for
(
index_t
b
=
0
;
b
<
batch
;
++
b
)
{
#pragma omp parallel for collapse(
2
)
for
(
index_t
b
=
0
;
b
<
batch
0
;
++
b
)
{
for
(
index_t
c
=
0
;
c
<
channel
;
++
c
)
{
for
(
index_t
i
=
0
;
i
<
hw
;
++
i
)
{
const
index_t
idx0
=
(
b
*
channel
+
c
)
*
hw
+
i
;
const
index_t
idx1
=
c
;
output
[
idx0
]
=
input0
[
idx0
]
/
input1
[
idx1
];
const
float
*
in0_ptr
=
input0
+
((
b
*
channel
)
+
c
)
*
image_size
;
const
float
*
in1_ptr
=
input1
+
(
batch1
>
1
?
b
*
channel
:
0
);
float
*
out_ptr
=
output
+
((
b
*
channel
)
+
c
)
*
image_size
;
for
(
index_t
i
=
0
;
i
<
image_size
;
++
i
)
{
out_ptr
[
i
]
=
in1_ptr
[
c
]
/
in0_ptr
[
i
];
}
}
}
}
break
;
case
MIN
:
#pragma omp parallel for collapse(
3
)
for
(
index_t
b
=
0
;
b
<
batch
;
++
b
)
{
#pragma omp parallel for collapse(
2
)
for
(
index_t
b
=
0
;
b
<
batch
0
;
++
b
)
{
for
(
index_t
c
=
0
;
c
<
channel
;
++
c
)
{
for
(
index_t
i
=
0
;
i
<
hw
;
++
i
)
{
const
index_t
idx0
=
(
b
*
channel
+
c
)
*
hw
+
i
;
const
index_t
idx1
=
c
;
output
[
idx0
]
=
std
::
min
<
float
>
(
input0
[
idx0
],
input1
[
idx1
]);
const
float
*
in0_ptr
=
input0
+
((
b
*
channel
)
+
c
)
*
image_size
;
const
float
*
in1_ptr
=
input1
+
(
batch1
>
1
?
b
*
channel
:
0
);
float
*
out_ptr
=
output
+
((
b
*
channel
)
+
c
)
*
image_size
;
for
(
index_t
i
=
0
;
i
<
image_size
;
++
i
)
{
out_ptr
[
i
]
=
std
::
min
(
in0_ptr
[
i
],
in1_ptr
[
c
]);
}
}
}
break
;
case
MAX
:
#pragma omp parallel for collapse(
3
)
for
(
index_t
b
=
0
;
b
<
batch
;
++
b
)
{
#pragma omp parallel for collapse(
2
)
for
(
index_t
b
=
0
;
b
<
batch
0
;
++
b
)
{
for
(
index_t
c
=
0
;
c
<
channel
;
++
c
)
{
for
(
index_t
i
=
0
;
i
<
hw
;
++
i
)
{
const
index_t
idx0
=
(
b
*
channel
+
c
)
*
hw
+
i
;
const
index_t
idx1
=
c
;
output
[
idx0
]
=
std
::
max
<
float
>
(
input0
[
idx0
],
input1
[
idx1
]);
const
float
*
in0_ptr
=
input0
+
((
b
*
channel
)
+
c
)
*
image_size
;
const
float
*
in1_ptr
=
input1
+
(
batch1
>
1
?
b
*
channel
:
0
);
float
*
out_ptr
=
output
+
((
b
*
channel
)
+
c
)
*
image_size
;
for
(
index_t
i
=
0
;
i
<
image_size
;
++
i
)
{
out_ptr
[
i
]
=
std
::
max
(
in0_ptr
[
i
],
in1_ptr
[
c
]);
}
}
}
break
;
case
SQR_DIFF
:
#pragma omp parallel for collapse(
3
)
for
(
index_t
b
=
0
;
b
<
batch
;
++
b
)
{
#pragma omp parallel for collapse(
2
)
for
(
index_t
b
=
0
;
b
<
batch
0
;
++
b
)
{
for
(
index_t
c
=
0
;
c
<
channel
;
++
c
)
{
for
(
index_t
i
=
0
;
i
<
hw
;
++
i
)
{
const
index_t
idx0
=
(
b
*
channel
+
c
)
*
hw
+
i
;
const
index_t
idx1
=
c
;
output
[
idx0
]
=
std
::
pow
(
input0
[
idx0
]
-
input1
[
idx1
],
2.
f
);
const
float
*
in0_ptr
=
input0
+
((
b
*
channel
)
+
c
)
*
image_size
;
const
float
*
in1_ptr
=
input1
+
(
batch1
>
1
?
b
*
channel
:
0
);
float
*
out_ptr
=
output
+
((
b
*
channel
)
+
c
)
*
image_size
;
for
(
index_t
i
=
0
;
i
<
image_size
;
++
i
)
{
out_ptr
[
i
]
=
std
::
pow
(
in0_ptr
[
i
]
-
in1_ptr
[
c
],
2.
f
);
}
}
}
break
;
case
POW
:
#pragma omp parallel for collapse(3)
for
(
index_t
b
=
0
;
b
<
batch
;
++
b
)
{
for
(
index_t
c
=
0
;
c
<
channel
;
++
c
)
{
for
(
index_t
i
=
0
;
i
<
hw
;
++
i
)
{
const
index_t
idx0
=
(
b
*
channel
+
c
)
*
hw
+
i
;
const
index_t
idx1
=
c
;
output
[
idx0
]
=
std
::
pow
(
input0
[
idx0
],
input1
[
idx1
]);
if
(
!
swapped
)
{
#pragma omp parallel for collapse(2)
for
(
index_t
b
=
0
;
b
<
batch0
;
++
b
)
{
for
(
index_t
c
=
0
;
c
<
channel
;
++
c
)
{
const
float
*
in0_ptr
=
input0
+
((
b
*
channel
)
+
c
)
*
image_size
;
const
float
*
in1_ptr
=
input1
+
(
batch1
>
1
?
b
*
channel
:
0
);
float
*
out_ptr
=
output
+
((
b
*
channel
)
+
c
)
*
image_size
;
for
(
index_t
i
=
0
;
i
<
image_size
;
++
i
)
{
out_ptr
[
i
]
=
std
::
pow
(
in0_ptr
[
i
],
in1_ptr
[
c
]);
}
}
}
}
else
{
#pragma omp parallel for collapse(2)
for
(
index_t
b
=
0
;
b
<
batch0
;
++
b
)
{
for
(
index_t
c
=
0
;
c
<
channel
;
++
c
)
{
const
float
*
in0_ptr
=
input0
+
((
b
*
channel
)
+
c
)
*
image_size
;
const
float
*
in1_ptr
=
input1
+
(
batch1
>
1
?
b
*
channel
:
0
);
float
*
out_ptr
=
output
+
((
b
*
channel
)
+
c
)
*
image_size
;
for
(
index_t
i
=
0
;
i
<
image_size
;
++
i
)
{
out_ptr
[
i
]
=
std
::
pow
(
in1_ptr
[
c
],
in0_ptr
[
i
]);
}
}
}
}
break
;
default:
LOG
(
FATAL
)
<<
"Eltwise op not support type "
<<
type
;
}
}
inline
void
TensorEltwise
(
const
EltwiseType
type
,
const
float
*
input0
,
const
float
*
input1
,
const
index_t
size
,
float
*
output
)
{
switch
(
type
)
{
case
SUM
:
#pragma omp parallel for
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
output
[
i
]
=
input0
[
i
]
+
input1
[
i
];
}
break
;
case
SUB
:
#pragma omp parallel for
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
output
[
i
]
=
input0
[
i
]
-
input1
[
i
];
}
break
;
case
PROD
:
#pragma omp parallel for
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
output
[
i
]
=
input0
[
i
]
*
input1
[
i
];
}
break
;
case
DIV
:
#pragma omp parallel for
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
output
[
i
]
=
input0
[
i
]
/
input1
[
i
];
}
break
;
case
MIN
:
#pragma omp parallel for
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
output
[
i
]
=
std
::
min
<
float
>
(
input0
[
i
],
input1
[
i
]);
}
break
;
case
MAX
:
#pragma omp parallel for
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
output
[
i
]
=
std
::
max
<
float
>
(
input0
[
i
],
input1
[
i
]);
}
break
;
case
SQR_DIFF
:
case
NEG
:
#pragma omp parallel for
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
output
[
i
]
=
std
::
pow
(
input0
[
i
]
-
input1
[
i
],
2.
f
)
;
for
(
index_t
i
=
0
;
i
<
batch0
*
channel
*
image_
size
;
++
i
)
{
output
[
i
]
=
-
input0
[
i
]
;
}
break
;
case
POW
:
case
ABS
:
#pragma omp parallel for
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
output
[
i
]
=
std
::
pow
(
input0
[
i
],
input1
[
i
]);
for
(
index_t
i
=
0
;
i
<
batch0
*
channel
*
image_
size
;
++
i
)
{
output
[
i
]
=
std
::
fabs
(
input0
[
i
]);
}
break
;
default:
...
...
@@ -444,95 +603,109 @@ inline void TensorEltwise(const EltwiseType type,
}
}
struct
EltwiseFunctorBase
{
EltwiseFunctorBase
(
const
EltwiseType
type
,
const
std
::
vector
<
float
>
&
coeff
,
const
float
value
)
:
type_
(
type
),
coeff_
(
coeff
),
value_
(
value
)
{}
const
float
value
,
const
DataFormat
data_format
)
:
type_
(
type
),
coeff_
(
coeff
),
value_
(
value
),
data_format_
(
data_format
)
{}
EltwiseType
type_
;
std
::
vector
<
float
>
coeff_
;
float
value_
;
DataFormat
data_format_
;
};
template
<
DeviceType
D
,
typename
T
>
struct
EltwiseFunctor
;
template
<
>
struct
EltwiseFunctor
<
DeviceType
::
CPU
,
float
>:
EltwiseFunctorBase
{
struct
EltwiseFunctor
<
DeviceType
::
CPU
,
float
>
:
EltwiseFunctorBase
{
EltwiseFunctor
(
const
EltwiseType
type
,
const
std
::
vector
<
float
>
&
coeff
,
const
float
value
)
:
EltwiseFunctorBase
(
type
,
coeff
,
value
)
{}
const
float
value
,
const
DataFormat
data_format
)
:
EltwiseFunctorBase
(
type
,
coeff
,
value
,
data_format
)
{}
MaceStatus
operator
()(
const
Tensor
*
input0
,
const
Tensor
*
input1
,
Tensor
*
output
,
StatsFuture
*
future
)
{
const
Tensor
*
input1
,
Tensor
*
output
,
StatsFuture
*
future
)
{
MACE_UNUSED
(
future
);
if
(
input1
==
nullptr
)
{
scalar_tensor_
.
Resize
({});
Tensor
::
MappingGuard
guard
(
&
scalar_tensor_
);
auto
scalar_data
=
scalar_tensor_
.
mutable_data
<
float
>
();
scalar_data
[
0
]
=
value_
;
input1
=
&
scalar_tensor_
;
}
bool
swapped
=
false
;
if
(
input1
!=
nullptr
)
{
MACE_CHECK
(
input0
->
dim_size
()
==
input1
->
dim_size
()
||
input0
->
dim_size
()
==
1
||
input1
->
dim_size
()
==
1
)
<<
"Inputs of Eltwise op must be same shape"
;
if
(
input0
->
size
()
!=
input1
->
size
())
{
if
(
input0
->
size
()
<
input1
->
size
())
{
std
::
swap
(
input0
,
input1
);
swapped
=
true
;
}
if
(
input1
->
dim_size
()
==
1
)
{
MACE_CHECK
(
input0
->
dim
(
1
)
==
input1
->
dim
(
0
))
<<
"Element-Wise op only support channel dimension broadcast"
;
}
else
{
MACE_CHECK
((
input0
->
dim
(
0
)
==
input1
->
dim
(
0
)
||
input1
->
dim
(
0
)
==
1
)
&&
input0
->
dim
(
1
)
==
input1
->
dim
(
1
)
&&
input1
->
dim
(
2
)
==
1
&&
input1
->
dim
(
3
)
==
1
)
<<
"Element-Wise op only support channel dimension broadcast"
;
if
(
input0
->
size
()
<
input1
->
size
())
{
std
::
swap
(
input0
,
input1
);
swapped
=
true
;
}
// check if we can broadcast tensor
uint32_t
rank_diff
=
static_cast
<
uint32_t
>
(
input0
->
dim_size
()
-
input1
->
dim_size
());
if
(
data_format_
==
NCHW
)
{
MACE_CHECK
(
input0
->
dim_size
()
==
4
&&
(
input1
->
dim_size
()
==
0
||
input1
->
dim_size
()
==
4
&&
input1
->
dim
(
1
)
==
input0
->
dim
(
1
)
&&
(
input1
->
dim
(
0
)
==
input0
->
dim
(
0
)
||
input1
->
dim
(
0
)
==
1
)
||
input1
->
dim_size
()
==
1
&&
input1
->
dim
(
0
)
==
input0
->
dim
(
1
)),
"only support broadcast channel dimension"
);
}
else
{
if
(
rank_diff
>
0
&&
rank_diff
<
input0
->
dim_size
())
{
for
(
uint32_t
i
=
0
;
i
<
input1
->
dim_size
();
++
i
)
{
MACE_CHECK
(
input0
->
dim
(
rank_diff
+
i
)
==
input1
->
dim
(
i
),
"Element-Wise op only support tail dimensions broadcast"
);
}
}
}
index_t
common_size
=
input1
->
size
();
index_t
diff_size
=
input0
->
size
()
/
common_size
;
MACE_RETURN_IF_ERROR
(
output
->
ResizeLike
(
input0
));
Tensor
::
MappingGuard
input0_guard
(
input0
);
Tensor
::
MappingGuard
input1_guard
(
input1
);
Tensor
::
MappingGuard
output_guard
(
output
);
const
float
*
input0_ptr
=
input0
->
data
<
float
>
();
const
float
*
input1_ptr
=
input1
->
data
<
float
>
();
float
*
output_ptr
=
output
->
mutable_data
<
float
>
();
const
index_t
size
=
input0
->
size
();
if
(
input1
==
nullptr
)
{
TensorScalar
(
type_
,
input0_ptr
,
value_
,
size
,
output_ptr
);
if
(
data_format_
==
NCHW
&&
input1
->
dim_size
()
>
0
&&
input1
->
size
()
<
input0
->
size
())
{
TensorEltwisePerChannel
(
type_
,
input0_ptr
,
input1_ptr
,
coeff_
,
input0
->
dim
(
0
),
input1
->
dim_size
()
==
1
?
1
:
input1
->
dim
(
0
),
input0
->
dim
(
1
),
input0
->
dim
(
2
)
*
input0
->
dim
(
3
),
swapped
,
output_ptr
);
}
else
{
Tensor
::
MappingGuard
input1_guard
(
input1
);
const
float
*
input1_ptr
=
input1
->
data
<
float
>
();
if
(
input1
->
size
()
!=
input0
->
size
())
{
const
index_t
batch
=
input0
->
dim
(
0
);
const
index_t
channel
=
input0
->
dim
(
1
);
const
index_t
hw
=
input0
->
dim
(
2
)
*
input0
->
dim
(
3
);
if
(
input1
->
dim
(
0
)
==
1
||
input1
->
dim_size
()
==
1
)
TensorVector
(
type_
,
input0_ptr
,
input1_ptr
,
batch
,
channel
,
hw
,
swapped
,
output_ptr
);
else
TensorBatchVector
(
type_
,
input0_ptr
,
input1_ptr
,
batch
,
channel
,
hw
,
swapped
,
output_ptr
);
}
else
{
if
(
!
coeff_
.
empty
()
&&
type_
==
SUM
)
{
#pragma omp parallel for
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
output_ptr
[
i
]
=
coeff_
[
0
]
*
input0_ptr
[
i
]
+
coeff_
[
1
]
*
input1_ptr
[
i
];
}
if
(
input1
->
size
()
==
input0
->
size
())
{
TensorEltwise
(
type_
,
input0_ptr
,
input1_ptr
,
coeff_
,
input0
->
size
(),
swapped
,
output_ptr
);
}
else
if
(
input1
->
size
()
<
input0
->
size
())
{
if
(
input1
->
size
()
>
1
)
{
TensorBroadcastEltwise
(
type_
,
input0_ptr
,
input1_ptr
,
coeff_
,
diff_size
,
common_size
,
swapped
,
output_ptr
);
}
else
{
TensorEltwise
(
type_
,
input0_ptr
,
input1_ptr
,
size
,
output_ptr
);
TensorScalarEltwise
(
type_
,
input0_ptr
,
input1_ptr
[
0
],
coeff_
,
input0
->
size
(),
swapped
,
output_ptr
);
}
}
}
return
MACE_SUCCESS
;
}
Tensor
scalar_tensor_
;
};
#ifdef MACE_ENABLE_OPENCL
...
...
@@ -540,13 +713,14 @@ template <typename T>
struct
EltwiseFunctor
<
DeviceType
::
GPU
,
T
>
:
EltwiseFunctorBase
{
EltwiseFunctor
(
const
EltwiseType
type
,
const
std
::
vector
<
float
>
&
coeff
,
const
float
value
)
:
EltwiseFunctorBase
(
type
,
coeff
,
value
)
{}
const
float
value
,
const
DataFormat
data_format
)
:
EltwiseFunctorBase
(
type
,
coeff
,
value
,
data_format
)
{}
MaceStatus
operator
()(
const
Tensor
*
input0
,
const
Tensor
*
input1
,
Tensor
*
output
,
StatsFuture
*
future
);
const
Tensor
*
input1
,
Tensor
*
output
,
StatsFuture
*
future
);
cl
::
Kernel
kernel_
;
uint32_t
kwg_size_
;
...
...
mace/kernels/softmax.h
浏览文件 @
46baf92a
...
...
@@ -42,48 +42,79 @@ struct SoftmaxFunctor<DeviceType::CPU, float> {
Tensor
*
output
,
StatsFuture
*
future
)
{
MACE_UNUSED
(
future
);
const
index_t
batch
=
input
->
dim
(
0
);
const
index_t
class_count
=
input
->
dim
(
1
);
const
index_t
class_size
=
input
->
dim
(
2
)
*
input
->
dim
(
3
);
const
index_t
batch_size
=
class_count
*
class_size
;
Tensor
::
MappingGuard
input_guard
(
input
);
Tensor
::
MappingGuard
output_guard
(
output
);
const
float
*
input_data
=
input
->
data
<
float
>
();
float
*
output_data
=
output
->
mutable_data
<
float
>
();
for
(
index_t
b
=
0
;
b
<
batch
;
++
b
)
{
// softmax for nchw image
if
(
input
->
dim_size
()
==
4
)
{
const
index_t
batch
=
input
->
dim
(
0
);
const
index_t
class_count
=
input
->
dim
(
1
);
const
index_t
class_size
=
input
->
dim
(
2
)
*
input
->
dim
(
3
);
const
index_t
batch_size
=
class_count
*
class_size
;
for
(
index_t
b
=
0
;
b
<
batch
;
++
b
)
{
#pragma omp parallel for
for
(
index_t
k
=
0
;
k
<
class_size
;
++
k
)
{
const
float
*
input_ptr
=
input_data
+
b
*
batch_size
+
k
;
float
*
output_ptr
=
output_data
+
b
*
batch_size
+
k
;
float
max_val
=
std
::
numeric_limits
<
float
>::
lowest
();
index_t
channel_offset
=
0
;
for
(
index_t
c
=
0
;
c
<
class_count
;
++
c
)
{
float
data
=
input_ptr
[
channel_offset
];
if
(
data
>
max_val
)
{
max_val
=
data
;
}
channel_offset
+=
class_size
;
}
channel_offset
=
0
;
float
sum
=
0
;
for
(
index_t
c
=
0
;
c
<
class_count
;
++
c
)
{
float
exp_value
=
::
exp
(
input_ptr
[
channel_offset
]
-
max_val
);
sum
+=
exp_value
;
output_ptr
[
channel_offset
]
=
exp_value
;
channel_offset
+=
class_size
;
}
sum
=
std
::
max
(
sum
,
std
::
numeric_limits
<
float
>::
min
());
channel_offset
=
0
;
for
(
index_t
c
=
0
;
c
<
class_count
;
++
c
)
{
output_ptr
[
channel_offset
]
/=
sum
;
channel_offset
+=
class_size
;
}
}
// k
}
// b
}
else
if
(
input
->
dim_size
()
==
2
)
{
// normal 2d softmax
const
index_t
class_size
=
input
->
dim
(
0
);
const
index_t
class_count
=
input
->
dim
(
1
);
#pragma omp parallel for
for
(
index_t
k
=
0
;
k
<
class_size
;
++
k
)
{
const
float
*
input_ptr
=
input_data
+
b
*
batch_size
+
k
;
float
*
output_ptr
=
output_data
+
b
*
batch_size
+
k
;
const
float
*
input_ptr
=
input_data
+
k
*
class_count
;
float
*
output_ptr
=
output_data
+
k
*
class_count
;
float
max_val
=
std
::
numeric_limits
<
float
>::
lowest
();
index_t
channel_offset
=
0
;
for
(
index_t
c
=
0
;
c
<
class_count
;
++
c
)
{
float
data
=
input_ptr
[
channel_offset
];
if
(
data
>
max_val
)
{
max_val
=
data
;
}
channel_offset
+=
class_size
;
max_val
=
std
::
max
(
max_val
,
input_ptr
[
c
]);
}
channel_offset
=
0
;
float
sum
=
0
;
for
(
index_t
c
=
0
;
c
<
class_count
;
++
c
)
{
float
exp_value
=
::
exp
(
input_ptr
[
c
hannel_offset
]
-
max_val
);
float
exp_value
=
::
exp
(
input_ptr
[
c
]
-
max_val
);
sum
+=
exp_value
;
output_ptr
[
channel_offset
]
=
exp_value
;
channel_offset
+=
class_size
;
output_ptr
[
c
]
=
exp_value
;
}
channel_offset
=
0
;
sum
=
std
::
max
(
sum
,
std
::
numeric_limits
<
float
>::
min
())
;
for
(
index_t
c
=
0
;
c
<
class_count
;
++
c
)
{
output_ptr
[
channel_offset
]
/=
sum
;
channel_offset
+=
class_size
;
output_ptr
[
c
]
/=
sum
;
}
}
// k
}
// b
}
}
else
{
MACE_NOT_IMPLEMENTED
;
}
return
MACE_SUCCESS
;
}
...
...
mace/ops/eltwise.h
浏览文件 @
46baf92a
...
...
@@ -30,7 +30,9 @@ class EltwiseOp : public Operator<D, T> {
static_cast
<
kernels
::
EltwiseType
>
(
OperatorBase
::
GetOptionalArg
<
int
>
(
"type"
,
static_cast
<
int
>
(
kernels
::
EltwiseType
::
NONE
))),
OperatorBase
::
GetRepeatedArgs
<
float
>
(
"coeff"
),
OperatorBase
::
GetOptionalArg
<
float
>
(
"value"
,
1.0
))
{}
OperatorBase
::
GetOptionalArg
<
float
>
(
"value"
,
1.0
),
static_cast
<
DataFormat
>
(
OperatorBase
::
GetOptionalArg
<
int
>
(
"data_format"
,
0
)))
{}
MaceStatus
Run
(
StatsFuture
*
future
)
override
{
const
Tensor
*
input0
=
this
->
Input
(
0
);
...
...
mace/ops/eltwise_test.cc
浏览文件 @
46baf92a
...
...
@@ -41,6 +41,7 @@ void SimpleTensorScalar(const kernels::EltwiseType type,
.
Input
(
"TInput"
)
.
AddIntArg
(
"type"
,
static_cast
<
int
>
(
type
))
.
AddFloatArg
(
"value"
,
x
)
.
AddIntArg
(
"data_format"
,
DataFormat
::
NCHW
)
.
Output
(
"TOutput"
)
.
Finalize
(
net
.
NewOperatorDef
());
// Run
...
...
@@ -84,15 +85,24 @@ void SimpleTensorEltwise(const kernels::EltwiseType type,
net
.
AddInputFromArray
<
D
,
float
>
(
"Input1"
,
shape1
,
input1
);
if
(
D
==
DeviceType
::
CPU
)
{
net
.
TransformDataFormat
<
D
,
float
>
(
"Input0"
,
NHWC
,
"TInput0"
,
NCHW
);
net
.
TransformDataFormat
<
D
,
float
>
(
"Input1"
,
NHWC
,
"TInput1"
,
NCHW
);
OpDefBuilder
(
"Eltwise"
,
"EltwiseTest"
)
.
Input
(
"TInput0"
)
.
Input
(
"TInput1"
)
auto
op_builder
=
OpDefBuilder
(
"Eltwise"
,
"EltwiseTest"
)
.
AddIntArg
(
"type"
,
static_cast
<
int
>
(
type
))
.
AddFloatsArg
(
"coeff"
,
coeff
)
.
Output
(
"TOutput"
)
.
Finalize
(
net
.
NewOperatorDef
());
.
AddIntArg
(
"data_format"
,
DataFormat
::
NCHW
)
.
Output
(
"TOutput"
);
if
(
shape0
.
size
()
>
1
)
{
net
.
TransformDataFormat
<
D
,
float
>
(
"Input0"
,
NHWC
,
"TInput0"
,
NCHW
);
op_builder
.
Input
(
"TInput0"
);
}
else
{
op_builder
.
Input
(
"Input0"
);
}
if
(
shape1
.
size
()
>
1
)
{
net
.
TransformDataFormat
<
D
,
float
>
(
"Input1"
,
NHWC
,
"TInput1"
,
NCHW
);
op_builder
.
Input
(
"TInput1"
);
}
else
{
op_builder
.
Input
(
"Input1"
);
}
op_builder
.
Finalize
(
net
.
NewOperatorDef
());
// Run
net
.
RunOp
(
D
);
...
...
@@ -214,6 +224,35 @@ TEST_F(EltwiseOpTest, CPUSimpleTensorVector) {
kernels
::
EltwiseType
::
SQR_DIFF
,
{
1
,
1
,
1
,
5
},
{
1
,
2
,
3
,
4
,
5
},
{
1
,
2
,
1
,
5
},
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
},
{
0
,
0
,
0
,
0
,
0
,
25
,
25
,
25
,
25
,
25
});
SimpleTensorEltwise
<
DeviceType
::
CPU
,
float
>
(
kernels
::
EltwiseType
::
SUM
,
{
1
,
1
,
2
,
3
},
{
1
,
2
,
3
,
4
,
5
,
6
},
{
3
},
{
1
,
2
,
3
},
{
2
,
4
,
6
,
5
,
7
,
9
});
SimpleTensorEltwise
<
DeviceType
::
CPU
,
float
>
(
kernels
::
EltwiseType
::
SUB
,
{
1
,
2
,
1
,
5
},
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
},
{
5
},
{
1
,
2
,
3
,
4
,
5
},
{
0
,
0
,
0
,
0
,
0
,
5
,
5
,
5
,
5
,
5
});
SimpleTensorEltwise
<
DeviceType
::
CPU
,
float
>
(
kernels
::
EltwiseType
::
SUB
,
{
5
},
{
1
,
2
,
3
,
4
,
5
},
{
1
,
2
,
1
,
5
},
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
},
{
0
,
0
,
0
,
0
,
0
,
-
5
,
-
5
,
-
5
,
-
5
,
-
5
});
SimpleTensorEltwise
<
DeviceType
::
CPU
,
float
>
(
kernels
::
EltwiseType
::
PROD
,
{
3
},
{
1
,
2
,
3
},
{
1
,
2
,
1
,
3
},
{
1
,
2
,
3
,
4
,
5
,
6
},
{
1
,
4
,
9
,
4
,
10
,
18
});
SimpleTensorEltwise
<
DeviceType
::
CPU
,
float
>
(
kernels
::
EltwiseType
::
DIV
,
{
1
,
2
,
1
,
5
},
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
},
{
5
},
{
1
,
1
,
1
,
1
,
5
},
{
1
,
2
,
3
,
4
,
1
,
6
,
7
,
8
,
9
,
2
});
SimpleTensorEltwise
<
DeviceType
::
CPU
,
float
>
(
kernels
::
EltwiseType
::
DIV
,
{
5
},
{
1
,
1
,
1
,
2
,
4
},
{
1
,
2
,
1
,
5
},
{
1
,
1
,
1
,
2
,
2
,
1
,
1
,
1
,
1
,
1
},
{
1
,
1
,
1
,
1
,
2
,
1
,
1
,
1
,
2
,
4
});
SimpleTensorEltwise
<
DeviceType
::
CPU
,
float
>
(
kernels
::
EltwiseType
::
MIN
,
{
5
},
{
1
,
2
,
3
,
4
,
5
},
{
1
,
2
,
1
,
5
},
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
},
{
1
,
2
,
3
,
4
,
5
,
1
,
2
,
3
,
4
,
5
});
SimpleTensorEltwise
<
DeviceType
::
CPU
,
float
>
(
kernels
::
EltwiseType
::
MAX
,
{
1
,
2
,
1
,
5
},
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
},
{
5
},
{
1
,
2
,
3
,
4
,
5
},
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
});
SimpleTensorEltwise
<
DeviceType
::
CPU
,
float
>
(
kernels
::
EltwiseType
::
SQR_DIFF
,
{
5
},
{
1
,
2
,
3
,
4
,
5
},
{
1
,
2
,
1
,
5
},
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
},
{
0
,
0
,
0
,
0
,
0
,
25
,
25
,
25
,
25
,
25
});
}
TEST_F
(
EltwiseOpTest
,
GPUSimpleTensorVector
)
{
...
...
@@ -322,6 +361,7 @@ void RandomTensorScalar(const kernels::EltwiseType type,
.
Input
(
"TInput"
)
.
AddIntArg
(
"type"
,
static_cast
<
int
>
(
type
))
.
AddFloatArg
(
"value"
,
0.1
)
.
AddIntArg
(
"data_format"
,
DataFormat
::
NCHW
)
.
Output
(
"TOutput"
)
.
Finalize
(
net
.
NewOperatorDef
());
// Run
...
...
@@ -375,6 +415,7 @@ void RandomTensorEltwise(const kernels::EltwiseType type,
.
Input
(
"TInput1"
)
.
AddIntArg
(
"type"
,
static_cast
<
int
>
(
type
))
.
AddFloatsArg
(
"coeff"
,
coeff
)
.
AddIntArg
(
"data_format"
,
DataFormat
::
NCHW
)
.
Output
(
"TOutput"
)
.
Finalize
(
net
.
NewOperatorDef
());
...
...
mace/ops/softmax_test.cc
浏览文件 @
46baf92a
...
...
@@ -29,8 +29,12 @@ void Simple() {
// Add input data
net
.
AddInputFromArray
<
D
,
float
>
(
"Input"
,
{
1
,
1
,
2
,
4
},
{
1
,
1
,
1
,
1
,
1
,
2
,
3
,
4
});
auto
expected
=
CreateTensor
<
float
>
(
{
1
,
1
,
2
,
4
},
{
0.25
,
0.25
,
0.25
,
0.25
,
0.0320586
,
0.08714432
,
0.23688282
,
0.64391426
});
if
(
D
==
DeviceType
::
CPU
)
{
// test 4d softmax
net
.
TransformDataFormat
<
CPU
,
float
>
(
"Input"
,
NHWC
,
"InputNCHW"
,
NCHW
);
OpDefBuilder
(
"Softmax"
,
"SoftmaxTest"
)
.
Input
(
"InputNCHW"
)
...
...
@@ -40,6 +44,21 @@ void Simple() {
// Run
net
.
RunOp
(
D
);
net
.
TransformDataFormat
<
CPU
,
float
>
(
"OutputNCHW"
,
NCHW
,
"Output"
,
NHWC
);
ExpectTensorNear
<
float
>
(
*
expected
,
*
net
.
GetOutput
(
"Output"
),
1e-5
);
// check 2d softmax
net
.
AddInputFromArray
<
D
,
float
>
(
"Input2d"
,
{
2
,
4
},
{
1
,
1
,
1
,
1
,
1
,
2
,
3
,
4
});
OpDefBuilder
(
"Softmax"
,
"SoftmaxTest"
)
.
Input
(
"Input2d"
)
.
Output
(
"Output"
)
.
Finalize
(
net
.
NewOperatorDef
());
// Run
net
.
RunOp
(
D
);
net
.
GetOutput
(
"Output"
)
->
Reshape
({
1
,
1
,
2
,
4
});
ExpectTensorNear
<
float
>
(
*
expected
,
*
net
.
GetOutput
(
"Output"
),
1e-5
);
}
else
if
(
D
==
DeviceType
::
GPU
)
{
BufferToImage
<
D
,
float
>
(
&
net
,
"Input"
,
"InputImage"
,
kernels
::
BufferType
::
IN_OUT_CHANNEL
);
...
...
@@ -55,15 +74,11 @@ void Simple() {
// Transfer output
ImageToBuffer
<
D
,
float
>
(
&
net
,
"OutputImage"
,
"Output"
,
kernels
::
BufferType
::
IN_OUT_CHANNEL
);
ExpectTensorNear
<
float
>
(
*
expected
,
*
net
.
GetOutput
(
"Output"
),
1e-5
);
}
else
{
MACE_NOT_IMPLEMENTED
;
}
auto
expected
=
CreateTensor
<
float
>
(
{
1
,
1
,
2
,
4
},
{
0.25
,
0.25
,
0.25
,
0.25
,
0.0320586
,
0.08714432
,
0.23688282
,
0.64391426
});
ExpectTensorNear
<
float
>
(
*
expected
,
*
net
.
GetOutput
(
"Output"
),
1e-5
);
}
}
// namespace
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录