Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Xiaomi
Mace
提交
cd506756
Mace
项目概览
Xiaomi
/
Mace
通知
106
Star
40
Fork
27
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
Mace
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
cd506756
编写于
6月 06, 2018
作者:
李
李寅
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Gemm transpose
上级
c0c0dfe5
变更
14
展开全部
隐藏空白更改
内联
并排
Showing
14 changed file
with
685 addition
and
439 deletion
+685
-439
mace/kernels/gemm.cc
mace/kernels/gemm.cc
+386
-289
mace/kernels/gemm.h
mace/kernels/gemm.h
+6
-2
mace/kernels/gemm_test.cc
mace/kernels/gemm_test.cc
+40
-43
mace/kernels/matmul.h
mace/kernels/matmul.h
+40
-14
mace/kernels/opencl/buffer_to_image.cc
mace/kernels/opencl/buffer_to_image.cc
+5
-1
mace/kernels/opencl/helper.cc
mace/kernels/opencl/helper.cc
+16
-8
mace/kernels/opencl/image_to_buffer.cc
mace/kernels/opencl/image_to_buffer.cc
+5
-1
mace/kernels/opencl/matmul.cc
mace/kernels/opencl/matmul.cc
+18
-9
mace/kernels/opencl/winograd_transform.cc
mace/kernels/opencl/winograd_transform.cc
+2
-2
mace/ops/matmul.h
mace/ops/matmul.h
+25
-12
mace/ops/matmul_benchmark.cc
mace/ops/matmul_benchmark.cc
+58
-2
mace/ops/matmul_test.cc
mace/ops/matmul_test.cc
+81
-53
mace/python/tools/converter_tool/transformer.py
mace/python/tools/converter_tool/transformer.py
+2
-2
mace/python/tools/memory_optimizer.py
mace/python/tools/memory_optimizer.py
+1
-1
未找到文件。
mace/kernels/gemm.cc
浏览文件 @
cd506756
此差异已折叠。
点击以展开。
mace/kernels/gemm.h
浏览文件 @
cd506756
...
...
@@ -30,7 +30,9 @@ void Gemm(const float *A,
const
index_t
height
,
const
index_t
K
,
const
index_t
width
,
float
*
C
);
float
*
C
,
const
bool
transpose_a
=
false
,
const
bool
transpose_b
=
false
);
void
GemmRef
(
const
float
*
A
,
const
float
*
B
,
...
...
@@ -38,7 +40,9 @@ void GemmRef(const float *A,
const
index_t
height
,
const
index_t
K
,
const
index_t
width
,
float
*
C
);
float
*
C
,
const
bool
transpose_a
=
false
,
const
bool
transpose_b
=
false
);
void
Gemv
(
const
float
*
m_ptr
,
const
float
*
v_ptr
,
...
...
mace/kernels/gemm_test.cc
浏览文件 @
cd506756
...
...
@@ -13,17 +13,22 @@
// limitations under the License.
#include <gtest/gtest.h>
#include <random>
#include <memory>
#include <random>
#include "mace/kernels/gemm.h"
#include "mace/core/types.h"
#include "mace/kernels/gemm.h"
namespace
mace
{
namespace
{
void
GemmTest
(
index_t
batch
,
index_t
N
,
index_t
K
,
index_t
M
)
{
void
GemmTest
(
index_t
batch
,
index_t
N
,
index_t
K
,
index_t
M
,
bool
transpose_a
,
bool
transpose_b
)
{
std
::
unique_ptr
<
float
[]
>
A
(
new
float
[
batch
*
N
*
K
]);
std
::
unique_ptr
<
float
[]
>
B
(
new
float
[
batch
*
K
*
M
]);
std
::
unique_ptr
<
float
[]
>
C
(
new
float
[
batch
*
N
*
M
]);
...
...
@@ -34,15 +39,13 @@ void GemmTest(index_t batch, index_t N, index_t K, index_t M) {
std
::
normal_distribution
<
float
>
nd
(
0
,
1
);
std
::
generate
(
A
.
get
(),
A
.
get
()
+
batch
*
N
*
K
,
[
&
gen
,
&
nd
]
{
return
nd
(
gen
);
});
[
&
gen
,
&
nd
]
{
return
nd
(
gen
);
});
std
::
generate
(
B
.
get
(),
B
.
get
()
+
batch
*
K
*
M
,
[
&
gen
,
&
nd
]
{
return
nd
(
gen
);
}
);
kernels
::
Gemm
(
A
.
get
(),
B
.
get
(),
batch
,
N
,
K
,
M
,
C
.
get
());
kernels
::
GemmRef
(
A
.
get
(),
B
.
get
(),
batch
,
N
,
K
,
M
,
C_ref
.
get
()
);
[
&
gen
,
&
nd
]
{
return
nd
(
gen
);
});
kernels
::
Gemm
(
A
.
get
(),
B
.
get
(),
batch
,
N
,
K
,
M
,
C
.
get
(),
transpose_a
,
transpose_b
);
kernels
::
Gemm
Ref
(
A
.
get
(),
B
.
get
(),
batch
,
N
,
K
,
M
,
C_ref
.
get
(),
transpose_a
,
transpose_b
);
for
(
int
i
=
0
;
i
<
batch
*
N
*
M
;
++
i
)
{
EXPECT_NEAR
(
C_ref
[
i
],
C
[
i
],
0.1
);
...
...
@@ -59,14 +62,8 @@ void GemvTest(index_t batch, index_t N, index_t M) {
std
::
mt19937
gen
(
rd
());
std
::
normal_distribution
<
float
>
nd
(
0
,
1
);
std
::
generate
(
A
.
get
(),
A
.
get
()
+
N
*
M
,
[
&
gen
,
&
nd
]
{
return
nd
(
gen
);
});
std
::
generate
(
B
.
get
(),
B
.
get
()
+
batch
*
M
,
[
&
gen
,
&
nd
]
{
return
nd
(
gen
);
});
std
::
generate
(
A
.
get
(),
A
.
get
()
+
N
*
M
,
[
&
gen
,
&
nd
]
{
return
nd
(
gen
);
});
std
::
generate
(
B
.
get
(),
B
.
get
()
+
batch
*
M
,
[
&
gen
,
&
nd
]
{
return
nd
(
gen
);
});
kernels
::
Gemv
(
A
.
get
(),
B
.
get
(),
batch
,
M
,
N
,
C
.
get
());
kernels
::
GemvRef
(
A
.
get
(),
B
.
get
(),
batch
,
M
,
N
,
C_ref
.
get
());
...
...
@@ -78,36 +75,36 @@ void GemvTest(index_t batch, index_t N, index_t M) {
}
// namespace
TEST
(
GEMMTest
,
AlignedWithoutBatch
)
{
GemmTest
(
1
,
1
,
64
,
128
);
GemmTest
(
1
,
2
,
64
,
128
);
GemmTest
(
1
,
3
,
64
,
128
);
GemmTest
(
1
,
4
,
64
,
128
);
GemmTest
(
1
,
5
,
64
,
128
);
GemmTest
(
1
,
6
,
64
,
128
);
GemmTest
(
1
,
7
,
64
,
128
);
GemmTest
(
1
,
17
,
64
,
128
);
GemmTest
(
1
,
1
,
64
,
128
,
false
,
false
);
GemmTest
(
1
,
2
,
64
,
128
,
false
,
true
);
GemmTest
(
1
,
3
,
64
,
128
,
true
,
false
);
GemmTest
(
1
,
4
,
64
,
128
,
true
,
true
);
GemmTest
(
1
,
5
,
64
,
128
,
false
,
false
);
GemmTest
(
1
,
6
,
64
,
128
,
false
,
true
);
GemmTest
(
1
,
7
,
64
,
128
,
true
,
false
);
GemmTest
(
1
,
17
,
64
,
128
,
true
,
true
);
}
TEST
(
GEMMTest
,
UnalignedWithoutBatch
)
{
GemmTest
(
1
,
1
,
63
,
127
);
GemmTest
(
1
,
2
,
63
,
127
);
GemmTest
(
1
,
3
,
63
,
127
);
GemmTest
(
1
,
4
,
63
,
127
);
GemmTest
(
1
,
5
,
63
,
127
);
GemmTest
(
1
,
6
,
63
,
127
);
GemmTest
(
1
,
7
,
63
,
127
);
GemmTest
(
1
,
17
,
63
,
127
);
GemmTest
(
1
,
1
,
63
,
127
,
false
,
false
);
GemmTest
(
1
,
2
,
63
,
127
,
false
,
true
);
GemmTest
(
1
,
3
,
63
,
127
,
true
,
false
);
GemmTest
(
1
,
4
,
63
,
127
,
true
,
true
);
GemmTest
(
1
,
5
,
63
,
127
,
false
,
false
);
GemmTest
(
1
,
6
,
63
,
127
,
false
,
true
);
GemmTest
(
1
,
7
,
63
,
127
,
true
,
false
);
GemmTest
(
1
,
17
,
63
,
127
,
true
,
true
);
}
TEST
(
GEMMTest
,
UnalignedWithBatch
)
{
GemmTest
(
3
,
1
,
63
,
127
);
GemmTest
(
3
,
2
,
63
,
127
);
GemmTest
(
3
,
3
,
63
,
127
);
GemmTest
(
3
,
4
,
63
,
127
);
GemmTest
(
3
,
5
,
63
,
127
);
GemmTest
(
3
,
6
,
63
,
127
);
GemmTest
(
3
,
7
,
63
,
127
);
GemmTest
(
3
,
17
,
63
,
127
);
GemmTest
(
3
,
1
,
63
,
127
,
false
,
false
);
GemmTest
(
3
,
2
,
63
,
127
,
false
,
true
);
GemmTest
(
3
,
3
,
63
,
127
,
true
,
false
);
GemmTest
(
3
,
4
,
63
,
127
,
true
,
true
);
GemmTest
(
3
,
5
,
63
,
127
,
false
,
false
);
GemmTest
(
3
,
6
,
63
,
127
,
false
,
true
);
GemmTest
(
3
,
7
,
63
,
127
,
true
,
false
);
GemmTest
(
3
,
17
,
63
,
127
,
true
,
true
);
}
TEST
(
GEMMTest
,
gemv
)
{
...
...
mace/kernels/matmul.h
浏览文件 @
cd506756
...
...
@@ -20,6 +20,8 @@
#endif
#include <algorithm>
#include <utility>
#include <functional>
#include <memory>
#include <string>
#include <vector>
...
...
@@ -36,14 +38,39 @@
namespace
mace
{
namespace
kernels
{
template
<
DeviceType
D
,
typename
T
>
template
<
DeviceType
D
,
typename
T
>
struct
MatMulFunctor
{
MaceStatus
operator
()(
const
Tensor
*
A
,
const
Tensor
*
B
,
Tensor
*
C
,
StatsFuture
*
future
)
{
const
Tensor
*
B
,
Tensor
*
C
,
bool
transpose_a
,
bool
transpose_b
,
StatsFuture
*
future
)
{
MACE_UNUSED
(
future
);
std
::
vector
<
index_t
>
c_shape
=
{
A
->
dim
(
0
),
A
->
dim
(
1
),
B
->
dim
(
2
),
1
};
index_t
batch
;
index_t
height
;
index_t
K
;
index_t
width
;
index_t
rank
=
A
->
dim_size
();
height
=
A
->
dim
(
rank
-
2
);
K
=
A
->
dim
(
rank
-
1
);
if
(
transpose_a
)
{
std
::
swap
(
height
,
K
);
}
if
(
transpose_b
)
{
width
=
B
->
dim
(
rank
-
2
);
}
else
{
width
=
B
->
dim
(
rank
-
1
);
}
batch
=
std
::
accumulate
(
A
->
shape
().
begin
(),
A
->
shape
().
end
()
-
2
,
1
,
std
::
multiplies
<
index_t
>
());
std
::
vector
<
index_t
>
c_shape
=
A
->
shape
();
c_shape
[
rank
-
2
]
=
height
;
c_shape
[
rank
-
1
]
=
width
;
MACE_RETURN_IF_ERROR
(
C
->
Resize
(
c_shape
));
Tensor
::
MappingGuard
guarda
(
A
);
...
...
@@ -53,28 +80,27 @@ struct MatMulFunctor {
const
T
*
b_ptr_base
=
B
->
data
<
T
>
();
T
*
c_ptr_base
=
C
->
mutable_data
<
T
>
();
const
index_t
batch
=
C
->
dim
(
0
);
const
index_t
height
=
C
->
dim
(
1
);
const
index_t
width
=
C
->
dim
(
2
);
const
index_t
K
=
A
->
dim
(
2
);
// It is better to use large block size if it fits for fast cache.
// Assume l1 cache size is 32k, we load three blocks at a time (A, B, C),
// the block size should be sqrt(32k / sizeof(T) / 3).
memset
(
c_ptr_base
,
0
,
batch
*
height
*
width
*
sizeof
(
T
));
Gemm
(
a_ptr_base
,
b_ptr_base
,
batch
,
height
,
K
,
width
,
c_ptr_base
);
Gemm
(
a_ptr_base
,
b_ptr_base
,
batch
,
height
,
K
,
width
,
c_ptr_base
,
transpose_a
,
transpose_b
);
return
MACE_SUCCESS
;
}
};
#ifdef MACE_ENABLE_OPENCL
template
<
typename
T
>
template
<
typename
T
>
struct
MatMulFunctor
<
DeviceType
::
GPU
,
T
>
{
MaceStatus
operator
()(
const
Tensor
*
A
,
const
Tensor
*
B
,
Tensor
*
C
,
StatsFuture
*
future
);
const
Tensor
*
B
,
Tensor
*
C
,
bool
transpose_a
,
bool
transpose_b
,
StatsFuture
*
future
);
cl
::
Kernel
kernel_
;
uint32_t
kwg_size_
;
...
...
mace/kernels/opencl/buffer_to_image.cc
浏览文件 @
cd506756
...
...
@@ -134,7 +134,11 @@ MaceStatus BufferToImageFunctor<DeviceType::GPU, T>::operator()(
}
else
{
b2f_kernel
.
setArg
(
idx
++
,
static_cast
<
uint32_t
>
(
buffer
->
dim
(
1
)));
b2f_kernel
.
setArg
(
idx
++
,
static_cast
<
uint32_t
>
(
buffer
->
dim
(
2
)));
b2f_kernel
.
setArg
(
idx
++
,
static_cast
<
uint32_t
>
(
buffer
->
dim
(
3
)));
if
(
buffer
->
dim_size
()
<
4
)
{
b2f_kernel
.
setArg
(
idx
++
,
static_cast
<
uint32_t
>
(
1
));
}
else
{
b2f_kernel
.
setArg
(
idx
++
,
static_cast
<
uint32_t
>
(
buffer
->
dim
(
3
)));
}
}
b2f_kernel
.
setArg
(
idx
++
,
*
(
image
->
opencl_image
()));
...
...
mace/kernels/opencl/helper.cc
浏览文件 @
cd506756
...
...
@@ -76,19 +76,27 @@ void CalWinogradFilterImageShape(
// [W * C, N * RoundUp<4>(H)]
void
CalInOutHeightImageShape
(
const
std
::
vector
<
index_t
>
&
shape
,
/* NHWC */
std
::
vector
<
size_t
>
*
image_shape
)
{
MACE_CHECK
(
shape
.
size
()
==
4
);
std
::
vector
<
index_t
>
padded_shape
=
shape
;
while
(
padded_shape
.
size
()
<
4
)
{
padded_shape
.
push_back
(
1
);
}
MACE_CHECK
(
padded_shape
.
size
()
==
4
);
image_shape
->
resize
(
2
);
(
*
image_shape
)[
0
]
=
shape
[
2
]
*
shape
[
3
];
(
*
image_shape
)[
1
]
=
shape
[
0
]
*
RoundUpDiv4
(
shape
[
1
]);
(
*
image_shape
)[
0
]
=
padded_shape
[
2
]
*
padded_
shape
[
3
];
(
*
image_shape
)[
1
]
=
padded_shape
[
0
]
*
RoundUpDiv4
(
padded_
shape
[
1
]);
}
// [RoundUp<4>(W) * C, N * H]
void
CalInOutWidthImageShape
(
const
std
::
vector
<
index_t
>
&
shape
,
/* NHWC */
std
::
vector
<
size_t
>
*
image_shape
)
{
MACE_CHECK
(
shape
.
size
()
==
4
);
std
::
vector
<
index_t
>
padded_shape
=
shape
;
while
(
padded_shape
.
size
()
<
4
)
{
padded_shape
.
push_back
(
1
);
}
MACE_CHECK
(
padded_shape
.
size
()
==
4
);
image_shape
->
resize
(
2
);
(
*
image_shape
)[
0
]
=
RoundUpDiv4
(
shape
[
2
])
*
shape
[
3
];
(
*
image_shape
)[
1
]
=
shape
[
0
]
*
shape
[
1
];
(
*
image_shape
)[
0
]
=
RoundUpDiv4
(
padded_shape
[
2
])
*
padded_
shape
[
3
];
(
*
image_shape
)[
1
]
=
padded_shape
[
0
]
*
padded_
shape
[
1
];
}
// [Ic * H * W, (Oc + 3) / 4]
...
...
@@ -150,10 +158,10 @@ void CalImage2DShape(const std::vector<index_t> &shape, /* NHWC */
std
::
vector
<
index_t
>
CalWinogradShape
(
const
std
::
vector
<
index_t
>
&
shape
,
const
BufferType
type
)
{
if
(
type
==
WINOGRAD_FILTER
)
{
return
{
16
,
shape
[
0
],
shape
[
1
]
,
1
};
return
{
16
,
shape
[
0
],
shape
[
1
]};
}
else
if
(
type
==
IN_OUT_HEIGHT
)
{
index_t
out_width
=
shape
[
0
]
*
((
shape
[
1
]
-
1
)
/
2
)
*
((
shape
[
2
]
-
1
)
/
2
);
return
{
16
,
shape
[
3
],
out_width
,
1
};
return
{
16
,
shape
[
3
],
out_width
};
}
else
{
LOG
(
FATAL
)
<<
"Mace not supported yet."
;
return
std
::
vector
<
index_t
>
();
...
...
mace/kernels/opencl/image_to_buffer.cc
浏览文件 @
cd506756
...
...
@@ -122,7 +122,11 @@ MaceStatus ImageToBufferFunctor<DeviceType::GPU, T>::operator()(
}
else
{
b2f_kernel
.
setArg
(
idx
++
,
static_cast
<
uint32_t
>
(
buffer
->
dim
(
1
)));
b2f_kernel
.
setArg
(
idx
++
,
static_cast
<
uint32_t
>
(
buffer
->
dim
(
2
)));
b2f_kernel
.
setArg
(
idx
++
,
static_cast
<
uint32_t
>
(
buffer
->
dim
(
3
)));
if
(
buffer
->
dim_size
()
<
4
)
{
b2f_kernel
.
setArg
(
idx
++
,
static_cast
<
uint32_t
>
(
1
));
}
else
{
b2f_kernel
.
setArg
(
idx
++
,
static_cast
<
uint32_t
>
(
buffer
->
dim
(
3
)));
}
}
b2f_kernel
.
setArg
(
idx
++
,
*
(
image
->
opencl_image
()));
...
...
mace/kernels/opencl/matmul.cc
浏览文件 @
cd506756
...
...
@@ -24,17 +24,27 @@ template <typename T>
MaceStatus
MatMulFunctor
<
DeviceType
::
GPU
,
T
>::
operator
()(
const
Tensor
*
A
,
const
Tensor
*
B
,
Tensor
*
C
,
bool
transpose_a
,
bool
transpose_b
,
StatsFuture
*
future
)
{
MACE_UNUSED
(
future
);
std
::
vector
<
index_t
>
c_shape
=
{
A
->
dim
(
0
),
A
->
dim
(
1
),
B
->
dim
(
2
),
1
};
MACE_CHECK
(
!
transpose_a
&&
!
transpose_b
,
"GPU does not support transpose matmul"
);
index_t
rank
=
A
->
dim_size
();
index_t
height
=
A
->
dim
(
rank
-
2
);
index_t
K
=
A
->
dim
(
rank
-
1
);
index_t
width
=
B
->
dim
(
rank
-
1
);
index_t
batch
=
std
::
accumulate
(
A
->
shape
().
begin
(),
A
->
shape
().
end
()
-
2
,
1
,
std
::
multiplies
<
index_t
>
());
std
::
vector
<
index_t
>
c_shape
=
A
->
shape
();
c_shape
[
rank
-
2
]
=
height
;
c_shape
[
rank
-
1
]
=
width
;
std
::
vector
<
size_t
>
c_image_shape
;
CalImage2DShape
(
c_shape
,
BufferType
::
IN_OUT_HEIGHT
,
&
c_image_shape
);
MACE_RETURN_IF_ERROR
(
C
->
ResizeImage
(
c_shape
,
c_image_shape
));
const
index_t
batch
=
C
->
dim
(
0
);
const
index_t
height
=
C
->
dim
(
1
);
const
index_t
width
=
C
->
dim
(
2
);
const
index_t
height_blocks
=
RoundUpDiv4
(
height
);
const
index_t
width_blocks
=
RoundUpDiv4
(
width
);
const
uint32_t
gws
[
2
]
=
{
...
...
@@ -82,13 +92,12 @@ MaceStatus MatMulFunctor<DeviceType::GPU, T>::operator()(const Tensor *A,
kernel_
.
setArg
(
idx
++
,
*
(
C
->
opencl_image
()));
kernel_
.
setArg
(
idx
++
,
static_cast
<
int
>
(
height
));
kernel_
.
setArg
(
idx
++
,
static_cast
<
int
>
(
width
));
kernel_
.
setArg
(
idx
++
,
static_cast
<
int
>
(
A
->
dim
(
2
)
));
kernel_
.
setArg
(
idx
++
,
static_cast
<
int
>
(
K
));
kernel_
.
setArg
(
idx
++
,
static_cast
<
int
>
(
height_blocks
));
kernel_
.
setArg
(
idx
++
,
static_cast
<
int
>
(
RoundUpDiv4
(
A
->
dim
(
2
)
)));
kernel_
.
setArg
(
idx
++
,
static_cast
<
int
>
(
RoundUpDiv4
(
K
)));
const
std
::
vector
<
uint32_t
>
lws
=
{
kwg_size_
/
64
,
64
,
0
};
std
::
string
tuning_key
=
Concat
(
"matmul_opencl_kernel"
,
C
->
dim
(
0
),
C
->
dim
(
1
),
C
->
dim
(
2
),
C
->
dim
(
3
));
std
::
string
tuning_key
=
Concat
(
"matmul_opencl_kernel"
,
batch
,
height
,
width
);
TuningOrRun2DKernel
(
kernel_
,
tuning_key
,
gws
,
lws
,
future
);
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
...
...
mace/kernels/opencl/winograd_transform.cc
浏览文件 @
cd506756
...
...
@@ -74,7 +74,7 @@ MaceStatus WinogradTransformFunctor<DeviceType::GPU, T>::operator()(
static_cast
<
uint32_t
>
(
RoundUpDiv4
(
input_tensor
->
dim
(
3
)))};
if
(
!
IsVecEqual
(
input_shape_
,
input_tensor
->
shape
()))
{
output_shape
=
{
16
,
input_tensor
->
dim
(
3
),
out_width
,
1
};
output_shape
=
{
16
,
input_tensor
->
dim
(
3
),
out_width
};
std
::
vector
<
size_t
>
image_shape
;
CalImage2DShape
(
output_shape
,
BufferType
::
IN_OUT_HEIGHT
,
&
image_shape
);
MACE_RETURN_IF_ERROR
(
output_tensor
->
ResizeImage
(
output_shape
,
image_shape
));
...
...
@@ -104,7 +104,7 @@ MaceStatus WinogradTransformFunctor<DeviceType::GPU, T>::operator()(
const
std
::
vector
<
uint32_t
>
lws
=
{
kwg_size_
/
8
,
8
,
0
};
std
::
string
tuning_key
=
Concat
(
"winograd_transform_kernel"
,
output_tensor
->
dim
(
0
),
output_tensor
->
dim
(
1
),
output_tensor
->
dim
(
2
)
,
output_tensor
->
dim
(
3
)
);
output_tensor
->
dim
(
2
));
TuningOrRun2DKernel
(
kernel_
,
tuning_key
,
gws
,
lws
,
future
);
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
...
...
mace/ops/matmul.h
浏览文件 @
cd506756
...
...
@@ -25,24 +25,37 @@ template <DeviceType D, class T>
class
MatMulOp
:
public
Operator
<
D
,
T
>
{
public:
MatMulOp
(
const
OperatorDef
&
operator_def
,
Workspace
*
ws
)
:
Operator
<
D
,
T
>
(
operator_def
,
ws
)
{}
:
Operator
<
D
,
T
>
(
operator_def
,
ws
),
transpose_a_
(
OperatorBase
::
GetOptionalArg
<
bool
>
(
"transpose_a"
,
false
)),
transpose_b_
(
OperatorBase
::
GetOptionalArg
<
bool
>
(
"transpose_b"
,
false
))
{
}
MaceStatus
Run
(
StatsFuture
*
future
)
override
{
const
Tensor
*
A
=
this
->
Input
(
0
);
const
Tensor
*
B
=
this
->
Input
(
1
);
Tensor
*
C
=
this
->
Output
(
0
);
MACE_CHECK
(
A
->
dim_size
()
==
4
&&
4
==
B
->
dim_size
())
<<
"The dimension of A and B should be 4"
;
MACE_CHECK
(
A
->
dim
(
0
)
==
B
->
dim
(
0
))
<<
"A and B must have same batch size"
;
MACE_CHECK
(
A
->
dim
(
2
)
==
B
->
dim
(
1
))
<<
"the number of A's column "
<<
A
->
dim
(
2
)
<<
" must be equal to B's row "
<<
B
->
dim
(
1
);
return
functor_
(
A
,
B
,
C
,
future
);
const
Tensor
*
A
=
this
->
Input
(
INPUT_A
);
const
Tensor
*
B
=
this
->
Input
(
INPUT_B
);
Tensor
*
C
=
this
->
Output
(
OUTPUT
);
MACE_CHECK
(
A
->
dim_size
()
==
B
->
dim_size
()
&&
A
->
dim_size
()
>=
2
,
"rank(A) should be equal to rank(B), rank should be greater "
"than or equal to 2"
);
index_t
rank
=
A
->
dim_size
();
for
(
index_t
i
=
0
;
i
<
rank
-
2
;
++
i
)
{
MACE_CHECK
(
A
->
dim
(
i
)
==
B
->
dim
(
i
),
"batch dimensions are not equal"
);
}
index_t
ak
=
transpose_a_
?
A
->
dim
(
rank
-
2
)
:
A
->
dim
(
rank
-
1
);
index_t
bk
=
transpose_b_
?
B
->
dim
(
rank
-
1
)
:
B
->
dim
(
rank
-
2
);
MACE_CHECK
(
ak
==
bk
,
"the number of A's column "
,
ak
,
" must be equal to B's row "
,
bk
);
return
functor_
(
A
,
B
,
C
,
transpose_a_
,
transpose_b_
,
future
);
}
private:
MACE_OP_INPUT_TAGS
(
INPUT_A
,
INPUT_B
);
MACE_OP_OUTPUT_TAGS
(
OUTPUT
);
kernels
::
MatMulFunctor
<
D
,
T
>
functor_
;
bool
transpose_a_
;
bool
transpose_b_
;
};
}
// namespace ops
...
...
mace/ops/matmul_benchmark.cc
浏览文件 @
cd506756
...
...
@@ -31,8 +31,8 @@ void MatMulBenchmark(
OpsTestNet
net
;
// Add input data
net
.
AddRandomInput
<
D
,
float
>
(
"A"
,
{
batch
,
height
,
channels
,
1
});
net
.
AddRandomInput
<
D
,
float
>
(
"B"
,
{
batch
,
channels
,
out_width
,
1
});
net
.
AddRandomInput
<
D
,
float
>
(
"A"
,
{
batch
,
height
,
channels
});
net
.
AddRandomInput
<
D
,
float
>
(
"B"
,
{
batch
,
channels
,
out_width
});
if
(
D
==
DeviceType
::
GPU
)
{
BufferToImage
<
D
,
T
>
(
&
net
,
"A"
,
"AImage"
,
kernels
::
BufferType
::
IN_OUT_WIDTH
);
...
...
@@ -65,6 +65,41 @@ void MatMulBenchmark(
}
net
.
Sync
();
}
template
<
DeviceType
D
,
typename
T
>
void
MatMulTransposeBenchmark
(
int
iters
,
int
batch
,
int
height
,
int
channels
,
int
out_width
)
{
mace
::
testing
::
StopTiming
();
OpsTestNet
net
;
// Add input data
net
.
AddRandomInput
<
D
,
float
>
(
"A"
,
{
batch
,
height
,
channels
});
net
.
AddRandomInput
<
D
,
float
>
(
"B"
,
{
batch
,
out_width
,
channels
});
if
(
D
==
DeviceType
::
CPU
)
{
OpDefBuilder
(
"MatMul"
,
"MatMulBM"
)
.
Input
(
"A"
)
.
Input
(
"B"
)
.
AddIntArg
(
"transpose_b"
,
1
)
.
Output
(
"Output"
)
.
Finalize
(
net
.
NewOperatorDef
());
}
else
{
MACE_NOT_IMPLEMENTED
;
}
// Warm-up
for
(
int
i
=
0
;
i
<
5
;
++
i
)
{
net
.
RunOp
(
D
);
}
net
.
Sync
();
mace
::
testing
::
StartTiming
();
while
(
iters
--
)
{
net
.
RunOp
(
D
);
}
net
.
Sync
();
}
}
// namespace
#define MACE_BM_MATMUL_MACRO(N, H, C, W, TYPE, DEVICE) \
...
...
@@ -83,6 +118,20 @@ void MatMulBenchmark(
MACE_BM_MATMUL_MACRO(N, H, C, W, float, GPU); \
MACE_BM_MATMUL_MACRO(N, H, C, W, half, GPU);
#define MACE_BM_MATMUL_TRANSPOSE_MACRO(N, H, C, W, TYPE, DEVICE) \
static void MACE_BM_MATMUL_##T_##N##_##H##_##C##_##W##_##TYPE##_##DEVICE( \
int iters) { \
const int64_t macc = static_cast<int64_t>(iters) * N * C * H * W; \
const int64_t tot = static_cast<int64_t>(iters) * N * (C * H + H * W); \
mace::testing::MaccProcessed(macc); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
MatMulTransposeBenchmark<DEVICE, TYPE>(iters, N, H, C, W); \
} \
MACE_BENCHMARK(MACE_BM_MATMUL_##T_##N##_##H##_##C##_##W##_##TYPE##_##DEVICE)
#define MACE_BM_MATMUL_TRANPOSE(N, H, C, W) \
MACE_BM_MATMUL_TRANSPOSE_MACRO(N, H, C, W, float, CPU);
MACE_BM_MATMUL
(
16
,
32
,
128
,
49
);
MACE_BM_MATMUL
(
16
,
32
,
128
,
961
);
MACE_BM_MATMUL
(
16
,
32
,
128
,
3969
);
...
...
@@ -90,6 +139,13 @@ MACE_BM_MATMUL(16, 128, 128, 49);
MACE_BM_MATMUL
(
16
,
128
,
128
,
961
);
MACE_BM_MATMUL
(
16
,
128
,
128
,
3969
);
MACE_BM_MATMUL_TRANPOSE
(
16
,
32
,
128
,
49
);
MACE_BM_MATMUL_TRANPOSE
(
16
,
32
,
128
,
961
);
MACE_BM_MATMUL_TRANPOSE
(
16
,
32
,
128
,
3969
);
MACE_BM_MATMUL_TRANPOSE
(
16
,
128
,
128
,
49
);
MACE_BM_MATMUL_TRANPOSE
(
16
,
128
,
128
,
961
);
MACE_BM_MATMUL_TRANPOSE
(
16
,
128
,
128
,
3969
);
}
// namespace test
}
// namespace ops
}
// namespace mace
mace/ops/matmul_test.cc
浏览文件 @
cd506756
...
...
@@ -72,46 +72,46 @@ void Simple(const std::vector<index_t> &A_shape,
}
// namespace
TEST_F
(
MatMulOpTest
,
SimpleCPU
)
{
Simple
<
DeviceType
::
CPU
>
({
1
,
2
,
3
,
1
},
{
1
,
2
,
3
,
4
,
5
,
6
},
{
1
,
3
,
2
,
1
},
{
1
,
2
,
3
,
4
,
5
,
6
},
{
1
,
2
,
2
,
1
},
{
22
,
28
,
49
,
64
});
Simple
<
DeviceType
::
CPU
>
({
1
,
2
,
3
},
{
1
,
2
,
3
,
4
,
5
,
6
},
{
1
,
3
,
2
},
{
1
,
2
,
3
,
4
,
5
,
6
},
{
1
,
2
,
2
},
{
22
,
28
,
49
,
64
});
Simple
<
DeviceType
::
CPU
>
(
{
1
,
5
,
5
,
1
},
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
,
11
,
12
,
13
,
14
,
15
,
16
,
17
,
18
,
19
,
20
,
21
,
22
,
23
,
24
,
25
},
{
1
,
5
,
5
,
1
},
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
,
11
,
12
,
13
,
14
,
15
,
16
,
17
,
18
,
19
,
20
,
21
,
22
,
23
,
24
,
25
},
{
1
,
5
,
5
,
1
},
{
215
,
230
,
245
,
260
,
275
,
490
,
530
,
570
,
610
,
650
,
765
,
830
,
895
,
960
,
1025
,
1040
,
1130
,
1220
,
1310
,
1400
,
1315
,
1430
,
1545
,
1660
,
1775
});
{
1
,
5
,
5
},
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
,
11
,
12
,
13
,
14
,
15
,
16
,
17
,
18
,
19
,
20
,
21
,
22
,
23
,
24
,
25
},
{
1
,
5
,
5
},
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
,
11
,
12
,
13
,
14
,
15
,
16
,
17
,
18
,
19
,
20
,
21
,
22
,
23
,
24
,
25
},
{
1
,
5
,
5
},
{
215
,
230
,
245
,
260
,
275
,
490
,
530
,
570
,
610
,
650
,
765
,
830
,
895
,
960
,
1025
,
1040
,
1130
,
1220
,
1310
,
1400
,
1315
,
1430
,
1545
,
1660
,
1775
});
}
TEST_F
(
MatMulOpTest
,
SimpleCPUWithBatch
)
{
Simple
<
DeviceType
::
CPU
>
({
2
,
2
,
3
,
1
},
{
1
,
2
,
3
,
4
,
5
,
6
,
1
,
2
,
3
,
4
,
5
,
6
},
{
2
,
3
,
2
,
1
},
{
1
,
2
,
3
,
4
,
5
,
6
,
1
,
2
,
3
,
4
,
5
,
6
},
{
2
,
2
,
2
,
1
},
{
22
,
28
,
49
,
64
,
22
,
28
,
49
,
64
});
Simple
<
DeviceType
::
CPU
>
({
2
,
2
,
3
},
{
1
,
2
,
3
,
4
,
5
,
6
,
1
,
2
,
3
,
4
,
5
,
6
},
{
2
,
3
,
2
},
{
1
,
2
,
3
,
4
,
5
,
6
,
1
,
2
,
3
,
4
,
5
,
6
},
{
2
,
2
,
2
},
{
22
,
28
,
49
,
64
,
22
,
28
,
49
,
64
});
}
TEST_F
(
MatMulOpTest
,
SimpleOPENCL
)
{
Simple
<
DeviceType
::
GPU
>
({
1
,
2
,
3
,
1
},
{
1
,
2
,
3
,
4
,
5
,
6
},
{
1
,
3
,
2
,
1
},
{
1
,
2
,
3
,
4
,
5
,
6
},
{
1
,
2
,
2
,
1
},
{
22
,
28
,
49
,
64
});
Simple
<
DeviceType
::
GPU
>
({
1
,
2
,
3
},
{
1
,
2
,
3
,
4
,
5
,
6
},
{
1
,
3
,
2
},
{
1
,
2
,
3
,
4
,
5
,
6
},
{
1
,
2
,
2
},
{
22
,
28
,
49
,
64
});
Simple
<
DeviceType
::
GPU
>
(
{
1
,
5
,
5
,
1
},
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
,
11
,
12
,
13
,
14
,
15
,
16
,
17
,
18
,
19
,
20
,
21
,
22
,
23
,
24
,
25
},
{
1
,
5
,
5
,
1
},
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
,
11
,
12
,
13
,
14
,
15
,
16
,
17
,
18
,
19
,
20
,
21
,
22
,
23
,
24
,
25
},
{
1
,
5
,
5
,
1
},
{
215
,
230
,
245
,
260
,
275
,
490
,
530
,
570
,
610
,
650
,
765
,
830
,
895
,
960
,
1025
,
1040
,
1130
,
1220
,
1310
,
1400
,
1315
,
1430
,
1545
,
1660
,
1775
});
{
1
,
5
,
5
},
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
,
11
,
12
,
13
,
14
,
15
,
16
,
17
,
18
,
19
,
20
,
21
,
22
,
23
,
24
,
25
},
{
1
,
5
,
5
},
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
,
11
,
12
,
13
,
14
,
15
,
16
,
17
,
18
,
19
,
20
,
21
,
22
,
23
,
24
,
25
},
{
1
,
5
,
5
},
{
215
,
230
,
245
,
260
,
275
,
490
,
530
,
570
,
610
,
650
,
765
,
830
,
895
,
960
,
1025
,
1040
,
1130
,
1220
,
1310
,
1400
,
1315
,
1430
,
1545
,
1660
,
1775
});
}
TEST_F
(
MatMulOpTest
,
SimpleGPUWithBatch
)
{
Simple
<
DeviceType
::
CPU
>
({
2
,
2
,
3
,
1
},
{
1
,
2
,
3
,
4
,
5
,
6
,
1
,
2
,
3
,
4
,
5
,
6
},
{
2
,
3
,
2
,
1
},
{
1
,
2
,
3
,
4
,
5
,
6
,
1
,
2
,
3
,
4
,
5
,
6
},
{
2
,
2
,
2
,
1
},
{
22
,
28
,
49
,
64
,
22
,
28
,
49
,
64
});
Simple
<
DeviceType
::
CPU
>
({
2
,
2
,
3
},
{
1
,
2
,
3
,
4
,
5
,
6
,
1
,
2
,
3
,
4
,
5
,
6
},
{
2
,
3
,
2
},
{
1
,
2
,
3
,
4
,
5
,
6
,
1
,
2
,
3
,
4
,
5
,
6
},
{
2
,
2
,
2
},
{
22
,
28
,
49
,
64
,
22
,
28
,
49
,
64
});
}
namespace
{
template
<
typename
T
>
void
Complex
(
const
index_t
batch
,
void
Complex
(
const
std
::
vector
<
index_t
>
&
batch
,
const
index_t
height
,
const
index_t
channels
,
const
index_t
out_width
)
{
...
...
@@ -119,23 +119,14 @@ void Complex(const index_t batch,
// Construct graph
OpsTestNet
net
;
OpDefBuilder
(
"MatMul"
,
"MatMulTest"
)
.
Input
(
"A"
)
.
Input
(
"B"
)
.
Output
(
"Output"
)
.
Finalize
(
net
.
NewOperatorDef
());
// Add input data
net
.
AddRandomInput
<
DeviceType
::
GPU
,
float
>
(
"A"
,
{
batch
,
height
,
channels
,
1
});
net
.
AddRandomInput
<
DeviceType
::
GPU
,
float
>
(
"B"
,
{
batch
,
channels
,
out_width
,
1
});
// run cpu
net
.
RunOp
();
// Check
Tensor
expected
;
expected
.
Copy
(
*
net
.
GetOutput
(
"Output"
));
index_t
batch_count
=
std
::
accumulate
(
batch
.
begin
(),
batch
.
end
(),
1
,
std
::
multiplies
<
index_t
>
());
net
.
AddRandomInput
<
DeviceType
::
GPU
,
float
>
(
"A"
,
{
batch_count
,
height
,
channels
});
net
.
AddRandomInput
<
DeviceType
::
GPU
,
float
>
(
"B"
,
{
batch_count
,
channels
,
out_width
});
// Run on opencl
BufferToImage
<
DeviceType
::
GPU
,
T
>
(
&
net
,
"A"
,
"AImage"
,
...
...
@@ -150,11 +141,40 @@ void Complex(const index_t batch,
.
AddIntArg
(
"T"
,
static_cast
<
int
>
(
DataTypeToEnum
<
T
>::
value
))
.
Finalize
(
net
.
NewOperatorDef
());
// Run on opencl
net
.
RunOp
(
DeviceType
::
GPU
);
ImageToBuffer
<
DeviceType
::
GPU
,
float
>
(
&
net
,
"OutputImage"
,
"OPENCLOutput"
,
kernels
::
BufferType
::
IN_OUT_HEIGHT
);
// run cpu
std
::
vector
<
index_t
>
shape_a
=
batch
;
shape_a
.
push_back
(
height
);
shape_a
.
push_back
(
channels
);
std
::
vector
<
index_t
>
shape_b
=
batch
;
shape_b
.
push_back
(
channels
);
shape_b
.
push_back
(
out_width
);
std
::
vector
<
index_t
>
expected_output_shape
=
batch
;
expected_output_shape
.
push_back
(
height
);
expected_output_shape
.
push_back
(
out_width
);
net
.
GetTensor
(
"A"
)
->
Reshape
(
shape_a
);
net
.
GetTensor
(
"B"
)
->
Reshape
(
shape_b
);
OpDefBuilder
(
"MatMul"
,
"MatMulTest"
)
.
Input
(
"A"
)
.
Input
(
"B"
)
.
Output
(
"Output"
)
.
Finalize
(
net
.
NewOperatorDef
());
net
.
RunOp
();
// Check
EXPECT_EQ
(
expected_output_shape
,
net
.
GetOutput
(
"Output"
)
->
shape
());
Tensor
expected
;
expected
.
Copy
(
*
net
.
GetOutput
(
"Output"
));
expected
.
Reshape
({
batch_count
,
height
,
out_width
});
if
(
DataTypeToEnum
<
T
>::
value
==
DataType
::
DT_HALF
)
{
ExpectTensorNear
<
float
>
(
expected
,
*
net
.
GetOutput
(
"OPENCLOutput"
),
1e-2
,
1e-1
);
...
...
@@ -166,28 +186,36 @@ void Complex(const index_t batch,
}
// namespace
TEST_F
(
MatMulOpTest
,
OPENCLAlignedWithoutBatch
)
{
Complex
<
float
>
(
1
,
64
,
128
,
32
);
Complex
<
float
>
(
1
,
64
,
32
,
128
);
Complex
<
float
>
({
1
},
64
,
128
,
32
);
Complex
<
float
>
({
1
},
64
,
32
,
128
);
Complex
<
float
>
({
2
,
3
},
64
,
32
,
128
);
}
TEST_F
(
MatMulOpTest
,
OPENCLUnAlignedWithoutBatch
)
{
Complex
<
float
>
(
1
,
31
,
113
,
61
);
Complex
<
float
>
(
1
,
113
,
31
,
73
);
Complex
<
float
>
({
1
},
31
,
113
,
61
);
Complex
<
float
>
({
1
},
113
,
31
,
73
);
Complex
<
float
>
({
2
,
3
},
113
,
31
,
73
);
}
TEST_F
(
MatMulOpTest
,
OPENCLUnAlignedWithBatch
)
{
Complex
<
float
>
(
2
,
3
,
3
,
3
);
Complex
<
float
>
(
16
,
31
,
61
,
67
);
Complex
<
float
>
(
31
,
31
,
61
,
67
);
Complex
<
float
>
({
2
},
3
,
3
,
3
);
Complex
<
float
>
({
16
},
31
,
61
,
67
);
Complex
<
float
>
({
31
},
31
,
61
,
67
);
Complex
<
float
>
({
2
,
3
},
31
,
61
,
67
);
}
TEST_F
(
MatMulOpTest
,
OPENCLHalfAlignedWithoutBatch
)
{
Complex
<
half
>
(
1
,
64
,
128
,
32
);
Complex
<
half
>
(
1
,
64
,
32
,
128
);
Complex
<
half
>
({
1
},
64
,
128
,
32
);
Complex
<
half
>
({
1
},
64
,
32
,
128
);
Complex
<
half
>
({
2
,
3
},
64
,
32
,
128
);
}
TEST_F
(
MatMulOpTest
,
OPENCLHalfUnAlignedWithBatch
)
{
Complex
<
half
>
(
2
,
31
,
113
,
61
);
Complex
<
half
>
(
16
,
32
,
64
,
64
);
Complex
<
half
>
(
31
,
31
,
61
,
67
);
Complex
<
half
>
({
2
},
31
,
113
,
61
);
Complex
<
half
>
({
16
},
32
,
64
,
64
);
Complex
<
half
>
({
31
},
31
,
61
,
67
);
Complex
<
half
>
({
2
,
3
},
31
,
61
,
67
);
}
// TODO(liyin): test transpose after implementing gpu runtime
// now transpose test is in kernels_test
}
// namespace test
}
// namespace ops
}
// namespace mace
mace/python/tools/converter_tool/transformer.py
浏览文件 @
cd506756
...
...
@@ -518,7 +518,7 @@ class Transformer(base_converter.ConverterInterface):
wt_output_width
=
batch
*
(
(
out_height
+
1
)
/
2
)
*
((
out_width
+
1
)
/
2
)
wt_output_shape
.
dims
.
extend
(
[
16
,
in_channels
,
wt_output_width
,
1
])
[
16
,
in_channels
,
wt_output_width
])
if
ConverterUtil
.
get_arg
(
op
,
MaceKeyword
.
mace_padding_str
)
\
...
...
@@ -543,7 +543,7 @@ class Transformer(base_converter.ConverterInterface):
matmul_op
.
output
.
extend
([
matmul_op
.
name
])
matmul_output_shape
=
matmul_op
.
output_shape
.
add
()
matmul_output_shape
.
dims
.
extend
(
[
16
,
out_channels
,
wt_output_width
,
1
])
[
16
,
out_channels
,
wt_output_width
])
arg
=
matmul_op
.
arg
.
add
()
arg
.
name
=
MaceKeyword
.
mace_winograd_filter_transformed
...
...
mace/python/tools/memory_optimizer.py
浏览文件 @
cd506756
...
...
@@ -167,7 +167,7 @@ class GPUMemoryOptimizer(MemoryOptimizer):
def
get_op_mem_block
(
self
,
op_type
,
output_shape
):
mem_block
=
[
0
,
0
]
if
op_type
==
'WinogradTransform'
or
op_type
==
'MatMul'
:
mem_block
[
0
]
=
output_shape
[
2
]
*
output_shape
[
3
]
mem_block
[
0
]
=
output_shape
[
2
]
mem_block
[
1
]
=
output_shape
[
0
]
*
int
((
output_shape
[
1
]
+
3
)
/
4
)
else
:
mem_block
[
0
]
=
output_shape
[
2
]
*
int
((
output_shape
[
3
]
+
3
)
/
4
)
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录