Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
1f0cbcf3
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
1f0cbcf3
编写于
1月 16, 2017
作者:
X
xutianbing
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
add GpuMatrix::mul, CpuMatrix::mul operators
上级
936301f1
变更
5
显示空白变更内容
内联
并排
Showing
5 changed file
with
652 addition
and
25 deletion
+652
-25
paddle/function/BufferArg.h
paddle/function/BufferArg.h
+1
-1
paddle/function/MulOp.cpp
paddle/function/MulOp.cpp
+469
-9
paddle/function/MulOp.h
paddle/function/MulOp.h
+41
-0
paddle/function/MulOpGpu.cu
paddle/function/MulOpGpu.cu
+118
-2
paddle/function/MulOpTest.cpp
paddle/function/MulOpTest.cpp
+23
-13
未找到文件。
paddle/function/BufferArg.h
浏览文件 @
1f0cbcf3
...
...
@@ -167,7 +167,7 @@ public:
ValueType
valueType
()
const
{
return
valueType_
;
}
BufferType
bufferType
()
const
{
return
bufferType_
;
}
const
TensorShape
&
shape
()
const
{
return
shape_
;
}
bool
isSparse
()
const
{
return
TENSOR_SPARSE
==
bufferType_
;
}
bool
isSparse
Arg
()
const
{
return
TENSOR_SPARSE
==
bufferType_
;
}
bool
isSequenceArg
()
const
{
return
TENSOR_SEQUENCE_DATA
==
bufferType_
;
}
const
SequenceArg
&
sequence
()
const
;
...
...
paddle/function/MulOp.cpp
浏览文件 @
1f0cbcf3
...
...
@@ -13,16 +13,471 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "MulOp.h"
#include "paddle/math/MathFunctions.h"
#include "paddle/math/SIMDFunctions.h"
#include "paddle/utils/ThreadLocal.h"
#ifndef PADDLE_TYPE_DOUBLE
#define GEMM paddle::gemm<float>
#else
#define GEMM paddle::gemm<double>
#endif
namespace
{
inline
void
vecAddTo
(
real
*
a
,
const
real
*
b
,
size_t
len
)
{
for
(
unsigned
int
i
=
0
;
i
<
len
;
++
i
)
{
a
[
i
]
+=
b
[
i
];
}
}
inline
void
vecAddTo
(
real
*
a
,
const
real
*
b
,
real
scaleB
,
size_t
len
)
{
for
(
unsigned
int
i
=
0
;
i
<
len
;
++
i
)
{
a
[
i
]
+=
scaleB
*
b
[
i
];
}
}
inline
void
colVecAddTo
(
real
*
a
,
const
real
*
b
,
size_t
len
,
size_t
aWidth
,
size_t
bWidth
)
{
for
(
unsigned
int
i
=
0
;
i
<
len
;
++
i
)
{
a
[
i
*
aWidth
]
+=
b
[
i
*
bWidth
];
}
}
inline
void
colVecAddTo
(
real
*
a
,
real
*
b
,
real
c
,
size_t
len
,
size_t
aWidth
,
size_t
bWidth
)
{
for
(
unsigned
int
i
=
0
;
i
<
len
;
++
i
)
{
a
[
i
*
aWidth
]
+=
b
[
i
*
bWidth
]
*
c
;
}
}
}
// namespace
namespace
paddle
{
template
<
>
void
MulOp
<
DEVICE_TYPE_CPU
>
(
CpuSparseMatrix
&
out
,
const
CpuMatrix
&
a
,
const
CpuMatrix
&
b
,
real
scaleAB
,
real
scaleT
)
{
/// todo(tianbing), clean the code
CHECK
(
!
out
.
isTransposed
())
<<
"Not supported"
;
CHECK_EQ
(
out
.
getValueType
(),
FLOAT_VALUE
);
const
real
*
A
=
a
.
getData
();
const
real
*
B
=
b
.
getData
();
real
*
C
=
out
.
getValue
();
int
*
rows
=
out
.
getRows
();
int
*
cols
=
out
.
getCols
();
size_t
height
=
out
.
getHeight
();
size_t
width
=
out
.
getWidth
();
if
(
scaleT
==
0
)
{
out
.
zeroMem
();
}
if
(
!
a
.
isTransposed
()
&&
!
b
.
isTransposed
())
{
size_t
m
=
a
.
getWidth
();
CHECK_EQ
(
b
.
getHeight
(),
m
);
CHECK_EQ
(
a
.
getHeight
(),
height
);
CHECK_EQ
(
b
.
getWidth
(),
width
);
if
(
out
.
getFormat
()
==
SPARSE_CSC
)
{
for
(
size_t
i
=
0
;
i
<
width
;
i
++
)
{
size_t
start
=
out
.
getColStartIdx
(
i
);
size_t
end
=
out
.
getColStartIdx
(
i
+
1
);
for
(
size_t
j
=
start
;
j
<
end
;
j
++
)
{
real
sum
=
0
;
size_t
rowIdx
=
rows
[
j
];
for
(
size_t
k
=
0
;
k
<
m
;
k
++
)
{
sum
+=
A
[
rowIdx
*
m
+
k
]
*
B
[
k
*
width
+
i
];
}
C
[
j
]
=
scaleAB
*
sum
+
scaleT
*
C
[
j
];
}
}
}
else
{
for
(
size_t
i
=
0
;
i
<
height
;
i
++
)
{
size_t
start
=
out
.
getRowStartIdx
(
i
);
size_t
end
=
out
.
getRowStartIdx
(
i
+
1
);
for
(
size_t
j
=
start
;
j
<
end
;
j
++
)
{
real
sum
=
0
;
size_t
colIdx
=
cols
[
j
];
for
(
size_t
k
=
0
;
k
<
m
;
k
++
)
{
sum
+=
A
[
i
*
m
+
k
]
*
B
[
k
*
width
+
colIdx
];
}
C
[
j
]
=
scaleAB
*
sum
+
scaleT
*
C
[
j
];
}
}
}
}
else
if
(
a
.
isTransposed
()
&&
!
b
.
isTransposed
())
{
size_t
m
=
a
.
getHeight
();
CHECK_EQ
(
m
,
b
.
getHeight
());
CHECK_EQ
(
b
.
getWidth
(),
width
);
CHECK_EQ
(
a
.
getWidth
(),
height
);
if
(
out
.
getFormat
()
==
SPARSE_CSC
)
{
for
(
size_t
i
=
0
;
i
<
width
;
i
++
)
{
size_t
start
=
out
.
getColStartIdx
(
i
);
size_t
end
=
out
.
getColStartIdx
(
i
+
1
);
for
(
size_t
j
=
start
;
j
<
end
;
j
++
)
{
real
sum
=
0
;
size_t
rowIdx
=
rows
[
j
];
for
(
size_t
k
=
0
;
k
<
m
;
k
++
)
{
sum
+=
A
[
k
*
height
+
rowIdx
]
*
B
[
k
*
width
+
i
];
}
C
[
j
]
=
scaleAB
*
sum
+
scaleT
*
C
[
j
];
}
}
}
else
{
for
(
size_t
i
=
0
;
i
<
height
;
i
++
)
{
int
start
=
out
.
getRowStartIdx
(
i
);
int
end
=
out
.
getRowStartIdx
(
i
+
1
);
for
(
int
j
=
start
;
j
<
end
;
j
++
)
{
real
sum
=
0
;
size_t
colIdx
=
cols
[
j
];
for
(
size_t
k
=
0
;
k
<
m
;
k
++
)
{
sum
+=
A
[
k
*
height
+
i
]
*
B
[
k
*
width
+
colIdx
];
}
C
[
j
]
=
scaleAB
*
sum
+
scaleT
*
C
[
j
];
}
}
}
}
else
if
(
!
a
.
isTransposed
()
&&
b
.
isTransposed
())
{
size_t
m
=
a
.
getWidth
();
CHECK_EQ
(
b
.
getWidth
(),
m
);
CHECK_EQ
(
a
.
getHeight
(),
height
);
CHECK_EQ
(
b
.
getHeight
(),
width
);
if
(
out
.
getFormat
()
==
SPARSE_CSR
)
{
for
(
size_t
i
=
0
;
i
<
height
;
i
++
)
{
size_t
start
=
out
.
getRowStartIdx
(
i
);
size_t
end
=
out
.
getRowStartIdx
(
i
+
1
);
for
(
size_t
j
=
start
;
j
<
end
;
j
++
)
{
real
sum
=
0
;
size_t
colIdx
=
cols
[
j
];
for
(
size_t
k
=
0
;
k
<
m
;
k
++
)
{
sum
+=
A
[
i
*
m
+
k
]
*
B
[
colIdx
*
m
+
k
];
}
C
[
j
]
=
scaleAB
*
sum
+
scaleT
*
C
[
j
];
}
}
}
else
{
LOG
(
FATAL
)
<<
"Not supported csc format "
"when a is not trans and b is trans"
;
}
}
else
{
LOG
(
FATAL
)
<<
"Not supported"
;
}
}
template
<
>
void
MulOp
<
DEVICE_TYPE_CPU
>
(
CpuMatrix
&
out
,
const
CpuMatrix
&
a
,
const
CpuMatrix
&
b
,
real
scaleAB
,
real
scaleT
)
{
/// todo(tianbing), clean the code
CHECK
(
!
out
.
isTransposed
())
<<
"Not supported"
;
CBLAS_TRANSPOSE
aTrans
=
CblasNoTrans
;
size_t
aRow
=
a
.
getHeight
();
size_t
aCol
=
a
.
getWidth
();
CBLAS_TRANSPOSE
bTrans
=
CblasNoTrans
;
size_t
bRow
=
b
.
getHeight
();
size_t
bCol
=
b
.
getWidth
();
if
(
a
.
isTransposed
())
{
aTrans
=
CblasTrans
;
aRow
=
a
.
getWidth
();
aCol
=
a
.
getHeight
();
}
if
(
b
.
isTransposed
())
{
bTrans
=
CblasTrans
;
bRow
=
b
.
getWidth
();
bCol
=
b
.
getHeight
();
}
/// C = A * B, for matrix format
CHECK_EQ
(
aCol
,
bRow
);
CHECK_EQ
(
aRow
,
out
.
getHeight
());
CHECK_EQ
(
bCol
,
out
.
getWidth
());
const
real
*
A
=
a
.
getData
();
const
real
*
B
=
b
.
getData
();
real
*
C
=
out
.
getData
();
int
M
=
out
.
getHeight
();
int
N
=
out
.
getWidth
();
int
K
=
aCol
;
int
lda
=
a
.
getStride
();
int
ldb
=
b
.
getStride
();
int
ldc
=
out
.
getStride
();
GEMM
(
aTrans
,
bTrans
,
M
,
N
,
K
,
scaleAB
,
A
,
lda
,
B
,
ldb
,
scaleT
,
C
,
ldc
);
VLOG
(
2
)
<<
" A[0]="
<<
A
[
0
]
<<
" A[1]="
<<
A
[
1
]
<<
" B[0]="
<<
B
[
0
]
<<
" B[1]="
<<
B
[
1
]
<<
" C[0]="
<<
C
[
0
]
<<
" C[1]="
<<
C
[
1
];
}
static
ThreadLocal
<
std
::
vector
<
const
real
*>>
threadLocalColArray
;
template
<
>
void
MulOp
<
DEVICE_TYPE_CPU
>
(
CpuMatrix
&
out
,
const
CpuSparseMatrix
&
a
,
const
CpuMatrix
&
b
,
real
scaleAB
,
real
scaleT
)
{
/// todo(tianbing), clean the code
CHECK
(
!
out
.
isTransposed
())
<<
"Not supported"
;
CHECK
(
!
b
.
isTransposed
())
<<
"Not supported"
;
CHECK
(
scaleT
==
0
||
scaleT
==
1
)
<<
"Not support"
;
CHECK_EQ
(
scaleAB
,
static_cast
<
real
>
(
1.0
))
<<
"Not supported"
;
CHECK_EQ
(
a
.
getFormat
(),
SPARSE_CSR
)
<<
"Not supported"
;
const
real
*
B
=
b
.
getData
();
real
*
C
=
out
.
getData
();
size_t
height
=
out
.
getHeight
();
size_t
width
=
out
.
getWidth
();
int
*
cols
=
a
.
getCols
();
real
*
values
=
a
.
getValue
();
if
(
scaleT
==
0
)
{
out
.
zeroMem
();
}
if
(
!
a
.
isTransposed
())
{
size_t
m
=
a
.
getWidth
();
CHECK_EQ
(
b
.
getHeight
(),
m
);
CHECK_EQ
(
a
.
getHeight
(),
height
);
CHECK_EQ
(
b
.
getWidth
(),
width
);
if
(
a
.
getValueType
()
==
NO_VALUE
)
{
if
(
width
%
32
==
0
)
{
// use libaddto
CHECK_EQ
((
size_t
)
B
%
32
,
0UL
);
CHECK_EQ
((
size_t
)
C
%
32
,
0UL
);
auto
&
colArray
=
*
threadLocalColArray
;
for
(
size_t
i
=
0
;
i
<
a
.
getHeight
();
++
i
)
{
const
int
start
=
a
.
getRowStartIdx
(
i
);
const
int
end
=
a
.
getRowStartIdx
(
i
+
1
);
size_t
colNum
=
end
-
start
;
colArray
.
resize
(
colNum
);
for
(
int
j
=
0
;
j
<
end
-
start
;
++
j
)
{
colArray
[
j
]
=
const_cast
<
CpuMatrix
&>
(
b
).
getRow
(
cols
[
j
+
start
]);
}
simd
::
batchAddTo
(
out
.
getRow
(
i
),
&
colArray
[
0
],
colNum
,
width
);
}
}
else
{
for
(
size_t
i
=
0
;
i
<
a
.
getHeight
();
++
i
)
{
const
int
start
=
a
.
getRowStartIdx
(
i
);
const
int
end
=
a
.
getRowStartIdx
(
i
+
1
);
for
(
int
j
=
start
;
j
<
end
;
++
j
)
{
vecAddTo
(
out
.
getRow
(
i
),
const_cast
<
CpuMatrix
&>
(
b
).
getRow
(
cols
[
j
]),
width
);
}
}
}
}
else
if
(
a
.
getValueType
()
==
FLOAT_VALUE
)
{
for
(
size_t
i
=
0
;
i
<
a
.
getHeight
();
++
i
)
{
const
int
start
=
a
.
getRowStartIdx
(
i
);
const
int
end
=
a
.
getRowStartIdx
(
i
+
1
);
for
(
int
j
=
start
;
j
<
end
;
++
j
)
{
vecAddTo
(
out
.
getRow
(
i
),
const_cast
<
CpuMatrix
&>
(
b
).
getRow
(
cols
[
j
]),
values
[
j
],
width
);
}
}
}
}
else
/*if (a->isTransposed())*/
{
size_t
m
=
a
.
getHeight
();
CHECK_EQ
(
b
.
getHeight
(),
m
);
CHECK_EQ
(
a
.
getWidth
(),
height
);
CHECK_EQ
(
b
.
getWidth
(),
width
);
if
(
a
.
getValueType
()
==
NO_VALUE
)
{
if
(
width
%
32
==
0
)
{
// use libaddto
CHECK_EQ
((
size_t
)
B
%
32
,
0UL
);
CHECK_EQ
((
size_t
)
C
%
32
,
0UL
);
for
(
size_t
i
=
0
;
i
<
a
.
getHeight
();
++
i
)
{
const
int
start
=
a
.
getRowStartIdx
(
i
);
const
int
end
=
a
.
getRowStartIdx
(
i
+
1
);
for
(
int
j
=
start
;
j
<
end
;
++
j
)
{
simd
::
addTo
(
out
.
getRow
(
cols
[
j
]),
const_cast
<
CpuMatrix
&>
(
b
).
getRow
(
i
),
width
);
}
}
}
else
{
for
(
size_t
i
=
0
;
i
<
a
.
getHeight
();
++
i
)
{
const
int
start
=
a
.
getRowStartIdx
(
i
);
const
int
end
=
a
.
getRowStartIdx
(
i
+
1
);
for
(
int
j
=
start
;
j
<
end
;
++
j
)
{
vecAddTo
(
out
.
getRow
(
cols
[
j
]),
const_cast
<
CpuMatrix
&>
(
b
).
getRow
(
i
),
width
);
}
}
}
}
else
if
(
a
.
getValueType
()
==
FLOAT_VALUE
)
{
for
(
size_t
i
=
0
;
i
<
a
.
getHeight
();
++
i
)
{
const
int
start
=
a
.
getRowStartIdx
(
i
);
const
int
end
=
a
.
getRowStartIdx
(
i
+
1
);
for
(
int
j
=
start
;
j
<
end
;
++
j
)
{
vecAddTo
(
out
.
getRow
(
cols
[
j
]),
const_cast
<
CpuMatrix
&>
(
b
).
getRow
(
i
),
values
[
j
],
width
);
}
}
}
}
}
template
<
>
void
MulOp
<
DEVICE_TYPE_CPU
>
(
CpuMatrix
&
out
,
const
CpuMatrix
&
a
,
const
CpuSparseMatrix
&
b
,
real
scaleAB
,
real
scaleT
)
{
/// todo(tianbing), clean the code
CHECK
(
!
out
.
trans_
)
<<
"Not supported"
;
CHECK
(
!
a
.
isTransposed
())
<<
"Not supported"
;
CHECK
(
scaleT
==
0
||
scaleT
==
1
);
CHECK_EQ
(
scaleAB
,
static_cast
<
real
>
(
1.0
));
real
*
A
=
const_cast
<
real
*>
(
a
.
getData
());
real
*
B
=
const_cast
<
real
*>
(
b
.
getValue
());
real
*
C
=
out
.
getData
();
int
*
rows
=
b
.
getRows
();
int
*
cols
=
b
.
getCols
();
if
(
scaleT
==
0
)
{
out
.
zeroMem
();
}
/// todo(tianbing), clean the code
if
(
b
.
getFormat
()
==
SPARSE_CSC
)
{
if
(
!
b
.
isTransposed
())
{
size_t
m
=
a
.
getWidth
();
CHECK_EQ
(
b
.
getHeight
(),
m
);
CHECK_EQ
(
a
.
getHeight
(),
out
.
height_
);
CHECK_EQ
(
b
.
getWidth
(),
out
.
width_
);
if
(
b
.
getValueType
()
==
NO_VALUE
)
{
for
(
size_t
j
=
0
;
j
<
b
.
getWidth
();
++
j
)
{
int
start
=
b
.
getColStartIdx
(
j
);
int
end
=
b
.
getColStartIdx
(
j
+
1
);
for
(
int
i
=
start
;
i
<
end
;
++
i
)
{
colVecAddTo
(
C
+
j
,
A
+
rows
[
i
],
out
.
height_
,
out
.
width_
,
a
.
getWidth
());
}
}
}
else
if
(
b
.
getValueType
()
==
FLOAT_VALUE
)
{
for
(
size_t
j
=
0
;
j
<
b
.
getWidth
();
++
j
)
{
int
start
=
b
.
getColStartIdx
(
j
);
int
end
=
b
.
getColStartIdx
(
j
+
1
);
for
(
int
i
=
start
;
i
<
end
;
++
i
)
{
colVecAddTo
(
C
+
j
,
A
+
rows
[
i
],
B
[
i
],
out
.
height_
,
out
.
width_
,
a
.
getWidth
());
}
}
}
}
else
/*if (b.isTransposed())*/
{
size_t
m
=
a
.
getWidth
();
CHECK_EQ
(
b
.
getHeight
(),
out
.
width_
);
CHECK_EQ
(
a
.
getHeight
(),
out
.
height_
);
CHECK_EQ
(
b
.
getWidth
(),
m
);
if
(
b
.
getValueType
()
==
NO_VALUE
)
{
for
(
size_t
i
=
0
;
i
<
b
.
getWidth
();
++
i
)
{
int
start
=
b
.
getColStartIdx
(
i
);
int
end
=
b
.
getColStartIdx
(
i
+
1
);
for
(
int
j
=
start
;
j
<
end
;
++
j
)
{
colVecAddTo
(
C
+
rows
[
j
],
A
+
i
,
out
.
height_
,
out
.
width_
,
a
.
getWidth
());
}
}
}
else
if
(
b
.
getValueType
()
==
FLOAT_VALUE
)
{
for
(
size_t
i
=
0
;
i
<
b
.
getWidth
();
++
i
)
{
int
start
=
b
.
getColStartIdx
(
i
);
int
end
=
b
.
getColStartIdx
(
i
+
1
);
for
(
int
j
=
start
;
j
<
end
;
++
j
)
{
colVecAddTo
(
C
+
rows
[
j
],
A
+
i
,
B
[
j
],
out
.
height_
,
out
.
width_
,
a
.
getWidth
());
}
}
}
}
}
else
{
if
(
!
b
.
isTransposed
())
{
size_t
m
=
a
.
getWidth
();
CHECK_EQ
(
b
.
getHeight
(),
m
);
CHECK_EQ
(
a
.
getHeight
(),
out
.
height_
);
CHECK_EQ
(
b
.
getWidth
(),
out
.
width_
);
if
(
b
.
getValueType
()
==
NO_VALUE
)
{
for
(
size_t
j
=
0
;
j
<
b
.
getHeight
();
++
j
)
{
int
start
=
b
.
getRowStartIdx
(
j
);
int
end
=
b
.
getRowStartIdx
(
j
+
1
);
for
(
int
i
=
start
;
i
<
end
;
++
i
)
{
colVecAddTo
(
C
+
cols
[
i
],
A
+
j
,
out
.
height_
,
out
.
width_
,
a
.
getWidth
());
}
}
}
else
if
(
b
.
getValueType
()
==
FLOAT_VALUE
)
{
for
(
size_t
j
=
0
;
j
<
b
.
getHeight
();
++
j
)
{
int
start
=
b
.
getRowStartIdx
(
j
);
int
end
=
b
.
getRowStartIdx
(
j
+
1
);
for
(
int
i
=
start
;
i
<
end
;
++
i
)
{
colVecAddTo
(
C
+
cols
[
i
],
A
+
j
,
B
[
i
],
out
.
height_
,
out
.
width_
,
a
.
getWidth
());
}
}
}
}
else
/*if (b.isTransposed())*/
{
size_t
m
=
a
.
getWidth
();
CHECK_EQ
(
b
.
getHeight
(),
out
.
width_
);
CHECK_EQ
(
a
.
getHeight
(),
out
.
height_
);
CHECK_EQ
(
b
.
getWidth
(),
m
);
if
(
b
.
getValueType
()
==
NO_VALUE
)
{
for
(
size_t
i
=
0
;
i
<
b
.
getHeight
();
++
i
)
{
int
start
=
b
.
getRowStartIdx
(
i
);
int
end
=
b
.
getRowStartIdx
(
i
+
1
);
for
(
int
j
=
start
;
j
<
end
;
++
j
)
{
colVecAddTo
(
C
+
i
,
A
+
cols
[
j
],
out
.
height_
,
out
.
width_
,
a
.
getWidth
());
}
}
}
else
if
(
b
.
getValueType
()
==
FLOAT_VALUE
)
{
for
(
size_t
i
=
0
;
i
<
b
.
getHeight
();
++
i
)
{
int
start
=
b
.
getRowStartIdx
(
i
);
int
end
=
b
.
getRowStartIdx
(
i
+
1
);
for
(
int
j
=
start
;
j
<
end
;
++
j
)
{
colVecAddTo
(
C
+
i
,
A
+
cols
[
j
],
B
[
j
],
out
.
height_
,
out
.
width_
,
a
.
getWidth
());
}
}
}
}
}
}
/**
* mul operator
* out = scaleT * out + scaleAB*(in1 * in2)
*
* \param outputs[0] output matrix,
N * M
* \param inputs[0] first input (sparse) matrix,
N * K
* \param inputs[1] second input matrix, K *
M (non-transpose
)
* \param outputs[0] output matrix,
M * N
* \param inputs[0] first input (sparse) matrix,
M * K (if non-trans)
* \param inputs[1] second input matrix, K *
N (if non-trans
)
*/
template
<
DeviceType
Device
>
class
MulFunc
:
public
FunctionBase
{
...
...
@@ -33,19 +488,23 @@ public:
}
void
calc
(
const
BufferArgs
&
inputs
,
const
BufferArgs
&
outputs
)
override
{
/// todo(tianbing), add more checks
CHECK_EQ
((
size_t
)
1
,
inputs
.
size
());
CHECK_EQ
((
size_t
)
2
,
outputs
.
size
());
CHECK_EQ
((
size_t
)
2
,
inputs
.
size
());
CHECK_EQ
((
size_t
)
1
,
outputs
.
size
());
CHECK
(
inputs
[
0
].
data
()
&&
inputs
[
1
].
data
()
&&
outputs
[
0
].
data
());
CHECK_EQ
(
inputs
[
0
].
shape
().
ndims
(),
(
size_t
)
2
);
CHECK_EQ
(
inputs
[
1
].
shape
().
ndims
(),
(
size_t
)
2
);
CHECK_EQ
(
outputs
[
0
].
shape
().
ndims
(),
(
size_t
)
2
);
CHECK_EQ
(
outputs
[
0
].
getArgType
(),
ASSIGN_TO
);
CHECK
(
inputs
[
0
].
isSparse
())
<<
"SparseMatrix requried here"
;
const
auto
in1_mat
=
inputs
[
0
].
sparse
().
SparseMatrix
<
Device
>
();
auto
in1_mat
=
inputs
[
0
].
matrix
<
Device
>
();
if
(
inputs
[
0
].
isSparseArg
())
{
in1_mat
=
inputs
[
0
].
sparse
().
SparseMatrix
<
Device
>
();
}
auto
in2_mat
=
inputs
[
1
].
matrix
<
Device
>
();
if
(
inputs
[
1
].
isSparseArg
())
{
in2_mat
=
inputs
[
1
].
sparse
().
SparseMatrix
<
Device
>
();
}
auto
out_mat
=
outputs
[
0
].
matrix
<
Device
>
();
const
auto
in2_mat
=
inputs
[
1
].
matrix
<
Device
>
();
MulOp
<
Device
>
(
out_mat
,
in1_mat
,
in2_mat
,
scaleAB_
,
scaleT_
);
}
...
...
@@ -54,6 +513,7 @@ private:
real
scaleT_
;
};
REGISTER_TYPED_FUNC
(
MulOp
,
CPU
,
MulFunc
);
#ifndef PADDLE_ONLY_CPU
REGISTER_TYPED_FUNC
(
MulOp
,
GPU
,
MulFunc
);
#endif
...
...
paddle/function/MulOp.h
浏览文件 @
1f0cbcf3
...
...
@@ -19,6 +19,40 @@ limitations under the License. */
#include "paddle/math/SparseMatrix.h"
namespace
paddle
{
template
<
DeviceType
DType
>
void
MulOp
(
CpuMatrix
&
out
,
const
CpuMatrix
&
a
,
const
CpuMatrix
&
b
,
real
scaleAB
,
real
scaleT
);
template
<
DeviceType
DType
>
void
MulOp
(
CpuMatrix
&
out
,
const
CpuSparseMatrix
&
a
,
const
CpuMatrix
&
b
,
real
scaleAB
,
real
scaleT
);
template
<
DeviceType
DType
>
void
MulOp
(
CpuMatrix
&
out
,
const
CpuMatrix
&
a
,
const
CpuSparseMatrix
&
b
,
real
scaleAB
,
real
scaleT
);
template
<
DeviceType
DType
>
void
MulOp
(
CpuSparseMatrix
&
out
,
const
CpuMatrix
&
a
,
const
CpuMatrix
&
b
,
real
scaleAB
,
real
scaleT
);
template
<
DeviceType
DType
>
void
MulOp
(
GpuMatrix
&
out
,
const
GpuMatrix
&
a
,
const
GpuMatrix
&
b
,
real
scaleAB
,
real
scaleT
);
template
<
DeviceType
DType
>
void
MulOp
(
GpuMatrix
&
out
,
...
...
@@ -27,4 +61,11 @@ void MulOp(GpuMatrix& out,
real
scaleAB
,
real
scaleT
);
template
<
DeviceType
DType
>
void
MulOp
(
GpuMatrix
&
out
,
const
GpuMatrix
&
a
,
const
GpuSparseMatrix
&
b
,
real
scaleAB
,
real
scaleT
);
}
// namespace paddle
paddle/function/MulOpGpu.cu
浏览文件 @
1f0cbcf3
...
...
@@ -20,6 +20,65 @@ limitations under the License. */
namespace
paddle
{
/**
* out = scale_t * out + scale_ab * (a * b)
* out : output matrix, M * N
*/
template
<
>
void
MulOp
<
DEVICE_TYPE_GPU
>
(
GpuMatrix
&
out
,
const
GpuMatrix
&
a
,
const
GpuMatrix
&
b
,
real
scale_ab
,
real
scale_t
)
{
CHECK
(
!
out
.
isTransposed
())
<<
"Not supported"
;
if
(
!
a
.
isTransposed
()
&&
!
b
.
isTransposed
())
{
/// a : M * K, b: K * N
CHECK_EQ
(
out
.
width_
,
b
.
width_
);
CHECK_EQ
(
out
.
height_
,
a
.
height_
);
CHECK_EQ
(
a
.
width_
,
b
.
height_
);
}
else
if
(
a
.
isTransposed
()
&&
!
b
.
isTransposed
())
{
/// a : K * M, b : K * N
CHECK_EQ
(
out
.
width_
,
b
.
width_
);
CHECK_EQ
(
out
.
height_
,
a
.
width_
);
CHECK_EQ
(
a
.
height_
,
b
.
height_
);
}
else
if
(
!
a
.
isTransposed
()
&&
b
.
isTransposed
())
{
/// a: M * K, b : N * K
CHECK_EQ
(
out
.
width_
,
b
.
height_
);
CHECK_EQ
(
out
.
height_
,
a
.
height_
);
CHECK_EQ
(
a
.
width_
,
b
.
width_
);
}
else
{
LOG
(
FATAL
)
<<
"Is not supported"
;
}
real
*
a_data
=
a
.
data_
;
real
*
b_data
=
b
.
data_
;
real
*
out_data
=
out
.
data_
;
int
dim_m
=
out
.
getHeight
();
int
dim_n
=
out
.
getWidth
();
int
dim_k
=
!
a
.
isTransposed
()
?
a
.
width_
:
a
.
height_
;
int
lda
=
a
.
getStride
();
int
ldb
=
b
.
getStride
();
int
ldc
=
out
.
getStride
();
hl_trans_op_t
trans_a
=
!
a
.
isTransposed
()
?
HPPL_OP_N
:
HPPL_OP_T
;
hl_trans_op_t
trans_b
=
!
b
.
isTransposed
()
?
HPPL_OP_N
:
HPPL_OP_T
;
hl_matrix_mul
(
a_data
,
trans_a
,
b_data
,
trans_b
,
out_data
,
dim_m
,
dim_n
,
dim_k
,
scale_ab
,
scale_t
,
lda
,
ldb
,
ldc
);
}
/**
* out = scale_t * out + scale_ab * (a * b)
* out : M * N
*/
template
<
>
void
MulOp
<
DEVICE_TYPE_GPU
>
(
GpuMatrix
&
out
,
...
...
@@ -32,12 +91,15 @@ void MulOp<DEVICE_TYPE_GPU>(GpuMatrix& out,
CHECK
(
b
.
useGpu_
==
true
)
<<
"Matrix type are not equal"
;
CHECK
(
!
out
.
trans_
&&
!
b
.
trans_
)
<<
"not supported"
;
if
(
!
a
.
trans_
)
{
/// a: M * K, b: K * N
CHECK
(
out
.
width_
==
b
.
width_
&&
out
.
height_
==
a
.
height_
&&
a
.
width_
==
b
.
height_
)
<<
"Matrix dimensions are not equal"
;
}
else
{
/// a: K * M, transpose, b: K * N
CHECK
(
out
.
width_
==
b
.
width_
&&
out
.
height_
==
a
.
width_
&&
a
.
height_
==
b
.
height_
)
<<
"Matrix dimensions are not equal"
;
}
hl_trans_op_t
a_trans
=
a
.
trans_
?
HPPL_OP_T
:
HPPL_OP_N
;
hl_sparse_matrix_s
a_data
=
a
.
sMatrix_
.
get
();
real
*
b_data
=
b
.
data_
;
...
...
@@ -54,4 +116,58 @@ void MulOp<DEVICE_TYPE_GPU>(GpuMatrix& out,
scale_t
);
}
/**
* out = scale_t * out + scale_ab * (a * b)
* out : M * N
*/
template
<
>
void
MulOp
<
DEVICE_TYPE_GPU
>
(
GpuMatrix
&
out
,
const
GpuMatrix
&
a
,
const
GpuSparseMatrix
&
b
,
real
scale_ab
,
real
scale_t
)
{
CHECK
(
out
.
isContiguous
());
CHECK
(
a
.
isContiguous
());
CHECK
(
a
.
useGpu_
==
true
)
<<
"Matrix type are not equal"
;
hl_sparse_matrix_s
b_data
=
b
.
sMatrix_
.
get
();
real
*
a_data
=
a
.
data_
;
real
*
out_data
=
out
.
data_
;
hl_trans_op_t
trans_b
=
b
.
trans_
?
HPPL_OP_T
:
HPPL_OP_N
;
if
(
!
b
.
trans_
)
{
/// a : M * K, b : K * N
CHECK
(
out
.
width_
==
b
.
width_
&&
out
.
height_
==
a
.
height_
&&
a
.
width_
==
b
.
height_
)
<<
"Matrix dimensions are not equal"
;
}
else
{
/// a : M * K, b : N * K, transpose
CHECK
(
out
.
width_
==
b
.
height_
&&
out
.
height_
==
a
.
height_
&&
a
.
width_
==
b
.
width_
)
<<
"Matrix dimensions are not equal"
;
}
if
(
b
.
format_
==
SPARSE_CSC
)
{
hl_matrix_dense_mul_csc
(
a_data
,
HPPL_OP_N
,
b_data
,
trans_b
,
out_data
,
out
.
height_
,
out
.
width_
,
a
.
width_
,
scale_ab
,
scale_t
);
}
else
{
hl_matrix_dense_mul_csr
(
a_data
,
HPPL_OP_N
,
b_data
,
trans_b
,
out_data
,
out
.
height_
,
out
.
width_
,
a
.
width_
,
scale_ab
,
scale_t
);
}
}
}
// namespace paddle
paddle/function/MulOpTest.cpp
浏览文件 @
1f0cbcf3
...
...
@@ -22,31 +22,41 @@ using namespace paddle; // NOLINT
void
testSpMatrixMul
(
int
M
,
int
N
,
int
K
,
real
rate
,
real
scale1
,
real
scale2
)
{
/// todo(tianbing) check CPU/GPU
const
auto
gpuFunc
=
FunctionBase
::
funcRegistrar_
.
createByType
(
"MulO
P
-GPU"
);
const
auto
gpuFunc
=
FunctionBase
::
funcRegistrar_
.
createByType
(
"MulO
p
-GPU"
);
gpuFunc
->
init
(
FuncConfig
().
set
(
"scaleAB"
,
scale1
).
set
(
"scaleT"
,
scale2
));
int
nnz
=
M
*
K
*
rate
;
auto
gpuA
=
std
::
make_shared
<
GpuSparseMatrix
>
(
M
,
K
,
nnz
);
const
auto
gpuB
=
std
::
make_shared
<
GpuMatrix
>
(
K
,
N
);
const
auto
gpuOut
=
std
::
make_shared
<
GpuMatrix
>
(
M
,
N
);
int
nnz
=
M
*
N
*
rate
;
MatrixPtr
cpuA
=
std
::
make_shared
<
CpuMatrix
>
(
M
,
K
);
MatrixPtr
cpuB
=
std
::
make_shared
<
CpuMatrix
>
(
N
,
K
);
MatrixPtr
cpuC
(
new
CpuSparseMatrix
(
M
,
N
,
nnz
)
);
gpuA
->
randomizeUniform
();
gpuB
->
randomizeUniform
();
gpuOut
->
randomizeUniform
();
MatrixPtr
gpuA
=
std
::
make_shared
<
GpuMatrix
>
(
M
,
K
);
MatrixPtr
gpuB
=
std
::
make_shared
<
GpuMatrix
>
(
N
,
K
);
MatrixPtr
gpuC
(
new
GpuSparseMatrix
(
M
,
N
,
nnz
));
cpuA
->
randomizeUniform
();
cpuB
->
randomizeUniform
();
cpuC
->
randomizeUniform
();
hl_stream_t
stream
(
HPPL_STREAM_3
);
gpuA
->
copyFrom
(
*
cpuA
,
stream
);
gpuB
->
copyFrom
(
*
cpuB
,
stream
);
gpuC
->
copyFrom
(
*
cpuC
,
stream
);
hl_stream_synchronize
(
stream
);
BufferArgs
inputs
;
BufferArgs
outputs
;
inputs
.
addArg
(
*
gpuA
);
inputs
.
addArg
(
*
gpuB
);
outputs
.
addArg
(
*
gpu
Out
);
inputs
.
addArg
(
*
gpuA
->
getTranspose
()
);
inputs
.
addArg
(
*
gpuB
->
getTranspose
()
);
outputs
.
addArg
(
*
gpu
C
,
ASSIGN_TO
);
gpuFunc
->
calc
(
inputs
,
outputs
);
}
TEST
(
SMatrix
,
sMatrixMul
)
{
for
(
auto
M
:
{
1
,
40
,
128
,
200
})
{
for
(
auto
N
:
{
100
,
2000
,
20480
})
{
for
(
auto
K
:
{
100
,
512
,
1024
})
{
for
(
auto
N
:
{
100
})
{
for
(
auto
K
:
{
100
})
{
/// todo(tianbing), add scaleAB and scaleT
VLOG
(
3
)
<<
" M="
<<
M
<<
" N="
<<
N
<<
" K="
<<
K
;
testSpMatrixMul
(
M
,
N
,
K
,
0.05
,
1
,
1
);
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录