Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
Paddle
提交
2803cf57
P
Paddle
项目概览
BaiXuePrincess
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
2803cf57
编写于
12月 18, 2018
作者:
Y
Yu Yang
提交者:
GitHub
12月 18, 2018
浏览文件
操作
浏览文件
下载
差异文件
Merge pull request #14868 from reyoung/feature/refine_w2v
Feature/refine w2v
上级
6aa6b8cf
4de1a8bd
变更
7
显示空白变更内容
内联
并排
Showing
7 changed file
with
409 addition
and
222 deletion
+409
-222
cmake/external/python.cmake
cmake/external/python.cmake
+2
-3
paddle/fluid/operators/hierarchical_sigmoid_op.h
paddle/fluid/operators/hierarchical_sigmoid_op.h
+18
-10
paddle/fluid/operators/math/blas.h
paddle/fluid/operators/math/blas.h
+8
-0
paddle/fluid/operators/math/blas_impl.h
paddle/fluid/operators/math/blas_impl.h
+21
-0
paddle/fluid/operators/math/matrix_bit_code.cc
paddle/fluid/operators/math/matrix_bit_code.cc
+321
-159
paddle/fluid/operators/math/matrix_bit_code.h
paddle/fluid/operators/math/matrix_bit_code.h
+37
-50
paddle/fluid/platform/dynload/mklml.h
paddle/fluid/platform/dynload/mklml.h
+2
-0
未找到文件。
cmake/external/python.cmake
浏览文件 @
2803cf57
...
...
@@ -18,8 +18,8 @@ ENDIF()
INCLUDE
(
python_module
)
FIND_PACKAGE
(
PythonInterp
${
PY_VERSION
}
)
FIND_PACKAGE
(
PythonLibs
${
PY_VERSION
}
)
FIND_PACKAGE
(
PythonInterp
${
PY_VERSION
}
REQUIRED
)
FIND_PACKAGE
(
PythonLibs
${
PY_VERSION
}
REQUIRED
)
if
(
WIN32
)
execute_process
(
COMMAND
"
${
PYTHON_EXECUTABLE
}
"
"-c"
...
...
@@ -79,6 +79,5 @@ IF(PYTHONINTERP_FOUND)
"please use pip to upgrade protobuf. pip install -U protobuf"
)
ENDIF
()
ENDIF
(
PYTHONINTERP_FOUND
)
INCLUDE_DIRECTORIES
(
${
PYTHON_INCLUDE_DIR
}
)
INCLUDE_DIRECTORIES
(
${
PYTHON_NUMPY_INCLUDE_DIR
}
)
paddle/fluid/operators/hierarchical_sigmoid_op.h
浏览文件 @
2803cf57
...
...
@@ -150,19 +150,27 @@ class HierarchicalSigmoidGradOpKernel : public framework::OpKernel<T> {
label
.
data
<
int64_t
>
()));
}
auto
&
place
=
*
ctx
.
template
device_context
<
DeviceContext
>().
eigen_device
();
auto
pre_out_mat
=
EigenMatrix
<
T
>::
From
(
pre_out
);
auto
pre_out_grad_mat
=
EigenMatrix
<
T
>::
From
(
pre_out_grad
);
auto
out_grad_mat
=
EigenMatrix
<
T
>::
From
(
out_grad
);
// softrelu derivative
Eigen
::
array
<
int
,
2
>
bcast
{
1
,
static_cast
<
int
>
(
pre_out_grad
.
dims
()[
1
])}
;
auto
blas
=
math
::
GetBlas
<
DeviceContext
,
T
>
(
ctx
)
;
// softrelu derivative
pre_out_grad_mat
.
device
(
place
)
=
static_cast
<
T
>
(
1.0
)
-
static_cast
<
T
>
(
1.0
)
/
pre_out_mat
.
exp
();
auto
*
pre_out_grad_data
=
pre_out_grad
.
data
<
T
>
();
auto
*
pre_out_data
=
pre_out
.
data
<
T
>
();
auto
n
=
pre_out
.
numel
();
blas
.
VEXP
(
n
,
pre_out_data
,
pre_out_grad_data
);
blas
.
VINV
(
n
,
pre_out_grad_data
,
pre_out_grad_data
);
for
(
int64_t
i
=
0
;
i
<
n
;
++
i
)
{
pre_out_grad_data
[
i
]
=
1.0
-
pre_out_grad_data
[
i
];
}
bit_code
->
Sub
(
&
pre_out_grad
);
// the gradient of clip(w * x + b)
pre_out_grad_mat
.
device
(
place
)
=
pre_out_grad_mat
*
out_grad_mat
.
broadcast
(
bcast
);
auto
*
out_grad_data
=
out_grad
.
data
<
T
>
();
int64_t
dim0
=
pre_out_grad
.
dims
()[
0
];
int64_t
dim1
=
pre_out_grad
.
dims
()[
1
];
for
(
int64_t
i
=
0
;
i
<
dim0
;
++
i
)
{
T
tmp
=
out_grad_data
[
i
];
blas
.
SCAL
(
dim1
,
tmp
,
pre_out_grad_data
+
i
*
dim1
);
}
// TODO(guosheng): multiply pre_out_grad with subgradient of clipping to
// be consistent with the clipping in forward.
...
...
paddle/fluid/operators/math/blas.h
浏览文件 @
2803cf57
...
...
@@ -181,6 +181,9 @@ class Blas {
const
framework
::
Tensor
&
mat_b
,
const
MatDescriptor
&
dim_b
,
T
alpha
,
framework
::
Tensor
*
mat_out
,
T
beta
)
const
;
template
<
typename
T
>
void
VINV
(
int
n
,
const
T
*
a
,
T
*
y
)
const
;
private:
const
DeviceContext
&
context_
;
};
...
...
@@ -282,6 +285,11 @@ class BlasT : private Blas<DeviceContext> {
Base
()
->
template
BatchedGEMM
<
T
>(
args
...);
}
template
<
typename
...
ARGS
>
void
VINV
(
ARGS
...
args
)
const
{
Base
()
->
template
VINV
<
T
>(
args
...);
}
private:
const
Blas
<
DeviceContext
>*
Base
()
const
{
return
static_cast
<
const
Blas
<
DeviceContext
>*>
(
this
);
...
...
paddle/fluid/operators/math/blas_impl.h
浏览文件 @
2803cf57
...
...
@@ -118,6 +118,11 @@ struct CBlas<float> {
static
void
VPOW
(
ARGS
...
args
)
{
platform
::
dynload
::
vsPowx
(
args
...);
}
template
<
typename
...
ARGS
>
static
void
VINV
(
ARGS
...
args
)
{
platform
::
dynload
::
vsInv
(
args
...);
}
};
template
<
>
...
...
@@ -213,6 +218,11 @@ struct CBlas<double> {
static
void
VPOW
(
ARGS
...
args
)
{
platform
::
dynload
::
vdPowx
(
args
...);
}
template
<
typename
...
ARGS
>
static
void
VINV
(
ARGS
...
args
)
{
platform
::
dynload
::
vdInv
(
args
...);
}
};
#else
...
...
@@ -603,6 +613,17 @@ void Blas<DeviceContext>::MatMul(const framework::Tensor &mat_a,
dim_a
.
stride_
,
dim_b
.
stride_
);
}
}
template
<
typename
DeviceContext
>
template
<
typename
T
>
void
Blas
<
DeviceContext
>::
VINV
(
int
n
,
const
T
*
a
,
T
*
y
)
const
{
#ifdef PADDLE_WITH_MKLML
CBlas
<
T
>::
VINV
(
n
,
a
,
y
);
#else
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
y
[
i
]
=
1.0
/
a
[
i
];
}
#endif
}
}
// namespace math
}
// namespace operators
...
...
paddle/fluid/operators/math/matrix_bit_code.cc
浏览文件 @
2803cf57
...
...
@@ -14,195 +14,334 @@ limitations under the License. */
#include "paddle/fluid/operators/math/matrix_bit_code.h"
#include <iostream>
#include <map>
namespace
paddle
{
namespace
operators
{
namespace
math
{
template
<
typename
T
>
void
MatrixBitCodeFunctor
<
T
>::
Add
(
const
framework
::
Tensor
&
vec
,
framework
::
Tensor
*
tmat
)
{
size_t
batch_size
=
tmat
->
dims
()[
0
];
size_t
width
=
tmat
->
dims
()[
1
];
struct
MatrixBitCodeFunctorAdd
:
public
boost
::
static_visitor
<
void
>
{
const
framework
::
Tensor
&
vec_
;
framework
::
Tensor
*
tmat_
;
MatrixBitCodeFunctorAdd
(
const
framework
::
Tensor
&
vec
,
framework
::
Tensor
*
tmat
)
:
vec_
(
vec
),
tmat_
(
tmat
)
{}
template
<
typename
CodeTable
>
void
operator
()(
const
CodeTable
&
code_table
)
{
size_t
batch_size
=
tmat_
->
dims
()[
0
];
size_t
width
=
tmat_
->
dims
()[
1
];
auto
*
tmat_data
=
tmat_
->
data
<
T
>
();
auto
*
vec_data
=
vec_
.
data
<
T
>
();
for
(
size_t
i
=
0
;
i
<
batch_size
;
++
i
)
{
auto
code
=
code_table_
->
get_code
(
i
);
int
code_length
=
code
->
get_length
();
auto
code
=
code_table
.
get_code
(
i
);
int
code_length
=
code
.
get_length
();
for
(
int
j
=
0
;
j
<
code_length
;
++
j
)
{
size_t
index
=
code
->
calc_index
(
j
);
tmat
->
data
<
T
>
()[
i
*
width
+
j
]
+=
vec
.
data
<
T
>
()
[
index
];
size_t
index
=
code
.
calc_index
(
j
);
tmat_data
[
i
*
width
+
j
]
+=
vec_data
[
index
];
}
}
}
};
template
<
typename
T
>
void
MatrixBitCodeFunctor
<
T
>::
Add
(
const
framework
::
Tensor
&
vec
,
framework
::
Tensor
*
tmat
)
{
MatrixBitCodeFunctorAdd
<
T
>
func
(
vec
,
tmat
);
code_table_
.
apply_visitor
(
func
);
}
template
<
typename
T
>
void
MatrixBitCodeFunctor
<
T
>::
AddGrad
(
const
framework
::
Tensor
&
tmat
,
framework
::
Tensor
*
vec
)
{
size_t
batch_size
=
tmat
.
dims
()[
0
];
size_t
width
=
tmat
.
dims
()[
1
];
struct
MatrixBitCodeFunctorAddGrad
:
public
boost
::
static_visitor
<
void
>
{
const
framework
::
Tensor
&
tmat_
;
framework
::
Tensor
*
vec_
;
MatrixBitCodeFunctorAddGrad
(
const
framework
::
Tensor
&
tmat
,
framework
::
Tensor
*
vec
)
:
tmat_
(
tmat
),
vec_
(
vec
)
{}
template
<
typename
CodeTable
>
void
operator
()(
const
CodeTable
&
table
)
{
size_t
batch_size
=
tmat_
.
dims
()[
0
];
size_t
width
=
tmat_
.
dims
()[
1
];
auto
*
vec_data
=
vec_
->
data
<
T
>
();
auto
*
tmat_data
=
tmat_
.
data
<
T
>
();
for
(
size_t
i
=
0
;
i
<
batch_size
;
++
i
)
{
auto
code
=
code_table_
->
get_code
(
i
);
int
code_length
=
code
->
get_length
();
auto
code
=
table
.
get_code
(
i
);
int
code_length
=
code
.
get_length
();
for
(
int
j
=
0
;
j
<
code_length
;
++
j
)
{
size_t
index
=
code
->
calc_index
(
j
);
vec
->
data
<
T
>
()[
index
]
+=
tmat
.
data
<
T
>
()[
i
*
width
+
j
];
size_t
index
=
code
.
calc_index
(
j
);
vec_data
[
index
]
+=
tmat_data
[
i
*
width
+
j
];
}
}
}
};
template
<
typename
T
>
void
MatrixBitCodeFunctor
<
T
>::
AddGrad
(
const
framework
::
Tensor
&
tmat
,
framework
::
Tensor
*
vec
)
{
MatrixBitCodeFunctorAddGrad
<
T
>
func
(
tmat
,
vec
);
code_table_
.
apply_visitor
(
func
);
}
template
<
typename
T
>
void
MatrixBitCodeFunctor
<
T
>::
AddGrad
(
const
framework
::
Tensor
&
tmat
,
framework
::
SelectedRows
*
vec
)
{
size_t
batch_size
=
tmat
.
dims
()[
0
];
size_t
width
=
tmat
.
dims
()[
1
];
struct
MatrixBitCodeFunctorSelectedRowsAddGrad
:
public
boost
::
static_visitor
<
void
>
{
const
framework
::
Tensor
&
tmat_
;
framework
::
SelectedRows
*
vec_
;
MatrixBitCodeFunctorSelectedRowsAddGrad
(
const
framework
::
Tensor
&
tmat
,
framework
::
SelectedRows
*
vec
)
:
tmat_
(
tmat
),
vec_
(
vec
)
{}
template
<
typename
CodeTable
>
void
operator
()(
const
CodeTable
&
code_table
)
{
size_t
batch_size
=
tmat_
.
dims
()[
0
];
size_t
width
=
tmat_
.
dims
()[
1
];
auto
*
vec_data
=
vec_
->
mutable_value
()
->
template
data
<
T
>();
auto
*
tmat_data
=
tmat_
.
data
<
T
>
();
for
(
size_t
i
=
0
;
i
<
batch_size
;
++
i
)
{
auto
code
=
code_table_
->
get_code
(
i
);
int
code_length
=
code
->
get_length
();
auto
code
=
code_table
.
get_code
(
i
);
int
code_length
=
code
.
get_length
();
for
(
int
j
=
0
;
j
<
code_length
;
++
j
)
{
size_t
index
=
code
->
calc_index
(
j
);
int64_t
row_index
=
vec
->
GetIndexFromId
(
static_cast
<
int64_t
>
(
index
));
vec
->
mutable_value
()
->
data
<
T
>
()[
row_index
]
+=
tmat
.
data
<
T
>
()[
i
*
width
+
j
];
size_t
index
=
code
.
calc_index
(
j
);
int64_t
row_index
=
vec_
->
GetIndexFromId
(
static_cast
<
int64_t
>
(
index
));
vec_data
[
row_index
]
+=
tmat_data
[
i
*
width
+
j
];
}
}
}
};
template
<
typename
T
>
void
MatrixBitCodeFunctor
<
T
>::
AddGrad
(
const
framework
::
Tensor
&
tmat
,
framework
::
SelectedRows
*
vec
)
{
MatrixBitCodeFunctorSelectedRowsAddGrad
<
T
>
func
(
tmat
,
vec
);
code_table_
.
apply_visitor
(
func
);
}
template
<
typename
T
>
void
MatrixBitCodeFunctor
<
T
>::
Sum
(
const
framework
::
Tensor
&
tmat
,
framework
::
Tensor
*
sum
,
T
scale_sum
)
{
size_t
num_samples
=
tmat
.
dims
()[
0
];
size_t
o_width
=
tmat
.
dims
()[
1
];
struct
MatrixBitCodeFunctorSum
:
public
boost
::
static_visitor
<
void
>
{
const
framework
::
Tensor
&
tmat_
;
framework
::
Tensor
*
sum_
;
T
scale_sum_
;
MatrixBitCodeFunctorSum
(
const
framework
::
Tensor
&
tmat
,
framework
::
Tensor
*
sum
,
T
scale_sum
)
:
tmat_
(
tmat
),
sum_
(
sum
),
scale_sum_
(
scale_sum
)
{}
template
<
typename
CodeTable
>
void
operator
()(
const
CodeTable
&
code_table
)
{
size_t
num_samples
=
tmat_
.
dims
()[
0
];
size_t
o_width
=
tmat_
.
dims
()[
1
];
auto
*
tmat_data
=
tmat_
.
data
<
T
>
();
auto
*
sum_data
=
sum_
->
data
<
T
>
();
for
(
size_t
i
=
0
;
i
<
num_samples
;
++
i
)
{
T
sm
=
static_cast
<
T
>
(
0.0
);
auto
code
=
code_table_
->
get_code
(
i
);
int
code_length
=
code
->
get_length
();
auto
code
=
code_table
.
get_code
(
i
);
int
code_length
=
code
.
get_length
();
for
(
int
j
=
0
;
j
<
code_length
;
++
j
)
{
if
(
code
->
calc_bit
(
j
))
{
// calc_bit starts from right most bit, while data in tmat[i] is in the
if
(
code
.
calc_bit
(
j
))
{
// calc_bit starts from right most bit, while data in tmat[i] is in
// the
// reverse order.
sm
+=
tmat
.
data
<
T
>
()
[
i
*
o_width
+
j
];
sm
+=
tmat_data
[
i
*
o_width
+
j
];
}
}
sum
->
data
<
T
>
()[
i
]
=
scale_sum
*
sm
;
sum_data
[
i
]
=
scale_sum_
*
sm
;
}
}
};
template
<
typename
T
>
void
MatrixBitCodeFunctor
<
T
>::
Sum
(
const
framework
::
Tensor
&
tmat
,
framework
::
Tensor
*
sum
,
T
scale_sum
)
{
MatrixBitCodeFunctorSum
<
T
>
func
(
tmat
,
sum
,
scale_sum
);
code_table_
.
apply_visitor
(
func
);
}
template
<
typename
T
>
void
MatrixBitCodeFunctor
<
T
>::
Mul
(
framework
::
Tensor
*
tmat
,
const
framework
::
Tensor
&
weight
,
const
framework
::
Tensor
&
input
)
{
struct
MatrixBitCodeFunctorMul
:
public
boost
::
static_visitor
<
void
>
{
framework
::
Tensor
*
tmat_
;
const
framework
::
Tensor
&
weight_
;
const
framework
::
Tensor
&
input_
;
MatrixBitCodeFunctorMul
(
framework
::
Tensor
*
tmat
,
const
framework
::
Tensor
&
weight
,
const
framework
::
Tensor
&
input
)
:
tmat_
(
tmat
),
weight_
(
weight
),
input_
(
input
)
{}
template
<
typename
CodeTable
>
void
operator
()(
const
CodeTable
&
code_table
)
{
auto
blas
=
GetBlas
<
platform
::
CPUDeviceContext
,
T
>
(
platform
::
CPUDeviceContext
());
size_t
num_samples
=
tmat
->
dims
()[
0
];
size_t
tmat_width
=
tmat
->
dims
()[
1
];
size_t
input_width
=
input
.
dims
()[
1
];
size_t
weight_width
=
weight
.
dims
()[
1
];
auto
tmat_value
=
tmat
->
data
<
T
>
();
auto
weight_value
=
weight
.
data
<
T
>
();
auto
input_value
=
input
.
data
<
T
>
();
size_t
num_samples
=
tmat_
->
dims
()[
0
];
size_t
tmat_width
=
tmat_
->
dims
()[
1
];
size_t
input_width
=
input_
.
dims
()[
1
];
size_t
weight_width
=
weight_
.
dims
()[
1
];
auto
tmat_value
=
tmat_
->
data
<
T
>
();
auto
weight_value
=
weight_
.
data
<
T
>
();
auto
input_value
=
input_
.
data
<
T
>
();
for
(
size_t
i
=
0
;
i
<
num_samples
;
++
i
)
{
auto
code
=
code_table_
->
get_code
(
i
);
int
code_length
=
code
->
get_length
();
const
T
*
input_row
=
input_value
+
input_width
*
i
;
auto
code
=
code_table
.
get_code
(
i
);
int
code_length
=
code
.
get_length
();
const
T
*
input_row
=
input_value
+
input_width
*
i
;
for
(
int
j
=
0
;
j
<
code_length
;
++
j
)
{
size_t
index
=
code
->
calc_index
(
j
);
const
T
*
weight_row
=
weight_value
+
weight_width
*
index
;
T
sum
=
static_cast
<
T
>
(
0.0
);
sum
=
blas
.
DOT
(
input_width
,
weight_row
,
input_row
);
size_t
index
=
code
.
calc_index
(
j
);
const
T
*
weight_row
=
weight_value
+
weight_width
*
index
;
T
sum
=
blas
.
DOT
(
input_width
,
weight_row
,
input_row
);
tmat_value
[
i
*
tmat_width
+
j
]
+=
sum
;
}
}
}
};
template
<
typename
T
>
void
MatrixBitCodeFunctor
<
T
>::
Mul
(
framework
::
Tensor
*
tmat
,
const
framework
::
Tensor
&
weight
,
const
framework
::
Tensor
&
input
)
{
MatrixBitCodeFunctorMul
<
T
>
func
(
tmat
,
weight
,
input
);
code_table_
.
apply_visitor
(
func
);
}
template
<
typename
T
,
size_t
N
>
class
ReservedVector
:
public
std
::
vector
<
T
>
{
public:
ReservedVector
()
{
this
->
reserve
(
N
);
}
};
template
<
typename
T
>
void
MatrixBitCodeFunctor
<
T
>::
MulGradWeight
(
const
framework
::
Tensor
&
tmat
,
framework
::
Tensor
*
weight
,
const
framework
::
Tensor
&
input
)
{
struct
MatrixBitCodeFunctorMulGradWeight
:
public
boost
::
static_visitor
<
void
>
{
const
framework
::
Tensor
&
tmat_
;
framework
::
Tensor
*
weight_
;
const
framework
::
Tensor
&
input_
;
MatrixBitCodeFunctorMulGradWeight
(
const
framework
::
Tensor
&
tmat
,
framework
::
Tensor
*
weight
,
const
framework
::
Tensor
&
input
)
:
tmat_
(
tmat
),
weight_
(
weight
),
input_
(
input
)
{}
template
<
typename
CodeTable
>
void
operator
()(
const
CodeTable
&
code_table
)
{
auto
blas
=
GetBlas
<
platform
::
CPUDeviceContext
,
T
>
(
platform
::
CPUDeviceContext
());
size_t
num_samples
=
tmat
.
dims
()[
0
];
size_t
input_width
=
input
.
dims
()[
1
];
size_t
tmat_width
=
tmat
.
dims
()[
1
];
size_t
weight_width
=
weight
->
dims
()[
1
];
auto
tmat_value
=
tmat
.
data
<
T
>
();
auto
weight_value
=
weight
->
data
<
T
>
();
auto
input_value
=
input
.
data
<
T
>
();
std
::
unordered_map
<
int
,
std
::
vector
<
std
::
pair
<
T
,
const
T
*>>>
ops
;
size_t
num_samples
=
tmat_
.
dims
()[
0
];
size_t
input_width
=
input_
.
dims
()[
1
];
size_t
tmat_width
=
tmat_
.
dims
()[
1
];
size_t
weight_width
=
weight_
->
dims
()[
1
];
auto
tmat_value
=
tmat_
.
data
<
T
>
();
auto
weight_value
=
weight_
->
data
<
T
>
();
auto
input_value
=
input_
.
data
<
T
>
();
std
::
map
<
int
,
ReservedVector
<
std
::
pair
<
T
,
const
T
*>
,
8u
>>
ops
;
for
(
size_t
i
=
0
;
i
<
num_samples
;
++
i
)
{
auto
code
=
code_table_
->
get_code
(
i
);
int
code_length
=
code
->
get_length
();
const
T
*
input_value_row
=
input_value
+
input_width
*
i
;
const
T
*
tmat_row
=
tmat_value
+
i
*
tmat_width
;
auto
code
=
code_table
.
get_code
(
i
);
int
code_length
=
code
.
get_length
();
const
T
*
input_value_row
=
input_value
+
input_width
*
i
;
const
T
*
tmat_row
=
tmat_value
+
i
*
tmat_width
;
for
(
int
j
=
0
;
j
<
code_length
;
++
j
)
{
ops
[
code
->
calc_index
(
j
)].
emplace_back
(
tmat_row
[
j
],
input_value_row
);
ops
[
code
.
calc_index
(
j
)].
emplace_back
(
tmat_row
[
j
],
input_value_row
);
}
}
for
(
auto
&
op
:
ops
)
{
auto
&
op_in_row
=
op
.
second
;
for
(
auto
&
pair
:
op_in_row
)
{
auto
&
scale
=
pair
.
first
;
auto
*
input_row
=
pair
.
second
;
T
*
weight_row
=
weight_value
+
op
.
first
*
weight_width
;
for
(
auto
&
op
:
ops
)
{
auto
&
op_in_row
=
op
.
second
;
for
(
auto
&
pair
:
op_in_row
)
{
auto
&
scale
=
pair
.
first
;
auto
*
input_row
=
pair
.
second
;
T
*
weight_row
=
weight_value
+
op
.
first
*
weight_width
;
blas
.
AXPY
(
input_width
,
scale
,
input_row
,
weight_row
);
}
}
}
};
template
<
typename
T
>
void
MatrixBitCodeFunctor
<
T
>::
MulGradWeight
(
const
framework
::
Tensor
&
tmat
,
framework
::
Tensor
*
weight
,
const
framework
::
Tensor
&
input
)
{
MatrixBitCodeFunctorMulGradWeight
<
T
>
func
(
tmat
,
weight
,
input
);
code_table_
.
apply_visitor
(
func
);
}
template
<
typename
T
>
void
MatrixBitCodeFunctor
<
T
>::
MulGradWeight
(
const
framework
::
Tensor
&
tmat
,
framework
::
SelectedRows
*
weight
,
const
framework
::
Tensor
&
input
)
{
struct
MatrixBitCodeFunctorMulGradWeightSR
:
public
boost
::
static_visitor
<
void
>
{
const
framework
::
Tensor
&
tmat_
;
framework
::
SelectedRows
*
weight_
;
const
framework
::
Tensor
&
input_
;
MatrixBitCodeFunctorMulGradWeightSR
(
const
framework
::
Tensor
&
tmat
,
framework
::
SelectedRows
*
weight
,
const
framework
::
Tensor
&
input
)
:
tmat_
(
tmat
),
weight_
(
weight
),
input_
(
input
)
{}
template
<
typename
CodeTable
>
void
operator
()(
const
CodeTable
&
code_table
)
{
auto
blas
=
GetBlas
<
platform
::
CPUDeviceContext
,
T
>
(
platform
::
CPUDeviceContext
());
size_t
num_samples
=
tmat
.
dims
()[
0
];
size_t
input_width
=
input
.
dims
()[
1
];
size_t
tmat_width
=
tmat
.
dims
()[
1
];
size_t
weight_width
=
weight
->
value
().
dims
()[
1
];
auto
tmat_value
=
tmat
.
data
<
T
>
();
auto
weight_value
=
weight
->
mutable_value
()
->
data
<
T
>
();
auto
input_value
=
input
.
data
<
T
>
();
size_t
num_samples
=
tmat_
.
dims
()[
0
];
size_t
input_width
=
input_
.
dims
()[
1
];
size_t
tmat_width
=
tmat_
.
dims
()[
1
];
size_t
weight_width
=
weight_
->
value
().
dims
()[
1
];
auto
tmat_value
=
tmat_
.
data
<
T
>
();
auto
weight_value
=
weight_
->
mutable_value
()
->
data
<
T
>
();
auto
input_value
=
input_
.
data
<
T
>
();
std
::
unordered_map
<
int
,
std
::
vector
<
std
::
pair
<
T
,
const
T
*>>>
ops
;
ops
.
reserve
(
weight
->
rows
().
size
());
std
::
unordered_map
<
int
,
std
::
vector
<
std
::
pair
<
T
,
const
T
*>>>
ops
;
ops
.
reserve
(
weight_
->
rows
().
size
());
for
(
size_t
i
=
0
;
i
<
num_samples
;
++
i
)
{
auto
code
=
code_table_
->
get_code
(
i
);
int
code_length
=
code
->
get_length
();
const
T
*
input_value_row
=
input_value
+
input_width
*
i
;
const
T
*
tmat_row
=
tmat_value
+
i
*
tmat_width
;
auto
code
=
code_table
.
get_code
(
i
);
int
code_length
=
code
.
get_length
();
const
T
*
input_value_row
=
input_value
+
input_width
*
i
;
const
T
*
tmat_row
=
tmat_value
+
i
*
tmat_width
;
for
(
int
j
=
0
;
j
<
code_length
;
++
j
)
{
ops
[
code
->
calc_index
(
j
)].
emplace_back
(
tmat_row
[
j
],
input_value_row
);
ops
[
code
.
calc_index
(
j
)].
emplace_back
(
tmat_row
[
j
],
input_value_row
);
}
}
for
(
auto
&
row
:
weight
->
rows
())
{
auto
&
op_in_row
=
ops
[
row
];
for
(
auto
&
pair
:
op_in_row
)
{
auto
&
scale
=
pair
.
first
;
auto
*
input_row
=
pair
.
second
;
for
(
auto
&
row
:
weight_
->
rows
())
{
auto
&
op_in_row
=
ops
[
row
];
for
(
auto
&
pair
:
op_in_row
)
{
auto
&
scale
=
pair
.
first
;
auto
*
input_row
=
pair
.
second
;
blas
.
AXPY
(
input_width
,
scale
,
input_row
,
weight_value
);
}
weight_value
+=
weight_width
;
}
}
};
template
<
typename
T
>
void
MatrixBitCodeFunctor
<
T
>::
MulGradWeight
(
const
framework
::
Tensor
&
tmat
,
framework
::
SelectedRows
*
weight
,
const
framework
::
Tensor
&
input
)
{
MatrixBitCodeFunctorMulGradWeightSR
<
T
>
func
(
tmat
,
weight
,
input
);
code_table_
.
apply_visitor
(
func
);
}
template
<
typename
T
>
void
MatrixBitCodeFunctor
<
T
>::
MulGradError
(
const
framework
::
Tensor
&
tmat
,
const
framework
::
Tensor
&
weight
,
framework
::
Tensor
*
input
)
{
size_t
num_samples
=
tmat
.
dims
()[
0
];
size_t
tmat_width
=
tmat
.
dims
()[
1
];
size_t
input_width
=
input
->
dims
()[
1
];
size_t
weight_width
=
weight
.
dims
()[
1
];
auto
tmat_value
=
tmat
.
data
<
T
>
();
auto
weight_value
=
weight
.
data
<
T
>
();
auto
input_value
=
input
->
data
<
T
>
();
struct
MatrixBitCodeFunctorMulGradError
:
public
boost
::
static_visitor
<
void
>
{
const
framework
::
Tensor
&
tmat_
;
const
framework
::
Tensor
&
weight_
;
framework
::
Tensor
*
input_
;
MatrixBitCodeFunctorMulGradError
(
const
framework
::
Tensor
&
tmat
,
const
framework
::
Tensor
&
weight
,
framework
::
Tensor
*
input
)
:
tmat_
(
tmat
),
weight_
(
weight
),
input_
(
input
)
{}
template
<
typename
CodeTable
>
void
operator
()(
const
CodeTable
&
code_table
)
{
size_t
num_samples
=
tmat_
.
dims
()[
0
];
size_t
tmat_width
=
tmat_
.
dims
()[
1
];
size_t
input_width
=
input_
->
dims
()[
1
];
size_t
weight_width
=
weight_
.
dims
()[
1
];
auto
tmat_value
=
tmat_
.
data
<
T
>
();
auto
weight_value
=
weight_
.
data
<
T
>
();
auto
input_value
=
input_
->
data
<
T
>
();
for
(
size_t
i
=
0
;
i
<
num_samples
;
++
i
)
{
auto
code
=
code_table_
->
get_code
(
i
);
int
code_length
=
code
->
get_length
();
auto
code
=
code_table
.
get_code
(
i
);
int
code_length
=
code
.
get_length
();
for
(
int
j
=
0
;
j
<
code_length
;
++
j
)
{
size_t
index
=
code
->
calc_index
(
j
);
size_t
index
=
code
.
calc_index
(
j
);
for
(
size_t
k
=
0
;
k
<
input_width
;
++
k
)
{
input_value
[
input_width
*
i
+
k
]
+=
...
...
@@ -211,21 +350,44 @@ void MatrixBitCodeFunctor<T>::MulGradError(const framework::Tensor& tmat,
}
}
}
}
};
template
<
typename
T
>
void
MatrixBitCodeFunctor
<
T
>::
MulGradError
(
const
framework
::
Tensor
&
tmat
,
const
framework
::
Tensor
&
weight
,
framework
::
Tensor
*
input
)
{
MatrixBitCodeFunctorMulGradError
<
T
>
func
(
tmat
,
weight
,
input
);
code_table_
.
apply_visitor
(
func
);
}
template
<
typename
T
>
void
MatrixBitCodeFunctor
<
T
>::
Sub
(
framework
::
Tensor
*
tmat
)
{
size_t
num_samples
=
tmat
->
dims
()[
0
];
size_t
o_width
=
tmat
->
dims
()[
1
];
struct
MatrixBitCodeFunctorSub
:
public
boost
::
static_visitor
<
void
>
{
framework
::
Tensor
*
tmat_
;
explicit
MatrixBitCodeFunctorSub
(
framework
::
Tensor
*
tmat
)
:
tmat_
(
tmat
)
{}
template
<
typename
CodeTable
>
void
operator
()(
const
CodeTable
&
code_table
)
{
size_t
num_samples
=
tmat_
->
dims
()[
0
];
size_t
o_width
=
tmat_
->
dims
()[
1
];
auto
*
tmat_data
=
tmat_
->
data
<
T
>
();
for
(
size_t
i
=
0
;
i
<
num_samples
;
++
i
)
{
auto
code
=
code_table_
->
get_code
(
i
);
int
code_length
=
code
->
get_length
();
auto
code
=
code_table
.
get_code
(
i
);
int
code_length
=
code
.
get_length
();
for
(
int
j
=
0
;
j
<
code_length
;
++
j
)
{
if
(
code
->
calc_bit
(
j
))
{
tmat
->
data
<
T
>
()[
i
*
o_width
+
j
]
-=
1
;
if
(
code
.
calc_bit
(
j
))
{
tmat_data
[
i
*
o_width
+
j
]
-=
1
;
}
}
}
}
};
template
<
typename
T
>
void
MatrixBitCodeFunctor
<
T
>::
Sub
(
framework
::
Tensor
*
tmat
)
{
MatrixBitCodeFunctorSub
<
T
>
func
(
tmat
);
code_table_
.
apply_visitor
(
func
);
}
template
class
MatrixBitCodeFunctor
<
float
>;
...
...
paddle/fluid/operators/math/matrix_bit_code.h
浏览文件 @
2803cf57
...
...
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <map>
#include <unordered_map>
#include <utility>
#include <vector>
...
...
@@ -22,6 +23,7 @@ limitations under the License. */
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/variant.h"
#if defined(_WIN32)
#include <intrin.h>
...
...
@@ -98,24 +100,7 @@ inline int clz(const T& value) {
inline
size_t
FindLastSet
(
size_t
x
)
{
return
sizeof
(
size_t
)
*
8
-
clz
(
x
);
}
#endif // !_WIN32
// set a code interface to create multiple code
class
Code
{
public:
virtual
~
Code
()
{}
virtual
size_t
calc_index
(
int
bit
)
const
=
0
;
virtual
bool
calc_bit
(
int
bit
)
const
=
0
;
virtual
int
get_length
()
const
=
0
;
};
// set a CodeTable interface to create multiple code table
class
CodeTable
{
public:
virtual
std
::
unique_ptr
<
Code
>
get_code
(
int64_t
code
)
const
=
0
;
virtual
size_t
size
()
const
=
0
;
virtual
int
get_max_code_length
()
const
=
0
;
virtual
~
CodeTable
()
{}
};
class
SimpleCode
:
public
Code
{
class
SimpleCode
{
public:
SimpleCode
(
size_t
code
,
size_t
num_classes
,
const
int64_t
*
ids
)
:
c_
(
static_cast
<
size_t
>
(
ids
[
code
])
+
num_classes
)
{}
...
...
@@ -137,16 +122,16 @@ class SimpleCode : public Code {
};
template
<
typename
T
>
class
CustomCode
:
public
Code
{
class
CustomCode
{
public:
CustomCode
(
const
framework
::
Tensor
&
ptable
,
const
framework
::
Tensor
&
pcode
,
const
int64_t
*
ids
,
int
index
)
:
ids_
(
ids
),
index_
(
index
)
{
ptable_
=
ptable
.
Slice
(
index
,
index
+
1
)
;
pcode_
=
pcode
.
Slice
(
index
,
index
+
1
)
;
const
int64_t
*
ids
,
int
index
)
{
seq_len_
=
ptable
.
dims
()[
1
];
ptable_
data_
=
ptable
.
data
<
T
>
()
+
seq_len_
*
index
;
pcode_
data_
=
pcode
.
data
<
T
>
()
+
seq_len_
*
index
;
}
/**
* Here the id of root shoud be 1 rather than 0, thus the encoding of class c
* Here the id of root shou
l
d be 1 rather than 0, thus the encoding of class c
* is `c + num_classes` and all siblings can get the same weight indice using
* prefixes.
* Weight index is the prefixes of encoding, thus leave out the right most
...
...
@@ -154,36 +139,37 @@ class CustomCode : public Code {
* Binary classification path is the suffixes of encoding, thus leave out the
* left most bit in calc_bit.
*/
size_t
calc_index
(
int
bit
)
const
{
return
ptable_
.
data
<
T
>
()[
bit
];
}
bool
calc_bit
(
int
bit
)
const
{
return
pcode_
.
data
<
T
>
()[
bit
];
}
int
get_length
()
const
{
int
length
=
0
;
size_t
calc_index
(
int
bit
)
const
{
return
ptable_data_
[
bit
];
}
bool
calc_bit
(
int
bit
)
const
{
return
pcode_data_
[
bit
];
}
for
(
int
i
=
0
;
i
<
static_cast
<
int
>
(
ptable_
.
dims
()[
1
]);
i
++
)
{
if
(
ptable_
.
data
<
T
>
()[
i
]
>=
0
)
{
length
++
;
}
else
{
return
length
;
}
// NOTE: this function is not thread-safe.
int
get_length
()
const
{
if
(
length_
<
0
)
{
auto
len
=
seq_len_
;
length_
=
static_cast
<
int
>
(
std
::
find_if
(
ptable_data_
,
ptable_data_
+
len
,
[](
const
T
&
val
)
{
return
val
<
0
;
})
-
ptable_data_
);
}
return
length
;
return
length
_
;
}
private:
framework
::
Tensor
ptable
_
;
framework
::
Tensor
pcode
_
;
const
int64_t
*
ids
_
;
const
int
index_
;
int64_t
seq_len
_
;
const
T
*
ptable_data
_
;
const
T
*
pcode_data
_
;
mutable
int
length_
{
-
1
}
;
};
class
SimpleCodeTable
:
public
CodeTable
{
class
SimpleCodeTable
{
public:
SimpleCodeTable
(
size_t
num_classes
,
const
int64_t
*
ids
)
:
num_classes_
(
num_classes
),
ids_
(
ids
)
{}
std
::
unique_ptr
<
Code
>
get_code
(
int64_t
code
)
const
{
std
::
unique_ptr
<
Code
>
coder
(
new
SimpleCode
(
code
,
num_classes_
,
ids_
));
return
coder
;
SimpleCode
get_code
(
int64_t
code
)
const
{
return
SimpleCode
(
code
,
num_classes_
,
ids_
)
;
}
size_t
size
()
const
{
return
num_classes_
;
}
int
get_max_code_length
()
const
{
return
FindLastSet
(
num_classes_
-
1
);
}
...
...
@@ -193,15 +179,14 @@ class SimpleCodeTable : public CodeTable {
};
template
<
typename
T
>
class
CustomCodeTable
:
public
CodeTable
{
class
CustomCodeTable
{
public:
CustomCodeTable
(
const
framework
::
Tensor
&
ptable
,
const
framework
::
Tensor
&
pcode
,
const
int64_t
*
ids
)
:
ptable_
(
ptable
),
pcode_
(
pcode
),
ids_
(
ids
)
{}
std
::
unique_ptr
<
Code
>
get_code
(
int64_t
code
)
const
{
std
::
unique_ptr
<
Code
>
coder
(
new
CustomCode
<
T
>
(
ptable_
,
pcode_
,
ids_
,
code
));
return
coder
;
CustomCode
<
T
>
get_code
(
int64_t
code
)
const
{
return
CustomCode
<
T
>
(
ptable_
,
pcode_
,
ids_
,
code
);
}
size_t
size
()
const
{
return
static_cast
<
size_t
>
(
ptable_
.
dims
()[
1
]);
}
...
...
@@ -215,19 +200,21 @@ class CustomCodeTable : public CodeTable {
const
int64_t
*
ids_
;
};
using
CodeTable
=
boost
::
variant
<
SimpleCodeTable
,
CustomCodeTable
<
int64_t
>>
;
template
<
typename
T
>
class
MatrixBitCodeFunctor
{
public:
MatrixBitCodeFunctor
(
size_t
num_classes
,
const
int64_t
*
ids
)
:
num_classes_
(
num_classes
),
ids_
(
ids
),
code_table_
(
new
SimpleCodeTable
(
num_classes
,
ids
))
{}
code_table_
(
SimpleCodeTable
(
num_classes
,
ids
))
{}
MatrixBitCodeFunctor
(
const
framework
::
Tensor
&
ptable
,
const
framework
::
Tensor
&
pcode
,
const
int64_t
*
ids
)
:
num_classes_
(
static_cast
<
size_t
>
(
ptable
.
dims
()[
1
])),
ids_
(
ids
),
code_table_
(
new
CustomCodeTable
<
int64_t
>
(
ptable
,
pcode
,
ids
))
{}
code_table_
(
CustomCodeTable
<
int64_t
>
(
ptable
,
pcode
,
ids
))
{}
/* For j < code_length
tmat(i, j) += vec(0, index(i, j))
*/
...
...
@@ -277,7 +264,7 @@ class MatrixBitCodeFunctor {
size_t
num_classes_
;
const
int64_t
*
ids_
;
std
::
unique_ptr
<
CodeTable
>
code_table_
;
CodeTable
code_table_
;
};
}
// namespace math
}
// namespace operators
...
...
paddle/fluid/platform/dynload/mklml.h
浏览文件 @
2803cf57
...
...
@@ -82,6 +82,8 @@ extern void* mklml_dso_handle;
__macro(vdSqr); \
__macro(vsPowx); \
__macro(vdPowx); \
__macro(vsInv); \
__macro(vdInv); \
__macro(MKL_Set_Num_Threads)
MKLML_ROUTINE_EACH
(
DECLARE_DYNAMIC_LOAD_MKLML_WRAP
);
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录