Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
6720e67f
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
未验证
提交
6720e67f
编写于
1月 19, 2018
作者:
C
Cao Ying
提交者:
GitHub
1月 19, 2018
浏览文件
操作
浏览文件
下载
差异文件
Merge pull request #7675 from lcy-seso/remove_copy_from_crf
delete memory copy from linear_chain_crf_op.
上级
a1c281f0
4020451a
变更
2
隐藏空白更改
内联
并排
Showing
2 changed file
with
20 addition
and
210 deletion
+20
-210
paddle/operators/linear_chain_crf_op.cc
paddle/operators/linear_chain_crf_op.cc
+2
-2
paddle/operators/linear_chain_crf_op.h
paddle/operators/linear_chain_crf_op.h
+18
-208
未找到文件。
paddle/operators/linear_chain_crf_op.cc
浏览文件 @
6720e67f
...
...
@@ -187,7 +187,7 @@ class LinearChainCRFOp : public framework::OperatorWithKernel {
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
return
framework
::
OpKernelType
(
framework
::
ToDataType
(
ctx
.
Input
<
LoDTensor
>
(
"Emission"
)
->
type
()),
ctx
.
device_context
());
platform
::
CPUPlace
());
}
};
...
...
@@ -248,7 +248,7 @@ class LinearChainCRFGradOp : public framework::OperatorWithKernel {
framework
::
ToDataType
(
ctx
.
Input
<
LoDTensor
>
(
framework
::
GradVarName
(
"LogLikelihood"
))
->
type
()),
ctx
.
device_context
());
platform
::
CPUPlace
());
}
};
...
...
paddle/operators/linear_chain_crf_op.h
浏览文件 @
6720e67f
...
...
@@ -65,57 +65,14 @@ class LinearChainCRFOpKernel : public framework::OpKernel<T> {
const
size_t
level
=
0
;
const
size_t
seq_num
=
in_lod
[
level
].
size
()
-
1
;
// These local variables hold the inputs and outputs, garanteeing them on
// CPU memory, to provide a consistent reference.
// TODO(caoying) Fix this by moving all these local variables into the
// class's data members once we can profile the whole training process.
LoDTensor
*
emission_weights
=
nullptr
;
LoDTensor
emission_weight_tensor
;
Tensor
*
transition_weights
=
nullptr
;
Tensor
transition_weight_tensor
;
LoDTensor
*
label
=
nullptr
;
LoDTensor
label_tensor
;
Tensor
*
emission_exps
=
nullptr
;
Tensor
emission_exps_tensor
;
Tensor
*
transition_exps
=
nullptr
;
Tensor
transition_exps_tensor
;
Tensor
*
alpha
=
nullptr
;
Tensor
alpha_tensor
;
Tensor
*
ll
=
nullptr
;
Tensor
ll_tensor
;
if
(
platform
::
is_gpu_place
(
ctx
.
GetPlace
()))
{
emission_weights
=
&
emission_weight_tensor
;
transition_weights
=
&
transition_weight_tensor
;
label
=
&
label_tensor
;
CopyInputsToCpuMemory
(
ctx
.
device_context
(),
*
ctx
.
Input
<
LoDTensor
>
(
"Emission"
),
*
ctx
.
Input
<
Tensor
>
(
"Transition"
),
*
ctx
.
Input
<
LoDTensor
>
(
"Label"
),
emission_weights
,
transition_weights
,
label
);
emission_exps
=
&
emission_exps_tensor
;
emission_exps
->
Resize
(
emission_weights
->
dims
());
transition_exps
=
&
transition_exps_tensor
;
transition_exps
->
Resize
(
transition_weights
->
dims
());
alpha
=
&
alpha_tensor
;
alpha
->
Resize
(
ctx
.
Output
<
Tensor
>
(
"Alpha"
)
->
dims
());
ll
=
&
ll_tensor
;
}
else
{
emission_weights
=
const_cast
<
LoDTensor
*>
(
ctx
.
Input
<
LoDTensor
>
(
"Emission"
));
transition_weights
=
const_cast
<
Tensor
*>
(
ctx
.
Input
<
Tensor
>
(
"Transition"
));
label
=
const_cast
<
LoDTensor
*>
(
ctx
.
Input
<
LoDTensor
>
(
"Label"
));
emission_exps
=
ctx
.
Output
<
Tensor
>
(
"EmissionExps"
);
transition_exps
=
ctx
.
Output
<
Tensor
>
(
"TransitionExps"
);
alpha
=
ctx
.
Output
<
Tensor
>
(
"Alpha"
);
ll
=
ctx
.
Output
<
Tensor
>
(
"LogLikelihood"
);
}
const
LoDTensor
*
emission_weights
=
ctx
.
Input
<
LoDTensor
>
(
"Emission"
);
const
Tensor
*
transition_weights
=
ctx
.
Input
<
Tensor
>
(
"Transition"
);
const
LoDTensor
*
label
=
ctx
.
Input
<
LoDTensor
>
(
"Label"
);
Tensor
*
emission_exps
=
ctx
.
Output
<
Tensor
>
(
"EmissionExps"
);
Tensor
*
transition_exps
=
ctx
.
Output
<
Tensor
>
(
"TransitionExps"
);
Tensor
*
alpha
=
ctx
.
Output
<
Tensor
>
(
"Alpha"
);
Tensor
*
ll
=
ctx
.
Output
<
Tensor
>
(
"LogLikelihood"
);
// Because the computation codes only runs on CPU, here the memory for all
// the outputs is FIXED to be allocated on the CPU memory.
...
...
@@ -173,61 +130,9 @@ class LinearChainCRFOpKernel : public framework::OpKernel<T> {
one_seq
,
one_seq_row_max
,
one_seq_exps
,
*
transition_weights
,
*
transition_exps
,
one_seq_label
,
&
one_seq_alpha
);
}
if
(
platform
::
is_gpu_place
(
ctx
.
GetPlace
()))
{
CopyOutputsToGpuMemory
(
ctx
.
device_context
(),
*
emission_exps
,
*
transition_exps
,
*
alpha
,
*
ll
,
ctx
.
Output
<
Tensor
>
(
"EmissionExps"
),
ctx
.
Output
<
Tensor
>
(
"TransitionExps"
),
ctx
.
Output
<
Tensor
>
(
"Alpha"
),
ctx
.
Output
<
Tensor
>
(
"LogLikelihood"
));
}
};
private:
void
CopyInputsToCpuMemory
(
const
platform
::
DeviceContext
&
ctx
,
const
LoDTensor
&
emission_weights_src
,
const
Tensor
&
transition_weights_src
,
const
LoDTensor
&
label_src
,
LoDTensor
*
emission_weights_dst
,
Tensor
*
transition_weights_dst
,
LoDTensor
*
label_dst
)
const
{
// Copy the inputs from GPU memory to CPU memory if this operators runs on
// GPU device.
auto
copyLoDTensor
=
[](
const
platform
::
DeviceContext
&
ctx
,
const
LoDTensor
&
src
,
LoDTensor
*
dst
)
{
dst
->
mutable_data
<
T
>
(
src
.
dims
(),
platform
::
CPUPlace
());
framework
::
Copy
(
src
,
platform
::
CPUPlace
(),
ctx
,
dst
);
};
copyLoDTensor
(
ctx
,
emission_weights_src
,
emission_weights_dst
);
copyLoDTensor
(
ctx
,
label_src
,
label_dst
);
transition_weights_dst
->
mutable_data
<
T
>
(
transition_weights_src
.
dims
(),
platform
::
CPUPlace
());
framework
::
Copy
(
transition_weights_src
,
platform
::
CPUPlace
(),
ctx
,
transition_weights_dst
);
}
void
CopyOutputsToGpuMemory
(
const
platform
::
DeviceContext
&
ctx
,
const
Tensor
&
emission_exps_src
,
const
Tensor
&
transition_exps_src
,
const
Tensor
&
alpha_src
,
const
Tensor
&
ll_src
,
Tensor
*
emission_exps_dst
,
Tensor
*
transition_exps_dst
,
Tensor
*
alpha_dst
,
Tensor
*
ll_dst
)
const
{
// Copy the forward results from CPU memory to GPU memory if this
// operators runs on GPU device.
auto
copyTensor
=
[](
const
platform
::
DeviceContext
&
ctx
,
const
Tensor
&
src
,
Tensor
*
dst
)
{
dst
->
mutable_data
<
T
>
(
platform
::
CUDAPlace
());
framework
::
Copy
(
src
,
platform
::
CUDAPlace
(),
ctx
,
dst
);
};
copyTensor
(
ctx
,
emission_exps_src
,
emission_exps_dst
);
copyTensor
(
ctx
,
transition_exps_src
,
transition_exps_dst
);
copyTensor
(
ctx
,
alpha_src
,
alpha_dst
);
copyTensor
(
ctx
,
ll_src
,
ll_dst
);
}
T
ForwardOneSequence
(
const
Tensor
&
emission
,
const
Tensor
&
emission_row_max
,
const
Tensor
&
emission_exps
,
const
Tensor
&
trans_weights
,
const
Tensor
&
trans_weight_exps
,
const
Tensor
&
label
,
...
...
@@ -296,63 +201,17 @@ class LinearChainCRFGradOpKernel : public framework::OpKernel<T> {
auto
lod
=
ctx
.
Input
<
LoDTensor
>
(
"Label"
)
->
lod
();
PADDLE_ENFORCE
(
lod
.
size
(),
"Input(Label) must be a sequence."
);
// These local variables hold the inputs and outputs, garanteeing them on
// CPU memory, to provide a consistent reference.
// TODO(caoying) Fix this by moving all these local variables into the
// class's data members once we can profile the training process, or
// implementing a real GPU kernel for CRF.
Tensor
*
label
=
nullptr
;
Tensor
label_tensor
;
Tensor
*
emission_exps
=
nullptr
;
Tensor
emission_exps_tensor
;
Tensor
*
transition_exps
=
nullptr
;
Tensor
transition_exps_tensor
;
Tensor
*
alpha
=
nullptr
;
Tensor
alpha_tensor
;
Tensor
ll_grad_tensor
;
T
*
ll_grad
=
nullptr
;
Tensor
*
emission_grad
=
nullptr
;
Tensor
emission_grad_tensor
;
Tensor
*
transition_grad
=
nullptr
;
Tensor
transition_grad_tensor
;
if
(
platform
::
is_gpu_place
(
ctx
.
GetPlace
()))
{
label
=
&
label_tensor
;
emission_exps
=
&
emission_exps_tensor
;
transition_exps
=
&
transition_exps_tensor
;
alpha
=
&
alpha_tensor
;
CopyInputsToCpuMemory
(
ctx
.
device_context
(),
*
ctx
.
Input
<
LoDTensor
>
(
"Label"
),
*
ctx
.
Input
<
Tensor
>
(
"EmissionExps"
),
*
ctx
.
Input
<
Tensor
>
(
"TransitionExps"
),
*
ctx
.
Input
<
Tensor
>
(
"Alpha"
),
*
ctx
.
Input
<
Tensor
>
(
framework
::
GradVarName
(
"LogLikelihood"
)),
label
,
emission_exps
,
transition_exps
,
alpha
,
&
ll_grad_tensor
);
ll_grad
=
ll_grad_tensor
.
data
<
T
>
();
if
(
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"Emission"
)))
{
emission_grad
=
&
emission_grad_tensor
;
emission_grad
->
Resize
(
emission_exps
->
dims
());
}
const
Tensor
*
label
=
ctx
.
Input
<
LoDTensor
>
(
"Label"
);
const
Tensor
*
emission_exps
=
ctx
.
Input
<
Tensor
>
(
"EmissionExps"
);
const
Tensor
*
transition_exps
=
ctx
.
Input
<
Tensor
>
(
"TransitionExps"
);
const
Tensor
*
alpha
=
ctx
.
Input
<
Tensor
>
(
"Alpha"
);
const
T
*
ll_grad
=
ctx
.
Input
<
Tensor
>
(
framework
::
GradVarName
(
"LogLikelihood"
))
->
data
<
T
>
();
if
(
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"Transition"
)))
{
transition_grad
=
&
transition_grad_tensor
;
transition_grad
->
Resize
(
transition_exps
->
dims
());
}
}
else
{
label
=
const_cast
<
LoDTensor
*>
(
ctx
.
Input
<
LoDTensor
>
(
"Label"
));
emission_exps
=
const_cast
<
Tensor
*>
(
ctx
.
Input
<
Tensor
>
(
"EmissionExps"
));
transition_exps
=
const_cast
<
Tensor
*>
(
ctx
.
Input
<
Tensor
>
(
"TransitionExps"
));
alpha
=
const_cast
<
Tensor
*>
(
ctx
.
Input
<
Tensor
>
(
"Alpha"
));
ll_grad
=
const_cast
<
Tensor
*>
(
ctx
.
Input
<
Tensor
>
(
framework
::
GradVarName
(
"LogLikelihood"
)))
->
data
<
T
>
();
emission_grad
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"Emission"
));
transition_grad
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"Transition"
));
}
Tensor
*
emission_grad
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"Emission"
));
Tensor
*
transition_grad
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"Transition"
));
// TODO(caoying) Fix this constraint. When the Input(Emission) is from the
// data reader operator, it can have no gradients.
...
...
@@ -389,58 +248,9 @@ class LinearChainCRFGradOpKernel : public framework::OpKernel<T> {
one_seq_emission_exps
,
*
transition_exps
,
one_seq_alpha
,
one_seq_label
,
&
one_seq_beta
,
transition_grad
,
&
one_seq_emission_grad
);
}
if
(
platform
::
is_gpu_place
(
ctx
.
GetPlace
()))
{
CopyOutputsToGpuMemory
(
ctx
.
device_context
(),
emission_grad
,
transition_grad
,
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"Emission"
)),
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"Transition"
)));
}
};
private:
void
CopyInputsToCpuMemory
(
const
platform
::
DeviceContext
&
ctx
,
const
LoDTensor
&
label_src
,
const
Tensor
&
emission_exps_src
,
const
Tensor
&
transition_exps_src
,
const
Tensor
&
alpha_src
,
const
Tensor
&
ll_grad_src
,
Tensor
*
label_dst
,
Tensor
*
emission_exps_dst
,
Tensor
*
transition_exps_dst
,
Tensor
*
alpha_dst
,
Tensor
*
ll_grad_dst
)
const
{
// Copy the inputs from GPU memory to CPU memory when this operators runs on
// GPU device.
label_dst
->
mutable_data
<
T
>
(
label_src
.
dims
(),
platform
::
CPUPlace
());
framework
::
Copy
(
label_src
,
platform
::
CPUPlace
(),
ctx
,
label_dst
);
auto
copyTensor
=
[](
const
platform
::
DeviceContext
&
ctx
,
const
Tensor
&
src
,
Tensor
*
dst
)
{
dst
->
mutable_data
<
T
>
(
src
.
dims
(),
platform
::
CPUPlace
());
framework
::
Copy
(
src
,
platform
::
CPUPlace
(),
ctx
,
dst
);
};
copyTensor
(
ctx
,
emission_exps_src
,
emission_exps_dst
);
copyTensor
(
ctx
,
transition_exps_src
,
transition_exps_dst
);
copyTensor
(
ctx
,
alpha_src
,
alpha_dst
);
copyTensor
(
ctx
,
ll_grad_src
,
ll_grad_dst
);
}
void
CopyOutputsToGpuMemory
(
const
platform
::
DeviceContext
&
ctx
,
const
Tensor
*
emission_grad_src
,
const
Tensor
*
transition_grad_src
,
Tensor
*
emission_grad_dst
,
Tensor
*
transition_grad_dst
)
const
{
// Copy the backward results from CPU memory to GPU
// memory if this operators runs on GPU device.
auto
copyTensor
=
[](
const
platform
::
DeviceContext
&
ctx
,
const
Tensor
*
src
,
Tensor
*
dst
)
{
if
(
src
&&
dst
)
{
dst
->
mutable_data
<
T
>
(
platform
::
CUDAPlace
());
framework
::
Copy
(
*
src
,
platform
::
CUDAPlace
(),
ctx
,
dst
);
}
};
copyTensor
(
ctx
,
emission_grad_src
,
emission_grad_dst
);
copyTensor
(
ctx
,
transition_grad_src
,
transition_grad_dst
);
}
void
BackwardOneSequence
(
const
platform
::
CPUDeviceContext
&
ctx
,
const
T
ll_grad
,
const
Tensor
&
emission_exps
,
const
Tensor
&
transition_exps
,
const
Tensor
&
alpha
,
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录