Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
0be1e09f
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
0be1e09f
编写于
3月 28, 2018
作者:
D
dzhwinter
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
"fix ci"
上级
5447046a
变更
4
显示空白变更内容
内联
并排
Showing
4 changed file
with
183 addition
and
167 deletion
+183
-167
paddle/fluid/operators/sequence_expand_op.cc
paddle/fluid/operators/sequence_expand_op.cc
+2
-3
paddle/fluid/operators/sequence_expand_op.cu
paddle/fluid/operators/sequence_expand_op.cu
+99
-94
paddle/fluid/operators/sequence_expand_op.h
paddle/fluid/operators/sequence_expand_op.h
+73
-57
python/paddle/fluid/tests/unittests/test_sequence_expand.py
python/paddle/fluid/tests/unittests/test_sequence_expand.py
+9
-13
未找到文件。
paddle/fluid/operators/sequence_expand_op.cc
浏览文件 @
0be1e09f
...
...
@@ -84,13 +84,12 @@ class SequenceExpandOp : public framework::OperatorWithKernel {
}
}
out_dims
[
0
]
=
out_first_dim
;
ctx
->
SetOutputDim
(
"Out"
,
out_dims
);
}
else
{
out_dims
[
0
]
=
-
1
;
}
ctx
->
SetOutputDim
(
"Out"
,
out_dims
);
ctx
->
ShareLoD
(
"X"
,
/*->*/
"Out"
);
}
}
};
class
SequenceExpandOpMaker
:
public
framework
::
OpProtoAndCheckerMaker
{
...
...
paddle/fluid/operators/sequence_expand_op.cu
浏览文件 @
0be1e09f
...
...
@@ -24,123 +24,128 @@ namespace operators {
using
LoDTensor
=
framework
::
LoDTensor
;
template
<
typename
T
>
__global__
void
sequence_expand_kernel
(
const
T
*
x_data
,
T
*
out_data
,
const
size_t
*
lod
,
const
size_t
*
out_offset
,
size_t
lod_size
,
size_t
element_len
,
size_t
x_size
)
{
int
bid_x
=
blockIdx
.
x
;
if
(
bid_x
>
lod_size
)
return
;
int
repeats
=
lod
[
bid_x
];
int
offset
=
out_offset
[
bid_x
]
;
for
(
int
tid_y
=
threadIdx
.
y
;
tid_y
<
repeats
;
tid_y
+=
blockDim
.
y
)
{
for
(
int
tid_x
=
threadIdx
.
x
;
tid_x
<
element_len
;
tid_x
+=
blockDim
.
x
)
{
out_data
[(
offset
+
tid_y
)
*
element_len
+
tid_x
]
=
x_data
[
bid_x
*
element_len
+
tid_x
]
;
__global__
void
sequence_expand_kernel
(
const
T
*
x_data
,
const
size_t
*
x_lod
,
const
size_t
*
ref_
lod
,
const
size_t
lod_size
,
/* default=1
,
the instance length*/
const
int
x_item_length
,
T
*
out_data
)
{
constexpr
int
N
=
1024
;
__shared__
int
mem
[
N
];
int
offset
=
0
;
for
(
int
i
=
0
;
i
<
lod_size
;
++
i
)
{
mem
[
i
]
=
offset
;
if
(
i
<
lod_size
-
1
)
{
offset
+=
(
ref_lod
[
i
+
1
]
-
ref_lod
[
i
])
*
(
x_lod
[
i
+
1
]
-
x_lod
[
i
])
;
}
}
}
__syncthreads
();
template
<
typename
T
>
__global__
void
sequence_expand_grad_kernel
(
const
T
*
dout_data
,
T
*
dx_data
,
const
size_t
*
lod
,
const
size_t
*
out_offset
,
size_t
lod_size
,
size_t
element_len
,
size_t
dout_size
,
size_t
dx_size
)
{
// reduce visit memory time.
// dout_shm = [0 - dout_size-1], dx_shm = [dout_size-1, dout_size + dx_size-1]
if
(
blockIdx
.
x
==
0
&&
blockIdx
.
y
==
0
&&
threadIdx
.
x
==
0
&&
threadIdx
.
y
==
0
)
{
printf
(
"lod_size=%ld, element_size=%ld, dout_size=%ld, dx_size=%ld
\n
"
,
lod_size
,
element_len
,
dout_size
,
dx_size
);
}
extern
__shared__
T
shm
[];
T
*
dout_shm
=
shm
;
T
*
dx_shm
=
&
shm
[
dout_size
];
// int idx = threadIdx.x + blockIdx.x * blockDim.x;
for
(
int
idx
=
0
;
idx
<
dout_size
;
++
idx
)
{
if
(
idx
<
dx_size
)
{
dx_shm
[
idx
]
=
0.0
;
int
bid
=
blockIdx
.
x
;
if
(
bid
>=
lod_size
-
1
)
return
;
int
x_item_count
=
x_lod
[
bid
+
1
]
-
x_lod
[
bid
];
int
repeats
=
ref_lod
[
bid
+
1
]
-
ref_lod
[
bid
];
int
out_offset
=
mem
[
bid
];
int
x_offset
=
x_lod
[
bid
];
for
(
int
tid_z
=
threadIdx
.
z
;
tid_z
<
repeats
;
tid_z
+=
blockDim
.
z
)
{
for
(
int
tid_y
=
threadIdx
.
y
;
tid_y
<
x_item_count
;
tid_y
+=
blockDim
.
y
)
{
for
(
int
tid_x
=
threadIdx
.
x
;
tid_x
<
x_item_length
;
tid_x
+=
blockDim
.
x
)
{
out_data
[(
out_offset
+
tid_z
*
x_item_count
+
tid_y
)
*
x_item_length
+
tid_x
]
=
x_data
[(
x_offset
+
tid_y
)
*
x_item_length
+
tid_x
];
}
if
(
idx
<
dout_size
)
{
dout_shm
[
idx
]
=
dout_data
[
idx
];
}
}
}
int
bid_x
=
blockIdx
.
x
;
if
(
bid_x
>
lod_size
)
return
;
int
repeats
=
lod
[
bid_x
];
int
offset
=
out_offset
[
bid_x
];
if
(
threadIdx
.
x
==
0
)
{
printf
(
"repeats=%d, offset=%ld
\n
"
,
repeats
,
offset
);
}
for
(
int
tid_y
=
threadIdx
.
y
;
tid_y
<
repeats
;
tid_y
+=
blockDim
.
y
)
{
for
(
int
tid_x
=
threadIdx
.
x
;
tid_x
<
element_len
;
tid_x
+=
blockDim
.
x
)
{
T
val
=
dout_shm
[(
offset
+
tid_y
)
*
element_len
+
tid_x
];
platform
::
CudaAtomicAdd
(
&
dx_shm
[
bid_x
*
element_len
+
tid_x
],
val
);
int
dx_idx
=
bid_x
*
element_len
+
tid_x
;
int
dout_idx
=
(
offset
+
tid_y
)
*
element_len
+
tid_x
;
printf
(
"dx_idx=%d, dout_idx=%d, dx_data=%f, dout_data=%f, val=%f
\n
"
,
dx_idx
,
dout_idx
,
dx_shm
[
dx_idx
],
dout_shm
[
dout_idx
],
val
);
template
<
typename
T
>
__global__
void
sequence_expand_grad_kernel
(
const
T
*
dout_data
,
const
size_t
*
ref_lod
,
const
size_t
*
dx_lod
,
const
size_t
lod_size
,
/* default=1,
the instance length*/
const
int
x_item_length
,
T
*
dx_data
)
{
// TODO(dzhwinter) : too many atomicAdd
// use shared memory to reduce memory visits
constexpr
int
N
=
1024
;
__shared__
int
mem
[
N
];
int
offset
=
0
;
for
(
int
i
=
0
;
i
<
lod_size
;
++
i
)
{
mem
[
i
]
=
offset
;
if
(
i
<
lod_size
-
1
)
{
offset
+=
(
ref_lod
[
i
+
1
]
-
ref_lod
[
i
])
*
(
dx_lod
[
i
+
1
]
-
dx_lod
[
i
]);
}
}
__syncthreads
();
// copy shared memory back to dx
for
(
int
idx
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
idx
<
dx_size
;
idx
+=
blockDim
.
x
*
gridDim
.
x
)
{
dx_data
[
idx
]
=
dx_shm
[
idx
];
int
bid
=
blockIdx
.
x
;
if
(
bid
>=
lod_size
-
1
)
return
;
int
x_item_count
=
dx_lod
[
bid
+
1
]
-
dx_lod
[
bid
];
int
repeats
=
ref_lod
[
bid
+
1
]
-
ref_lod
[
bid
];
int
out_offset
=
mem
[
bid
];
int
x_offset
=
dx_lod
[
bid
];
for
(
int
tid_z
=
threadIdx
.
z
;
tid_z
<
repeats
;
tid_z
+=
blockDim
.
z
)
{
for
(
int
tid_y
=
threadIdx
.
y
;
tid_y
<
x_item_count
;
tid_y
+=
blockDim
.
y
)
{
for
(
int
tid_x
=
threadIdx
.
x
;
tid_x
<
x_item_length
;
tid_x
+=
blockDim
.
x
)
{
platform
::
CudaAtomicAdd
(
&
dx_data
[(
x_offset
+
tid_y
)
*
x_item_length
+
tid_x
],
dout_data
[(
out_offset
+
tid_z
*
x_item_count
+
tid_y
)
*
x_item_length
+
tid_x
]);
}
}
}
}
template
<
typename
T
>
struct
SequenceExpandFunctor
<
platform
::
CUDADeviceContext
,
T
>
{
void
operator
()(
const
platform
::
CUDADeviceContext
&
context
,
const
LoDTensor
&
x
,
LoDTensor
*
out
)
{
auto
x_dims
=
x
.
dims
();
size_t
element_len
=
framework
::
product
(
x_dims
)
/
x_dims
[
0
];
auto
lod
=
out
->
lod
().
back
();
framework
::
Vector
<
size_t
>
out_lod
;
for
(
size_t
i
=
0
;
i
<
lod
.
size
()
-
1
;
++
i
)
{
out_lod
.
push_back
(
lod
[
i
+
1
]
-
lod
[
i
])
;
}
int
thread_
x
=
std
::
max
(
static_cast
<
int
>
(
element_len
),
32
);
int
block_x
=
static_cast
<
int
>
(
out
_lod
.
size
());
dim3
block_size
(
thread_x
,
1024
/
thread_x
);
void
operator
()(
const
platform
::
CUDADeviceContext
&
context
,
const
LoDTensor
&
x
,
const
framework
::
Vector
<
size_t
>&
x_lod
,
/*expand source lod*/
const
framework
::
Vector
<
size_t
>&
ref_lod
,
/*expand referenced lod*/
LoDTensor
*
out
)
{
int
x_item_length
=
1
;
x_item_length
=
x
.
numel
()
/
x
.
dims
()[
0
];
VLOG
(
0
)
<<
"x_item_length"
<<
x_item_length
;
int
thread_x
=
std
::
max
(
static_cast
<
int
>
(
ref_lod
.
size
()),
32
);
int
thread_y
=
std
::
max
(
1024
/
thread_x
,
16
);
int
thread_
z
=
std
::
min
(
1024
/
thread_x
/
thread_y
,
16
);
int
block_x
=
static_cast
<
int
>
(
ref
_lod
.
size
());
dim3
block_size
(
thread_x
,
thread_y
,
thread_z
);
dim3
grid_size
(
block_x
,
1
);
sequence_expand_kernel
<<<
grid_size
,
block_size
,
0
,
context
.
stream
()
>>>
(
x
.
data
<
T
>
(),
out
->
mutable_data
<
T
>
(
context
.
GetPlace
()),
out_lod
.
CUDAData
(
context
.
GetPlace
()),
lod
.
CUDAData
(
context
.
GetPlace
())
,
out
_lod
.
size
(),
element_len
,
framework
::
product
(
x_dims
));
x
.
data
<
T
>
(),
x_lod
.
CUDAData
(
context
.
GetPlace
()),
ref_lod
.
CUDAData
(
context
.
GetPlace
()),
x_lod
.
size
(),
x_item_length
,
out
->
mutable_data
<
T
>
(
context
.
GetPlace
()
));
}
};
template
<
typename
T
>
struct
SequenceExpandGradFunctor
<
platform
::
CUDADeviceContext
,
T
>
{
void
operator
()(
const
platform
::
CUDADeviceContext
&
context
,
const
LoDTensor
&
x
,
const
LoDTensor
&
out
,
const
LoDTensor
&
dout
,
LoDTensor
*
dx
)
{
auto
x_dims
=
x
.
dims
();
size_t
element_len
=
framework
::
product
(
x_dims
)
/
x_dims
[
0
];
auto
lod
=
out
.
lod
().
back
();
framework
::
Vector
<
size_t
>
out_lod
;
for
(
size_t
i
=
0
;
i
<
lod
.
size
()
-
1
;
++
i
)
{
out_lod
.
push_back
(
lod
[
i
+
1
]
-
lod
[
i
]);
}
size_t
dout_size
=
framework
::
product
(
dout
.
dims
());
size_t
dx_size
=
framework
::
product
(
dx
->
dims
());
int
thread_x
=
std
::
max
(
static_cast
<
int
>
(
element_len
),
32
);
dim3
block_size
(
thread_x
,
1024
/
thread_x
);
int
block_x
=
static_cast
<
int
>
(
out_lod
.
size
());
const
LoDTensor
&
dout
,
const
framework
::
Vector
<
size_t
>&
x_lod
,
/*expand source lod*/
const
framework
::
Vector
<
size_t
>&
ref_lod
,
/*expand based lod*/
LoDTensor
*
dx
)
{
int
x_item_length
=
1
;
x_item_length
=
framework
::
product
(
dx
->
dims
())
/
dx
->
dims
()[
0
];
int
thread_x
=
std
::
max
(
static_cast
<
int
>
(
ref_lod
.
size
()),
32
);
int
thread_y
=
std
::
max
(
1024
/
thread_x
,
16
);
int
thread_z
=
std
::
min
(
1024
/
thread_x
/
thread_y
,
16
);
int
block_x
=
static_cast
<
int
>
(
ref_lod
.
size
());
dim3
block_size
(
thread_x
,
thread_y
,
thread_z
);
dim3
grid_size
(
block_x
,
1
);
sequence_expand_grad_kernel
<<<
grid_size
,
block_size
,
(
dout_size
+
dx_size
)
*
sizeof
(
T
),
context
.
stream
()
>>>
(
dout
.
data
<
T
>
(),
dx
->
mutable_data
<
T
>
(
context
.
GetPlace
()),
out_lod
.
CUDAData
(
context
.
GetPlace
()),
lod
.
CUDAData
(
context
.
GetPlace
()),
out_lod
.
size
(),
element_len
,
dout_size
,
dx_size
);
sequence_expand_grad_kernel
<<<
grid_size
,
block_size
,
0
,
context
.
stream
()
>>>
(
dout
.
data
<
T
>
(),
ref_lod
.
CUDAData
(
context
.
GetPlace
()),
x_lod
.
CUDAData
(
context
.
GetPlace
()),
ref_lod
.
size
(),
x_item_length
,
dx
->
mutable_data
<
T
>
(
context
.
GetPlace
()));
}
};
...
...
paddle/fluid/operators/sequence_expand_op.h
浏览文件 @
0be1e09f
...
...
@@ -13,8 +13,10 @@ See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <numeric> // std::i
to
a
#include <numeric> // std::i
ot
a
#include <glog/logging.h>
#include <sstream>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/operators/math/math_function.h"
...
...
@@ -29,40 +31,42 @@ using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
template
<
typename
DeviceContext
,
typename
T
>
struct
SequenceExpandFunctor
{
void
operator
()(
const
DeviceContext
&
ctx
,
const
LoDTensor
&
x
,
LoDTensor
*
out
);
void
operator
()(
const
DeviceContext
&
ctx
,
const
LoDTensor
&
x
,
const
framework
::
Vector
<
size_t
>&
x_lod
,
/*expand source lod*/
const
framework
::
Vector
<
size_t
>&
ref_lod
,
/*expand referenced lod*/
LoDTensor
*
out
);
};
template
<
typename
DeviceContext
,
typename
T
>
struct
SequenceExpandGradFunctor
{
void
operator
()(
const
DeviceContext
&
ctx
,
const
LoDTensor
&
x
,
const
LoDTensor
&
out
,
const
LoDTensor
&
dout
,
LoDTensor
*
dx
);
void
operator
()(
const
DeviceContext
&
ctx
,
const
LoDTensor
&
dout
,
const
framework
::
Vector
<
size_t
>&
x_lod
,
/*expand source lod*/
const
framework
::
Vector
<
size_t
>&
ref_lod
,
/*expand referenced lod*/
LoDTensor
*
dx
);
};
template
<
typename
T
>
struct
SequenceExpandFunctor
<
platform
::
CPUDeviceContext
,
T
>
{
void
operator
()(
const
platform
::
CPUDeviceContext
&
context
,
const
LoDTensor
&
x
,
void
operator
()(
const
platform
::
CPUDeviceContext
&
context
,
const
LoDTensor
&
x
,
const
framework
::
Vector
<
size_t
>&
x_lod
,
/*expand source lod*/
const
framework
::
Vector
<
size_t
>&
ref_lod
,
/*expand referenced lod*/
LoDTensor
*
out
)
{
auto
&
out_lod
=
out
->
lod
()[
0
];
framework
::
Vector
<
size_t
>
x_lod
;
if
(
x
.
lod
()
==
1
)
{
x_lod
=
x
.
lod
()[
0
];
}
else
{
x_lod
.
reserve
(
out_lod
.
size
());
std
::
itoa
(
x_lod
.
begin
(),
x_lod
.
end
(),
0
);
// fill 0 ~ out_lod.size()-1
}
int
out_offset
=
0
;
auto
&
eigen_place
=
*
context
.
eigen_device
();
for
(
size_t
i
=
1
;
i
<
out
_lod
.
size
();
++
i
)
{
int
repeat_num
=
y_lod
[
ref_level
][
i
]
-
y_lod
[
ref_level
]
[
i
-
1
];
for
(
size_t
i
=
1
;
i
<
ref
_lod
.
size
();
++
i
)
{
int
repeat_num
=
ref_lod
[
i
]
-
ref_lod
[
i
-
1
];
int
x_start
=
x_lod
[
i
-
1
];
int
x_end
=
x_lod
[
i
];
int
x_seq_len
=
x_end
-
x_start
;
if
(
repeat_num
>
0
)
{
auto
x_sub_tensor
=
x
->
Slice
(
x_start
,
x_end
);
auto
x_sub_tensor
=
x
.
Slice
(
x_start
,
x_end
);
x_sub_tensor
.
Resize
({
1
,
x_sub_tensor
.
numel
()});
int
out_start
=
out_offset
;
if
(
x_lod
.
size
()
==
1
)
{
out_start
=
out
_lod
[
0
][
out_offset
];
if
(
out
->
lod
()
.
size
()
==
1
)
{
out_start
=
out
->
lod
()
[
0
][
out_offset
];
}
auto
out_sub_tensor
=
out
->
Slice
(
out_start
,
out_start
+
x_seq_len
*
repeat_num
);
...
...
@@ -71,6 +75,7 @@ struct SequenceExpandFunctor<platform::CPUDeviceContext, T> {
EigenMatrix
<
T
>::
From
(
x_sub_tensor
)
.
broadcast
(
Eigen
::
array
<
int
,
2
>
({{
repeat_num
,
1
}}));
}
out_offset
+=
repeat_num
;
}
}
};
...
...
@@ -96,13 +101,10 @@ class SequenceExpandKernel : public framework::OpKernel<T> {
return
;
}
auto
&
out_lod
=
*
out
->
mutable_lod
();
// x lod level is at most 1.
if
(
x_lod
.
size
()
==
0
)
{
out_lod
=
y_lod
[
ref_level
];
}
else
if
(
x_lod
.
size
()
==
1
)
{
out_lod
.
resize
(
1
);
out_lod
[
0
]
=
{
0
};
framework
::
Vector
<
size_t
>
out_lod
;
if
(
x_lod
.
size
()
==
1
)
{
out_lod
.
push_back
(
0
);
int
out_offset
=
0
;
for
(
size_t
i
=
1
;
i
<
y_lod
[
ref_level
].
size
();
++
i
)
{
int
repeat_num
=
y_lod
[
ref_level
][
i
]
-
y_lod
[
ref_level
][
i
-
1
];
...
...
@@ -110,14 +112,25 @@ class SequenceExpandKernel : public framework::OpKernel<T> {
int
x_end
=
x_lod
[
0
][
i
];
int
x_seq_len
=
x_end
-
x_start
;
for
(
int
j
=
0
;
j
<
repeat_num
;
++
j
)
{
out_lod
[
0
].
push_back
(
out_lod
[
0
]
.
back
()
+
x_seq_len
);
out_lod
.
push_back
(
out_lod
.
back
()
+
x_seq_len
);
out_offset
++
;
}
}
// write lod to out if x has lod
auto
&
ref_lod
=
*
out
->
mutable_lod
();
ref_lod
[
0
]
=
out_lod
;
}
framework
::
Vector
<
size_t
>
ref_x_lod
;
if
(
x
->
lod
().
size
()
==
1
)
{
ref_x_lod
=
x
->
lod
()[
0
];
}
else
{
// x_lod doesn't has lod, use fake x lod, level = 0
ref_x_lod
.
resize
(
x
->
dims
()[
0
]
+
1
);
std
::
iota
(
ref_x_lod
.
begin
(),
ref_x_lod
.
end
(),
0
);
}
SequenceExpandFunctor
<
DeviceContext
,
T
>
functor
;
functor
(
context
.
template
device_context
<
DeviceContext
>(),
*
x
,
out
);
functor
(
context
.
template
device_context
<
DeviceContext
>(),
*
x
,
ref_x_lod
,
y_lod
[
ref_level
],
out
);
}
};
...
...
@@ -135,32 +148,29 @@ class SequenceExpandKernel : public framework::OpKernel<T> {
* */
template
<
typename
T
>
struct
SequenceExpandGradFunctor
<
platform
::
CPUDeviceContext
,
T
>
{
void
operator
()(
const
platform
::
CPUDeviceContext
&
context
,
const
LoDTensor
&
x
,
const
LoDTensor
&
out
,
const
LoDTensor
&
dout
,
LoDTensor
*
dx
)
{
auto
&
dev_ctx
=
context
.
template
device_context
<
DeviceContext
>();
math
::
SetConstant
<
DeviceContext
,
T
>
set_zero
;
set_zero
(
dev_ctx
,
g_x
,
static_cast
<
T
>
(
0
));
int
g_out_offset
=
0
;
for
(
size_t
i
=
1
;
i
<
y_lod
[
ref_level
].
size
();
++
i
)
{
int
repeat_num
=
y_lod
[
ref_level
][
i
]
-
y_lod
[
ref_level
][
i
-
1
];
void
operator
()(
const
platform
::
CPUDeviceContext
&
context
,
const
LoDTensor
&
dout
,
const
framework
::
Vector
<
size_t
>&
x_lod
,
/*expand source lod*/
const
framework
::
Vector
<
size_t
>&
ref_lod
,
/*expand referenced lod*/
LoDTensor
*
dx
)
{
math
::
SetConstant
<
platform
::
CPUDeviceContext
,
T
>
set_zero
;
set_zero
(
context
,
dx
,
static_cast
<
T
>
(
0
));
int
dout_offset
=
0
;
for
(
size_t
i
=
1
;
i
<
ref_lod
.
size
();
++
i
)
{
int
repeat_num
=
ref_lod
[
i
]
-
ref_lod
[
i
-
1
];
if
(
repeat_num
>
0
)
{
int
x_start
=
i
-
1
;
int
x_end
=
i
;
if
(
x_lod
.
size
()
==
1
)
{
x_start
=
x_lod
[
0
][
i
-
1
];
x_end
=
x_lod
[
0
][
i
];
}
int
x_start
=
x_lod
[
i
-
1
];
int
x_end
=
x_lod
[
i
];
int
x_seq_len
=
x_end
-
x_start
;
auto
g_x_sub
=
g_
x
->
Slice
(
x_start
,
x_end
);
g_x_sub
.
Resize
(
flatten_to_1d
(
g_
x_sub
.
dims
()));
int
g_out_end
=
g_
out_offset
+
repeat_num
*
x_seq_len
;
auto
g_out_sub
=
g_out
->
Slice
(
g_out_offset
,
g_
out_end
);
g_out_sub
.
Resize
({
repeat_num
,
g_
x_sub
.
dims
()[
0
]});
math
::
ColwiseSum
<
DeviceContext
,
T
>
col_sum
;
col_sum
(
dev_ctx
,
g_out_sub
,
&
g_
x_sub
);
g_
out_offset
+=
repeat_num
*
x_seq_len
;
auto
dx_sub
=
d
x
->
Slice
(
x_start
,
x_end
);
dx_sub
.
Resize
(
flatten_to_1d
(
d
x_sub
.
dims
()));
int
dout_end
=
d
out_offset
+
repeat_num
*
x_seq_len
;
auto
dout_sub
=
dout
.
Slice
(
dout_offset
,
d
out_end
);
dout_sub
.
Resize
({
repeat_num
,
d
x_sub
.
dims
()[
0
]});
math
::
ColwiseSum
<
platform
::
CPU
DeviceContext
,
T
>
col_sum
;
col_sum
(
context
,
dout_sub
,
&
d
x_sub
);
d
out_offset
+=
repeat_num
*
x_seq_len
;
}
}
}
...
...
@@ -179,20 +189,26 @@ class SequenceExpandGradKernel : public framework::OpKernel<T> {
g_x
->
mutable_data
<
T
>
(
context
.
GetPlace
());
g_x
->
set_lod
(
x
->
lod
());
auto
&
x_lod
=
x
->
lod
();
auto
&
y_lod
=
y
->
lod
();
if
(
ref_level
==
-
1
)
ref_level
=
y_lod
.
size
()
-
1
;
// just copy the gradient
if
(
y_lod
[
ref_level
].
size
()
<=
1
)
{
framework
::
TensorCopy
(
*
g_out
,
context
.
GetPlace
(),
g_x
);
return
;
}
framework
::
Vector
<
size_t
>
ref_x_lod
;
framework
::
Vector
<
size_t
>
ref_lod
=
y_lod
[
ref_level
];
if
(
x
->
lod
().
size
()
==
1
)
{
ref_x_lod
=
x
->
lod
()[
0
];
}
else
{
// x_lod doesn't has lod, use fake x lod, level = 0
ref_x_lod
.
resize
(
x
->
dims
()[
0
]
+
1
);
std
::
iota
(
ref_x_lod
.
begin
(),
ref_x_lod
.
end
(),
0
);
}
SequenceExpandGradFunctor
<
DeviceContext
,
T
>
functor
;
functor
(
context
.
template
device_context
<
DeviceContext
>(),
*
x
,
*
y
,
*
g_out
,
g_x
);
functor
(
context
.
template
device_context
<
DeviceContext
>(),
*
g_out
,
ref_x_lod
,
ref_lod
,
g_x
);
}
};
...
...
python/paddle/fluid/tests/unittests/test_sequence_expand.py
浏览文件 @
0be1e09f
...
...
@@ -19,14 +19,8 @@ from op_test import OpTest
class
TestSequenceExpand
(
OpTest
):
def
set_data
(
self
):
x
=
[
i
/
10.0
for
i
in
range
(
3
)]
y
=
[
i
/
10.0
for
i
in
range
(
8
)]
x_data
=
np
.
array
(
x
).
reshape
(
3
,
1
).
astype
(
'float32'
)
y_data
=
np
.
array
(
y
).
reshape
(
8
,
1
).
astype
(
'float32'
)
print
(
x_data
)
print
(
y_data
)
# x_data = np.random.uniform(0.1, 1, [3, 1]).astype('float32')
# y_data = np.random.uniform(0.1, 1, [8, 1]).astype('float32')
x_data
=
np
.
random
.
uniform
(
0.1
,
1
,
[
3
,
1
]).
astype
(
'float32'
)
y_data
=
np
.
random
.
uniform
(
0.1
,
1
,
[
8
,
1
]).
astype
(
'float32'
)
y_lod
=
[[
0
,
1
,
4
,
8
]]
self
.
inputs
=
{
'X'
:
x_data
,
'Y'
:
(
y_data
,
y_lod
)}
...
...
@@ -53,8 +47,10 @@ class TestSequenceExpand(OpTest):
x_len
=
x_idx
[
i
]
-
x_idx
[
i
-
1
]
if
repeat_num
>
0
:
x_sub
=
x_data
[
x_idx
[
i
-
1
]:
x_idx
[
i
],
:]
x_sub
=
np
.
repeat
(
x_sub
,
repeat_num
,
axis
=
0
)
out
=
np
.
vstack
((
out
,
x_sub
))
stacked_x_sub
=
x_sub
for
r
in
range
(
repeat_num
-
1
):
stacked_x_sub
=
np
.
vstack
((
stacked_x_sub
,
x_sub
))
out
=
np
.
vstack
((
out
,
stacked_x_sub
))
if
x_lod
is
not
None
:
for
j
in
xrange
(
repeat_num
):
out_lod
[
0
].
append
(
out_lod
[
0
][
-
1
]
+
x_len
)
...
...
@@ -107,11 +103,11 @@ class TestSequenceExpandCase3(TestSequenceExpand):
class
TestSequenceExpandCase4
(
TestSequenceExpand
):
def
set_data
(
self
):
data
=
[
0.1
,
0.3
,
0.2
,
0.15
,
0.25
,
0.2
,
0.15
,
0.25
,
0.1
,
0.3
]
data
=
np
.
random
.
uniform
(
0.1
,
1
,
[
5
*
2
,
1
])
x_data
=
np
.
array
(
data
).
reshape
([
5
,
2
]).
astype
(
'float32'
)
x_lod
=
[[
0
,
2
,
5
]]
y_data
=
np
.
random
.
uniform
(
0.1
,
1
,
[
2
,
1
]).
astype
(
'float32'
)
y_lod
=
[[
0
,
1
,
2
],
[
0
,
1
,
2
]]
y_data
=
np
.
random
.
uniform
(
0.1
,
1
,
[
3
,
1
]).
astype
(
'float32'
)
y_lod
=
[[
0
,
1
,
3
],
[
0
,
1
,
3
]]
self
.
inputs
=
{
'X'
:
(
x_data
,
x_lod
),
'Y'
:
(
y_data
,
y_lod
)}
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录