Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
c66eec75
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
c66eec75
编写于
9月 25, 2020
作者:
P
pangyoki
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
support num_distribution different multinomial distributions
上级
8e14302f
变更
3
隐藏空白更改
内联
并排
Showing
3 changed file
with
150 addition
and
55 deletion
+150
-55
paddle/fluid/operators/multinomial_op.cc
paddle/fluid/operators/multinomial_op.cc
+8
-1
paddle/fluid/operators/multinomial_op.cu
paddle/fluid/operators/multinomial_op.cu
+134
-51
python/paddle/fluid/tests/unittests/test_multinomial_op.py
python/paddle/fluid/tests/unittests/test_multinomial_op.py
+8
-3
未找到文件。
paddle/fluid/operators/multinomial_op.cc
浏览文件 @
c66eec75
...
...
@@ -30,6 +30,7 @@ class MultinomialOpMaker : public framework::OpProtoAndCheckerMaker {
void
Make
()
override
{
AddInput
(
"X"
,
"A tensor contains probabilities of categories"
);
AddOutput
(
"Out"
,
"The output tensor of multinomial op"
);
// AddOutput("yokiOut", "yoki");
AddAttr
<
int
>
(
"num_samples"
,
"number of the generated samples"
)
.
SetDefault
(
1
);
AddAttr
<
bool
>
(
"replacement"
,
"can a category be sampled more than once"
)
...
...
@@ -49,7 +50,7 @@ class MultinomialOp : public framework::OperatorWithKernel {
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
{
OP_INOUT_CHECK
(
ctx
->
HasInput
(
"X"
),
"Input"
,
"X"
,
"Multinomial"
);
OP_INOUT_CHECK
(
ctx
->
HasOutput
(
"Out"
),
"Output"
,
"Out"
,
"Multinomial"
);
//
OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "Multinomial");
auto
x_dim
=
ctx
->
GetInputDim
(
"X"
);
int64_t
x_rank
=
x_dim
.
size
();
...
...
@@ -62,6 +63,7 @@ class MultinomialOp : public framework::OperatorWithKernel {
out_dims
[
x_rank
-
1
]
=
num_samples
;
ctx
->
SetOutputDim
(
"Out"
,
framework
::
make_ddim
(
out_dims
));
// ctx->SetOutputDim("yokiOut", x_dim);
}
};
...
...
@@ -72,11 +74,16 @@ class MultinomialOpKernel<platform::CPUDeviceContext, T>
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
const
auto
x
=
ctx
.
Input
<
framework
::
Tensor
>
(
"X"
);
auto
out
=
ctx
.
Output
<
framework
::
Tensor
>
(
"Out"
);
// auto yokiout = ctx.Output<framework::Tensor>("yokiOut");
const
int64_t
num_samples
=
ctx
.
Attr
<
int
>
(
"num_samples"
);
const
bool
replacement
=
ctx
.
Attr
<
bool
>
(
"replacement"
);
auto
*
in_data
=
x
->
data
<
T
>
();
auto
*
out_data
=
out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
/*auto *yokiout_data = yokiout->mutable_data<T>(ctx.GetPlace());
for (int i = 0; i < x->numel(); i++) {
yokiout_data[i] = in_data[i];
}*/
auto
in_dims
=
x
->
dims
();
int64_t
in_rank
=
in_dims
.
size
();
...
...
paddle/fluid/operators/multinomial_op.cu
浏览文件 @
c66eec75
...
...
@@ -70,8 +70,31 @@ template <typename T>
__global__
void
NormalizeProbability
(
T
*
norm_probs
,
const
T
*
in_data
,
T
*
sum_rows
)
{
// int id = blockIdx.x * blockDim.x + threadIdx.x;
int
id
=
threadIdx
.
x
;
norm_probs
[
id
]
=
in_data
[
id
]
/
sum_rows
[
0
];
// int id = threadIdx.x;
int
id
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
+
blockIdx
.
y
*
gridDim
.
x
*
blockDim
.
x
;
norm_probs
[
id
]
=
in_data
[
id
]
/
sum_rows
[
blockIdx
.
y
];
}
template
<
typename
T
>
__global__
void
yokiFunc
(
const
T
*
in_data
,
T
*
out
)
{
// int id = blockIdx.x * blockDim.x + threadIdx.x;
// int id = threadIdx.x;
int
id
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
+
blockIdx
.
y
*
gridDim
.
x
*
blockDim
.
x
;
out
[
id
]
=
in_data
[
id
];
}
template
<
typename
T
>
__global__
void
Cumsum
(
T
*
norm_probs_data
,
int64_t
num_distributions
,
int64_t
num_categories
,
T
*
cumulative_probs
)
{
// int id = blockIdx.x;
for
(
int
id
=
blockIdx
.
x
;
id
<
num_distributions
;
id
+=
gridDim
.
x
)
{
thrust
::
inclusive_scan
(
thrust
::
device
,
norm_probs_data
+
id
*
num_categories
,
norm_probs_data
+
(
id
+
1
)
*
num_categories
,
cumulative_probs
+
id
*
num_categories
);
}
}
template
<
typename
T
>
...
...
@@ -141,21 +164,29 @@ __global__ void sampleMultinomialWithReplacement(
// global index formula for 2D grid of 1D blocks
// int idx = blockIdx.y * gridDim.x * blockDim.x + blockIdx.x * blockDim.x +
// threadIdx.x;
int
idx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
for
(
int
sample
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
sample
<
totalSamples
;
sample
+=
blockDim
.
x
*
gridDim
.
x
)
{
// we are losing 3 out of 4 generated numbers but it's ok
// this kernel is not very efficient anyway
// int idx = blockIdx.x * blockDim.x + threadIdx.x;
// T uniform_random = dist(rng);
T
uniform_random
=
rng
[
sample
]
;
int
idx
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
+
blockIdx
.
y
*
gridDim
.
x
*
blockDim
.
x
;
// Find the bucket that a uniform sample lies in
int
choice
=
binarySearchForMultinomial
<
T
>
(
normDistPrefixSum
,
normDist
,
categories
,
uniform_random
);
for
(
int
curDist
=
blockIdx
.
y
;
curDist
<
distributions
;
curDist
+=
gridDim
.
y
)
{
for
(
int
sample
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
sample
<
totalSamples
;
sample
+=
blockDim
.
x
*
gridDim
.
x
)
{
// we are losing 3 out of 4 generated numbers but it's ok
// this kernel is not very efficient anyway
dest
[
sample
]
=
choice
;
// T uniform_random = dist(rng);
T
uniform_random
=
rng
[
sample
+
curDist
*
totalSamples
];
// Find the bucket that a uniform sample lies in
int
choice
=
binarySearchForMultinomial
<
T
>
(
normDistPrefixSum
+
curDist
*
categories
,
normDist
+
curDist
*
categories
,
categories
,
uniform_random
);
dest
[
sample
+
curDist
*
totalSamples
]
=
choice
;
}
}
}
...
...
@@ -167,17 +198,48 @@ class MultinomialOpKernel<platform::CUDADeviceContext, T>
const
auto
x
=
ctx
.
Input
<
framework
::
Tensor
>
(
"X"
);
auto
out
=
ctx
.
Output
<
framework
::
Tensor
>
(
"Out"
);
// auto yokiout = ctx.Output<framework::Tensor>("yokiOut");
const
int64_t
num_samples
=
ctx
.
Attr
<
int
>
(
"num_samples"
);
const
bool
replacement
=
ctx
.
Attr
<
bool
>
(
"replacement"
);
auto
*
in_data
=
x
->
data
<
T
>
();
auto
*
out_data
=
out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
// auto* yokiout_data = yokiout->mutable_data<T>(ctx.GetPlace());
auto
in_dims
=
x
->
dims
();
int64_t
in_rank
=
in_dims
.
size
();
const
int64_t
num_categories
=
in_dims
[
in_rank
-
1
];
const
int64_t
num_distributions
=
in_rank
>
1
?
in_dims
[
in_rank
-
2
]
:
1
;
if
(
!
replacement
)
{
int
in_data_numel
=
x
->
numel
();
int
out_data_numel
=
out
->
numel
();
// std::vector<T> cpu_in_data(in_data_numel);
// std::vector<T> cpu_out_data(out_data_numel);
// T cpu_in_data[in_data_numel];
// T cpu_out_data[out_data_numel];
T
*
cpu_in_data
=
new
T
[
in_data_numel
];
T
*
cpu_out_data
=
new
T
[
out_data_numel
];
cudaMemcpy
(
cpu_in_data
,
in_data
,
in_data_numel
*
sizeof
(
T
),
cudaMemcpyDeviceToHost
);
VLOG
(
3
)
<<
"Print cpu_in_data "
<<
cpu_in_data
[
0
]
<<
"
\n
"
;
VLOG
(
3
)
<<
"Print in_data_numel "
<<
in_data_numel
<<
"
\n
"
;
VLOG
(
3
)
<<
"Print out_data_numel "
<<
out_data_numel
<<
"
\n
"
;
MultinomialFunctor
<
T
>
(
cpu_out_data
,
cpu_in_data
,
num_samples
,
replacement
,
num_categories
,
num_distributions
);
cudaMemcpy
(
out_data
,
cpu_out_data
,
out_data_numel
*
sizeof
(
T
),
cudaMemcpyHostToDevice
);
delete
[]
cpu_in_data
;
delete
[]
cpu_out_data
;
return
;
}
// std::vector<T> sum_rows(num_distributions);
// SumArrayCUDAKernel<T>(in_data, sum_rows,)
...
...
@@ -188,30 +250,44 @@ class MultinomialOpKernel<platform::CUDADeviceContext, T>
VLOG
(
3
)
<<
"Print in_rank "
<<
in_rank
<<
"
\n
"
;
framework
::
Tensor
sum_rows_t
;
auto
*
sum_rows_data
=
sum_rows_t
.
mutable_data
<
T
>
({
1
},
ctx
.
GetPlace
());
auto
*
sum_rows_data
=
sum_rows_t
.
mutable_data
<
T
>
({
num_distributions
},
ctx
.
GetPlace
());
// auto* sum_rows_data =
// sum_rows_t->mutable_data<T>(framework::make_ddim({1}), ctx.GetPlace());
// sum_rows_t->mutable_data<T>(framework::make_ddim({num_distributions}),
// ctx.GetPlace());
auto
&
place
=
*
ctx
.
template
device_context
<
platform
::
CUDADeviceContext
>()
.
eigen_device
();
auto
eigen_input
=
framework
::
EigenVector
<
T
>::
Flatten
(
*
x
);
// auto eigen_sum_rows = framework::EigenVector<T>::From(sum_rows_t);
auto
eigen_sum_rows
=
framework
::
EigenScalar
<
T
>::
From
(
sum_rows_t
);
eigen_sum_rows
.
device
(
place
)
=
eigen_input
.
sum
(
Eigen
::
DSizes
<
int
,
1
>
(
0
))
.
eval
()
.
reshape
(
Eigen
::
DSizes
<
int
,
1
>
(
sum_rows_t
.
dims
()[
0
]));
// eigen_sum_rows.device(place) =
// eigen_input.sum().eval().reshape(Eigen::DSizes<int, 1>(1));
dim3
grid
(
num_distributions
);
dim3
block
(
num_categories
);
if
(
num_distributions
==
1
)
{
auto
eigen_input
=
framework
::
EigenVector
<
T
>::
Flatten
(
*
x
);
auto
eigen_sum_rows
=
framework
::
EigenVector
<
T
>::
From
(
sum_rows_t
);
// auto eigen_sum_rows = framework::EigenScalar<T>::From(sum_rows_t);
eigen_sum_rows
.
device
(
place
)
=
eigen_input
.
sum
(
Eigen
::
DSizes
<
int
,
1
>
(
1
))
.
eval
()
.
reshape
(
Eigen
::
DSizes
<
int
,
1
>
(
sum_rows_t
.
dims
()[
0
]));
}
else
{
auto
eigen_input
=
framework
::
EigenMatrix
<
T
>::
From
(
*
x
);
// auto eigen_sum_rows = framework::EigenVector<T>::From(sum_rows_t);
auto
eigen_sum_rows
=
framework
::
EigenVector
<
T
>::
From
(
sum_rows_t
);
eigen_sum_rows
.
device
(
place
)
=
eigen_input
.
sum
(
Eigen
::
DSizes
<
int
,
1
>
(
1
));
// .eval()
// .reshape(Eigen::DSizes<int, 1>(sum_rows_t.dims()[0]));
// eigen_sum_rows.device(place) =
// eigen_input.sum().eval().reshape(Eigen::DSizes<int, 1>(1));
}
// std::vector<T> in_data_norm(num_categories);
framework
::
Tensor
norm_probs_t
;
auto
*
norm_probs_data
=
norm_probs_t
.
mutable_data
<
T
>
({
num_categories
},
ctx
.
GetPlace
());
auto
*
norm_probs_data
=
norm_probs_t
.
mutable_data
<
T
>
(
{
num_distributions
,
num_categories
},
ctx
.
GetPlace
());
// dim3 grid(num_distributions);
// dim3 block(num_categories);
dim3
block
(
num_categories
<
512
?
num_categories
:
512
);
dim3
grid
((
num_categories
-
1
)
/
block
.
x
+
1
,
num_distributions
);
NormalizeProbability
<
T
><<<
grid
,
block
,
0
,
ctx
.
cuda_device_context
().
stream
()
>>>
(
norm_probs_data
,
in_data
,
sum_rows_data
);
...
...
@@ -219,43 +295,46 @@ class MultinomialOpKernel<platform::CUDADeviceContext, T>
// num_distributions can only be 1.
// std::vector<T> cumulative_probs(num_categories);
framework
::
Tensor
cumulative_probs_t
;
auto
*
cumulative_probs
=
cumulative_probs_t
.
mutable_data
<
T
>
({
num_categories
},
ctx
.
GetPlace
());
auto
*
cumulative_probs
=
cumulative_probs_t
.
mutable_data
<
T
>
(
{
num_distributions
,
num_categories
},
ctx
.
GetPlace
());
// T cumulative_probs[num_categories];
int64_t
size
=
num_categories
;
thrust
::
inclusive_scan
(
thrust
::
device
,
norm_probs_data
,
norm_probs_data
+
num_categories
,
cumulative_probs
);
dim3
block1
(
1
);
dim3
grid1
(
num_distributions
);
Cumsum
<
T
><<<
grid1
,
block1
,
0
,
ctx
.
cuda_device_context
().
stream
()
>>>
(
norm_probs_data
,
num_distributions
,
num_categories
,
cumulative_probs
);
/*
dim3 block2(num_categories < 512 ? num_categories : 512);
dim3 grid2((num_categories-1)/block2.x+1, num_distributions);
yokiFunc<T><<<grid2, block2, 0, ctx.cuda_device_context().stream()>>>(
cumulative_probs, yokiout_data);*/
// int64_t size = num_categories;
// thrust::inclusive_scan(thrust::device, norm_probs_data,
// norm_probs_data + num_categories,
// cumulative_probs);
VLOG
(
3
)
<<
"Print cumsum "
<<
cumulative_probs
<<
"
\n
"
;
if
(
replacement
)
{
dim3
block
(
128
);
// int grid_y = 1;
dim3
grid
((
num_samples
-
1
)
/
block
.
x
+
1
);
/*
// std::vector<T> rng(num_samples);
T rng[num_samples];
std::uniform_real_distribution<T> dist(0, 1);
auto gen_ptr = framework::DefaultCPUGenerator();
auto engine = gen_ptr->GetCPUEngine();
for (int s = 0; s < num_samples; s++) {
rng[s] = dist(*engine);
}
*/
dim3
grid
((
num_samples
-
1
)
/
block
.
x
+
1
,
num_distributions
);
std
::
random_device
rd
;
auto
seed
=
rd
();
framework
::
Tensor
rng_data_t
;
auto
*
rng_data
=
rng_data_t
.
mutable_data
<
T
>
({
num_samples
},
ctx
.
GetPlace
());
auto
*
rng_data
=
rng_data_t
.
mutable_data
<
T
>
(
{
num_distributions
,
num_samples
},
ctx
.
GetPlace
());
thrust
::
counting_iterator
<
unsigned
int
>
index_sequence_begin
(
0
);
platform
::
Transform
<
platform
::
CUDADeviceContext
>
trans
;
auto
*
context
=
static_cast
<
const
platform
::
CUDADeviceContext
*>
(
&
ctx
.
device_context
());
trans
(
*
context
,
index_sequence_begin
,
index_sequence_begin
+
num_samples
,
rng_data
,
RandomGeneratorCudaFunctor
<
T
>
(
seed
));
trans
(
*
context
,
index_sequence_begin
,
index_sequence_begin
+
num_distributions
*
num_samples
,
rng_data
,
RandomGeneratorCudaFunctor
<
T
>
(
seed
));
VLOG
(
3
)
<<
"Print enter
\n
"
;
// VLOG(3) << "Print size in_data " <<
...
...
@@ -267,8 +346,12 @@ class MultinomialOpKernel<platform::CUDADeviceContext, T>
T
><<<
grid
,
block
,
0
,
ctx
.
cuda_device_context
().
stream
()
>>>
(
rng_data
,
num_samples
,
out_data
,
num_distributions
,
num_categories
,
cumulative_probs
,
norm_probs_data
);
VLOG
(
3
)
<<
"Print end
\n
"
<<
out_data
;
}
VLOG
(
3
)
<<
"Print final end
\n
"
;
// MultinomialCudaFunctor<T>(out_data, in_data, num_samples, replacement,
// num_categories, num_distributions);
}
...
...
python/paddle/fluid/tests/unittests/test_multinomial_op.py
浏览文件 @
c66eec75
...
...
@@ -38,6 +38,7 @@ class TestMultinomialOp(OpTest):
# input probability is a vector, and replacement is True
self
.
input_np
=
np
.
random
.
rand
(
4
)
self
.
outputs
=
{
"Out"
:
np
.
zeros
(
100000
).
astype
(
"int64"
)}
# self.outputs = {"yokiOut": np.zeros(4).astype("int64")}
self
.
attrs
=
{
"num_samples"
:
100000
,
"replacement"
:
True
}
def
test_check_output
(
self
):
...
...
@@ -53,19 +54,21 @@ class TestMultinomialOp(OpTest):
# normalize the input to get the probability
prob
=
self
.
input_np
/
self
.
input_np
.
sum
(
axis
=-
1
,
keepdims
=
True
)
sample_prob
=
self
.
sample_output
(
np
.
array
(
outs
[
0
]))
print
(
"sample_prob: "
+
str
(
sample_prob
)
+
"
\n
prob: "
+
str
(
prob
))
# sample_prob = np.array(outs[0])
# print("input", self.input_np)
# print("sample_prob: " + str(sample_prob) + "\nprob: " + str(prob))
self
.
assertTrue
(
np
.
allclose
(
sample_prob
,
prob
,
rtol
=
0
,
atol
=
0.01
),
"sample_prob: "
+
str
(
sample_prob
)
+
"
\n
prob: "
+
str
(
prob
))
"""
class
TestMultinomialOp2
(
TestMultinomialOp
):
def
init_data
(
self
):
# input probability is a matrix
self
.
input_np
=
np
.
random
.
rand
(
3
,
4
)
self
.
outputs
=
{
"Out"
:
np
.
zeros
((
3
,
100000
)).
astype
(
"int64"
)}
# self.outputs = {"yokiOut": np.zeros((3, 4)).astype("int64")}
self
.
attrs
=
{
"num_samples"
:
100000
,
"replacement"
:
True
}
def
sample_output
(
self
,
out
):
...
...
@@ -88,11 +91,13 @@ class TestMultinomialOp3(TestMultinomialOp):
def
verify_output
(
self
,
outs
):
out
=
np
.
array
(
outs
[
0
])
# print("op3out", out)
unique_out
=
np
.
unique
(
out
)
self
.
assertEqual
(
len
(
unique_out
),
100
,
"replacement is False. categories can't be sampled repeatedly"
)
"""
"""
class TestReplacementError(unittest.TestCase):
def init_data(self):
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录