Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
272f3e6d
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
272f3e6d
编写于
11月 06, 2017
作者:
T
typhoonzero
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
refine get cuda context
上级
ff4c20e0
变更
9
隐藏空白更改
内联
并排
Showing
9 changed file
with
23 addition
and
48 deletion
+23
-48
paddle/framework/operator.h
paddle/framework/operator.h
+3
-4
paddle/operators/accuracy_op.cu
paddle/operators/accuracy_op.cu
+2
-5
paddle/operators/conv2d_transpose_cudnn_op.cu
paddle/operators/conv2d_transpose_cudnn_op.cu
+0
-1
paddle/operators/conv_cudnn_op.cu
paddle/operators/conv_cudnn_op.cu
+0
-1
paddle/operators/conv_shift_op.cu
paddle/operators/conv_shift_op.cu
+2
-6
paddle/operators/cross_entropy_op.cu
paddle/operators/cross_entropy_op.cu
+5
-10
paddle/operators/lookup_table_op.cu
paddle/operators/lookup_table_op.cu
+8
-12
paddle/operators/multiplex_op.cu
paddle/operators/multiplex_op.cu
+2
-6
paddle/operators/nccl_op.cu
paddle/operators/nccl_op.cu
+1
-3
未找到文件。
paddle/framework/operator.h
浏览文件 @
272f3e6d
...
...
@@ -298,11 +298,10 @@ class ExecutionContext {
}
#ifdef PADDLE_WITH_CUDA
const
platform
::
CUDADeviceContext
&
cuda_device_context
()
const
{
const
inline
platform
::
CUDADeviceContext
&
cuda_device_context
()
const
{
PADDLE_ENFORCE
(
platform
::
is_gpu_place
(
device_context_
.
GetPlace
()));
auto
cuda_ctx
=
reinterpret_cast
<
const
platform
::
CUDADeviceContext
*>
(
&
device_context_
);
return
*
cuda_ctx
;
return
*
reinterpret_cast
<
const
platform
::
CUDADeviceContext
*>
(
&
device_context_
);
}
#endif
...
...
paddle/operators/accuracy_op.cu
浏览文件 @
272f3e6d
...
...
@@ -72,11 +72,8 @@ class AccuracyOpCUDAKernel : public framework::OpKernel<T> {
}
AccuracyCudaKernel
<
PADDLE_CUDA_NUM_THREADS
><<<
1
,
PADDLE_CUDA_NUM_THREADS
,
0
,
reinterpret_cast
<
const
platform
::
CUDADeviceContext
&>
(
ctx
.
device_context
())
.
stream
()
>>>
(
num_samples
,
infer_width
,
indices_data
,
label_data
,
accuracy_data
);
1
,
PADDLE_CUDA_NUM_THREADS
,
0
,
ctx
.
cuda_device_context
().
stream
()
>>>
(
num_samples
,
infer_width
,
indices_data
,
label_data
,
accuracy_data
);
}
};
...
...
paddle/operators/conv2d_transpose_cudnn_op.cu
浏览文件 @
272f3e6d
...
...
@@ -27,7 +27,6 @@ using ScopedTensorDescriptor = platform::ScopedTensorDescriptor;
using
ScopedFilterDescriptor
=
platform
::
ScopedFilterDescriptor
;
using
ScopedConvolutionDescriptor
=
platform
::
ScopedConvolutionDescriptor
;
using
DataLayout
=
platform
::
DataLayout
;
using
CUDADeviceContext
=
platform
::
CUDADeviceContext
;
static
constexpr
size_t
kConvCudnnWorkspaceLimitBytes
=
1024
*
1024
*
1024
;
...
...
paddle/operators/conv_cudnn_op.cu
浏览文件 @
272f3e6d
...
...
@@ -27,7 +27,6 @@ using ScopedTensorDescriptor = platform::ScopedTensorDescriptor;
using
ScopedFilterDescriptor
=
platform
::
ScopedFilterDescriptor
;
using
ScopedConvolutionDescriptor
=
platform
::
ScopedConvolutionDescriptor
;
using
DataLayout
=
platform
::
DataLayout
;
using
CUDADeviceContext
=
platform
::
CUDADeviceContext
;
static
constexpr
size_t
kCONV_CUDNN_WORKSPACE_LIMIT_BYTES
=
1024
*
1024
*
1024
;
...
...
paddle/operators/conv_shift_op.cu
浏览文件 @
272f3e6d
...
...
@@ -130,9 +130,7 @@ class ConvShiftKernel<platform::GPUPlace, T> : public framework::OpKernel<T> {
dim3
grid_dim
(
num_x_blocks
,
batch_size
);
auto
stream
=
reinterpret_cast
<
const
platform
::
CUDADeviceContext
&>
(
context
.
device_context
())
.
stream
();
auto
stream
=
context
.
cuda_device_context
().
stream
();
conv_shift_forward
<
T
><<<
grid_dim
,
x_per_block
,
mem_per_block
,
stream
>>>
(
x_data
,
y_data
,
out_data
,
x_width
,
y_width
,
y_half_width
,
batch_size
);
...
...
@@ -159,9 +157,7 @@ class ConvShiftGradKernel<platform::GPUPlace, T>
int
y_width
=
Y
->
dims
()[
1
];
int
y_half_width
=
(
y_width
-
1
)
/
2
;
auto
stream
=
reinterpret_cast
<
const
platform
::
CUDADeviceContext
&>
(
context
.
device_context
())
.
stream
();
auto
stream
=
context
.
cuda_device_context
().
stream
();
const
int
x_per_block
=
256
;
int
num_x_blocks
=
div_up
(
x_width
,
x_per_block
);
...
...
paddle/operators/cross_entropy_op.cu
浏览文件 @
272f3e6d
...
...
@@ -82,24 +82,19 @@ class CrossEntropyGradientOpCUDAKernel : public framework::OpKernel<T> {
int
block
=
512
;
int
grid
=
(
batch_size
*
class_num
+
block
-
1
)
/
block
;
auto
stream
=
ctx
.
cuda_device_context
().
stream
();
if
(
ctx
.
Attr
<
bool
>
(
"soft_label"
))
{
auto
*
label_data
=
label
->
data
<
T
>
();
SoftCrossEntropyGradientKernel
<
T
><<<
grid
,
block
,
0
,
reinterpret_cast
<
const
platform
::
CUDADeviceContext
&>
(
ctx
.
device_context
())
.
stream
()
>>>
(
dx_data
,
dy_data
,
x_data
,
label_data
,
batch_size
,
class_num
);
SoftCrossEntropyGradientKernel
<
T
><<<
grid
,
block
,
0
,
stream
>>>
(
dx_data
,
dy_data
,
x_data
,
label_data
,
batch_size
,
class_num
);
}
else
{
math
::
SetConstant
<
platform
::
GPUPlace
,
T
>
functor
;
functor
(
ctx
.
device_context
(),
dx
,
0
);
auto
*
label_data
=
label
->
data
<
int64_t
>
();
grid
=
(
batch_size
+
block
-
1
)
/
block
;
CrossEntropyGradientKernel
<
T
><<<
grid
,
block
,
0
,
reinterpret_cast
<
const
platform
::
CUDADeviceContext
&>
(
ctx
.
device_context
())
.
stream
()
>>>
(
dx_data
,
dy_data
,
x_data
,
label_data
,
batch_size
,
class_num
);
CrossEntropyGradientKernel
<
T
><<<
grid
,
block
,
0
,
stream
>>>
(
dx_data
,
dy_data
,
x_data
,
label_data
,
batch_size
,
class_num
);
}
}
};
...
...
paddle/operators/lookup_table_op.cu
浏览文件 @
272f3e6d
...
...
@@ -74,10 +74,9 @@ class LookupTableCUDAKernel : public framework::OpKernel<T> {
dim3
threads
(
128
,
8
);
dim3
grids
(
8
,
1
);
LookupTable
<
T
,
128
,
8
,
8
><<<
grids
,
threads
,
0
,
reinterpret_cast
<
const
platform
::
CUDADeviceContext
&>
(
context
.
device_context
())
.
stream
()
>>>
(
output
,
table
,
ids
,
N
,
K
,
D
);
LookupTable
<
T
,
128
,
8
,
8
><<<
grids
,
threads
,
0
,
context
.
device_context
().
stream
()
>>>
(
output
,
table
,
ids
,
N
,
K
,
D
);
}
};
...
...
@@ -95,9 +94,7 @@ class LookupTableGradCUDAKernel : public framework::OpKernel<T> {
auto
*
ids_data
=
ids
->
data
<
int64_t
>
();
auto
ids_dim
=
ids
->
dims
();
auto
stream
=
reinterpret_cast
<
const
platform
::
CUDADeviceContext
&>
(
context
.
device_context
())
.
stream
();
auto
stream
=
context
.
cuda_device_context
().
stream
();
// copy GPU memory to CPU pinned memory
framework
::
Vector
<
int64_t
>
new_rows
;
new_rows
.
resize
(
ids_dim
[
0
]);
...
...
@@ -136,11 +133,10 @@ class LookupTableGradCUDAKernel : public framework::OpKernel<T> {
dim3
threads
(
128
,
8
);
dim3
grids
(
8
,
1
);
LookupTableGrad
<
T
,
128
,
8
,
8
><<<
grids
,
threads
,
0
,
reinterpret_cast
<
const
platform
::
CUDADeviceContext
&>
(
context
.
device_context
())
.
stream
()
>>>
(
d_table
,
d_output
,
ids
,
N
,
K
,
D
);
LookupTableGrad
<
T
,
128
,
8
,
8
><<<
grids
,
threads
,
0
,
context
.
device_context
().
stream
()
>>>
(
d_table
,
d_output
,
ids
,
N
,
K
,
D
);
}
}
};
...
...
paddle/operators/multiplex_op.cu
浏览文件 @
272f3e6d
...
...
@@ -35,9 +35,7 @@ class MultiplexGPUKernel : public framework::OpKernel<T> {
Tensor
index_t_cpu
;
index_t_cpu
.
CopyFrom
(
*
ids
,
platform
::
CPUPlace
(),
ctx
.
device_context
());
auto
*
index
=
index_t_cpu
.
data
<
int32_t
>
();
auto
stream
=
reinterpret_cast
<
const
platform
::
CUDADeviceContext
&>
(
ctx
.
device_context
())
.
stream
();
auto
stream
=
ctx
.
cuda_device_context
().
stream
();
Place
place
=
boost
::
get
<
Place
>
(
ctx
.
GetPlace
());
for
(
auto
i
=
0
;
i
<
rows
;
i
++
)
{
int32_t
k
=
index
[
i
];
...
...
@@ -73,9 +71,7 @@ class MultiplexGradGPUKernel : public framework::OpKernel<T> {
index_t_cpu
.
CopyFrom
(
*
ids
,
platform
::
CPUPlace
(),
ctx
.
device_context
());
auto
*
index
=
index_t_cpu
.
data
<
int32_t
>
();
auto
stream
=
reinterpret_cast
<
const
platform
::
CUDADeviceContext
&>
(
ctx
.
device_context
())
.
stream
();
auto
stream
=
ctx
.
device_context
().
stream
();
Place
place
=
boost
::
get
<
Place
>
(
ctx
.
GetPlace
());
for
(
auto
i
=
0
;
i
<
rows
;
i
++
)
{
size_t
k
=
static_cast
<
size_t
>
(
index
[
i
]);
...
...
paddle/operators/nccl_op.cu
浏览文件 @
272f3e6d
...
...
@@ -64,9 +64,7 @@ class NCCLAllReduceKernel : public framework::OpKernel<T> {
auto
*
comm
=
ctx
.
Input
<
Communicator
>
(
"Communicator"
);
auto
stream
=
reinterpret_cast
<
const
platform
::
CUDADeviceContext
&>
(
ctx
.
device_context
())
.
stream
();
auto
stream
=
ctx
.
cuda_device_context
().
stream
();
// device id
int
gpu_id
=
boost
::
get
<
platform
::
GPUPlace
>
(
ctx
.
GetPlace
()).
GetDeviceId
();
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录