未验证 提交 11fb8a1c 编写于 作者: G GaoWei8 提交者: GitHub

Refine cudnn softmax (#25757)

* refine cudnn softmax
上级 885c61f0
...@@ -12,60 +12,90 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,60 +12,90 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/math/softmax.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/softmax_op.h"
#include "paddle/fluid/platform/cudnn_desc.h"
#include "paddle/fluid/platform/cudnn_helper.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
using ScopedTensorDescriptor = platform::ScopedTensorDescriptor;
using DataLayout = platform::DataLayout;
using Tensor = framework::Tensor; using Tensor = framework::Tensor;
static inline int SizeOutAxis(const int axis, DDim dims) {
int size = 1;
for (int i = axis + 1; i < dims.size(); i++) {
size *= dims[i];
}
return size;
}
template <typename T> template <typename T>
class SoftmaxCUDNNKernel : public framework::OpKernel<T> { class SoftmaxCUDNNKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& context) const override { void Compute(const framework::ExecutionContext& ctx) const override {
auto* X = context.Input<Tensor>("X"); auto* x = ctx.Input<Tensor>("X");
auto* Out = context.Output<Tensor>("Out"); auto* out = ctx.Output<Tensor>("Out");
out->mutable_data<T>(ctx.GetPlace());
// allocate memory on device. auto* out_data = out->data<T>();
Out->mutable_data<T>(context.GetPlace());
auto dims = x->dims();
auto dims = X->dims(); const int rank = dims.size();
auto flattened_dims = framework::flatten_to_2d(dims, dims.size() - 1); const int axis = CanonicalAxis(ctx.Attr<int>("axis"), rank);
framework::LoDTensor flattened_x; const int dim = dims[axis];
framework::LoDTensor flattened_out; const int N = SizeToAxis(axis, dims);
flattened_x.ShareDataWith(*X).Resize(flattened_dims); const int D = SizeOutAxis(axis, dims);
flattened_out.ShareDataWith(*Out).Resize(flattened_dims);
ScopedTensorDescriptor desc;
math::SoftmaxCUDNNFunctor<T>()( std::vector<int> tensor_dims = {N, dim, D, 1};
context.template device_context<platform::CUDADeviceContext>(), DataLayout layout = DataLayout::kNCHW;
&flattened_x, &flattened_out); cudnnTensorDescriptor_t desc_ = desc.descriptor<T>(layout, tensor_dims);
auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
auto handle = dev_ctx.cudnn_handle();
auto mode = axis == rank - 1 ? CUDNN_SOFTMAX_MODE_INSTANCE
: CUDNN_SOFTMAX_MODE_CHANNEL;
PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSoftmaxForward(
handle, CUDNN_SOFTMAX_ACCURATE, mode,
platform::CudnnDataType<T>::kOne(), desc_, x->data<T>(),
platform::CudnnDataType<T>::kZero(), desc_, out_data));
} }
}; };
template <typename T> template <typename T>
class SoftmaxGradCUDNNKernel : public framework::OpKernel<T> { class SoftmaxGradCUDNNKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& context) const override { void Compute(const framework::ExecutionContext& ctx) const override {
auto* Out = context.Input<Tensor>("Out"); auto* out = ctx.Input<Tensor>("Out");
auto* dOut = context.Input<Tensor>(framework::GradVarName("Out")); auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
auto* dX = context.Output<Tensor>(framework::GradVarName("X")); auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
dx->mutable_data<T>(ctx.GetPlace());
// allocate memory on device. auto* dx_data = dx->data<T>();
dX->mutable_data<T>(context.GetPlace());
auto dims = out->dims();
auto dims = Out->dims(); const int rank = dims.size();
auto flattened_dims = framework::flatten_to_2d(dims, dims.size() - 1); const int axis = CanonicalAxis(ctx.Attr<int>("axis"), rank);
framework::LoDTensor flattened_out; const int dim = dims[axis];
framework::LoDTensor flattened_d_out; const int N = SizeToAxis(axis, dims);
framework::LoDTensor flattened_d_x; const int D = SizeOutAxis(axis, dims);
flattened_out.ShareDataWith(*Out).Resize(flattened_dims);
flattened_d_out.ShareDataWith(*dOut).Resize(flattened_dims); ScopedTensorDescriptor desc;
flattened_d_x.ShareDataWith(*dX).Resize(flattened_dims); std::vector<int> tensor_dims = {N, dim, D, 1};
DataLayout layout = DataLayout::kNCHW;
math::SoftmaxGradCUDNNFunctor<T>()( cudnnTensorDescriptor_t desc_ = desc.descriptor<T>(layout, tensor_dims);
context.template device_context<platform::CUDADeviceContext>(),
&flattened_out, &flattened_d_out, &flattened_d_x); auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
auto handle = dev_ctx.cudnn_handle();
auto mode = axis == rank - 1 ? CUDNN_SOFTMAX_MODE_INSTANCE
: CUDNN_SOFTMAX_MODE_CHANNEL;
PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSoftmaxBackward(
handle, CUDNN_SOFTMAX_ACCURATE, mode,
platform::CudnnDataType<T>::kOne(), desc_, out->data<T>(), desc_,
dout->data<T>(), platform::CudnnDataType<T>::kZero(), desc_, dx_data));
} }
}; };
......
...@@ -53,13 +53,6 @@ class SoftmaxOp : public framework::OperatorWithKernel { ...@@ -53,13 +53,6 @@ class SoftmaxOp : public framework::OperatorWithKernel {
"Attr(axis) value should be in range [-R, R-1], " "Attr(axis) value should be in range [-R, R-1], "
"R is the rank of Input(X).")); "R is the rank of Input(X)."));
auto use_cudnn = ctx->Attrs().Get<bool>("use_cudnn");
if (axis != rank_x - 1 && axis != -1) {
PADDLE_ENFORCE_EQ(use_cudnn, false,
platform::errors::InvalidArgument(
"CUDNN kernel only support axis as -1."));
}
ctx->SetOutputDim("Out", ctx->GetInputDim("X")); ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
ctx->ShareLoD("X", /*->*/ "Out"); ctx->ShareLoD("X", /*->*/ "Out");
} }
......
...@@ -153,16 +153,103 @@ class TestSoftmaxCUDNNOp2(TestSoftmaxCUDNNOp): ...@@ -153,16 +153,103 @@ class TestSoftmaxCUDNNOp2(TestSoftmaxCUDNNOp):
return [2, 3, 4, 5] return [2, 3, 4, 5]
@unittest.skipIf(not core.is_compiled_with_cuda(),
"core is not compiled with CUDA")
class TestSoftmaxCUDNNOp3(TestSoftmaxCUDNNOp):
def get_x_shape(self):
return [2, 3, 4, 5]
def get_axis(self):
return 0
@unittest.skipIf(not core.is_compiled_with_cuda(),
"core is not compiled with CUDA")
class TestSoftmaxCUDNNOp4(TestSoftmaxCUDNNOp):
def get_x_shape(self):
return [2, 3, 4, 5]
def get_axis(self):
return 1
@unittest.skipIf(not core.is_compiled_with_cuda(), @unittest.skipIf(not core.is_compiled_with_cuda(),
"core is not compiled with CUDA") "core is not compiled with CUDA")
class TestSoftmaxCUDNNOp5(TestSoftmaxCUDNNOp): class TestSoftmaxCUDNNOp5(TestSoftmaxCUDNNOp):
def get_x_shape(self): def get_x_shape(self):
return [2, 3, 4, 5] return [2, 3, 4, 5]
def get_axis(self):
return 2
@unittest.skipIf(not core.is_compiled_with_cuda(),
"core is not compiled with CUDA")
class TestSoftmaxCUDNNOp6(TestSoftmaxCUDNNOp):
def get_x_shape(self):
return [2, 3, 4, 5]
def get_axis(self): def get_axis(self):
return 3 return 3
@unittest.skipIf(not core.is_compiled_with_cuda(),
"core is not compiled with CUDA")
class TestSoftmaxCUDNNOp7(TestSoftmaxCUDNNOp):
def get_x_shape(self):
return [2, 3, 4, 5, 6]
@unittest.skipIf(not core.is_compiled_with_cuda(),
"core is not compiled with CUDA")
class TestSoftmaxCUDNNOp8(TestSoftmaxCUDNNOp):
def get_x_shape(self):
return [2, 3, 4, 5, 6]
def get_axis(self):
return 0
@unittest.skipIf(not core.is_compiled_with_cuda(),
"core is not compiled with CUDA")
class TestSoftmaxCUDNNOp9(TestSoftmaxCUDNNOp):
def get_x_shape(self):
return [2, 3, 4, 5, 6]
def get_axis(self):
return 1
@unittest.skipIf(not core.is_compiled_with_cuda(),
"core is not compiled with CUDA")
class TestSoftmaxCUDNNOp10(TestSoftmaxCUDNNOp):
def get_x_shape(self):
return [2, 3, 4, 5, 6]
def get_axis(self):
return 2
@unittest.skipIf(not core.is_compiled_with_cuda(),
"core is not compiled with CUDA")
class TestSoftmaxCUDNNOp11(TestSoftmaxCUDNNOp):
def get_x_shape(self):
return [2, 3, 4, 5, 6]
def get_axis(self):
return 3
@unittest.skipIf(not core.is_compiled_with_cuda(),
"core is not compiled with CUDA")
class TestSoftmaxCUDNNOp12(TestSoftmaxCUDNNOp):
def get_x_shape(self):
return [2, 3, 4, 5, 6]
def get_axis(self):
return 4
@unittest.skipIf(not core.is_compiled_with_cuda(), @unittest.skipIf(not core.is_compiled_with_cuda(),
"core is not compiled with CUDA") "core is not compiled with CUDA")
class TestSoftmaxFP16Op(TestSoftmaxOp): class TestSoftmaxFP16Op(TestSoftmaxOp):
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册