未验证 提交 1d91a49d 编写于 作者: C chengduo 提交者: GitHub

Some trivial optimization (#13530)

* some trivial opt

* remove the fix of lod_tensor and shrink_rnn_memory_op

* refine ShrinkRNNMemoryOp

test=develop
上级 5093afce
...@@ -38,27 +38,31 @@ struct OpInfo { ...@@ -38,27 +38,31 @@ struct OpInfo {
OpAttrChecker* checker_{nullptr}; OpAttrChecker* checker_{nullptr};
InferVarTypeFN infer_var_type_; InferVarTypeFN infer_var_type_;
InferShapeFN infer_shape_; InferShapeFN infer_shape_;
std::string op_type_;
bool HasOpProtoAndChecker() const { bool HasOpProtoAndChecker() const {
return proto_ != nullptr && checker_ != nullptr; return proto_ != nullptr && checker_ != nullptr;
} }
const proto::OpProto& Proto() const { const proto::OpProto& Proto() const {
PADDLE_ENFORCE_NOT_NULL(proto_, "Operator Proto has not been registered"); PADDLE_ENFORCE_NOT_NULL(proto_, "Operator %s Proto has not been registered",
op_type_);
PADDLE_ENFORCE(proto_->IsInitialized(), PADDLE_ENFORCE(proto_->IsInitialized(),
"Operator Proto must be initialized in op info"); "Operator %s Proto must be initialized in op info",
op_type_);
return *proto_; return *proto_;
} }
const OpCreator& Creator() const { const OpCreator& Creator() const {
PADDLE_ENFORCE_NOT_NULL(creator_, PADDLE_ENFORCE_NOT_NULL(
"Operator Creator has not been registered"); creator_, "Operator %s Creator has not been registered", op_type_);
return creator_; return creator_;
} }
const GradOpMakerFN& GradOpMaker() const { const GradOpMakerFN& GradOpMaker() const {
PADDLE_ENFORCE_NOT_NULL(grad_op_maker_, PADDLE_ENFORCE_NOT_NULL(grad_op_maker_,
"Operator GradOpMaker has not been registered."); "Operator %s GradOpMaker has not been registered.",
op_type_);
return grad_op_maker_; return grad_op_maker_;
} }
...@@ -73,8 +77,9 @@ class OpInfoMap { ...@@ -73,8 +77,9 @@ class OpInfoMap {
return map_.find(op_type) != map_.end(); return map_.find(op_type) != map_.end();
} }
void Insert(const std::string& type, const OpInfo& info) { void Insert(const std::string& type, OpInfo info) {
PADDLE_ENFORCE(!Has(type), "Operator %s has been registered", type); PADDLE_ENFORCE(!Has(type), "Operator %s has been registered", type);
info.op_type_ = type;
map_.insert({type, info}); map_.insert({type, info});
} }
......
...@@ -45,10 +45,12 @@ class ReadInferVarType : public framework::VarTypeInference { ...@@ -45,10 +45,12 @@ class ReadInferVarType : public framework::VarTypeInference {
framework::VarDesc* reader = block->FindVarRecursive(reader_name); framework::VarDesc* reader = block->FindVarRecursive(reader_name);
auto dtypes = reader->GetDataTypes(); auto dtypes = reader->GetDataTypes();
PADDLE_ENFORCE_EQ(dtypes.size(), out_names.size()); PADDLE_ENFORCE_EQ(dtypes.size(), out_names.size());
auto lod_levels = reader->GetLoDLevels();
for (size_t i = 0; i < dtypes.size(); ++i) { for (size_t i = 0; i < dtypes.size(); ++i) {
framework::VarDesc& out = block->FindRecursiveOrCreateVar(out_names[i]); framework::VarDesc& out = block->FindRecursiveOrCreateVar(out_names[i]);
out.SetType(framework::proto::VarType::LOD_TENSOR); out.SetType(framework::proto::VarType::LOD_TENSOR);
out.SetDataType(dtypes[i]); out.SetDataType(dtypes[i]);
out.SetLoDLevel(lod_levels[i]);
} }
} }
}; };
......
...@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#define EIGEN_USE_GPU #include <algorithm>
#include "paddle/fluid/operators/sgd_op.h" #include "paddle/fluid/operators/sgd_op.h"
#include "paddle/fluid/platform/cuda_primitives.h" #include "paddle/fluid/platform/cuda_primitives.h"
...@@ -33,22 +33,21 @@ __global__ void SGDKernel(const T* g, const T* p, const T* learning_rate, ...@@ -33,22 +33,21 @@ __global__ void SGDKernel(const T* g, const T* p, const T* learning_rate,
} }
} }
template <typename T, int block_size> template <typename T>
__global__ void SparseSGDFunctorKernel(const T* selected_rows, __global__ void SparseSGDFunctorKernel(const T* selected_rows,
const int64_t* rows, const int64_t* rows,
const T* learning_rate, T* tensor_out, const T* learning_rate, T* tensor_out,
int64_t row_numel) { int64_t row_numel, int64_t limit) {
const int ty = blockIdx.y; for (int64_t i = blockIdx.x; i < limit; i += gridDim.x) {
int tid = threadIdx.x; const T* selected_rows_ptr = selected_rows + i * row_numel;
T* tensor_out_ptr = tensor_out + rows[i] * row_numel;
selected_rows += ty * row_numel; for (int64_t index = threadIdx.x; index < row_numel; index += blockDim.x) {
tensor_out += rows[ty] * row_numel; // Since index in rows of SelectedRows can be duplicate, we have to use
// Atomic Operation to avoid concurrent write error.
for (int index = tid; index < row_numel; index += block_size) { paddle::platform::CudaAtomicAdd(
// Since index in rows of SelectedRows can be duplicate, we have to use tensor_out_ptr + index,
// Atomic Operation to avoid concurrent write error. -1.0 * learning_rate[0] * selected_rows_ptr[index]);
paddle::platform::CudaAtomicAdd( }
tensor_out + index, -1.0 * learning_rate[0] * selected_rows[index]);
} }
} }
} // namespace } // namespace
...@@ -97,13 +96,15 @@ class SGDOpCUDAKernel : public framework::OpKernel<T> { ...@@ -97,13 +96,15 @@ class SGDOpCUDAKernel : public framework::OpKernel<T> {
auto* in_data = in_value.data<T>(); auto* in_data = in_value.data<T>();
auto* out_data = param_out->data<T>(); auto* out_data = param_out->data<T>();
const int block_size = 256; const int kThreadsPerBlock = 256;
dim3 threads(block_size, 1); int thread_x = kThreadsPerBlock;
dim3 grid(1, in_rows.size()); int max_threads = ctx.cuda_device_context().GetMaxPhysicalThreadCount();
SparseSGDFunctorKernel< int max_blocks = std::max(max_threads / kThreadsPerBlock, 1);
T, 256><<<grid, threads, 0, ctx.cuda_device_context().stream()>>>(
SparseSGDFunctorKernel<<<max_blocks, thread_x, 0,
ctx.cuda_device_context().stream()>>>(
in_data, in_rows.CUDAData(ctx.GetPlace()), learning_rate->data<T>(), in_data, in_rows.CUDAData(ctx.GetPlace()), learning_rate->data<T>(),
out_data, in_row_numel); out_data, in_row_numel, in_rows.size());
} else { } else {
PADDLE_THROW("Unsupported Variable Type of Grad"); PADDLE_THROW("Unsupported Variable Type of Grad");
......
...@@ -52,16 +52,26 @@ class ShrinkRNNMemoryOp : public ArrayOp { ...@@ -52,16 +52,26 @@ class ShrinkRNNMemoryOp : public ArrayOp {
size_t height = dst_num_rows; size_t height = dst_num_rows;
// do shrink for the top level LoD // do shrink for the top level LoD
if (x_tensor.lod().size() > 0 && if (x_tensor.lod().size() > 0 &&
x_tensor.lod()[0].size() > static_cast<size_t>(dst_num_rows)) { x_tensor.lod()[0].size() > static_cast<size_t>(dst_num_rows)) {
auto lod_offset = framework::GetSubLoDAndAbsoluteOffset(x_tensor.lod(), 0, if (x_tensor.lod().size() > 1) { // MultiLevel LoD
dst_num_rows, 0); auto lod_offset = framework::GetSubLoDAndAbsoluteOffset(
height = lod_offset.second.second; x_tensor.lod(), 0, dst_num_rows, 0);
auto out_lod = out_tensor.mutable_lod(); height = lod_offset.second.second;
framework::AppendLoD(out_lod, lod_offset.first); auto out_lod = out_tensor.mutable_lod();
framework::AppendLoD(out_lod, lod_offset.first);
} else {
// Shrink LoD
auto lod_item = x_tensor.lod()[0];
lod_item.resize(dst_num_rows + 1);
out_tensor.set_lod({lod_item});
const auto &const_lod_item = lod_item;
height = const_lod_item.back();
}
} }
if (dst_num_rows != 0) { if (height != 0) {
out_tensor.mutable_data(place, x_tensor.type()); out_tensor.mutable_data(place, x_tensor.type());
auto dev_ctx = platform::DeviceContextPool::Instance().Get(place); auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
framework::TensorCopy(x_tensor.Slice(0, height), place, *dev_ctx, framework::TensorCopy(x_tensor.Slice(0, height), place, *dev_ctx,
...@@ -134,8 +144,11 @@ class ShrinkRNNMemoryGradOp : public ArrayOp { ...@@ -134,8 +144,11 @@ class ShrinkRNNMemoryGradOp : public ArrayOp {
} else { } else {
auto &dout_tensor = dout_var->Get<framework::LoDTensor>(); auto &dout_tensor = dout_var->Get<framework::LoDTensor>();
auto height = dout_tensor.dims()[0]; auto height = dout_tensor.dims()[0];
auto slice = dx_tensor.Slice(0, static_cast<int>(height)); if (height != 0) {
framework::TensorCopy(dout_tensor, dout_tensor.place(), dev_ctx, &slice); auto slice = dx_tensor.Slice(0, static_cast<int>(height));
framework::TensorCopy(dout_tensor, dout_tensor.place(), dev_ctx,
&slice);
}
if (dx_tensor.dims()[0] > height) { if (dx_tensor.dims()[0] > height) {
auto rest_tensor = dx_tensor.Slice( auto rest_tensor = dx_tensor.Slice(
static_cast<int>(height), static_cast<int>(dx_tensor.dims()[0])); static_cast<int>(height), static_cast<int>(dx_tensor.dims()[0]));
......
...@@ -201,6 +201,7 @@ CUDADeviceContext::CUDADeviceContext(CUDAPlace place) ...@@ -201,6 +201,7 @@ CUDADeviceContext::CUDADeviceContext(CUDAPlace place)
compute_capability = GetCUDAComputeCapability(place_.device); compute_capability = GetCUDAComputeCapability(place_.device);
multi_process = GetCUDAMultiProcessors(place_.device); multi_process = GetCUDAMultiProcessors(place_.device);
max_threads_per_mp = GetCUDAMaxThreadsPerMultiProcessor(place_.device); max_threads_per_mp = GetCUDAMaxThreadsPerMultiProcessor(place_.device);
grid_max_dims_ = GpuMaxGridDim(place_.device);
PADDLE_ENFORCE(cudaStreamCreate(&stream_)); PADDLE_ENFORCE(cudaStreamCreate(&stream_));
eigen_stream_.reset(new EigenCudaStreamDevice()); eigen_stream_.reset(new EigenCudaStreamDevice());
eigen_stream_->Reinitialize(&stream_, place); eigen_stream_->Reinitialize(&stream_, place);
...@@ -239,6 +240,10 @@ int CUDADeviceContext::GetMaxPhysicalThreadCount() const { ...@@ -239,6 +240,10 @@ int CUDADeviceContext::GetMaxPhysicalThreadCount() const {
return multi_process * max_threads_per_mp; return multi_process * max_threads_per_mp;
} }
std::tuple<int, int, int> CUDADeviceContext::GetMaxGridDims() const {
return grid_max_dims_;
}
Eigen::GpuDevice* CUDADeviceContext::eigen_device() const { Eigen::GpuDevice* CUDADeviceContext::eigen_device() const {
return eigen_device_.get(); return eigen_device_.get();
} }
......
...@@ -13,6 +13,7 @@ limitations under the License. */ ...@@ -13,6 +13,7 @@ limitations under the License. */
#include <memory> #include <memory>
#include <mutex> // NOLINT #include <mutex> // NOLINT
#include <string> #include <string>
#include <tuple>
#include <unordered_map> #include <unordered_map>
#include <vector> #include <vector>
...@@ -91,6 +92,8 @@ class CUDADeviceContext : public DeviceContext { ...@@ -91,6 +92,8 @@ class CUDADeviceContext : public DeviceContext {
/*! \brief Return the max physical thread count in the device context */ /*! \brief Return the max physical thread count in the device context */
int GetMaxPhysicalThreadCount() const; int GetMaxPhysicalThreadCount() const;
std::tuple<int, int, int> GetMaxGridDims() const;
/*! \brief Return eigen device in the device context. */ /*! \brief Return eigen device in the device context. */
Eigen::GpuDevice* eigen_device() const; Eigen::GpuDevice* eigen_device() const;
...@@ -135,6 +138,8 @@ class CUDADeviceContext : public DeviceContext { ...@@ -135,6 +138,8 @@ class CUDADeviceContext : public DeviceContext {
cudaStream_t stream_; cudaStream_t stream_;
cublasHandle_t cublas_handle_; cublasHandle_t cublas_handle_;
std::tuple<int, int, int> grid_max_dims_;
int compute_capability; int compute_capability;
int multi_process; int multi_process;
int max_threads_per_mp; int max_threads_per_mp;
......
...@@ -48,35 +48,54 @@ __global__ static void ForRangeElemwiseOpGridIsOne(Function func) { ...@@ -48,35 +48,54 @@ __global__ static void ForRangeElemwiseOpGridIsOne(Function func) {
} }
template <typename Function> template <typename Function>
__global__ static void ForRangeElemwiseOp(Function func, int limit) { __global__ static void ForRangeElemwiseOp(Function func, size_t limit) {
size_t idx = static_cast<size_t>(blockIdx.x * blockDim.x + threadIdx.x); size_t idx = static_cast<size_t>(blockIdx.x * blockDim.x + threadIdx.x);
if (idx < limit) { if (idx < limit) {
func(idx); func(idx);
} }
} }
template <typename Function>
__global__ static void ForRangeElemwiseOpGridLarge(Function func, size_t limit,
int grid_dim) {
size_t idx = static_cast<size_t>(blockIdx.x * blockDim.x + threadIdx.x);
while (idx < limit) {
func(idx);
idx += grid_dim;
}
}
template <> template <>
struct ForRange<CUDADeviceContext> { struct ForRange<CUDADeviceContext> {
ForRange(const CUDADeviceContext& dev_ctx, size_t limit) ForRange(const CUDADeviceContext& dev_ctx, size_t limit)
: dev_ctx_(dev_ctx), limit_(static_cast<int>(limit)) {} : dev_ctx_(dev_ctx), limit_(limit) {}
template <typename Function> template <typename Function>
inline void operator()(Function func) const { inline void operator()(Function func) const {
constexpr int num_threads = 1024; constexpr int num_threads = 1024;
int block_size = limit_ <= num_threads ? limit_ : num_threads; int block_size = limit_ <= num_threads ? limit_ : num_threads;
int grid_size = (limit_ + num_threads - 1) / num_threads; size_t grid_size = (limit_ + num_threads - 1) / num_threads;
if (grid_size == 1) { int max_grid_dim = std::get<0>(dev_ctx_.GetMaxGridDims());
ForRangeElemwiseOpGridIsOne<<<1, block_size, 0, dev_ctx_.stream()>>>(
func); if (grid_size < max_grid_dim) {
int grid_size_int = static_cast<int>(grid_size);
if (grid_size == 1) {
ForRangeElemwiseOpGridIsOne<<<1, block_size, 0, dev_ctx_.stream()>>>(
func);
} else {
ForRangeElemwiseOp<<<grid_size_int, block_size, 0, dev_ctx_.stream()>>>(
func, limit_);
}
} else { } else {
ForRangeElemwiseOp<<<grid_size, block_size, 0, dev_ctx_.stream()>>>( ForRangeElemwiseOpGridLarge<<<max_grid_dim, block_size, 0,
func, limit_); dev_ctx_.stream()>>>(func, limit_,
max_grid_dim);
} }
} }
const CUDADeviceContext& dev_ctx_; const CUDADeviceContext& dev_ctx_;
int limit_; size_t limit_;
}; };
#endif #endif
......
...@@ -152,5 +152,22 @@ void GpuMemsetAsync(void *dst, int value, size_t count, cudaStream_t stream) { ...@@ -152,5 +152,22 @@ void GpuMemsetAsync(void *dst, int value, size_t count, cudaStream_t stream) {
PADDLE_ENFORCE(cudaMemsetAsync(dst, value, count, stream), PADDLE_ENFORCE(cudaMemsetAsync(dst, value, count, stream),
"cudaMemsetAsync failed in paddle::platform::GpuMemsetAsync"); "cudaMemsetAsync failed in paddle::platform::GpuMemsetAsync");
} }
std::tuple<int, int, int> GpuMaxGridDim(int id) {
std::tuple<int, int, int> result;
PADDLE_ENFORCE(
cudaDeviceGetAttribute(&std::get<0>(result), cudaDevAttrMaxBlockDimX, id),
"cudaDeviceGetAttribute failed in "
"cudaDevAttrMaxBlockDim");
PADDLE_ENFORCE(
cudaDeviceGetAttribute(&std::get<1>(result), cudaDevAttrMaxBlockDimY, id),
"cudaDeviceGetAttribute failed in "
"cudaDevAttrMaxBlockDim");
PADDLE_ENFORCE(
cudaDeviceGetAttribute(&std::get<2>(result), cudaDevAttrMaxBlockDimZ, id),
"cudaDeviceGetAttribute failed in "
"cudaDevAttrMaxBlockDim");
return result;
}
} // namespace platform } // namespace platform
} // namespace paddle } // namespace paddle
...@@ -19,6 +19,7 @@ limitations under the License. */ ...@@ -19,6 +19,7 @@ limitations under the License. */
#include <cuda_runtime.h> #include <cuda_runtime.h>
#include <stddef.h> #include <stddef.h>
#include <string> #include <string>
#include <tuple>
namespace paddle { namespace paddle {
namespace platform { namespace platform {
...@@ -72,6 +73,8 @@ void GpuMemcpyPeerSync(void *dst, int dst_device, const void *src, ...@@ -72,6 +73,8 @@ void GpuMemcpyPeerSync(void *dst, int dst_device, const void *src,
//! Set memory dst with value count size asynchronously //! Set memory dst with value count size asynchronously
void GpuMemsetAsync(void *dst, int value, size_t count, cudaStream_t stream); void GpuMemsetAsync(void *dst, int value, size_t count, cudaStream_t stream);
std::tuple<int, int, int> GpuMaxGridDim(int id);
} // namespace platform } // namespace platform
} // namespace paddle } // namespace paddle
......
...@@ -311,6 +311,7 @@ def _copy_reader_var_(block, var): ...@@ -311,6 +311,7 @@ def _copy_reader_var_(block, var):
new_var = block.create_var(name=var.name, type=core.VarDesc.VarType.READER) new_var = block.create_var(name=var.name, type=core.VarDesc.VarType.READER)
new_var.desc.set_shapes(var.desc.shapes()) new_var.desc.set_shapes(var.desc.shapes())
new_var.desc.set_dtypes(var.desc.dtypes()) new_var.desc.set_dtypes(var.desc.dtypes())
new_var.desc.set_lod_levels(var.desc.lod_levels())
new_var.persistable = True new_var.persistable = True
return new_var return new_var
...@@ -632,6 +633,7 @@ def py_reader(capacity, ...@@ -632,6 +633,7 @@ def py_reader(capacity,
}) })
startup_var.desc.set_dtypes(dtypes) startup_var.desc.set_dtypes(dtypes)
startup_var.desc.set_lod_levels(lod_levels)
startup_var.persistable = True startup_var.persistable = True
main_prog_var = _copy_reader_var_(default_main_program().current_block(), main_prog_var = _copy_reader_var_(default_main_program().current_block(),
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册