提交 88c4943a 编写于 作者: M mindspore-ci-bot 提交者: Gitee

!2989 bucket reduce sparse gradient

Merge pull request !2989 from kisnwang/two-step-reduce-sparse-gradient
......@@ -73,9 +73,18 @@ class KernelMeta {
};
struct SparseGradient {
float *value_;
int *indices_;
size_t indices_size_;
float *value_{nullptr};
int *indices_{nullptr};
size_t indices_size_{0};
};
struct ReduceSparseGradientParam {
SparseGradient *input_grad_{nullptr};
SparseGradient *workspace_grad_{nullptr};
SparseGradient *output_grad_{nullptr};
size_t max_index_{0};
size_t value_stride_{0};
bool use_sort_reduce_{false};
};
struct MultiThreadComputeParams {
......@@ -112,10 +121,6 @@ void SaveJsonInfo(const std::string &json_name, const std::string &info);
std::string GetProcessor(const AnfNodePtr &anf_node);
bool IsSameShape(const std::vector<size_t> &shape_a, const std::vector<size_t> &shape_b);
int Sign(float x);
void DeduplicateIndexedSlices(const SparseGradient &origin_sparse_grad, SparseGradient *unique_grad, size_t first_dim,
size_t outer_dim);
void ReduceSparseGradient(const SparseGradient &origin_sparse_grad, SparseGradient *unique_grad, size_t first_dim,
size_t outer_dim, bool use_multi_threads = true);
std::pair<AnfNodePtr, size_t> GetKernelInput(const AnfNodePtr &anf_node, size_t index);
std::vector<std::pair<AnfNodePtr, std::pair<size_t, size_t>>> GetInputIndex(const std::vector<AnfNodePtr> &node_list,
const std::vector<AnfNodePtr> &input_list);
......@@ -130,14 +135,7 @@ void GetGraphRealOutput(const FuncGraphPtr &func_graph, std::vector<std::pair<An
bool IsWeightBoundary(const AnfNodePtr &node);
void MultiThreadCompute(const MultiThreadComputeFunc &func, MultiThreadComputeParams *params,
size_t total_compute_size);
void RunMultiThreadReduceSparseGradient(const SparseGradient &origin_sparse_grad, SparseGradient *unique_grad,
size_t outer_dim, std::vector<std::pair<int, size_t>> *sorted_indices,
std::vector<size_t> *slice_positions);
void ReduceMultiSparseGradient(const std::vector<std::shared_ptr<SparseGradient>> &unique_slice_grads,
SparseGradient *tmp_grad, SparseGradient *unique_grad, size_t first_dim,
size_t outer_dim);
void TwoLevelReduceSparseGradient(const SparseGradient &origin_sparse_grad, SparseGradient *tmp_grad,
SparseGradient *unique_grad, size_t first_dim, size_t outer_dim);
void BucketReduceSparseGradient(const ReduceSparseGradientParam &param);
std::vector<int> GetReduceAttrAxis(const CNodePtr &cnode);
} // namespace kernel
} // namespace mindspore
......
......@@ -81,6 +81,8 @@ void SparseApplyAdamCPUKernel::InitInputOutputSize(const CNodePtr &kernel_node)
MS_EXCEPTION_IF_NULL(kernel_node);
workspace_size_list_.emplace_back(indices_size_ * var_outer_dim_size_ * sizeof(float));
workspace_size_list_.emplace_back(indices_size_ * sizeof(int));
workspace_size_list_.emplace_back(indices_size_ * var_outer_dim_size_ * sizeof(float));
workspace_size_list_.emplace_back(indices_size_ * sizeof(int));
workspace_size_list_.emplace_back(var_first_dim_size_ * var_outer_dim_size_ * sizeof(float));
}
......@@ -142,11 +144,21 @@ bool SparseApplyAdamCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inp
auto indices = reinterpret_cast<int *>(inputs[10]->addr);
auto new_grad = reinterpret_cast<float *>(workspace[0]->addr);
auto new_indices = reinterpret_cast<int *>(workspace[1]->addr);
auto m_t = reinterpret_cast<float *>(workspace[2]->addr);
auto workspace_grad = reinterpret_cast<float *>(workspace[2]->addr);
auto workspace_indices = reinterpret_cast<int *>(workspace[3]->addr);
auto m_t = reinterpret_cast<float *>(workspace[4]->addr);
SparseGradient unique_sparse_grad({new_grad, new_indices, indices_size_});
ReduceSparseGradient(SparseGradient({grad, indices, indices_size_}), &unique_sparse_grad, var_first_dim_size_,
var_outer_dim_size_);
SparseGradient workspace_sparse_grad({workspace_grad, workspace_indices, indices_size_});
SparseGradient input_sparse_grad({grad, indices, indices_size_});
ReduceSparseGradientParam param;
param.input_grad_ = &input_sparse_grad;
param.workspace_grad_ = &workspace_sparse_grad;
param.output_grad_ = &unique_sparse_grad;
param.max_index_ = var_first_dim_size_;
param.value_stride_ = var_outer_dim_size_;
BucketReduceSparseGradient(param);
size_t total_dim_size = var_first_dim_size_ * var_outer_dim_size_;
lr = lr * std::sqrt(1 - beta2_power) / (1 - beta1_power);
......
......@@ -132,12 +132,19 @@ bool SparseApplyFtrlCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inp
auto indices = reinterpret_cast<int *>(inputs[4]->addr);
auto new_grad = reinterpret_cast<float *>(workspace[0]->addr);
auto new_indices = reinterpret_cast<int *>(workspace[1]->addr);
auto tmp_grad = reinterpret_cast<float *>(workspace[2]->addr);
auto tmp_indices = reinterpret_cast<int *>(workspace[3]->addr);
auto workspace_grad = reinterpret_cast<float *>(workspace[2]->addr);
auto workspace_indices = reinterpret_cast<int *>(workspace[3]->addr);
SparseGradient unique_sparse_grad({new_grad, new_indices, indices_size_});
SparseGradient tmp_sparse_grad({tmp_grad, tmp_indices, indices_size_});
TwoLevelReduceSparseGradient(SparseGradient({grad, indices, indices_size_}), &tmp_sparse_grad, &unique_sparse_grad,
var_first_dim_size_, var_outer_dim_size_);
SparseGradient workspace_sparse_grad({workspace_grad, workspace_indices, indices_size_});
SparseGradient input_sparse_grad({grad, indices, indices_size_});
ReduceSparseGradientParam param;
param.input_grad_ = &input_sparse_grad;
param.workspace_grad_ = &workspace_sparse_grad;
param.output_grad_ = &unique_sparse_grad;
param.max_index_ = var_first_dim_size_;
param.value_stride_ = var_outer_dim_size_;
BucketReduceSparseGradient(param);
MultiThreadComputeParams input_params;
input_params.var_ = var;
......
......@@ -123,13 +123,19 @@ bool SparseApplyLazyAdamCPUKernel::Launch(const std::vector<kernel::AddressPtr>
auto indices = reinterpret_cast<int *>(inputs[10]->addr);
auto new_grad = reinterpret_cast<float *>(workspace[0]->addr);
auto new_indices = reinterpret_cast<int *>(workspace[1]->addr);
auto tmp_grad = reinterpret_cast<float *>(workspace[2]->addr);
auto tmp_indices = reinterpret_cast<int *>(workspace[3]->addr);
auto workspace_grad = reinterpret_cast<float *>(workspace[2]->addr);
auto workspace_indices = reinterpret_cast<int *>(workspace[3]->addr);
SparseGradient unique_sparse_grad({new_grad, new_indices, indices_size_});
SparseGradient tmp_sparse_grad({tmp_grad, tmp_indices, indices_size_});
TwoLevelReduceSparseGradient(SparseGradient({grad, indices, indices_size_}), &tmp_sparse_grad, &unique_sparse_grad,
var_first_dim_size_, var_outer_dim_size_);
SparseGradient workspace_sparse_grad({workspace_grad, workspace_indices, indices_size_});
SparseGradient input_sparse_grad({grad, indices, indices_size_});
ReduceSparseGradientParam param;
param.input_grad_ = &input_sparse_grad;
param.workspace_grad_ = &workspace_sparse_grad;
param.output_grad_ = &unique_sparse_grad;
param.max_index_ = var_first_dim_size_;
param.value_stride_ = var_outer_dim_size_;
BucketReduceSparseGradient(param);
lr = lr * std::sqrt(1 - beta2_power) / (1 - beta1_power);
MultiThreadComputeParams input_params;
......
......@@ -61,6 +61,8 @@ void SparseApplyProximalAdagradCPUKernel::InitInputOutputSize(const CNodePtr &ke
MS_EXCEPTION_IF_NULL(kernel_node);
workspace_size_list_.emplace_back(indices_size_ * var_outer_dim_size_ * sizeof(float));
workspace_size_list_.emplace_back(indices_size_ * sizeof(int));
workspace_size_list_.emplace_back(indices_size_ * var_outer_dim_size_ * sizeof(float));
workspace_size_list_.emplace_back(indices_size_ * sizeof(int));
}
void SparseApplyProximalAdagradCPUKernel::InitKernel(const CNodePtr &kernel_node) {
......@@ -119,9 +121,19 @@ bool SparseApplyProximalAdagradCPUKernel::Launch(const std::vector<kernel::Addre
auto indices = reinterpret_cast<int *>(inputs[6]->addr);
auto new_grad = reinterpret_cast<float *>(workspace[0]->addr);
auto new_indices = reinterpret_cast<int *>(workspace[1]->addr);
auto workspace_grad = reinterpret_cast<float *>(workspace[2]->addr);
auto workspace_indices = reinterpret_cast<int *>(workspace[3]->addr);
SparseGradient unique_sparse_grad({new_grad, new_indices, indices_size_});
ReduceSparseGradient(SparseGradient({grad, indices, indices_size_}), &unique_sparse_grad, var_first_dim_size_,
var_outer_dim_size_);
SparseGradient workspace_sparse_grad({workspace_grad, workspace_indices, indices_size_});
SparseGradient input_sparse_grad({grad, indices, indices_size_});
ReduceSparseGradientParam param;
param.input_grad_ = &input_sparse_grad;
param.workspace_grad_ = &workspace_sparse_grad;
param.output_grad_ = &unique_sparse_grad;
param.max_index_ = var_first_dim_size_;
param.value_stride_ = var_outer_dim_size_;
BucketReduceSparseGradient(param);
MultiThreadComputeParams input_params;
input_params.var_ = var;
......
......@@ -16,6 +16,7 @@
#include "backend/session/cpu_session.h"
#include <algorithm>
#include <sstream>
#include "ir/tensor.h"
#include "ir/anf.h"
#include "backend/kernel_compiler/kernel.h"
......@@ -148,6 +149,48 @@ void CPUSession::SetKernelInfo(const KernelGraph *kernel_graph) {
}
}
namespace {
void KernelNotSupportException(const AnfNodePtr &kernel_node) {
std::string kernel_name = AnfAlgo::GetCNodeName(kernel_node);
std::stringstream operator_info;
operator_info << "Operator[" << kernel_name << "] ";
auto kernel_info = dynamic_cast<device::KernelInfo *>(kernel_node->kernel_info());
if (kernel_info == nullptr) {
operator_info << "is not support.";
MS_LOG(EXCEPTION) << operator_info.str();
}
auto kernel_build_Info = kernel_info->select_kernel_build_info();
if (kernel_build_Info == nullptr) {
operator_info << "is not support.";
MS_LOG(EXCEPTION) << operator_info.str();
}
size_t input_num = kernel_build_Info->GetInputNum();
if (input_num > 0) {
operator_info << " input(";
for (size_t i = 0; i < input_num; ++i) {
operator_info << TypeIdLabel(kernel_build_Info->GetInputDeviceType(i));
if (i != input_num - 1) {
operator_info << ",";
}
}
operator_info << ") ";
}
size_t output_num = kernel_build_Info->GetOutputNum();
if (output_num > 0) {
operator_info << "output(";
for (size_t i = 0; i < output_num; ++i) {
operator_info << TypeIdLabel(kernel_build_Info->GetOutputDeviceType(i));
if (i != kernel_build_Info->GetOutputNum() - 1) {
operator_info << ",";
}
}
operator_info << ") ";
}
operator_info << "is not support.";
MS_LOG(EXCEPTION) << operator_info.str();
}
} // namespace
void CPUSession::BuildKernel(const KernelGraph *kernel_graph) {
MS_EXCEPTION_IF_NULL(kernel_graph);
auto &kernel_nodes = kernel_graph->execution_order();
......@@ -158,7 +201,7 @@ void CPUSession::BuildKernel(const KernelGraph *kernel_graph) {
std::shared_ptr<kernel::CPUKernel> cpu_kernel =
kernel::CPUKernelFactory::GetInstance().Create(kernel_name, kernel_node);
if (cpu_kernel == nullptr) {
MS_LOG(EXCEPTION) << "Operator[" << kernel_name << "] is not support.";
KernelNotSupportException(kernel_node);
}
cpu_kernel->Init(kernel_node);
AnfAlgo::SetKernelMod(cpu_kernel, kernel_node.get());
......
......@@ -25,7 +25,7 @@ class CommonUtilTest : public UT::Common {
CommonUtilTest() = default;
};
TEST_F(CommonUtilTest, DeduplicateIndexedSlicesTest1) {
TEST_F(CommonUtilTest, BucketReduceSparseGradient1) {
// The indices is a vector and the grad is a tensor with shape (6, 2)
/* 0
* 0
......@@ -46,20 +46,39 @@ TEST_F(CommonUtilTest, DeduplicateIndexedSlicesTest1) {
for (int i = 0; i < 6 * 2; i++) {
grad.push_back(i);
}
std::vector<int> unique_indices(3);
std::vector<float> summed_grad(6);
SparseGradient unique_grad({summed_grad.data(), unique_indices.data(), 0});
ReduceSparseGradient(SparseGradient({grad.data(), indices.data(), 6}), &unique_grad, 6, 2);
std::vector<int> unique_indices(6);
std::vector<float> summed_grad(12);
std::vector<int> tmp_indices(6);
std::vector<float> tmp_grad(12);
SparseGradient unique_grad({summed_grad.data(), unique_indices.data(), 6});
SparseGradient workspace_grad({tmp_grad.data(), tmp_indices.data(), 6});
SparseGradient input_grad({grad.data(), indices.data(), 6});
ReduceSparseGradientParam param;
param.input_grad_ = &input_grad;
param.workspace_grad_ = &workspace_grad;
param.output_grad_ = &unique_grad;
param.max_index_ = 6;
param.value_stride_ = 2;
BucketReduceSparseGradient(param);
EXPECT_EQ(unique_grad.indices_size_, 3);
EXPECT_EQ(unique_indices, std::vector<int>({0, 1, 3}));
std::vector<int> expect_indices({0, 1, 3});
for (size_t i = 0; i < unique_grad.indices_size_; ++i) {
EXPECT_EQ(unique_grad.indices_[i], expect_indices[i]);
}
/* 10 13
* 10 12
* 10 11
*/
EXPECT_EQ(summed_grad, std::vector<float>({10, 13, 10, 12, 10, 11}));
std::vector<int> expect_value({10, 13, 10, 12, 10, 11});
for (size_t i = 0; i < unique_grad.indices_size_ * 2; ++i) {
EXPECT_EQ(unique_grad.value_[i], expect_value[i]);
}
}
TEST_F(CommonUtilTest, DeduplicateIndexedSlicesTest2) {
TEST_F(CommonUtilTest, BucketReduceSparseGradient2) {
// The indices is a vector and the grad is a tensor with shape (6, 2)
/* 0
* 0
......@@ -80,16 +99,36 @@ TEST_F(CommonUtilTest, DeduplicateIndexedSlicesTest2) {
for (int i = 0; i < 6 * 2; i++) {
grad.push_back(i);
}
std::vector<int> unique_indices(2);
std::vector<float> summed_grad(4);
SparseGradient unique_grad({summed_grad.data(), unique_indices.data(), 0});
ReduceSparseGradient(SparseGradient({grad.data(), indices.data(), 6}), &unique_grad, 6, 2);
std::vector<int> unique_indices(6);
std::vector<float> summed_grad(12);
std::vector<int> tmp_indices(6);
std::vector<float> tmp_grad(12);
SparseGradient unique_grad({summed_grad.data(), unique_indices.data(), 6});
SparseGradient workspace_grad({tmp_grad.data(), tmp_indices.data(), 6});
SparseGradient input_grad({grad.data(), indices.data(), 6});
ReduceSparseGradientParam param;
param.input_grad_ = &input_grad;
param.workspace_grad_ = &workspace_grad;
param.output_grad_ = &unique_grad;
param.max_index_ = 6;
param.value_stride_ = 2;
BucketReduceSparseGradient(param);
EXPECT_EQ(unique_grad.indices_size_, 2);
EXPECT_EQ(unique_indices, std::vector<int>({0, 1}));
std::vector<int> expect_indices({0, 1});
for (size_t i = 0; i < unique_grad.indices_size_; ++i) {
EXPECT_EQ(unique_grad.indices_[i], expect_indices[i]);
}
/* 10 13
* 10 12
*/
EXPECT_EQ(summed_grad, std::vector<float>({10, 13, 10, 12}));
std::vector<int> expect_value({10, 13, 10, 12});
for (size_t i = 0; i < unique_grad.indices_size_ * 2; ++i) {
EXPECT_EQ(unique_grad.value_[i], expect_value[i]);
}
}
} // namespace kernel
} // namespace mindspore
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册