diff --git a/mindspore/ccsrc/kernel/cpu/cpu_kernel.cc b/mindspore/ccsrc/kernel/cpu/cpu_kernel.cc index c9d3770c6e34102559c8dd40d4e049ac21c16bc1..c94c0a050892de2e84183d982cd6311ce37ea6a2 100644 --- a/mindspore/ccsrc/kernel/cpu/cpu_kernel.cc +++ b/mindspore/ccsrc/kernel/cpu/cpu_kernel.cc @@ -66,5 +66,15 @@ size_t CPUKernelUtils::GetElementNumOnAxis(const std::vector &shape, int } return result; } + +void CPUKernelUtils::GetElementNumEveryDim(const std::vector &shape, std::vector *element_num) { + size_t accumulation = 1; + element_num->emplace_back(1); + for (size_t i = shape.size() - 1; i > 0; --i) { + accumulation *= shape[i]; + element_num->emplace_back(accumulation); + } + std::reverse(element_num->begin(), element_num->end()); +} } // namespace kernel } // namespace mindspore diff --git a/mindspore/ccsrc/kernel/cpu/cpu_kernel.h b/mindspore/ccsrc/kernel/cpu/cpu_kernel.h index 2538459336e64665422b21185f9f4b650e98a0f6..a391f7586357a35447808c814828651422afe4a4 100644 --- a/mindspore/ccsrc/kernel/cpu/cpu_kernel.h +++ b/mindspore/ccsrc/kernel/cpu/cpu_kernel.h @@ -78,6 +78,7 @@ class CPUKernelUtils { static void ExpandDimsTo4(std::vector *shape); static size_t CalcOffset(const std::vector &shape, size_t dim0, size_t dim1, size_t dim2, size_t dim3); static size_t GetElementNumOnAxis(const std::vector &shape, int axis); + static void GetElementNumEveryDim(const std::vector &shape, std::vector *element_num); }; } // namespace kernel } // namespace mindspore diff --git a/mindspore/ccsrc/kernel/cpu/slice_cpu_kernel.cc b/mindspore/ccsrc/kernel/cpu/slice_cpu_kernel.cc index b1565425e01a26ba7b450d181457951378671e87..d6896222b145ffdd6411c0c3726b669daa4ab704 100644 --- a/mindspore/ccsrc/kernel/cpu/slice_cpu_kernel.cc +++ b/mindspore/ccsrc/kernel/cpu/slice_cpu_kernel.cc @@ -70,6 +70,8 @@ void SliceCPUKernel::InitKernel(const CNodePtr &kernel_node) { end_.insert(end_.begin(), 1); } } + CPUKernelUtils::GetElementNumEveryDim(input_shape_, &input_element_num_); + CPUKernelUtils::GetElementNumEveryDim(output_shape_, &output_element_num_); } bool SliceCPUKernel::Launch(const std::vector &inputs, @@ -78,12 +80,40 @@ bool SliceCPUKernel::Launch(const std::vector &inputs, auto input_addr = reinterpret_cast(inputs[0]->addr); auto output_addr = reinterpret_cast(outputs[0]->addr); - for (int i = begin_[0]; i < end_[0]; i += strides_[0]) { - for (int j = begin_[1]; j < end_[1]; j += strides_[1]) { - for (int k = begin_[2]; k < end_[2]; k += strides_[2]) { + bool can_copy_memory[3] = {CanCopyMemoryOnAxis(0), CanCopyMemoryOnAxis(1), CanCopyMemoryOnAxis(2)}; + size_t in_start_offset[3] = {begin_[0] * input_element_num_[0], begin_[1] * input_element_num_[1], + begin_[2] * input_element_num_[2]}; + size_t in_step_size[3] = {strides_[0] * input_element_num_[0], strides_[1] * input_element_num_[1], + strides_[2] * input_element_num_[2]}; + + auto in_n_offset = in_start_offset[0]; + auto out_n_offset = 0; + for (int i = begin_[0]; i < end_[0]; + i += strides_[0], in_n_offset += in_step_size[0], out_n_offset += output_element_num_[0]) { + if (can_copy_memory[0]) { + CopyDataToOutput(inputs, in_n_offset, outputs, out_n_offset, input_element_num_[0]); + continue; + } + auto in_c_offset = in_start_offset[1]; + auto out_c_offset = 0; + for (int j = begin_[1]; j < end_[1]; + j += strides_[1], in_c_offset += in_step_size[1], out_c_offset += output_element_num_[1]) { + if (can_copy_memory[1]) { + CopyDataToOutput(inputs, in_n_offset + in_c_offset, outputs, out_n_offset + out_c_offset, + input_element_num_[1]); + continue; + } + auto in_h_offset = in_start_offset[2]; + auto out_h_offset = 0; + for (int k = begin_[2]; k < end_[2]; + k += strides_[2], in_h_offset += in_step_size[2], out_h_offset += output_element_num_[2]) { + if (can_copy_memory[2]) { + CopyDataToOutput(inputs, in_n_offset + in_c_offset + in_h_offset, outputs, + out_n_offset + out_c_offset + out_h_offset, input_element_num_[2]); + continue; + } for (int m = begin_[3]; m < end_[3]; m += strides_[3]) { - auto offset = CPUKernelUtils::CalcOffset(input_shape_, i, j, k, m); - *output_addr++ = input_addr[offset]; + *output_addr++ = input_addr[in_n_offset + in_c_offset + in_h_offset + m]; } } } @@ -92,7 +122,38 @@ bool SliceCPUKernel::Launch(const std::vector &inputs, return true; } -void SliceCPUKernel::CheckParam(const CNodePtr &kernel_node) { +bool SliceCPUKernel::CanCopyMemoryOnAxis(size_t dim) const { + for (size_t i = dim + 1; i < 4; ++i) { + if (begin_[i] != 0 || end_[i] != SizeToInt(input_shape_[i]) || strides_[i] != 1) { + return false; + } + } + return true; +} + +void SliceCPUKernel::CopyDataToOutput(const std::vector &inputs, size_t in_offset, + const std::vector &outputs, size_t out_offset, + size_t copy_num) const { + auto input_addr = reinterpret_cast(inputs[0]->addr); + auto in_buff_size = inputs[0]->size; + auto output_addr = reinterpret_cast(outputs[0]->addr); + auto out_buff_size = outputs[0]->size; + + if ((in_offset + copy_num) * sizeof(float) > in_buff_size) { + MS_LOG(EXCEPTION) << "input memory out of bounds."; + } + if ((out_offset + copy_num) * sizeof(float) > out_buff_size) { + MS_LOG(EXCEPTION) << "output memory out of bounds."; + } + + auto ret = memcpy_s(output_addr + out_offset, out_buff_size - out_offset * sizeof(float), input_addr + in_offset, + copy_num * sizeof(float)); + if (ret != EOK) { + MS_LOG(EXCEPTION) << "memcpy failed. ret:" << ret; + } +} + +void SliceCPUKernel::CheckParam(const CNodePtr &kernel_node) const { size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node); if (input_num != 1) { MS_LOG(EXCEPTION) << "Input number is " << input_num << ", but SliceCPUKernel needs 1 inputs."; diff --git a/mindspore/ccsrc/kernel/cpu/slice_cpu_kernel.h b/mindspore/ccsrc/kernel/cpu/slice_cpu_kernel.h index 788c4f39ad71438c85e62d1158f0f1fbb3bb7905..c39ec12f59697799172c02280680ffb1aac96f33 100644 --- a/mindspore/ccsrc/kernel/cpu/slice_cpu_kernel.h +++ b/mindspore/ccsrc/kernel/cpu/slice_cpu_kernel.h @@ -33,12 +33,17 @@ class SliceCPUKernel : public CPUKernel { const std::vector &outputs) override; private: - void CheckParam(const CNodePtr &kernel_node); + bool CanCopyMemoryOnAxis(size_t dim) const; + void CopyDataToOutput(const std::vector &inputs, size_t in_offset, + const std::vector &outputs, size_t out_offset, size_t copy_num) const; + void CheckParam(const CNodePtr &kernel_node) const; std::vector begin_; std::vector end_; std::vector strides_; std::vector input_shape_; + std::vector input_element_num_; std::vector output_shape_; + std::vector output_element_num_; }; MS_REG_CPU_KERNEL(Slice, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32), diff --git a/mindspore/ccsrc/kernel/cpu/slice_grad_cpu_kernel.cc b/mindspore/ccsrc/kernel/cpu/slice_grad_cpu_kernel.cc index 868008b1c1f50fa9ab19027c85817487304373ee..778d214f4f1096b30775a750076f1d1e42fbcd4c 100644 --- a/mindspore/ccsrc/kernel/cpu/slice_grad_cpu_kernel.cc +++ b/mindspore/ccsrc/kernel/cpu/slice_grad_cpu_kernel.cc @@ -21,13 +21,14 @@ namespace mindspore { namespace kernel { void SliceGradCPUKernel::InitKernel(const CNodePtr &kernel_node) { CheckParam(kernel_node); - output_dx_shape_ = AnfAlgo::GetOutputInferShape(kernel_node, 0); - input_dy_shape_ = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0); + output_shape_ = AnfAlgo::GetOutputInferShape(kernel_node, 0); + input_shape_ = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0); + CPUKernelUtils::ExpandDimsTo4(&input_shape_); begin_ = AnfAlgo::GetNodeAttr>(kernel_node, BEGIN); for (size_t i = 0; i < begin_.size(); i++) { if (begin_[i] < 0) { - begin_[i] = begin_[i] + output_dx_shape_[i]; + begin_[i] = begin_[i] + output_shape_[i]; } } @@ -37,61 +38,90 @@ void SliceGradCPUKernel::InitKernel(const CNodePtr &kernel_node) { if (strides != nullptr) { strides_ = AnfAlgo::GetNodeAttr>(kernel_node, STRIDES); end_ = AnfAlgo::GetNodeAttr>(kernel_node, END); - if (strides_.size() != end_.size() || strides_.size() != output_dx_shape_.size()) { + if (strides_.size() != end_.size() || strides_.size() != output_shape_.size()) { MS_LOG(EXCEPTION) << "stride|end|input size must be equal"; } for (size_t i = 0; i < strides_.size(); ++i) { if (strides_[i] < 0) { - strides_[i] = (strides_[i] + output_dx_shape_[i]) > 0 ? (strides_[i] + output_dx_shape_[i]) : 0; + strides_[i] = (strides_[i] + output_shape_[i]) > 0 ? (strides_[i] + output_shape_[i]) : 0; } if (end_[i] < 0) { - end_[i] = (end_[i] + output_dx_shape_[i]) > 0 ? (end_[i] + output_dx_shape_[i]) : 0; + end_[i] = (end_[i] + output_shape_[i]) > 0 ? (end_[i] + output_shape_[i]) : 0; } } } else { auto sizes = AnfAlgo::GetNodeAttr>(kernel_node, SIZE); - if (sizes.size() != output_dx_shape_.size() || begin_.size() != output_dx_shape_.size()) { + if (sizes.size() != output_shape_.size() || begin_.size() != output_shape_.size()) { MS_LOG(EXCEPTION) << "begin|size|input size must be equal"; } for (size_t i = 0; i < sizes.size(); ++i) { if (sizes[i] < 0) { - sizes[i] = (sizes[i] + output_dx_shape_[i]) > 0 ? (sizes[i] + output_dx_shape_[i]) : 0; + sizes[i] = (sizes[i] + output_shape_[i]) > 0 ? (sizes[i] + output_shape_[i]) : 0; } strides_.emplace_back(1); end_.emplace_back(begin_[i] + sizes[i]); } } - auto output_len = output_dx_shape_.size(); + auto output_len = output_shape_.size(); if (output_len < 4) { for (size_t i = 0; i < 4 - output_len; ++i) { - output_dx_shape_.insert(output_dx_shape_.begin(), 1); + output_shape_.insert(output_shape_.begin(), 1); begin_.insert(begin_.begin(), 0); strides_.insert(strides_.begin(), 1); end_.insert(end_.begin(), 1); } } + CPUKernelUtils::GetElementNumEveryDim(input_shape_, &input_element_num_); + CPUKernelUtils::GetElementNumEveryDim(output_shape_, &output_element_num_); } bool SliceGradCPUKernel::Launch(const std::vector &inputs, const std::vector & /*workspace*/, const std::vector &outputs) { - auto input_dy_addr = reinterpret_cast(inputs[0]->addr); - auto output_dx_addr = reinterpret_cast(outputs[0]->addr); + auto input_addr = reinterpret_cast(inputs[0]->addr); + auto output_addr = reinterpret_cast(outputs[0]->addr); - auto out_size = sizeof(float) * output_dx_shape_[0] * output_dx_shape_[1] * output_dx_shape_[2] * output_dx_shape_[3]; - auto ret = memset_s(output_dx_addr, out_size, 0, out_size); + auto ret = memset_s(output_addr, outputs[0]->size, 0, outputs[0]->size); if (ret != EOK) { - MS_LOG(ERROR) << "output buff memset fail."; + MS_LOG(ERROR) << "output buff memset fail. ret:" << ret; return false; } - for (int i = begin_[0]; i < end_[0]; i += strides_[0]) { - for (int j = begin_[1]; j < end_[1]; j += strides_[1]) { - for (int k = begin_[2]; k < end_[2]; k += strides_[2]) { + bool can_copy_memory[3] = {CanCopyMemoryOnAxis(0), CanCopyMemoryOnAxis(1), CanCopyMemoryOnAxis(2)}; + size_t out_start_offset[3] = {begin_[0] * output_element_num_[0], begin_[1] * output_element_num_[1], + begin_[2] * output_element_num_[2]}; + size_t out_step_size[3] = {strides_[0] * output_element_num_[0], strides_[1] * output_element_num_[1], + strides_[2] * output_element_num_[2]}; + + auto in_n_offset = 0; + auto out_n_offset = out_start_offset[0]; + for (int i = begin_[0]; i < end_[0]; + i += strides_[0], in_n_offset += input_element_num_[0], out_n_offset += out_step_size[0]) { + if (can_copy_memory[0]) { + CopyDataToOutput(inputs, in_n_offset, outputs, out_n_offset, input_element_num_[0]); + continue; + } + auto in_c_offset = 0; + auto out_c_offset = out_start_offset[1]; + for (int j = begin_[1]; j < end_[1]; + j += strides_[1], in_c_offset += input_element_num_[1], out_c_offset += out_step_size[1]) { + if (can_copy_memory[1]) { + CopyDataToOutput(inputs, in_n_offset + in_c_offset, outputs, out_n_offset + out_c_offset, + input_element_num_[1]); + continue; + } + auto in_h_offset = 0; + auto out_h_offset = out_start_offset[2]; + for (int k = begin_[2]; k < end_[2]; + k += strides_[2], in_h_offset += input_element_num_[2], out_h_offset += out_step_size[2]) { + if (can_copy_memory[2]) { + CopyDataToOutput(inputs, in_n_offset + in_c_offset + in_h_offset, outputs, + out_n_offset + out_c_offset + out_h_offset, input_element_num_[2]); + continue; + } for (int m = begin_[3]; m < end_[3]; m += strides_[3]) { - auto offset = CPUKernelUtils::CalcOffset(output_dx_shape_, i, j, k, m); - output_dx_addr[offset] = *input_dy_addr++; + output_addr[out_n_offset + out_c_offset + out_h_offset + m] = *input_addr++; } } } @@ -99,7 +129,38 @@ bool SliceGradCPUKernel::Launch(const std::vector &inputs, return true; } -void SliceGradCPUKernel::CheckParam(const CNodePtr &kernel_node) { +bool SliceGradCPUKernel::CanCopyMemoryOnAxis(size_t dim) const { + for (size_t i = dim + 1; i < 4; ++i) { + if (begin_[i] != 0 || end_[i] != SizeToInt(output_shape_[i]) || strides_[i] != 1) { + return false; + } + } + return true; +} + +void SliceGradCPUKernel::CopyDataToOutput(const std::vector &inputs, size_t in_offset, + const std::vector &outputs, size_t out_offset, + size_t copy_num) const { + auto input_addr = reinterpret_cast(inputs[0]->addr); + auto in_buff_size = inputs[0]->size; + auto output_addr = reinterpret_cast(outputs[0]->addr); + auto out_buff_size = outputs[0]->size; + + if ((in_offset + copy_num) * sizeof(float) > in_buff_size) { + MS_LOG(EXCEPTION) << "input memory out of bounds."; + } + if ((out_offset + copy_num) * sizeof(float) > out_buff_size) { + MS_LOG(EXCEPTION) << "output memory out of bounds."; + } + + auto ret = memcpy_s(output_addr + out_offset, out_buff_size - out_offset * sizeof(float), input_addr + in_offset, + copy_num * sizeof(float)); + if (ret != EOK) { + MS_LOG(EXCEPTION) << "memcpy failed. ret:" << ret; + } +} + +void SliceGradCPUKernel::CheckParam(const CNodePtr &kernel_node) const { size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node); if (output_num != 1) { MS_LOG(EXCEPTION) << "Output number is " << output_num << ", but SliceGradGpuKernel needs 1 output."; diff --git a/mindspore/ccsrc/kernel/cpu/slice_grad_cpu_kernel.h b/mindspore/ccsrc/kernel/cpu/slice_grad_cpu_kernel.h index 847208e4bb353693f3887548c1e72f0e3ba36ad9..a256543ae06c5ee1f19f775cb7804c6abb6fa0c8 100644 --- a/mindspore/ccsrc/kernel/cpu/slice_grad_cpu_kernel.h +++ b/mindspore/ccsrc/kernel/cpu/slice_grad_cpu_kernel.h @@ -33,12 +33,17 @@ class SliceGradCPUKernel : public CPUKernel { const std::vector &outputs) override; private: - void CheckParam(const CNodePtr &kernel_node); + bool CanCopyMemoryOnAxis(size_t dim) const; + void CopyDataToOutput(const std::vector &inputs, size_t in_offset, + const std::vector &outputs, size_t out_offset, size_t copy_num) const; + void CheckParam(const CNodePtr &kernel_node) const; std::vector begin_; std::vector end_; std::vector strides_; - std::vector input_dy_shape_; - std::vector output_dx_shape_; + std::vector input_shape_; + std::vector input_element_num_; + std::vector output_shape_; + std::vector output_element_num_; }; MS_REG_CPU_KERNEL( diff --git a/tests/st/ops/cpu/test_slice_grad_op.py b/tests/st/ops/cpu/test_slice_grad_op.py index 0d76ed847e8e787373aa2294f9ff1e4035dd6b67..a3d56f0f052ff205e9b18277ffebb5d00ca30420 100644 --- a/tests/st/ops/cpu/test_slice_grad_op.py +++ b/tests/st/ops/cpu/test_slice_grad_op.py @@ -40,7 +40,7 @@ class SliceGrad(nn.Cell): @pytest.mark.level0 @pytest.mark.platform_x86_cpu @pytest.mark.env_onecard -def test_slice(): +def test_slice_grad(): x = Tensor(np.array([[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]], [[5, 5, 5], [6, 6, 6]]]), mstype.float32) dy = Tensor(np.array([[[3., 1., 2.]], [[4., 1., 4.]]]), mstype.float32) slicegrad = SliceGrad() @@ -54,6 +54,27 @@ def test_slice(): print("output:\n", output) assert (output.asnumpy() == expect).all() +class SliceGrad2(nn.Cell): + def __init__(self): + super(SliceGrad2, self).__init__() + self.slicegrad = G.SliceGrad() + + def construct(self, dy, x): + return self.slicegrad(dy, x, (0, 1, 0), (2, 2, 2)) + +@pytest.mark.level0 +@pytest.mark.platform_x86_cpu +@pytest.mark.env_onecard +def test_slice_grad2(): + dy = Tensor(np.array([[[2., 3.], [4., 5.]], [[8., 9.], [10., 11.]]]), mstype.float32) + x = Tensor(np.arange(2 * 3 * 2).reshape(2, 3, 2), mstype.float32) + grad = SliceGrad2() + output = grad(dy, x) + print("output:\n", output) + expect = [[[0., 0.], [2., 3.], [4., 5.]], + [[0., 0.], [8., 9.], [10., 11.]]] + assert (output.asnumpy() == expect).all() if __name__ == '__main__': - test_slice() + test_slice_grad() + test_slice_grad2() diff --git a/tests/st/ops/cpu/test_slice_op.py b/tests/st/ops/cpu/test_slice_op.py index 90c777ef502a01f7b70816e3f9d0218198cc047e..e927c31689e7bb96acd4d3bda32166094cdac385 100644 --- a/tests/st/ops/cpu/test_slice_op.py +++ b/tests/st/ops/cpu/test_slice_op.py @@ -21,6 +21,7 @@ import mindspore.nn as nn from mindspore import Tensor from mindspore.common import dtype as mstype from mindspore.ops import operations as P +from mindspore.ops.operations import _grad_ops as G context.set_context(mode=context.GRAPH_MODE, device_target='CPU') @@ -46,6 +47,27 @@ def test_slice(): print("output:\n", output) assert (output.asnumpy() == expect).all() +class Slice2(nn.Cell): + def __init__(self): + super(Slice2, self).__init__() + self.slice = P.Slice() + + def construct(self, x): + return self.slice(x, (1, 0, 0), (1, 2, 3)) + +@pytest.mark.level0 +@pytest.mark.platform_x86_cpu +@pytest.mark.env_onecard +def test_slice2(): + x = Tensor(np.arange(3 * 2 * 3).reshape(3, 2, 3), mstype.float32) + expect = [[[6., 7., 8.], + [9., 10., 11.]]] + + slice_op = Slice2() + output = slice_op(x) + print("output:\n", output) + assert (output.asnumpy() == expect).all() if __name__ == '__main__': test_slice() + test_slice2() diff --git a/tests/st/ops/cpu/test_stridedslice_op.py b/tests/st/ops/cpu/test_stridedslice_op.py index 40a597b165c59032fb597b2bfae2af7c79afef4d..82098f9cd4f0980bd016cafcd16e4aba573fb4ac 100644 --- a/tests/st/ops/cpu/test_stridedslice_op.py +++ b/tests/st/ops/cpu/test_stridedslice_op.py @@ -43,3 +43,6 @@ def test_slice(): expect = [[[5., 5., 5.], [6., 7., 8.]]] assert (output.asnumpy() == expect).all() + +if __name__ == '__main__': + test_slice()