From 098eec46619a8adc8233f49752920e5b16119688 Mon Sep 17 00:00:00 2001 From: dengwentao Date: Mon, 1 Jun 2020 15:51:43 +0800 Subject: [PATCH] add cpu op embedding_look_up --- mindspore/ccsrc/kernel/CMakeLists.txt | 3 + .../embedding_look_up_comm_grad_cpu_kernel.cc | 77 ++++++ .../embedding_look_up_comm_grad_cpu_kernel.h | 46 ++++ .../cpu/embedding_look_up_cpu_kernel.cc | 228 ++++++++++++++++++ .../kernel/cpu/embedding_look_up_cpu_kernel.h | 74 ++++++ .../ccsrc/kernel/cpu/subscalar_cpu_kernel.cc | 81 +++++++ .../ccsrc/kernel/cpu/subscalar_cpu_kernel.h | 44 ++++ mindspore/ccsrc/operator/ops.cc | 3 + mindspore/ccsrc/operator/ops.h | 3 + .../pass/const_input_to_attr_registry.cc | 3 + 10 files changed, 562 insertions(+) create mode 100644 mindspore/ccsrc/kernel/cpu/embedding_look_up_comm_grad_cpu_kernel.cc create mode 100644 mindspore/ccsrc/kernel/cpu/embedding_look_up_comm_grad_cpu_kernel.h create mode 100644 mindspore/ccsrc/kernel/cpu/embedding_look_up_cpu_kernel.cc create mode 100644 mindspore/ccsrc/kernel/cpu/embedding_look_up_cpu_kernel.h create mode 100644 mindspore/ccsrc/kernel/cpu/subscalar_cpu_kernel.cc create mode 100644 mindspore/ccsrc/kernel/cpu/subscalar_cpu_kernel.h diff --git a/mindspore/ccsrc/kernel/CMakeLists.txt b/mindspore/ccsrc/kernel/CMakeLists.txt index 226692314..993768b7f 100644 --- a/mindspore/ccsrc/kernel/CMakeLists.txt +++ b/mindspore/ccsrc/kernel/CMakeLists.txt @@ -25,6 +25,9 @@ if (ENABLE_CPU) if (NOT ENABLE_MPI) list(REMOVE_ITEM CPU_SRC_LIST "cpu/allgather_cpu_kernel.cc") list(REMOVE_ITEM CPU_SRC_LIST "cpu/reduce_scatter_cpu_kernel.cc") + list(REMOVE_ITEM CPU_SRC_LIST "cpu/embedding_look_up_comm_grad_cpu_kernel.cc") + list(REMOVE_ITEM CPU_SRC_LIST "cpu/embedding_look_up_cpu_kernel.cc") + list(REMOVE_ITEM CPU_SRC_LIST "cpu/subscalar_cpu_kernel.cc") endif () endif () diff --git a/mindspore/ccsrc/kernel/cpu/embedding_look_up_comm_grad_cpu_kernel.cc b/mindspore/ccsrc/kernel/cpu/embedding_look_up_comm_grad_cpu_kernel.cc new file mode 100644 index 000000000..837cb647e --- /dev/null +++ b/mindspore/ccsrc/kernel/cpu/embedding_look_up_comm_grad_cpu_kernel.cc @@ -0,0 +1,77 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include +#include "kernel/cpu/embedding_look_up_comm_grad_cpu_kernel.h" +#include "device/cpu/cpu_device_address.h" +#include "device/cpu/mpi/mpi_adapter.h" +#include "ir/primitive.h" + +namespace mindspore { +namespace kernel { +void EmbeddingLookUpCommGradCPUKernel::InitKernel(const CNodePtr &kernel_node) { + CheckParam(kernel_node); + split_num_ = AnfAlgo::GetNodeAttr(kernel_node, "split_num"); + MS_LOG(INFO) << "split_num: " << split_num_; + auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0); + if (input_shape[0] % split_num_ != 0) { + MS_LOG(EXCEPTION) << "Input shape[0] is " << input_shape[0] << ", but it must be multiple of split_num."; + } +} + +bool EmbeddingLookUpCommGradCPUKernel::Launch(const std::vector &inputs, + const std::vector & /*workspace*/, + const std::vector &outputs) { +#if defined(_WIN32) || defined(_WIN64) + auto start_time = std::chrono::steady_clock::now(); +#else + struct timeval start_time, end_time; + (void)gettimeofday(&start_time, nullptr); +#endif + auto input_addr = reinterpret_cast(inputs[0]->addr); + auto output_addr = reinterpret_cast(outputs[0]->addr); + size_t input_size = inputs[0]->size; + size_t output_size = outputs[0]->size; + MS_LOG(DEBUG) << "input addr: " << input_addr << "input size: " << input_size; + MS_LOG(DEBUG) << "output addr: " << output_addr << "output size: " << output_size; + memset_s(output_addr, output_size, 0, output_size); + const std::vector &rank_group = {0, 1, 2, 3, 4, 5, 6, 7}; + size_t input_split_lens = input_size / split_num_ / sizeof(float_t); + size_t output_split_lens = output_size / split_num_ / sizeof(float_t); + for (int i = 0; i < split_num_; i++) { + device::cpu::MPIAdapter::Instance().AllGather(input_addr + i * input_split_lens, + output_addr + i * output_split_lens, rank_group, input_split_lens); + } +#if defined(_WIN32) || defined(_WIN64) + auto end_time = std::chrono::steady_clock::now(); + std::chrono::duration> cost = end_time - start_time; + MS_LOG(INFO) << "EmbeddingLookUpCommGradCPUKernel, used time: " << cost.count() << " us"; +#else + (void)gettimeofday(&end_time, nullptr); + uint64_t time = 1000000 * static_cast(end_time.tv_sec - start_time.tv_sec); + time += static_cast(end_time.tv_usec - start_time.tv_usec); + MS_LOG(INFO) << "EmbeddingLookUpCommGradCPUKernel, used time: " << time << " us"; +#endif + return true; +} + +void EmbeddingLookUpCommGradCPUKernel::CheckParam(const CNodePtr &kernel_node) { + size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node); + if (input_num != 1) { + MS_LOG(EXCEPTION) << "Argument number is " << input_num << ", but EmbeddingLookUpCommGradCPUKernel needs 1."; + } +} +} // namespace kernel +} // namespace mindspore diff --git a/mindspore/ccsrc/kernel/cpu/embedding_look_up_comm_grad_cpu_kernel.h b/mindspore/ccsrc/kernel/cpu/embedding_look_up_comm_grad_cpu_kernel.h new file mode 100644 index 000000000..7222bd9be --- /dev/null +++ b/mindspore/ccsrc/kernel/cpu/embedding_look_up_comm_grad_cpu_kernel.h @@ -0,0 +1,46 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_CCSRC_KERNEL_CPU_EMBEDDING_LOOK_UP_COMM_GRAD_CPU_KERNEL_H_ +#define MINDSPORE_CCSRC_KERNEL_CPU_EMBEDDING_LOOK_UP_COMM_GRAD_CPU_KERNEL_H_ +#include +#include +#include "kernel/cpu/cpu_kernel.h" +#include "kernel/cpu/cpu_kernel_factory.h" + +namespace mindspore { +namespace kernel { +class EmbeddingLookUpCommGradCPUKernel : public CPUKernel { + public: + EmbeddingLookUpCommGradCPUKernel() : split_num_(1) {} + ~EmbeddingLookUpCommGradCPUKernel() override{}; + + void InitKernel(const CNodePtr &kernel_node) override; + + bool Launch(const std::vector &inputs, const std::vector &workspace, + const std::vector &outputs) override; + + private: + void CheckParam(const CNodePtr &kernel_node); + int split_num_; +}; + +MS_REG_CPU_KERNEL(EmbeddingLookupCommGrad, + KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32), + EmbeddingLookUpCommGradCPUKernel); +} // namespace kernel +} // namespace mindspore + +#endif // MINDSPORE_CCSRC_KERNEL_CPU_EMBEDDING_LOOK_UP_COMM_GRAD_CPU_KERNEL_H_ diff --git a/mindspore/ccsrc/kernel/cpu/embedding_look_up_cpu_kernel.cc b/mindspore/ccsrc/kernel/cpu/embedding_look_up_cpu_kernel.cc new file mode 100644 index 000000000..d86e49fc5 --- /dev/null +++ b/mindspore/ccsrc/kernel/cpu/embedding_look_up_cpu_kernel.cc @@ -0,0 +1,228 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include +#include +#include "kernel/cpu/embedding_look_up_cpu_kernel.h" +#include "device/cpu/cpu_device_address.h" +#include "device/cpu/mpi/mpi_adapter.h" +#include "ir/primitive.h" + +namespace mindspore { +namespace kernel { +void EmbeddingLookUpCPUKernel::InitKernel(const CNodePtr &kernel_node) { + CheckParam(kernel_node); + + input_shape_ = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0); + input_lens_ = 1; + for (auto shape : input_shape_) { + MS_LOG(DEBUG) << "input shape: " << shape; + input_lens_ = input_lens_ * shape; + } + MS_LOG(DEBUG) << "input lens: " << input_lens_; + + indices_shape_ = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 1); + indices_lens_ = 1; + for (auto shape : indices_shape_) { + MS_LOG(DEBUG) << "indice shape: " << shape; + indices_lens_ = indices_lens_ * shape; + } + MS_LOG(DEBUG) << "indice lens: " << indices_lens_; + + output_shape_ = AnfAlgo::GetOutputInferShape(kernel_node, 0); + for (auto shape : output_shape_) { + MS_LOG(DEBUG) << "output shape: " << shape; + } + auto output_type = AnfAlgo::GetOutputInferDataType(kernel_node, 0); + MS_LOG(DEBUG) << "output type: " << output_type; + + int axis = AnfAlgo::GetNodeAttr(kernel_node, "axis"); + MS_LOG(DEBUG) << "axis: " << axis; + if (axis_ < 0) { + axis = axis + SizeToInt(input_shape_.size()); + } + axis_ = 4 - input_shape_.size() + axis; + MS_LOG(DEBUG) << "axis_: " << axis_; + reduce_scatter_flag_ = AnfAlgo::GetNodeAttr(kernel_node, "reduce_scatter_flag"); + MS_LOG(DEBUG) << "reduce_scatter_flag: " << reduce_scatter_flag_; + if (reduce_scatter_flag_) { + size_t gatherv2_out_lens = 1; + for (int i = 0; i < SizeToInt(input_shape_.size()); i++) { + if (i == axis) { + for (int j = 0; j < SizeToInt(indices_shape_.size()); j++) { + MS_LOG(DEBUG) << "gatherv2 out shape: " << indices_shape_[j]; + gatherv2_out_lens = gatherv2_out_lens * indices_shape_[j]; + } + } else { + MS_LOG(DEBUG) << "gatherv2 out shape: " << input_shape_[i]; + gatherv2_out_lens = gatherv2_out_lens * input_shape_[i]; + } + } + gatherv2_out_lens_ = gatherv2_out_lens * sizeof(float); + MS_LOG(DEBUG) << "gatherv2 out lens: " << gatherv2_out_lens_; + gather_v2_out_ = malloc(gatherv2_out_lens_); + if (gather_v2_out_ == nullptr) { + MS_LOG(EXCEPTION) << "EmbeddingLookUpCPUKernel malloc failed, malloc lens: " << gatherv2_out_lens_; + } + memset_s(gather_v2_out_, gatherv2_out_lens_, 0, gatherv2_out_lens_); + + split_num_ = AnfAlgo::GetNodeAttr(kernel_node, "split_num"); + MS_LOG(DEBUG) << "split_num: " << split_num_; + } + offset_ = AnfAlgo::GetNodeAttr(kernel_node, "offset"); + MS_LOG(DEBUG) << "offset: " << offset_; + CPUKernelUtils::ExpandDimsTo4(&input_shape_); + CPUKernelUtils::ExpandDimsTo4(&output_shape_); +} + +bool EmbeddingLookUpCPUKernel::Launch(const std::vector &inputs, + const std::vector & /*workspace*/, + const std::vector &outputs) { +#if defined(_WIN32) || defined(_WIN64) + auto start_time = std::chrono::steady_clock::now(); +#else + struct timeval start_time, end_time; + (void)gettimeofday(&start_time, nullptr); +#endif + auto output_addr = reinterpret_cast(outputs[0]->addr); + MS_LOG(DEBUG) << "output addr: " << output_addr << "output size: " << outputs[0]->size; + float *gather_out_addr = reduce_scatter_flag_ ? reinterpret_cast(gather_v2_out_) : output_addr; + MS_LOG(DEBUG) << "gatherv2 out addr: " << gather_out_addr; + size_t dim0 = input_shape_[0]; + size_t dim1 = input_shape_[1]; + size_t dim2 = input_shape_[2]; + + if (axis_ == 3) { + for (size_t i = 0; i < dim0; ++i) { + for (size_t j = 0; j < dim1; ++j) { + for (size_t k = 0; k < dim2; ++k) { + LookUpTable(inputs, i, j, k, &gather_out_addr); + } + } + } + } else if (axis_ == 2) { + for (size_t i = 0; i < dim0; ++i) { + for (size_t j = 0; j < dim1; ++j) { + LookUpTable(inputs, i, j, 0, &gather_out_addr); + } + } + } else if (axis_ == 1) { + for (size_t i = 0; i < dim0; ++i) { + LookUpTable(inputs, i, 0, 0, &gather_out_addr); + } + } else if (axis_ == 0) { + LookUpTable(inputs, 0, 0, 0, &gather_out_addr); + } + + if (reduce_scatter_flag_) { + size_t one_split_lens = gatherv2_out_lens_ / split_num_ / sizeof(float); + size_t reduce_scatter_out_lens = one_split_lens / 8; + const std::vector &group = {0, 1, 2, 3, 4, 5, 6, 7}; + for (int i = 0; i < split_num_; i++) { + device::cpu::MPIAdapter::Instance().ReduceScatter(reinterpret_cast(gather_v2_out_) + i * one_split_lens, + output_addr + i * reduce_scatter_out_lens, group, + one_split_lens, "sum"); + } + } +#if defined(_WIN32) || defined(_WIN64) + auto end_time = std::chrono::steady_clock::now(); + std::chrono::duration> cost = end_time - start_time; + MS_LOG(INFO) << "EmbeddingLookUpCPUKernel, used time: " << cost.count() << " us"; +#else + (void)gettimeofday(&end_time, nullptr); + uint64_t time = 1000000 * static_cast(end_time.tv_sec - start_time.tv_sec); + time += static_cast(end_time.tv_usec - start_time.tv_usec); + MS_LOG(INFO) << "EmbeddingLookUpCPUKernel, used time: " << time << " us"; +#endif + return true; +} + +void memcpy_task(std::vector mem_dest_addr_list, std::vector mem_src_addr_list, size_t start, + size_t end, size_t lens) { + for (size_t i = start; i < end; i++) { + auto ret = memcpy_s(mem_dest_addr_list[i], lens, mem_src_addr_list[i], lens); + if (ret != EOK) { + MS_LOG(EXCEPTION) << "memery copy failed."; + } + } + return; +} + +void EmbeddingLookUpCPUKernel::LookUpTable(const std::vector &inputs, size_t dim0, size_t dim1, + size_t dim2, float **output_addr) { + auto input_addr = reinterpret_cast(inputs[0]->addr); + auto indices_addr = reinterpret_cast(inputs[1]->addr); + size_t num = CPUKernelUtils::GetElementNumOnAxis(input_shape_, axis_); + size_t lens = num * sizeof(float); + std::vector mem_dest_addr_list; + std::vector mem_src_addr_list; + for (size_t i = 0; i < indices_lens_; ++i) { + int indices = indices_addr[i] - offset_; + if (indices >= 0) { + size_t index = IntToSize(indices); + if (index < input_shape_[axis_]) { + size_t pos = 0; + if (axis_ == 3) { + pos = CPUKernelUtils::CalcOffset(input_shape_, dim0, dim1, dim2, index); + } else if (axis_ == 2) { + pos = CPUKernelUtils::CalcOffset(input_shape_, dim0, dim1, index, 0); + } else if (axis_ == 1) { + pos = CPUKernelUtils::CalcOffset(input_shape_, dim0, index, 0, 0); + } else if (axis_ == 0) { + pos = CPUKernelUtils::CalcOffset(input_shape_, index, 0, 0, 0); + } + + if (pos + num <= input_lens_) { + mem_dest_addr_list.push_back(*output_addr); + mem_src_addr_list.push_back(input_addr + pos); + } + } + } + *output_addr += num; + } + + const size_t thread_num = 8; + std::thread threads[8]; + size_t memcpy_lens = mem_dest_addr_list.size(); + size_t start = 0; + size_t ones_copy_lens = (memcpy_lens + thread_num - 1) / thread_num; + size_t i; + for (i = 0; i < thread_num; i++) { + if (start > memcpy_lens) { + break; + } + auto end = (start + ones_copy_lens) > memcpy_lens ? memcpy_lens : start + ones_copy_lens; + threads[i] = std::thread(memcpy_task, mem_dest_addr_list, mem_src_addr_list, start, end, lens); + start = start + ones_copy_lens; + } + for (size_t j = 0; j < i; j++) { + threads[j].join(); + } +} + +void EmbeddingLookUpCPUKernel::CheckParam(const CNodePtr &kernel_node) { + auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0); + if (input_shape.size() > 4) { + MS_LOG(EXCEPTION) << "Input dims is " << input_shape.size() + << ", but EmbeddingLookUpCPUKernel olny support 4d or lower."; + } + + size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node); + if (input_num != 2) { + MS_LOG(EXCEPTION) << "Argument number is " << input_num << ", but EmbeddingLookUpCPUKernel needs 2."; + } +} +} // namespace kernel +} // namespace mindspore diff --git a/mindspore/ccsrc/kernel/cpu/embedding_look_up_cpu_kernel.h b/mindspore/ccsrc/kernel/cpu/embedding_look_up_cpu_kernel.h new file mode 100644 index 000000000..d839571ca --- /dev/null +++ b/mindspore/ccsrc/kernel/cpu/embedding_look_up_cpu_kernel.h @@ -0,0 +1,74 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_CCSRC_KERNEL_CPU_EMBEDDING_LOOK_UP_CPU_KERNEL_H_ +#define MINDSPORE_CCSRC_KERNEL_CPU_EMBEDDING_LOOK_UP_CPU_KERNEL_H_ +#include +#include +#include "kernel/cpu/cpu_kernel.h" +#include "kernel/cpu/cpu_kernel_factory.h" + +namespace mindspore { +namespace kernel { +class EmbeddingLookUpCPUKernel : public CPUKernel { + public: + EmbeddingLookUpCPUKernel() { + axis_ = 0; + offset_ = 0; + split_num_ = 0; + input_lens_ = 0; + indices_lens_ = 0; + gatherv2_out_lens_ = 0; + reduce_scatter_flag_ = false; + gather_v2_out_ = nullptr; + } + ~EmbeddingLookUpCPUKernel() override { + if (gather_v2_out_ != nullptr) { + free(gather_v2_out_); + gather_v2_out_ = nullptr; + } + } + + void InitKernel(const CNodePtr &kernel_node) override; + + bool Launch(const std::vector &inputs, const std::vector &workspace, + const std::vector &outputs) override; + + private: + void LookUpTable(const std::vector &inputs, size_t dim0, size_t dim1, size_t dim2, + float **output_addr); + void CheckParam(const CNodePtr &kernel_node); + std::vector input_shape_; + std::vector indices_shape_; + std::vector output_shape_; + int axis_; + int offset_; + int split_num_; + size_t input_lens_; + size_t indices_lens_; + size_t gatherv2_out_lens_; + bool reduce_scatter_flag_; + + void *gather_v2_out_; +}; + +MS_REG_CPU_KERNEL( + EmbeddingLookup, + KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeFloat32), + EmbeddingLookUpCPUKernel); +} // namespace kernel +} // namespace mindspore + +#endif // MINDSPORE_CCSRC_KERNEL_CPU_EMBEDDING_LOOK_UP_CPU_KERNEL_H_ diff --git a/mindspore/ccsrc/kernel/cpu/subscalar_cpu_kernel.cc b/mindspore/ccsrc/kernel/cpu/subscalar_cpu_kernel.cc new file mode 100644 index 000000000..435154561 --- /dev/null +++ b/mindspore/ccsrc/kernel/cpu/subscalar_cpu_kernel.cc @@ -0,0 +1,81 @@ +/** + * Copyright 2019 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include +#include "kernel/cpu/subscalar_cpu_kernel.h" +#include "device/cpu/cpu_device_address.h" + +namespace mindspore { +namespace kernel { +void SubscalarCPUKernel::InitKernel(const CNodePtr &kernel_node) { + offset_ = AnfAlgo::GetNodeAttr(kernel_node, "input_y"); + MS_LOG(DEBUG) << "offset: " << offset_; +} + +void sub_task(int *in_addr, int *out_addr, size_t lens, int offset) { + for (size_t i = 0; i < lens; i++) { + out_addr[i] = in_addr[i] - offset; + } +} + +bool SubscalarCPUKernel::Launch(const std::vector &inputs, + const std::vector & /*workspace*/, + const std::vector &outputs) { +#if defined(_WIN32) || defined(_WIN64) + auto start_time = std::chrono::steady_clock::now(); +#else + struct timeval start_time, end_time; + (void)gettimeofday(&start_time, nullptr); +#endif + auto input_addr = reinterpret_cast(inputs[0]->addr); + auto output_addr = reinterpret_cast(outputs[0]->addr); + auto lens = inputs[0]->size / sizeof(int); + if (lens < 10000) { + for (size_t i = 0; i < lens; i++) { + output_addr[i] = input_addr[i] - offset_; + } + } else { + size_t thread_num = 4; + std::thread threads[4]; + size_t process_lens = (lens + thread_num - 1) / thread_num; + size_t process_offset = 0; + for (size_t i = 0; i < thread_num; i++) { + threads[i] = + std::thread(sub_task, input_addr + process_offset, output_addr + process_offset, process_lens, offset_); + if (process_offset + process_lens > lens) { + process_lens = lens - process_offset; + process_offset = lens; + } else { + process_offset += process_lens; + } + } + for (size_t i = 0; i < thread_num; i++) { + threads[i].join(); + } + } +#if defined(_WIN32) || defined(_WIN64) + auto end_time = std::chrono::steady_clock::now(); + std::chrono::duration> cost = end_time - start_time; + MS_LOG(INFO) << "SubscaleCPUKernel, used time: " << cost.count() << " us"; +#else + (void)gettimeofday(&end_time, nullptr); + uint64_t time = 1000000 * static_cast(end_time.tv_sec - start_time.tv_sec); + time += static_cast(end_time.tv_usec - start_time.tv_usec); + MS_LOG(INFO) << "SubscalarCPUKernel, used time: " << time << " us"; +#endif + return true; +} +} // namespace kernel +} // namespace mindspore diff --git a/mindspore/ccsrc/kernel/cpu/subscalar_cpu_kernel.h b/mindspore/ccsrc/kernel/cpu/subscalar_cpu_kernel.h new file mode 100644 index 000000000..bd70b075e --- /dev/null +++ b/mindspore/ccsrc/kernel/cpu/subscalar_cpu_kernel.h @@ -0,0 +1,44 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_CCSRC_KERNEL_CPU_SUBSCALAR_CPU_KERNEL_H_ +#define MINDSPORE_CCSRC_KERNEL_CPU_SUBSCALAR_CPU_KERNEL_H_ +#include +#include +#include "kernel/cpu/cpu_kernel.h" +#include "kernel/cpu/cpu_kernel_factory.h" + +namespace mindspore { +namespace kernel { +class SubscalarCPUKernel : public CPUKernel { + public: + SubscalarCPUKernel() : offset_(0) {} + ~SubscalarCPUKernel() override = default; + + void InitKernel(const CNodePtr &kernel_node) override; + + bool Launch(const std::vector &inputs, const std::vector &workspace, + const std::vector &outputs) override; + + private: + int offset_; +}; + +MS_REG_CPU_KERNEL(Subscalar, KernelAttr().AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32), + SubscalarCPUKernel); +} // namespace kernel +} // namespace mindspore + +#endif // MINDSPORE_CCSRC_KERNEL_CPU_SUBSCALAR_CPU_KERNEL_H_ diff --git a/mindspore/ccsrc/operator/ops.cc b/mindspore/ccsrc/operator/ops.cc index 06d84dd4e..be60c4ebb 100755 --- a/mindspore/ccsrc/operator/ops.cc +++ b/mindspore/ccsrc/operator/ops.cc @@ -133,6 +133,8 @@ const PrimitivePtr kPrimConcat = std::make_shared("Concat"); const PrimitivePtr kPrimSqueeze = std::make_shared("Squeeze"); const PrimitivePtr kPrimTranspose = std::make_shared("Transpose"); const PrimitivePtr kPrimGatherV2 = std::make_shared("GatherV2"); +const PrimitivePtr kPrimEmbeddingLookup = std::make_shared("EmbeddingLookup"); +const PrimitivePtr kPrimEmbeddingLookupCommGrad = std::make_shared("EmbeddingLookupCommGrad"); const PrimitivePtr kPrimSize = std::make_shared("Size"); const PrimitivePtr kPrimArgMax = std::make_shared("Argmax"); const PrimitivePtr kPrimPack = std::make_shared("Pack"); @@ -168,6 +170,7 @@ const PrimitivePtr kPrimLess = std::make_shared("Less"); const PrimitivePtr kPrimLessEqual = std::make_shared("LessEqual"); const PrimitivePtr kPrimCumSum = std::make_shared("CumSum"); const PrimitivePtr kPrimCumProd = std::make_shared("CumProd"); +const PrimitivePtr kPrimSubscalar = std::make_shared("Subscalar"); // NN const PrimitivePtr kPrimFlatten = std::make_shared("Flatten"); diff --git a/mindspore/ccsrc/operator/ops.h b/mindspore/ccsrc/operator/ops.h index e372a8ab1..c409989ff 100755 --- a/mindspore/ccsrc/operator/ops.h +++ b/mindspore/ccsrc/operator/ops.h @@ -140,6 +140,8 @@ extern const PrimitivePtr kPrimConcat; extern const PrimitivePtr kPrimSqueeze; extern const PrimitivePtr kPrimTranspose; extern const PrimitivePtr kPrimGatherV2; +extern const PrimitivePtr kPrimEmbeddingLookup; +extern const PrimitivePtr kPrimEmbeddingLookupCommGrad; extern const PrimitivePtr kPrimSize; extern const PrimitivePtr kPrimArgMax; extern const PrimitivePtr kPrimPack; @@ -176,6 +178,7 @@ extern const PrimitivePtr kPrimLess; extern const PrimitivePtr kPrimLessEqual; extern const PrimitivePtr kPrimCumSum; extern const PrimitivePtr kPrimCumProd; +extern const PrimitivePtr kPrimSubscalar; // NN extern const PrimitivePtr kPrimFlatten; diff --git a/mindspore/ccsrc/pre_activate/pass/const_input_to_attr_registry.cc b/mindspore/ccsrc/pre_activate/pass/const_input_to_attr_registry.cc index cc8a1341b..c7e63c9a4 100644 --- a/mindspore/ccsrc/pre_activate/pass/const_input_to_attr_registry.cc +++ b/mindspore/ccsrc/pre_activate/pass/const_input_to_attr_registry.cc @@ -36,6 +36,9 @@ ConstInputToAttrInfoRegistry::ConstInputToAttrInfoRegistry() { Register(prim::kPrimReduceSum->name(), {1}); Register(prim::kPrimReduceMean->name(), {1}); Register(prim::kPrimGatherV2->name(), {2}); + Register(prim::kPrimEmbeddingLookup->name(), {2, 3, 4, 5}); + Register(prim::kPrimEmbeddingLookupCommGrad->name(), {1}); + Register(prim::kPrimSubscalar->name(), {1}); Register(prim::kPrimTranspose->name(), {1}); Register(prim::kPrimUnsortedSegmentSum->name(), {2}); Register(prim::kPrimOneHot->name(), {1}); -- GitLab