diff --git a/dnn/include/megdnn/oprs/general.h b/dnn/include/megdnn/oprs/general.h index 3fddc64d8d6fbe66f247fe73f6b24d60a123208b..b327f3fcc714afed59071f1f2eecd39f02beacff 100644 --- a/dnn/include/megdnn/oprs/general.h +++ b/dnn/include/megdnn/oprs/general.h @@ -500,46 +500,19 @@ public: /* * \param[in] srcs: TensorND on cpu. srcs[i] corresponding to the * address of i-th Tensor. - * \param[in] table: with size `2 * srcs.nr_total_elems()`. - * table[addr] corresponding to outer_idx, - * table[addr+srcs.nr_total_elems()] corresponding to - * inner_idx of dsts. + * \param[in] offsets: with size `2 * srcs.shape[0]`. + * offsets[i * 2] and offsets[i * 2 + 1] means + * the begin and the end of offset in * \param[out] dst: output TensorND, live on cpu or gpu */ - virtual void exec(_megdnn_tensor_in srcs, _megdnn_tensor_in table, + virtual void exec(_megdnn_tensor_in srcs, _megdnn_tensor_in offsets, _megdnn_tensor_out dst, _megdnn_workspace workspace) = 0; virtual size_t get_workspace_in_bytes(const TensorShapeArray& srcs, - const TensorShape& table, + const TensorShape& offsets, const TensorShape& dst) = 0; }; -/** - * \brief ParamPackSplit, used for network forwarding. - * Split a single large param into several small tensors, use copy stategy - * either. - */ -class ParamPackSplit: public ParamPackConcatSplitBase { - DEF_OPR_IMPL(ParamPackSplit, ParamPackConcatSplitBase, 2, 1); - -public: - /* - * \param[in] src: input TensorND, live on cpu or gpu - * \param[in] table: with size `2 * srcs.nr_total_elems()`. - * table[addr] corresponding to outer_idx, - * table[addr+srcs.nr_total_elems()] corresponding to - * inner_idx of dsts. - * \param[out] dsts: TensorND on cpu. dsts[i] corresponding to the address - * of i-th Tensor - */ - virtual void exec(_megdnn_tensor_in src, _megdnn_tensor_in table, - _megdnn_tensor_out dsts, _megdnn_workspace workspace) = 0; - - virtual size_t get_workspace_in_bytes(const TensorShape& src, - const TensorShape& table, - const TensorShapeArray& dsts) = 0; -}; - /** * \brief base class for Tile and Repeat */ diff --git a/dnn/src/common/handle_impl.h b/dnn/src/common/handle_impl.h index 2e6ec73fbb2216a7def52a806f94603641c2a5cd..c8d654b73f31eb99107ec07432c53183a01032f4 100644 --- a/dnn/src/common/handle_impl.h +++ b/dnn/src/common/handle_impl.h @@ -167,7 +167,6 @@ private: cb(Resize) \ cb(ResizeBackward) \ cb(ParamPackConcat) \ - cb(ParamPackSplit) \ cb(MaxTensorDiff) \ cb(MaskConvForward) \ cb(MaskPropagate) \ diff --git a/dnn/src/common/param_pack.cpp b/dnn/src/common/param_pack.cpp index 4eb9de4d1f1d21eb86389e6a5833e24307feba15..2f40d701f2603f8d089561353303edf2e360510c 100644 --- a/dnn/src/common/param_pack.cpp +++ b/dnn/src/common/param_pack.cpp @@ -48,9 +48,9 @@ std::vector ParamPackConcatSplitBase::gen_offsets( size_t offset = 0; for (size_t i = 0; i < shapes.size(); i++) { offset = get_aligned(offset); - offsets[i * 2] = offset; + offsets[i << 1] = offset; offset += shapes[i].total_nr_elems(); - offsets[i * 2 + 1] = offset; + offsets[(i << 1) + 1] = offset; } return offsets; } diff --git a/dnn/src/cuda/param_pack/opr_impl.cpp b/dnn/src/cuda/param_pack/opr_impl.cpp index fb521eaec8721ab840cf7ad77e872666448733ed..ad2d11961dd8eb8b2f1917fa5c0a8bc9ad108731 100644 --- a/dnn/src/cuda/param_pack/opr_impl.cpp +++ b/dnn/src/cuda/param_pack/opr_impl.cpp @@ -60,56 +60,5 @@ void ParamPackConcatImpl::exec(_megdnn_tensor_in srcs, #undef cb } -size_t ParamPackSplitImpl::get_workspace_in_bytes( - const TensorShape&, const TensorShape&, const TensorShapeArray& dsts) { - return sizeof(size_t) * dsts.size(); -} - -template -void ParamPackSplitImpl::exec_internal(_megdnn_tensor_in src, - _megdnn_tensor_in table, - _megdnn_tensor_out dsts, - _megdnn_workspace workspace) { - // inner and outer table must be int32 - megdnn_assert(table.layout.dtype == dtype::Int32()); - // dsts is src pointer, ndim must be 1 - megdnn_assert(dsts.layout.ndim == 1); - - auto out_size = dsts.layout.shape[0], - inp_size = src.layout.total_nr_elems(); - - auto stream = cuda_stream(this->handle()); - - auto total_workspace_size = sizeof(T*) * out_size; - auto dsts_cpu = static_cast(dsts.raw_ptr); - megdnn_assert_internal(dsts_cpu); - auto dsts_gpu = reinterpret_cast(workspace.raw_ptr); - - auto table_outer_gpu = table.ptr(); - auto table_inner_gpu = table_outer_gpu + inp_size; - - cuda_check(cudaMemcpyAsync(dsts_gpu, dsts_cpu, total_workspace_size, - cudaMemcpyHostToDevice, stream)); - - // param_pack_split_proxy() - param_pack::split_proxy(src.ptr(), dsts_gpu, inp_size, - table_outer_gpu, table_inner_gpu, stream); -} - -void ParamPackSplitImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_in table, - _megdnn_tensor_out dsts, - _megdnn_workspace workspace) { - check_exec(src.layout, table.layout, dsts.layout); -#define cb(DType) \ - if (src.layout.dtype == DType()) { \ - using ctype = typename DTypeTrait::ctype; \ - exec_internal(src, table, dsts, workspace); \ - return; \ - } - MEGDNN_FOREACH_COMPUTING_DTYPE(cb) - megdnn_throw("bad type"); -#undef cb -} - } // namespace cuda } // namespace megdnn diff --git a/dnn/src/cuda/param_pack/opr_impl.h b/dnn/src/cuda/param_pack/opr_impl.h index ab46434edb6b03ea6e23bdd840e7d3fc6cd51741..38e2ec9fefd93a4563e0547cef28dbf6c3f7a6c8 100644 --- a/dnn/src/cuda/param_pack/opr_impl.h +++ b/dnn/src/cuda/param_pack/opr_impl.h @@ -31,21 +31,5 @@ private: _megdnn_tensor_out dst, _megdnn_workspace workspace); }; -class ParamPackSplitImpl final : public ParamPackSplit { -public: - using ParamPackSplit::ParamPackSplit; - void exec(_megdnn_tensor_in src, _megdnn_tensor_in table, - _megdnn_tensor_out dsts, _megdnn_workspace workspace) override; - - size_t get_workspace_in_bytes(const TensorShape& src, - const TensorShape& table, - const TensorShapeArray& dsts) override; - -private: - template - void exec_internal(_megdnn_tensor_in src, _megdnn_tensor_in table, - _megdnn_tensor_out dsts, _megdnn_workspace workspace); -}; - } // namespace cuda } // namespace megdnn diff --git a/dnn/src/cuda/param_pack/param_pack.cu b/dnn/src/cuda/param_pack/param_pack.cu index 1939d00226314881db8ea292cf68846da80792c6..58d4112a5eb86ef8ed45d9ec7dee01220b9037bd 100644 --- a/dnn/src/cuda/param_pack/param_pack.cu +++ b/dnn/src/cuda/param_pack/param_pack.cu @@ -40,31 +40,6 @@ __global__ void concat_kernel(const T** srcs, T* dst, } } -template -__global__ void split_kernel(const T* src, T** dsts, - const int32_t* table_outer, - const int32_t* table_inner, - size_t total_size) { - size_t addr = threadIdx.x + blockIdx.x * blockDim.x; - if (addr < total_size) { - int32_t i = table_outer[addr]; - int32_t idx = table_inner[addr]; - if (idx != -1) { - dsts[i][idx] = src[addr]; - } - } -} - -template -void split_proxy(const T* src, T** dsts, size_t total_size, - const int32_t* table_outer, const int32_t* table_inner, - cudaStream_t stream) { - size_t NR_BLOCKS = DIVUP(total_size, NR_THREADS); - split_kernel<<>>( - src, dsts, table_outer, table_inner, total_size); - after_kernel_launch(); -} - template void concat_proxy(const T** srcs, T* dst, size_t srcs_size, size_t total_size, const int32_t* offsets, @@ -78,10 +53,7 @@ void concat_proxy(const T** srcs, T* dst, size_t srcs_size, size_t total_size, #define INST(T) \ template void concat_proxy(const T**, T*, size_t, size_t, \ const int32_t*, \ - cudaStream_t); \ - template void split_proxy(const T*, T**, size_t, \ - const int32_t*, const int32_t*, \ - cudaStream_t); + cudaStream_t); #define cb(DType) INST(typename DTypeTrait::ctype) MEGDNN_FOREACH_COMPUTING_DTYPE(cb) #undef cb diff --git a/dnn/src/cuda/param_pack/param_pack.cuh b/dnn/src/cuda/param_pack/param_pack.cuh index 53dc3e9c73d0dce90fc5feb2d1040702559a3781..aa79441c1ad15be742b1678a1713de513076bf1a 100644 --- a/dnn/src/cuda/param_pack/param_pack.cuh +++ b/dnn/src/cuda/param_pack/param_pack.cuh @@ -19,11 +19,6 @@ namespace megdnn { namespace cuda { namespace param_pack { -template -void split_proxy(const T* src, T** dsts, size_t total_size, - const int32_t* table_outer, const int32_t* table_inner, - cudaStream_t stream); - template void concat_proxy(const T** srcs, T* dst, size_t srcs_size, size_t total_size, const int32_t* offsets, cudaStream_t stream); diff --git a/dnn/src/naive/param_pack/opr_impl.cpp b/dnn/src/naive/param_pack/opr_impl.cpp index 8b15ce9b8c97e2a9981cf575e6411d7f8aba01bd..c457c99ee5b462b7f9cfd1afe3d8fa5e40648b4c 100644 --- a/dnn/src/naive/param_pack/opr_impl.cpp +++ b/dnn/src/naive/param_pack/opr_impl.cpp @@ -16,43 +16,6 @@ using namespace megdnn; using namespace naive; -template -void ParamPackSplitImpl::exec_internal(_megdnn_tensor_in src, int32_t* table, - _megdnn_tensor_out dsts, - _megdnn_workspace) { - auto dsts_ptr = static_cast(dsts.raw_ptr); - auto src_ptr = src.ptr(); - - auto inp_size = src.layout.total_nr_elems(); - auto table_outer = table, table_inner = table_outer + inp_size; - - for (size_t j = 0; j < inp_size; j++) { - int32_t i = table_outer[j]; - int32_t idx = table_inner[j]; - if (idx != -1) { - dsts_ptr[i][idx] = src_ptr[j]; - } - } -} - -void ParamPackSplitImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_in table, - _megdnn_tensor_out dsts, - _megdnn_workspace workspace) { - check_exec(src.layout, table.layout, dsts.layout); - auto table_ptr = table.ptr(); - -#define cb(DType) \ - if (src.layout.dtype == DType()) { \ - using ctype = typename DTypeTrait::ctype; \ - MEGDNN_DISPATCH_CPU_KERN_OPR( \ - exec_internal(src, table_ptr, dsts, workspace)); \ - return; \ - } - MEGDNN_FOREACH_COMPUTING_DTYPE(cb) - megdnn_throw("bad type"); -#undef cb -} - template void ParamPackConcatImpl::exec_internal(_megdnn_tensor_in srcs, int32_t* offsets, diff --git a/dnn/src/naive/param_pack/opr_impl.h b/dnn/src/naive/param_pack/opr_impl.h index bf8df0b0d79e5f623c4ebb437e20fb64ea0379df..0d0e15450f6222b9727f0080e0a7d3dad42a16b3 100644 --- a/dnn/src/naive/param_pack/opr_impl.h +++ b/dnn/src/naive/param_pack/opr_impl.h @@ -13,27 +13,10 @@ namespace megdnn { namespace naive { -class ParamPackSplitImpl final : public ParamPackSplit { -public: - using ParamPackSplit::ParamPackSplit; - void exec(_megdnn_tensor_in src, _megdnn_tensor_in table, - _megdnn_tensor_out dsts, _megdnn_workspace workspace) override; - - size_t get_workspace_in_bytes(const TensorShape&, const TensorShape&, - const TensorShapeArray&) override { - return 0; - } - -private: - template - void exec_internal(_megdnn_tensor_in src, int32_t* table, - _megdnn_tensor_out dsts, _megdnn_workspace workspace); -}; - class ParamPackConcatImpl final : public ParamPackConcat { public: using ParamPackConcat::ParamPackConcat; - void exec(_megdnn_tensor_in srcs, _megdnn_tensor_in table, + void exec(_megdnn_tensor_in srcs, _megdnn_tensor_in offsets, _megdnn_tensor_out dst, _megdnn_workspace workspace) override; size_t get_workspace_in_bytes(const TensorShapeArray&, const TensorShape&, @@ -43,7 +26,7 @@ public: private: template - void exec_internal(_megdnn_tensor_in srcs, int32_t* table, + void exec_internal(_megdnn_tensor_in srcs, int32_t* offsets, _megdnn_tensor_out dst, _megdnn_workspace workspace); }; diff --git a/dnn/test/cuda/param_pack.cpp b/dnn/test/cuda/param_pack.cpp index 8406e0db5496626294edeaa183b5c3982eb0ab3a..9450d4aee13a7d1961cca85e394ac9eea3b06b85 100644 --- a/dnn/test/cuda/param_pack.cpp +++ b/dnn/test/cuda/param_pack.cpp @@ -18,56 +18,38 @@ using namespace test; namespace { template -std::vector create_table(const TensorShapeArray& shapes, - size_t align) { +std::vector create_offsets(const TensorShapeArray& shapes, + size_t alignment) { size_t dtype_size = sizeof(T); - if (align < dtype_size) - align = dtype_size; + if (alignment < dtype_size) + alignment = dtype_size; + alignment /= dtype_size; - align /= dtype_size; + auto get_aligned = [alignment](size_t v) { + auto mod = v & (alignment - 1); + return v + ((alignment - mod) & (alignment - 1)); + }; - size_t offset = shapes[0].total_nr_elems(); - for (size_t i = 1; i < shapes.size(); i++) { - auto d = offset & (align - 1); - offset += (align - d) & (align - 1); - - offset += shapes[i].total_nr_elems(); - } - - std::vector table(offset * 2); - - int32_t* outer_table = table.data(); - int32_t* inner_table = outer_table + offset; - - offset = 0; + std::vector offsets(shapes.size() << 1); + size_t offset = 0; for (size_t i = 0; i < shapes.size(); i++) { - for (; (offset & (align - 1)) != 0; offset++) { - outer_table[offset] = inner_table[offset] = -1; - } - - size_t j = 0; - for (; j < shapes[i].total_nr_elems(); j++) { - outer_table[offset + j] = i; - inner_table[offset + j] = j; - } - offset += j; + offset = get_aligned(offset); + offsets[i << 1] = offset; + offset += shapes[i].total_nr_elems(); + offsets[(i << 1) + 1] = offset; } - return table; + return offsets; } template -std::vector create_pack(size_t pack_size, const std::vector& table, +std::vector create_pack(size_t pack_size, const std::vector& offsets, const std::vector>& ptr) { - assert(pack_size == table.size() / 2); - const int32_t* outer_table = table.data(); - const int32_t* inner_table = outer_table + pack_size; - std::vector data(pack_size); - for (size_t idx = 0; idx < pack_size; ++idx) { - int32_t out_idx = outer_table[idx]; - int32_t in_idx = inner_table[idx]; - if (in_idx != -1) { - data[idx] = ptr[out_idx][in_idx]; - } + assert(pack_size == offsets.back()); + std::vector data(pack_size, 0); + for (size_t i = 0; i * 2 < offsets.size(); ++i) { + size_t begin = offsets[i * 2], end = offsets[i * 2 +1]; + for (size_t j = 0;j < end - begin; j++) + data[begin + j] = ptr[i][j]; } return data; } @@ -95,65 +77,6 @@ T* create_device_data(Handle* handle, const T* data, size_t size) { return data_device; } -template -void test_param_pack_split(Handle* handle, const TensorShapeArray& shapes, - DType type) { - auto split = handle->create_operator(); - - size_t nr_params = shapes.size(); - std::vector param_ptrs; - for (size_t i = 0; i < nr_params; ++i) { - param_ptrs.push_back(create_device_data(handle, - nullptr, shapes[i].total_nr_elems())); - } - std::vector> expected_param = create_params(nr_params, - shapes); - - std::vector table = - create_table(shapes, handle->alignment_requirement()); - ASSERT_EQ(table, - ParamPackSplit::gen_offsets( - shapes, handle->alignment_requirement(), sizeof(T))); - size_t pack_size = table.size() / 2; - int32_t* table_gpu = create_device_data(handle, table.data(), - table.size()); - - std::vector pack = - create_pack(pack_size, table, expected_param); - T* pack_gpu = create_device_data(handle, pack.data(), pack.size()); - - TensorLayout src_layout({pack_size}, type); - TensorND src_tensor(pack_gpu, src_layout); - - TensorLayout table_layout({table.size()}, dtype::Int32()); - TensorND table_tensor(table_gpu, table_layout); - - test::WorkspaceWrapper workspace(handle, split->get_workspace_in_bytes( - {pack_size}, table_layout, shapes)); - TensorND dst_tensor(param_ptrs.data(), - TensorLayout({nr_params}, dtype::Int32())); - - split->exec(src_tensor, table_tensor, dst_tensor, workspace.workspace()); - - - // check - for (size_t i = 0; i < nr_params; ++i) { - T* actual_param = static_cast(malloc(shapes[i].total_nr_elems() - * sizeof(T))); - test::megdnn_memcpy_D2H(handle, actual_param, param_ptrs[i], - shapes[i].total_nr_elems() * sizeof(T)); - for (size_t idx = 0; idx < shapes[i].total_nr_elems(); ++idx) { - ASSERT_EQ(actual_param[idx], expected_param[i][idx]); - } - free(actual_param); - } - test::megdnn_free(handle, pack_gpu); - test::megdnn_free(handle, table_gpu); - for (auto ptr : param_ptrs) { - test::megdnn_free(handle, ptr); - } -} - template void test_param_pack_concat(Handle* handle, const TensorShapeArray& shapes, DType type) { @@ -167,28 +90,28 @@ void test_param_pack_concat(Handle* handle, const TensorShapeArray& shapes, param_ptrs.push_back(create_device_data(handle, params[i].data(), shapes[i].total_nr_elems())); } - std::vector table = - create_table(shapes, handle->alignment_requirement()); - size_t pack_size = table.size() / 2; - int32_t* table_gpu = create_device_data(handle, table.data(), - table.size()); + std::vector offsets = + create_offsets(shapes, handle->alignment_requirement()); + size_t pack_size = offsets.back(); + int32_t* offsets_gpu = create_device_data(handle, offsets.data(), + offsets.size()); std::vector expected_pack = - create_pack(pack_size, table, params); + create_pack(pack_size, offsets, params); T* pack_gpu = create_device_data(handle, nullptr, expected_pack.size()); TensorLayout dst_layout({pack_size}, type); TensorND dst_tensor(pack_gpu, dst_layout); - TensorLayout table_layout({table.size()}, dtype::Int32()); - TensorND table_tensor(table_gpu, table_layout); + TensorLayout offsets_layout({offsets.size()}, dtype::Int32()); + TensorND offsets_tensor(offsets_gpu, offsets_layout); test::WorkspaceWrapper workspace(handle, concat->get_workspace_in_bytes( - shapes, table_layout, {pack_size})); + shapes, offsets_layout, {pack_size})); TensorND src_tensor(param_ptrs.data(), TensorLayout({nr_params}, dtype::Int32())); - concat->exec(src_tensor, table_tensor, dst_tensor, workspace.workspace()); + concat->exec(src_tensor, offsets_tensor, dst_tensor, workspace.workspace()); // check T* actual_pack = static_cast(malloc(pack_size * sizeof(T))); @@ -199,7 +122,7 @@ void test_param_pack_concat(Handle* handle, const TensorShapeArray& shapes, } free(actual_pack); test::megdnn_free(handle, pack_gpu); - test::megdnn_free(handle, table_gpu); + test::megdnn_free(handle, offsets_gpu); for (auto ptr : param_ptrs) { test::megdnn_free(handle, ptr); } @@ -222,9 +145,6 @@ TEST_F(CUDA, PARAM_PACK) { {111, 111, 111}, {128, 128, 128}}); for (auto shapes : shapes_vec) { - test_param_pack_split(handle_cuda(), shapes, dtype::Int32()); - test_param_pack_split(handle_cuda(), shapes, dtype::Int16()); - test_param_pack_split(handle_cuda(), shapes, dtype::Float32()); test_param_pack_concat(handle_cuda(), shapes, dtype::Int32()); test_param_pack_concat(handle_cuda(), shapes, dtype::Int16()); test_param_pack_concat(handle_cuda(), shapes, dtype::Float32()); diff --git a/python_module/src/cpp/opr_defs.cpp b/python_module/src/cpp/opr_defs.cpp index 021e513ede14b03ffb5dc3050ca5efa5b5620d49..220605a9aede36226a1e9a6829665e256c8807ce 100644 --- a/python_module/src/cpp/opr_defs.cpp +++ b/python_module/src/cpp/opr_defs.cpp @@ -38,8 +38,7 @@ SymbolVar _Opr::_axis_add_remove(SymbolVar src, } SymbolVarArray _Opr::param_pack_split( - SymbolVar src, SymbolVar table, - const std::vector>& shapes, + SymbolVar src, const std::vector>& shapes, const OperatorNodeConfig& config) { auto size = shapes.size(); mgb::TensorShapeArray shapearr(size); @@ -48,18 +47,16 @@ SymbolVarArray _Opr::param_pack_split( } auto cn = src.node()->comp_node(); - auto table_val = megdnn::ParamPackSplit::gen_offsets( + auto offsets_val = megdnn::ParamPackConcat::gen_offsets( shapearr, cn.get_mem_addr_alignment(), src.dtype().size()); - if (!table.node()) { - if (config.has_comp_node_set()) { - cn = config.get_single_comp_node(); - } - HostTensorND hv{cn, TensorShape{{table_val.size()}}, dtype::Int32{}}; - memcpy(hv.raw_ptr(), table_val.data(), table_val.size() * sizeof(int)); - table = opr::ImmutableTensor::make(*src.node()->owner_graph(), hv); + if (config.has_comp_node_set()) { + cn = config.get_single_comp_node(); } + HostTensorND hv{cn, TensorShape{{offsets_val.size()}}, dtype::Int32{}}; + memcpy(hv.raw_ptr(), offsets_val.data(), offsets_val.size() * sizeof(int)); + auto offsets = opr::ImmutableTensor::make(*src.node()->owner_graph(), hv); - return mgb::opr::ParamPackSplit::make(src, table, table_val, shapearr, config); + return mgb::opr::ParamPackSplit::make(src, offsets, offsets_val, shapearr, config); } #if MGB_ENABLE_OPR_MM diff --git a/python_module/src/cpp/opr_defs.h b/python_module/src/cpp/opr_defs.h index f3f1eec73c45bdadbb3a585e529abb26421e185e..5c35835574e8ad2768f7cd8eee19755fc36c2f3c 100644 --- a/python_module/src/cpp/opr_defs.h +++ b/python_module/src/cpp/opr_defs.h @@ -44,8 +44,7 @@ static SymbolVar add_update(SymbolVar dest, SymbolVar delta, // tensor manip static SymbolVarArray param_pack_split( - SymbolVar src, SymbolVar table, - const std::vector>& shapes, + SymbolVar src, const std::vector>& shapes, const OperatorNodeConfig& config); static SymbolVar dimshuffle(SymbolVar src, diff --git a/python_module/src/python/opr_template.py b/python_module/src/python/opr_template.py index 80f00cf6c89736ab1aff8f0618e85a1a62890245..c83d4172d7cf4c82d97b22e243d0018f96063fdf 100644 --- a/python_module/src/python/opr_template.py +++ b/python_module/src/python/opr_template.py @@ -159,11 +159,11 @@ def dimshuffle(src, pattern, ndim=0, *, pattern_mgb.push_back(i) return _mgb._Opr.dimshuffle(src, pattern_mgb, int(ndim), config) -def param_pack_split(src, shapes, table=None, *, +def param_pack_split(src, shapes, *, name=None, comp_node=None, config=None): """ split param into a list of tensor for given shape - ParamPackSplit operator has two inputs: ``src`` and ``tables`` and would + ParamPackSplit operator has a input: ``src`` and would have a ``output``. output[i] indicates the address of tensor which part of ``src`` would transfer its elements into. @@ -172,24 +172,13 @@ def param_pack_split(src, shapes, table=None, *, output[0] indicates the address of tensor with shapes[0]:(1, 2, 4), output[1] indicates the address of tensor with shapes[1]:(4, 2, 2), output[2] indicates the address of tensor with shapes[2]:(4, 2, 1). - And table have the double size of input tensor. - For each element in the tensor input[i], we may have - output[outer_index[i]][inner_index[i]] = input[i]. - Table would the concatation of outer_index and inner_index, so more - alternatively, output[table[i]][table[i+len(input)]] = input[i] :param src: The concatenated input tensor. :type src: :class:`SymbolVar` :param shapes: Shapes of output tensors :type shapes: list of list of int - :param table: Output element mapping table; it if it is None, a table would - be generated from ``shapes`` - :type table: :class:`SymbolVar` with int32 type, or None """ config = _helper.gen_config(name, comp_node, config) - if isinstance(table, (list, tuple)) and isinstance(shapes, _mgb.SymbolVar): - # compatible with old API - table, shapes = shapes, table if not isinstance(shapes, (list, tuple)): raise TypeError('could not convert {} to tensor shapes'.format( @@ -202,10 +191,7 @@ def param_pack_split(src, shapes, table=None, *, assert min(s) > 0 shapes_mgb.push_back(s) - if table is None: - table = _mgb.SymbolVar() - - return _mgb._Opr.param_pack_split(src, table, shapes_mgb, config) + return _mgb._Opr.param_pack_split(src, shapes_mgb, config) class _modify_subtensor_helper: def __init__(self, dest, val, *, name=None, comp_node=None, config=None): diff --git a/src/opr/impl/tensor_manip.cpp b/src/opr/impl/tensor_manip.cpp index d29a25244496c1886dabc7f5beaf3176de76f1e2..f33f6f92db949f4cd37113fa4545cad2988dcaee 100644 --- a/src/opr/impl/tensor_manip.cpp +++ b/src/opr/impl/tensor_manip.cpp @@ -1400,8 +1400,8 @@ void ParamPackConcat::init_output_static_infer_desc(){ using namespace cg::static_infer; auto &&mgr = owner_graph()->static_infer_manager(); - auto infer_out = [this](TensorShape &dest, const InpVal &inp) { - dest = {m_offsets.back()}; + auto infer_out = [this](TensorShape& dest, const InpVal& inp) { + dest = {static_cast(m_offsets.back())}; return true; }; DepVal shp_deps; @@ -1476,9 +1476,6 @@ SymbolVarArray ParamPackSplit::make(const SymbolVar& src, return ret; } -void ParamPackSplit::scn_do_execute() { -} - void ParamPackSplit::init_output_dtype() { // already initialized in constructor } @@ -1518,7 +1515,6 @@ void ParamPackSplit::init_output_static_infer_desc() { MGB_IMPL_OPR_GRAD(ParamPackSplit) { mgb_assert(out_grad.size() == opr.output().size()); SmallVector grad; - // last var is workspace, ignore it for (size_t i = 0; i < out_grad.size(); ++i) { auto gval = out_grad[i]; if (!gval) { diff --git a/src/opr/include/megbrain/opr/tensor_manip.h b/src/opr/include/megbrain/opr/tensor_manip.h index c81f34bb5562317985434ecaee4f4f86e04204b2..8dcff9da646d3b5fa8a5545b8b069d95d29960e0 100644 --- a/src/opr/include/megbrain/opr/tensor_manip.h +++ b/src/opr/include/megbrain/opr/tensor_manip.h @@ -583,7 +583,7 @@ MGB_DEFINE_OPR_CLASS(ParamPackSplit, cg::SingleCNOperatorNodeBase) // { std::vector m_offsets; std::vector m_mem_fwd_success; - void scn_do_execute() override; + void scn_do_execute() override{}; void init_output_static_infer_desc() override; bool infer_shape(size_t index, TensorShape &dest, const cg::static_infer::InpVal &inp); diff --git a/src/opr/test/tensor_manip.cpp b/src/opr/test/tensor_manip.cpp index 55864658d7bea3346ade8dc61abb0d3421d8ec55..f3694c217585121befe2ef800b91a19951512850 100644 --- a/src/opr/test/tensor_manip.cpp +++ b/src/opr/test/tensor_manip.cpp @@ -1898,15 +1898,15 @@ void test_param_pack_concat(const TensorShapeArray &shapes, DType type){ srcs.push_back(nd); } - auto host_table_gen = megdnn::ParamPackSplit::gen_offsets(shapes, + auto host_offsets_gen = megdnn::ParamPackConcat::gen_offsets(shapes, cn.get_mem_addr_alignment(), 4); - ASSERT_EQ(host_table_gen.size(), size * 2); - auto host_table = std::make_shared(); - host_table->comp_node(cn).dtype(dtype::Int32{}).resize({size * 2}); - memcpy(host_table->raw_ptr(), host_table_gen.data(), size * 8); - auto table = opr::Host2DeviceCopy::make(*graph, host_table); + ASSERT_EQ(host_offsets_gen.back(), size); + auto host_offsets = std::make_shared(); + host_offsets->comp_node(cn).dtype(dtype::Int32{}).resize({srcs.size() * 2}); + memcpy(host_offsets->raw_ptr(), host_offsets_gen.data(), srcs.size() * 8); + auto offsets = opr::Host2DeviceCopy::make(*graph, host_offsets); - auto z = opr::ParamPackConcat::make(srcs, table, host_table_gen); + auto z = opr::ParamPackConcat::make(srcs, offsets, host_offsets_gen); HostTensorND host_z; auto func = graph->compile({make_callback_copy(z, host_z)}); @@ -1944,17 +1944,17 @@ void test_param_pack_split(const TensorShapeArray& shapes) { auto make_graph = [&](const typename Checker::SymInpArray& inputs) -> typename Checker::SymOutArray { - auto table_val = megdnn::ParamPackSplit::gen_offsets( + auto offsets_val = megdnn::ParamPackConcat::gen_offsets( shapes, cn.get_mem_addr_alignment(), 4); - HostTensorND table; - std::copy_n(table_val.data(), table_val.size(), - table.dtype(dtype::Int32{}) + HostTensorND offsets; + std::copy_n(offsets_val.data(), offsets_val.size(), + offsets.dtype(dtype::Int32{}) .comp_node(cn) - .resize({table_val.size()}) + .resize({offsets_val.size()}) .ptr()); - auto sym_table = opr::SharedDeviceTensor::make( - *inputs[0].node()->owner_graph(), table); - auto out = opr::ParamPackSplit::make(inputs[0], sym_table, table_val, + auto sym_offsets = opr::SharedDeviceTensor::make( + *inputs[0].node()->owner_graph(), offsets); + auto out = opr::ParamPackSplit::make(inputs[0], sym_offsets, offsets_val, shapes); mgb_assert(out.size() == nr_out); typename Checker::SymOutArray ret;