diff --git a/src/framework/program/program-optimize/program_optimize.cpp b/src/framework/program/program-optimize/program_optimize.cpp index 15724523ded18e14cecf5d5aacf506992dadb3b4..2e802b120a6effd33bbe68048123c33f76a8aee8 100644 --- a/src/framework/program/program-optimize/program_optimize.cpp +++ b/src/framework/program/program-optimize/program_optimize.cpp @@ -106,11 +106,14 @@ std::shared_ptr ProgramOptimize::FushionOptimize( } std::vector> op_descs; - for (int m = 0; m < nodes.size(); ++m) { - auto &node = nodes[m]; - op_descs.push_back(node->op_desc_); + if (add_split) { + GenerateOps(&op_descs, begin_node.get(), add_split); + } else { + for (int m = 0; m < nodes.size(); ++m) { + auto &node = nodes[m]; + op_descs.push_back(node->op_desc_); + } } - // GenerateOps(&op_descs, begin_node.get()); block->ops_ = op_descs; } @@ -267,12 +270,12 @@ void ProgramOptimize::GenerateOps( } void ProgramOptimize::GenerateOps( - std::vector> *op_descs, - Node *begin_node) { + std::vector> *op_descs, Node *begin_node, + bool can_add_split) { // std::vector> *op_desc, // Node *input_node, Node *current_node, bool adding_thread, int // thread_num - if (false) { + if (can_add_split) { this->GenerateOps(op_descs, begin_node, begin_node, false, -1, nullptr); } else { this->GenerateOps(op_descs, begin_node, begin_node); diff --git a/src/framework/program/program-optimize/program_optimize.h b/src/framework/program/program-optimize/program_optimize.h index 93943cf83951565d91f67bfa77881dbcb130278d..ae632da4bdf004f23e7dab86ab06a4e007fdb75b 100644 --- a/src/framework/program/program-optimize/program_optimize.h +++ b/src/framework/program/program-optimize/program_optimize.h @@ -34,7 +34,7 @@ class ProgramOptimize { int current_block_; std::vector> new_blocks_; void GenerateOps(std::vector> *op_descs, - Node *begin_node); + Node *begin_node, bool can_add_split); void GenerateOps(std::vector> *op_desc, Node *input_node, Node *current_node); void GenerateOps(std::vector> *op_desc, diff --git a/src/io/io.cpp b/src/io/io.cpp index 7afb44bb45ab333a2bbda4fee533be995d73a630..c60113bb7882b7482cf5a23e4ad48adb6ec63de8 100644 --- a/src/io/io.cpp +++ b/src/io/io.cpp @@ -14,9 +14,11 @@ limitations under the License. */ #include "io.h" #include +#define PADDLE_MOBILE_PROFILE #ifdef PADDLE_MOBILE_PROFILE +#include #include -#include +#include #endif #include "common/enforce.h" @@ -74,8 +76,9 @@ static size_t ReadBuffer(const char *file_name, uint8_t **out) { template const framework::Program Loader::Load( - const std::string &dirname, bool optimize) { - auto program = this->LoadProgram(dirname + "/__model__", optimize); + const std::string &dirname, bool optimize, bool can_add_split) { + auto program = + this->LoadProgram(dirname + "/__model__", optimize, can_add_split); program.model_path = dirname; return program; } @@ -92,7 +95,7 @@ const framework::Program Loader::Load( template const framework::Program Loader::LoadProgram( - const std::string &model_path, bool optimize) { + const std::string &model_path, bool optimize, bool can_add_split) { std::string model_filename = model_path; PaddleMobile__Framework__Proto__ProgramDesc *c_program; uint8_t *buf = NULL; @@ -144,7 +147,7 @@ const framework::Program Loader::LoadProgram( if (optimize) { framework::ProgramOptimize program_optimize; program.optimizeProgram = - program_optimize.FushionOptimize(originProgramDesc); + program_optimize.FushionOptimize(originProgramDesc, can_add_split); } if (optimize) { program.optimizeProgram->Description("optimize: "); @@ -308,6 +311,7 @@ void Executor::InitMemory() { template void Executor::InitCombineMemory() { + LOG(kLOG_INFO) << " begin init combine memory"; char *origin_data = Get_binary_data(program_.para_path); char *data = origin_data; for (const auto &block : to_predict_program_->Blocks()) { @@ -328,6 +332,7 @@ void Executor::InitCombineMemory() { } } delete origin_data; + LOG(kLOG_INFO) << " end init combine memory "; } template @@ -341,31 +346,37 @@ std::shared_ptr Executor::Predict( std::shared_ptr to_predict_block = to_predict_program_->Block(0); #ifdef PADDLE_MOBILE_PROFILE - std::map _profile; + std::unordered_map _profile; #endif for (int j = 0; j < ops_of_block_[*to_predict_block.get()].size(); ++j) { auto op = ops_of_block_[*to_predict_block.get()][j]; #ifdef PADDLE_MOBILE_PROFILE - _profile[op->Type()] = clock(); + _profile[op->Type()] -= clock(); #endif op->Run(); #ifdef PADDLE_MOBILE_PROFILE - _profile[op->Type()] = clock() - _profile[op->Type()]; + _profile[op->Type()] += clock(); #endif } #ifdef PADDLE_MOBILE_PROFILE { - DLOG << "========================[ profile ]=========================="; + std::cout << "====================[ profile ]======================\n"; + using prof_t = std::pair; + std::vector _tprofile(_profile.begin(), _profile.end()); clock_t _ptotal = 0; - for (auto const &p : _profile) { + for (auto const &p : _tprofile) { _ptotal += p.second; } - for (auto const &p : _profile) { - DLOG << p.first << std::string(16 - p.first.size(), ' ') << "\t" - << (float)p.second << "\t\t" - << (float)p.second / (float)_ptotal * 100.0; + auto compf = [](const prof_t &a, const prof_t &b) { + return a.second > b.second; + }; + std::sort(_tprofile.begin(), _tprofile.end(), compf); + _tprofile.push_back(std::make_pair("total", _ptotal)); + for (auto const &p : _tprofile) { + printf("%-16s\t%-10.0f\t%-.4f\n", p.first.c_str(), (float)p.second, + (float)p.second / _ptotal * 100.0); } - DLOG << "========================[ ]=========================="; + std::cout << "====================[---------]======================\n"; } #endif auto ops = ops_of_block_[*to_predict_program_->Block(0)]; diff --git a/src/io/io.h b/src/io/io.h index fb18ca0cc1768f5cfe39acfcba7d0117a67e1de5..a1fbf158c2b026336d363db512cb44fe58ee93db 100644 --- a/src/io/io.h +++ b/src/io/io.h @@ -35,7 +35,8 @@ class Loader { * @b 加载分开形式的 fluid 模型 * */ const framework::Program Load(const std::string &dirname, - bool optimize = false); + bool optimize = false, + bool can_add_split = false); /* * @b load combine format fluid mode @@ -47,7 +48,8 @@ class Loader { private: const framework::Program LoadProgram(const std::string &model_path, - bool optimize = false); + bool optimize = false, + bool can_add_split = false); }; template diff --git a/src/operators/kernel/arm/depthwise_conv_kernel.cpp b/src/operators/kernel/arm/depthwise_conv_kernel.cpp index 6cd4538c4540ff11d91a6f49d088ad38f6d992e7..e6f27b772562789e07807b2b56c1f9d73bf373a9 100644 --- a/src/operators/kernel/arm/depthwise_conv_kernel.cpp +++ b/src/operators/kernel/arm/depthwise_conv_kernel.cpp @@ -28,7 +28,6 @@ void DepthwiseConvKernel::Compute(const ConvParam ¶m) const { Tensor filter = *param.Filter(); Tensor *output = param.Output(); output->mutable_data(); - int groups = param.Groups(); std::vector strides = param.Strides(); std::vector paddings = param.Paddings(); @@ -40,7 +39,6 @@ void DepthwiseConvKernel::Compute(const ConvParam ¶m) const { std::vector filter_shape_vec(framework::vectorize(filter.dims())); std::vector output_shape_vec(framework::vectorize(output->dims())); - size_t data_dim = filter_shape_vec.size() - 2; std::vector col_shape_vec(1 + 2 * data_dim); col_shape_vec[0] = input->dims()[1] / groups; @@ -61,18 +59,13 @@ void DepthwiseConvKernel::Compute(const ConvParam ¶m) const { col_matrix.ShareDataWith(col); col_matrix.Resize(col_matrix_shape); } - // DLOG << " col_shape = " << col_shape; - // DLOG << " col_matrix_shape = " << col_matrix_shape; framework::DDim input_shape = framework::slice_ddim( input->dims(), 1, static_cast(input->dims().size())); - // DLOG << " input_shape = " << input_shape; framework::DDim filter_matrix_shape = {filter.dims()[0], filter.numel() / filter.dims()[0]}; filter.Resize(filter_matrix_shape); - // DLOG << " filter.dims() = " << filter.dims(); - framework::DDim output_matrix_shape = { output->dims()[1], output->numel() / (output->dims()[0] * output->dims()[1])}; @@ -87,8 +80,6 @@ void DepthwiseConvKernel::Compute(const ConvParam ¶m) const { for (int i = 0; i < batch_size; i++) { Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape); Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape); - // DLOG << " in_batch.dims() = " << in_batch.dims(); - // DLOG << " out_batch.dims() = " << out_batch.dims(); for (int g = 0; g < groups; g++) { Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step); @@ -111,13 +102,9 @@ void DepthwiseConvKernel::Compute(const ConvParam ¶m) const { // gemm Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step); Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step); - // DLOG << " out_slice " << out_slice.dims(); - // DLOG << " filter_slice " << filter_slice.dims(); - // DLOG << " col_matrix " << col_matrix.dims(); math::matmul(filter_slice, false, col_matrix, false, static_cast(1), &out_slice, static_cast(0)); - auto filter_ptr = filter_slice.data(); } } } diff --git a/src/operators/kernel/arm/transpose_kernel.cpp b/src/operators/kernel/arm/transpose_kernel.cpp index 3ebe261fb8fe511022d6efbf4641898ef326319f..1b41968f40d036d55b98298a76564dcc12576571 100644 --- a/src/operators/kernel/arm/transpose_kernel.cpp +++ b/src/operators/kernel/arm/transpose_kernel.cpp @@ -11,29 +11,28 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #ifdef TRANSPOSE_OP #include "operators/kernel/transpose_kernel.h" - namespace paddle_mobile { namespace operators { -template -void TransposeFunc(const int numel, const T* input, const vector axis, - const vector old_strides, const vector new_strides, - T* output) { - for (int i = 0; i < numel; ++i) { - int old_idx = 0; - int idx = i; - for (int j = 0; j < axis.size(); ++j) { - int order = axis[j]; - old_idx += (idx / new_strides[j]) * old_strides[order]; - idx %= new_strides[j]; - } - output[i] = input[old_idx]; - } -} +// vector pos; +// template +// void TransposeFunc(const int numel, const T* input, const vector axis, +// const vector old_strides, const vector +// new_strides, T* output) { +// for (int i = 0; i < numel; ++i) { +// int old_idx = 0; +// int idx = i; +// for (int j = 0; j < axis.size(); ++j) { +// int order = axis[j]; +// old_idx += (idx / new_strides[j]) * old_strides[order]; +// idx %= new_strides[j]; +// } +// output[i] = input[old_idx]; +// } +// } template <> void TransposeKernel::Compute(const TransposeParam& param) const { @@ -44,28 +43,38 @@ void TransposeKernel::Compute(const TransposeParam& param) const { const auto* input_x_data = input_x->data(); auto* out_data = out->mutable_data(); - size_t axis_size = axis.size(); - std::vector new_dims; - new_dims.reserve(axis_size); - for (auto c : axis) { - new_dims.push_back(input_x_dims[c]); + size_t ndim = axis.size(); + std::vector xdim(ndim); + std::vector xstride(ndim); + std::vector xout(ndim); + for (int i = 0; i < ndim; i++) { + int j = ndim - 1 - i; + xdim[j] = input_x_dims[axis[i]]; + xstride[j] = 1; + for (int k = axis[i] + 1; k < ndim; k++) { + xstride[j] *= input_x_dims[k]; + } + xout[j] = xstride[j] * xdim[j]; } - std::vector old_strides; - std::vector new_strides; - for (int i = 0; i < axis.size(); i++) { - int temp_old = 1; - int temp_new = 1; - for (int j = i + 1; j < axis.size(); j++) { - temp_old *= input_x_dims[j]; - temp_new *= new_dims[j]; + auto numel = input_x->numel(); + size_t pind = 0; + std::vector ind(ndim); + for (int i = 0; i < numel; i++) { + out_data[i] = input_x_data[pind]; + ind[0]++; + pind += xstride[0]; + for (int j = 0; j < ndim - 1; j++) { + if (ind[j] == xdim[j]) { + ind[j + 1]++; + ind[j] = 0; + pind += xstride[j + 1]; + pind -= xout[j]; + } else { + break; + } } - old_strides.push_back(temp_old); - new_strides.push_back(temp_new); } - - TransposeFunc(input_x->numel(), input_x_data, axis, old_strides, - new_strides, out_data); } } // namespace operators diff --git a/src/operators/math/gemm.cpp b/src/operators/math/gemm.cpp index fc243766bf9f8760178ac4efb0dfdd11a5742fa9..d69ae00d4aed04e42736afd10f51c88022387e29 100644 --- a/src/operators/math/gemm.cpp +++ b/src/operators/math/gemm.cpp @@ -114,10 +114,12 @@ void PackMatrixB_(int k, int n, int paddingN, const float *B, int ldb, for (j = 0; j < n - paddingN; j += NR) { for (i = 0; i < k; ++i) { Bij = &B(i, j); - *buffer++ = *Bij; - *buffer++ = *(Bij + 1); - *buffer++ = *(Bij + 2); - *buffer++ = *(Bij + 3); + asm volatile( + "vld1.32 {q0}, [%[Bij]] \n\t" + "vst1.32 {q0}, [%[buffer]]! \n\t" + : [buffer] "+r"(buffer) + : [Bij] "r"(Bij) + : "memory", "q0"); } } if (paddingN != 0) { diff --git a/src/operators/math/gemm.h b/src/operators/math/gemm.h index 87d65bdd28a42c4510668345ad7ce7058eb2cdf8..e510f4cdc9c6a1163914f72f73f1722529df9e16 100644 --- a/src/operators/math/gemm.h +++ b/src/operators/math/gemm.h @@ -20,9 +20,9 @@ limitations under the License. */ #define C(i, j) C[(i)*ldc + (j)] // 分块计算的块大小,mc 与 kc 分别对应分块计算时的 m 与 k -#define MC 384 -#define KC 384 -#define NC 4096 +#define MC 128 +#define KC 128 +#define NC 1024 #define MR 4 #define NR 4 diff --git a/test/framework/test_load.cpp b/test/framework/test_load.cpp index 2300f05c99a122b352d888a45ca3c6ef082469ba..32d314826f8d6bd4e504b16cd78464d660919a30 100644 --- a/test/framework/test_load.cpp +++ b/test/framework/test_load.cpp @@ -19,9 +19,10 @@ int main() { paddle_mobile::Loader loader; // ../../../test/models/googlenet // ../../../test/models/mobilenet - auto program = loader.Load(g_resnet, true); - loader.Load(g_googlenet_combine + "/model", g_googlenet_combine + "/params", - true); + auto program = loader.Load(g_googlenet, true, true); + // loader.Load(g_googlenet_combine + "/model", g_googlenet_combine + + // "/params", + // true); program.originProgram->Description("program desc: "); return 0;