/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include "io/paddle_mobile.h" #ifdef PADDLE_MOBILE_CL #include #include "framework/cl/cl_tensor.h" #endif #include "common/common.h" #include "operators/math/gemm.h" namespace paddle_mobile { template void PaddleMobile::SetThreadNum(int num) { #ifdef _OPENMP omp_set_num_threads(num); #endif } template bool PaddleMobile::Load(const std::string &dirname, bool optimize, bool quantification, int batch_size, bool loddable) { if (loader_.get() == nullptr) { loader_ = std::make_shared>(); } else { LOG(kLOG_INFO) << "loader inited"; } if (executor_.get() == nullptr) { executor_ = std::make_shared>( loader_->Load(dirname, optimize, quantification), batch_size, optimize, loddable); } else { LOG(kLOG_INFO) << "executor inited"; } return true; } template bool PaddleMobile::Load(const std::string &model_path, const std::string ¶_path, bool optimize, bool quantification, int batch_size, bool loddable) { if (loader_.get() == nullptr) { loader_ = std::make_shared>(); } else { LOG(kLOG_INFO) << "loader inited"; } if (executor_.get() == nullptr) { executor_ = std::make_shared>( loader_->Load(model_path, para_path, optimize, quantification), batch_size, optimize, loddable); } else { LOG(kLOG_INFO) << "executor inited"; } return true; } template bool PaddleMobile::LoadCombinedMemory(size_t model_len, const uint8_t *model_buf, size_t combined_params_len, uint8_t *combined_params_buf) { int batch_size = 1; bool optimise = true; bool quantification = false; if (loader_.get() == nullptr) { loader_ = std::make_shared>(); } else { LOG(kLOG_INFO) << "loader inited"; } if (executor_.get() == nullptr) { executor_ = std::make_shared>( loader_->LoadCombinedMemory(model_len, model_buf, combined_params_len, combined_params_buf, optimise, quantification), batch_size, optimise); } else { LOG(kLOG_INFO) << "executor inited"; } return true; } template std::shared_ptr PaddleMobile::Predict( const framework::Tensor &t) { return executor_->Predict(t); } template std::shared_ptr PaddleMobile::PredictLod( const framework::LoDTensor &t) { return executor_->PredictLod(t); } template std::vector::Ptype> PaddleMobile::Predict(const std::vector &input, const std::vector &dims) { return executor_->Predict(input, dims); } template void PaddleMobile::Clear() { executor_ = nullptr; loader_ = nullptr; } template double PaddleMobile::GetPredictTime() {} #ifdef PADDLE_MOBILE_CPU template <> double PaddleMobile::GetPredictTime() { int m = 32; int n = 224 * 224; int k = 27; int lda = k; int ldb = n; int ldc = n; float *a = static_cast(paddle_mobile::memory::Alloc(sizeof(float) * m * k)); float *b = static_cast(paddle_mobile::memory::Alloc(sizeof(float) * k * n)); float *c = static_cast(paddle_mobile::memory::Alloc(sizeof(float) * m * n)); int t1 = 1; int t2 = 1; for (int i = 0; i < m * k; ++i) { unsigned int seed = 100; a[i] = t1 + rand_r(&seed) % t2; } for (int i = 0; i < k * n; ++i) { unsigned int seed = 200; b[i] = t1 + rand_r(&seed) % t2; } paddle_mobile::operators::math::Gemm gemm; auto time1 = paddle_mobile::time(); gemm.Sgemm(m, n, k, static_cast(1), a, lda, b, ldb, static_cast(0), c, ldc, false, static_cast(nullptr)); auto time2 = paddle_mobile::time(); double cost = paddle_mobile::time_diff(time1, time2); paddle_mobile::memory::Free(a); paddle_mobile::memory::Free(b); paddle_mobile::memory::Free(c); return cost; } #endif template PaddleMobile::~PaddleMobile() { executor_ = nullptr; loader_ = nullptr; } #ifdef PADDLE_MOBILE_FPGA template void PaddleMobile::InjectVariable(const framework::Tensor &t, std::string var_name) { executor_->InjectVariable(t, var_name); } template void PaddleMobile::FeedData(const framework::Tensor &t) { executor_->FeedData(t); } template std::shared_ptr PaddleMobile::FetchResult(int id) { return executor_->FetchResult(id); } template void PaddleMobile::Predict_From_To(int start, int end) { executor_->Predict_From_To(start, end); } template void PaddleMobile::Predict_From(int start) { executor_->Predict_From(start); } template void PaddleMobile::Predict_To(int end) { executor_->Predict_To(end); } #endif #ifdef PADDLE_MOBILE_CL static std::mutex lc; template void PaddleMobile::SetCLPath(std::string path) { std::lock_guard lock(lc); if (framework::CLEngine::Instance()->GetCLPath() == "") { framework::CLEngine::Instance()->setClPath(path); } } template <> double PaddleMobile::GetPredictTime() { cl_int status; cl_uint nPlatform; clGetPlatformIDs(0, NULL, &nPlatform); cl_platform_id *listPlatform = reinterpret_cast( malloc(nPlatform * sizeof(cl_platform_id))); clGetPlatformIDs(nPlatform, listPlatform, NULL); cl_uint nDevice = 0; clGetDeviceIDs(listPlatform[0], CL_DEVICE_TYPE_GPU, 0, NULL, &nDevice); cl_device_id *listDevice = reinterpret_cast(malloc(nDevice * sizeof(cl_device_id))); clGetDeviceIDs(listPlatform[0], CL_DEVICE_TYPE_GPU, nDevice, listDevice, NULL); cl_context context = clCreateContext(NULL, nDevice, listDevice, NULL, NULL, &status); cl_command_queue queue = clCreateCommandQueue(context, listDevice[0], 0, &status); int n = 1; int c = 3; int h = 224; int w = 224; float *input = static_cast( paddle_mobile::memory::Alloc(sizeof(float) * 3 * 224 * 224)); float *filter = static_cast( paddle_mobile::memory::Alloc(sizeof(float) * 32 * 27)); int input_w = w * (c + 3) / 4; int input_h = n * h; int filter_w = 3 * (3 + 3) / 4; int filter_h = 32 * 3; int output_w = 224 * (32 + 3) / 4; int output_h = 1 * 224; framework::DDim input_dims = {1, 3, 224, 224}; framework::CLTensor input_cl_tensor(context, queue); input_cl_tensor.Resize(input_dims); cl_mem inputBuffer = input_cl_tensor.mutable_with_data(input); framework::DDim filter_dims = {32, 3, 3, 3}; framework::CLTensor filter_cl_tensor(context, queue); input_cl_tensor.Resize(filter_dims); cl_mem filterBuffer = filter_cl_tensor.mutable_with_data(filter); cl_mem cl_filter_image = NULL; cl_mem cl_input_image = NULL; cl_mem cl_output_image = NULL; cl_image_format cf = {.image_channel_order = CL_RGBA, .image_channel_data_type = CL_HALF_FLOAT}; cl_input_image = clCreateImage2D(context, CL_MEM_READ_WRITE | 0, &cf, input_w, input_h, 0, NULL, &status); cl_filter_image = clCreateImage2D(context, CL_MEM_READ_WRITE | 0, &cf, filter_w, filter_h, 0, NULL, &status); cl_output_image = clCreateImage2D(context, CL_MEM_READ_WRITE | 0, &cf, output_w, output_h, 0, NULL, &status); char *code; std::string path = framework::CLEngine::Instance()->GetCLPath() + "/cl_kernel/feed_kernel.cl"; size_t length = readText(path.c_str(), &code); cl_program program = clCreateProgramWithSource( context, 1, (const char **)&code, &length, NULL); std::string path1 = "-cl-fast-relaxed-math -I " + framework::CLEngine::Instance()->GetCLPath() + "/cl_kernel"; clBuildProgram(program, 0, 0, path1.c_str(), NULL, NULL); cl_kernel kernel = clCreateKernel(program, "feed", &status); int out_H = 224; int out_W = 224; int out_C = 3; int Stride2 = out_C * out_H * out_W; int Stride1 = out_H * out_W; int Stride0 = out_W; status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputBuffer); CL_CHECK_ERRORS(status); status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &cl_input_image); CL_CHECK_ERRORS(status); status = clSetKernelArg(kernel, 2, sizeof(cl_int), &out_H); CL_CHECK_ERRORS(status); status = clSetKernelArg(kernel, 3, sizeof(cl_int), &out_W); CL_CHECK_ERRORS(status); status = clSetKernelArg(kernel, 4, sizeof(cl_int), &out_C); CL_CHECK_ERRORS(status); status = clSetKernelArg(kernel, 5, sizeof(cl_int), &Stride0); CL_CHECK_ERRORS(status); status = clSetKernelArg(kernel, 6, sizeof(cl_int), &Stride1); CL_CHECK_ERRORS(status); status = clSetKernelArg(kernel, 7, sizeof(cl_int), &Stride2); CL_CHECK_ERRORS(status); size_t global_work_size[3] = {1, 224, 224}; // cl_event out_event = param.Out()->GetClEvent(); status = clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, NULL, 0, NULL, NULL); CL_CHECK_ERRORS(status); out_H = 3; out_W = 3; out_C = 3; Stride2 = out_C * out_H * out_W; Stride1 = out_H * out_W; Stride0 = out_W; status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &filterBuffer); CL_CHECK_ERRORS(status); status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &cl_filter_image); CL_CHECK_ERRORS(status); status = clSetKernelArg(kernel, 2, sizeof(cl_int), &out_H); CL_CHECK_ERRORS(status); status = clSetKernelArg(kernel, 3, sizeof(cl_int), &out_W); CL_CHECK_ERRORS(status); status = clSetKernelArg(kernel, 4, sizeof(cl_int), &out_C); CL_CHECK_ERRORS(status); status = clSetKernelArg(kernel, 5, sizeof(cl_int), &Stride0); CL_CHECK_ERRORS(status); status = clSetKernelArg(kernel, 6, sizeof(cl_int), &Stride1); CL_CHECK_ERRORS(status); status = clSetKernelArg(kernel, 7, sizeof(cl_int), &Stride2); CL_CHECK_ERRORS(status); size_t global_work_size1[3] = {1, 3, 96}; // cl_event out_event = param.Out()->GetClEvent(); status = clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size1, NULL, 0, NULL, NULL); CL_CHECK_ERRORS(status); clFinish(queue); queue = clCreateCommandQueue(context, listDevice[0], 0, &status); path = framework::CLEngine::Instance()->GetCLPath() + "/cl_kernel/conv_kernel.cl"; size_t length1 = readText(path.c_str(), &code); program = clCreateProgramWithSource(context, 1, (const char **)&code, &length1, &status); CL_CHECK_ERRORS(status); clBuildProgram(program, 0, 0, path1.c_str(), NULL, NULL); kernel = clCreateKernel(program, "conv_3x3", &status); CL_CHECK_ERRORS(status); int c_block = (32 + 3) / 4; int nh = n * h; int stride = 1; int offset = 0; int input_c = (c + 3) / 4; int dilation = 1; int input_width = 224; int input_height = 224; int output_width = 224; int output_height = 224; status = clSetKernelArg(kernel, 0, sizeof(int), &c_block); CL_CHECK_ERRORS(status); status = clSetKernelArg(kernel, 1, sizeof(int), &w); CL_CHECK_ERRORS(status); status = clSetKernelArg(kernel, 2, sizeof(int), &nh); CL_CHECK_ERRORS(status); status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &cl_input_image); CL_CHECK_ERRORS(status); status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &cl_filter_image); CL_CHECK_ERRORS(status); status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &cl_output_image); CL_CHECK_ERRORS(status); status = clSetKernelArg(kernel, 6, sizeof(int), &stride); CL_CHECK_ERRORS(status); status = clSetKernelArg(kernel, 7, sizeof(int), &offset); CL_CHECK_ERRORS(status); status = clSetKernelArg(kernel, 8, sizeof(int), &input_c); CL_CHECK_ERRORS(status); status = clSetKernelArg(kernel, 9, sizeof(int), &dilation); CL_CHECK_ERRORS(status); status = clSetKernelArg(kernel, 10, sizeof(int), &input_width); CL_CHECK_ERRORS(status); status = clSetKernelArg(kernel, 11, sizeof(int), &input_height); CL_CHECK_ERRORS(status); status = clSetKernelArg(kernel, 12, sizeof(int), &output_width); CL_CHECK_ERRORS(status); status = clSetKernelArg(kernel, 13, sizeof(int), &output_height); CL_CHECK_ERRORS(status); // cl_event out_event = param.Output()->GetClEvent(); // cl_event wait_event = param.Input()->GetClEvent(); size_t global_work_size2[3] = {8, 224, 224}; auto time1 = paddle_mobile::time(); status = clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size2, NULL, 0, NULL, NULL); CL_CHECK_ERRORS(status); clFinish(queue); auto time2 = paddle_mobile::time(); paddle_mobile::memory::Free(input); paddle_mobile::memory::Free(filter); if (status == CL_SUCCESS) { return paddle_mobile::time_diff(time1, time2); } else { return -1; } } template int PaddleMobile::readText( const char *kernelPath, char **pcode) { // 读取文本文件放入 pcode，返回字符串长度 FILE *fp; int size; // printf(" File: %s\n", kernelPath); fp = fopen(kernelPath, "rb"); if (!fp) { printf(" Open file failed\n"); return -1; } if (fseek(fp, 0, SEEK_END) != 0) { printf(" Seek end of file failed\n"); return -1; } if ((size = ftell(fp)) < 0) { printf(" Get file position failed\n"); return -1; } rewind(fp); if ((*pcode = reinterpret_cast(malloc(size + 1))) == NULL) { printf(" Allocate space failed\n"); return -1; } fread(*pcode, 1, size, fp); (*pcode)[size] = '\0'; fclose(fp); return size + 1; } #endif template class PaddleMobile; template class PaddleMobile; template class PaddleMobile; template class PaddleMobile; } // namespace paddle_mobile