未验证 提交 136e78cd 编写于 作者: R Ray Liu 提交者: GitHub

Merge pull request #1232 from codeWorm2015/develop

fix #1231  merge opencl branch, support Andreno GPU、Mali GPU
cmake_minimum_required(VERSION 3.0)
project(paddle-mobile)
# select the platform to build
option(CPU "armv7 with neon support" ON)
option(MALI_GPU "mali gpu support" OFF)
option(FPGA "fpga support" OFF)
cmake_minimum_required(VERSION 3.6)
option(USE_OPENMP "openmp support" OFF)
option(USE_OPENMP "openmp support" ON)
option(DEBUGING "enable debug mode" ON)
option(USE_EXCEPTION "use std exception" OFF)
option(USE_EXCEPTION "use std exception" ON)
option(LOG_PROFILE "log profile" OFF)
# select the platform to build
option(CPU "armv7 with neon" ON)
option(GPU_MALI "mali gpu" OFF)
option(GPU_CL "opencl gpu" ON)
option(FPGA "fpga" OFF)
project(paddle-mobile)
file(GLOB_RECURSE PADDLE_MOBILE_CC src/*.cc src/*.cpp src/*.c src/*.mm)
file(GLOB_RECURSE PADDLE_MOBILE_H src/*.h)
......@@ -70,7 +71,27 @@ else()
endforeach()
endif()
if(MALI_GPU)
if (GPU_CL)
add_definitions(-DPADDLE_MOBILE_CL)
# opencl version
add_definitions(-DCL_TARGET_OPENCL_VERSION=220)
link_libraries(${CMAKE_CURRENT_LIST_DIR}/third_party/opencl/libOpenCL.so)
include_directories(third_party/opencl/OpenCL-Headers)
else()
file(GLOB_RECURSE _tmp_list src/framework/cl/*.cpp src/operators/kernel/cl/*.cpp)
foreach(f ${_tmp_list})
list(REMOVE_ITEM PADDLE_MOBILE_CC ${f})
endforeach()
file(GLOB_RECURSE _tmp_list_h src/framework/cl/*.h)
foreach(f ${_tmp_list_h})
list(REMOVE_ITEM PADDLE_MOBILE_H ${f})
endforeach()
endif()
if (GPU_MALI)
add_definitions(-DPADDLE_MOBILE_MALI_GPU)
add_definitions(-DUSE_ACL=1)
add_definitions(-DUSE_OPENCL)
......@@ -124,17 +145,17 @@ endif()
if(ANDROID_NDK_TOOLCHAIN_INCLUDED)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -llog")
else()
list(REMOVE_ITEM PADDLE_MOBILE_H ${CMAKE_CURRENT_SOURCE_DIR}/src/jni/paddle_mobile_jni.h)
list(REMOVE_ITEM PADDLE_MOBILE_CC ${CMAKE_CURRENT_SOURCE_DIR}/src/jni/paddle_mobile_jni.cpp)
list(REMOVE_ITEM PADDLE_MOBILE_H ${CMAKE_CURRENT_SOURCE_DIR}/src/io/jni/paddle_mobile_jni.h)
list(REMOVE_ITEM PADDLE_MOBILE_CC ${CMAKE_CURRENT_SOURCE_DIR}/src/io/jni/paddle_mobile_jni.cpp)
list(REMOVE_ITEM PADDLE_MOBILE_H ${CMAKE_CURRENT_SOURCE_DIR}/src/operators/math/math_func_neon.h)
endif()
if(IS_IOS)
else()
list(REMOVE_ITEM PADDLE_MOBILE_H ${CMAKE_CURRENT_SOURCE_DIR}/src/ios_io/PaddleMobileCPU.h)
list(REMOVE_ITEM PADDLE_MOBILE_CC ${CMAKE_CURRENT_SOURCE_DIR}/src/ios_io/PaddleMobileCPU.mm)
list(REMOVE_ITEM PADDLE_MOBILE_H ${CMAKE_CURRENT_SOURCE_DIR}/src/ios_io/op_symbols.h)
endif()
list(REMOVE_ITEM PADDLE_MOBILE_H ${CMAKE_CURRENT_SOURCE_DIR}/src/io/ios_io/PaddleMobileCPU.h)
list(REMOVE_ITEM PADDLE_MOBILE_CC ${CMAKE_CURRENT_SOURCE_DIR}/src/io/ios_io/PaddleMobileCPU.mm)
list(REMOVE_ITEM PADDLE_MOBILE_H ${CMAKE_CURRENT_SOURCE_DIR}/src/io/ios_io/op_symbols.h)
endif ()
set(CMAKE_VERBOSE_MAKEFILE ON)
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
......
......@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <chrono>
#include <chrono> // NOLINT
namespace paddle_mobile {
using Time = decltype(std::chrono::high_resolution_clock::now());
......@@ -25,3 +27,5 @@ inline double time_diff(Time t1, Time t2) {
ms counter = std::chrono::duration_cast<ms>(diff);
return counter.count() / 1000.0;
}
} // namespace paddle_mobile
......@@ -46,7 +46,8 @@ struct PaddleMobileException : public std::exception {
std::string detail(buffer); \
throw paddle_mobile::PaddleMobileException("Custom Exception", buffer, \
__FILE__, __LINE__); \
}
} \
exit(0);
#define PADDLE_MOBILE_ENFORCE(stat, ...) \
{ \
......
......@@ -39,7 +39,13 @@ struct PrecisionTrait<Precision::FP16> {
};
//! device type
enum DeviceTypeEnum { kINVALID = -1, kCPU = 0, kFPGA = 1, kGPU_MALI = 2 };
enum DeviceTypeEnum {
kINVALID = -1,
kCPU = 0,
kFPGA = 1,
kGPU_MALI = 2,
kGPU_CL = 3
};
template <DeviceTypeEnum T>
struct DeviceType {};
......@@ -47,6 +53,7 @@ struct DeviceType {};
typedef DeviceType<kCPU> CPU;
typedef DeviceType<kFPGA> FPGA;
typedef DeviceType<kGPU_MALI> GPU_MALI;
typedef DeviceType<kGPU_CL> GPU_CL;
//! data type
enum DataType {
......
......@@ -117,9 +117,9 @@ class Attribute {
template <typename Vistor>
static typename Vistor::type_t ApplyVistor(Vistor vistor, Attribute attr) {
if (attr.variant_.TypeId() == typeid(int).hash_code()) {
if (attr.variant_.TypeId() == typeid(int).hash_code()) { // NOLINT
return vistor(attr.variant_.Get<int>());
} else if (attr.variant_.TypeId() == typeid(float).hash_code()) {
} else if (attr.variant_.TypeId() == typeid(float).hash_code()) { // NOLINT
return vistor(attr.variant_.Get<float>());
} else if (attr.variant_.TypeId() == typeid(string).hash_code()) {
return vistor(attr.variant_.GetString());
......@@ -129,7 +129,7 @@ class Attribute {
return vistor(attr.variant_.Get<vector<float>>());
} else if (attr.variant_.TypeId() == typeid(vector<string>).hash_code()) {
return vistor(attr.variant_.Get<vector<string>>());
} else if (attr.variant_.TypeId() == typeid(bool).hash_code()) {
} else if (attr.variant_.TypeId() == typeid(bool).hash_code()) { // NOLINT
return vistor(attr.variant_.Get<bool>());
} else if (attr.variant_.TypeId() == typeid(vector<bool>).hash_code()) {
return vistor(attr.variant_.Get<vector<bool>>());
......@@ -137,7 +137,6 @@ class Attribute {
return vistor(attr.variant_.Get<int64_t>());
} else {
PADDLE_MOBILE_THROW_EXCEPTION("type not support");
exit(0);
}
}
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "CL/cl.h"
struct CLKernelDeleter {
template <class T>
void operator()(T *clKernelObj) {
clReleaseKernel(clKernelObj);
}
};
struct CLMemDeleter {
template <class T>
void operator()(T *clMemObj) {
clReleaseMemObject(clMemObj);
}
};
struct CLEventDeleter {
template <class T>
void operator()(T *clEventObj) {
clReleaseEvent(clEventObj);
}
};
struct CLCommQueueDeleter {
template <class T>
void operator()(T *clQueueObj) {
clReleaseCommandQueue(clQueueObj);
}
};
struct CLContextDeleter {
template <class T>
void operator()(T *clContextObj) {
clReleaseContext(clContextObj);
}
};
struct CLProgramDeleter {
template <class T>
void operator()(T *clProgramObj) {
clReleaseProgram(clProgramObj);
}
};
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "framework/cl/cl_engine.h"
#include "CL/cl.h"
#include "framework/cl/cl_tool.h"
#include <cstdlib>
#include <cstring>
namespace paddle_mobile {
namespace framework {
bool CLEngine::Init() {
if (initialized_) {
return true;
}
cl_int status;
SetPlatform();
SetClDeviceId();
initialized_ = true;
return initialized_;
// setClCommandQueue();
// std::string filename = "./HelloWorld_Kernel.cl";
// loadKernelFromFile(filename.c_str());
// buildProgram();
}
CLEngine *CLEngine::Instance() {
static CLEngine cl_engine_;
cl_engine_.Init();
return &cl_engine_;
}
bool CLEngine::SetPlatform() {
platform_ = NULL; // the chosen platform
cl_uint numPlatforms; // the NO. of platforms
cl_int status = clGetPlatformIDs(0, NULL, &numPlatforms);
/**For clarity, choose the first available platform. */
if (numPlatforms > 0) {
cl_platform_id *platforms = reinterpret_cast<cl_platform_id *>(
malloc(numPlatforms * sizeof(cl_platform_id)));
status = clGetPlatformIDs(numPlatforms, platforms, NULL);
platform_ = platforms[0];
free(platforms);
return true;
} else {
return false;
}
}
bool CLEngine::SetClDeviceId() {
cl_uint numDevices = 0;
devices_ = NULL;
cl_int status =
clGetDeviceIDs(platform_, CL_DEVICE_TYPE_GPU, 0, NULL, &numDevices);
if (numDevices > 0) {
devices_ = reinterpret_cast<cl_device_id *>(
malloc(numDevices * sizeof(cl_device_id)));
status = clGetDeviceIDs(platform_, CL_DEVICE_TYPE_GPU, numDevices, devices_,
NULL);
return true;
}
return false;
}
// std::unique_ptr<_cl_kernel, clKernel_deleter> CLEngine::GSetKernel(
// const std::string &kernel_name) {
// std::unique_ptr<_cl_kernel, clKernel_deleter> kernel(
// clCreateKernel(program_.get(), kernel_name.c_str(), NULL));
// return std::move(kernel);
//}
//
// bool CLEngine::SetClCommandQueue() {
// cl_int status;
// command_queue_.reset(
// clCreateCommandQueue(context_.get(), devices_[0], 0, &status));
// return true;
//}
// bool CLEngine::SetClContext() {
// context_.reset(clCreateContext(NULL, 1, devices_, NULL, NULL, NULL));
// return true;
//}
// bool CLEngine::LoadKernelFromFile(const char *kernel_file) {
// size_t size;
// char *str;
// std::fstream f(kernel_file, (std::fstream::in | std::fstream::binary));
//
// if (!f.is_open()) {
// return false;
// }
//
// size_t fileSize;
// f.seekg(0, std::fstream::end);
// size = fileSize = (size_t)f.tellg();
// f.seekg(0, std::fstream::beg);
// str = new char[size + 1];
// if (!str) {
// f.close();
// return 0;
// }
//
// f.read(str, fileSize);
// f.close();
// str[size] = '\0';
// const char *source = str;
// size_t sourceSize[] = {strlen(source)};
// program_.reset(
// clCreateProgramWithSource(context_.get(), 1, &source, sourceSize,
// NULL));
// return true;
//}
} // namespace framework
} // namespace paddle_mobile
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <memory>
#include <string>
#include "CL/cl.h"
#include "common/enforce.h"
#include "common/log.h"
#include "framework/cl/cl_deleter.h"
#include "framework/cl/cl_tool.h"
namespace paddle_mobile {
namespace framework {
class CLEngine {
public:
static CLEngine *Instance();
bool Init();
std::unique_ptr<_cl_context, CLContextDeleter> CreateContext() {
cl_int status;
cl_context c = clCreateContext(NULL, 1, devices_, NULL, NULL, &status);
std::unique_ptr<_cl_context, CLContextDeleter> context_ptr(c);
CL_CHECK_ERRORS(status);
return std::move(context_ptr);
}
std::unique_ptr<_cl_command_queue, CLCommQueueDeleter> CreateClCommandQueue(
cl_context context) {
cl_int status;
cl_command_queue queue =
clCreateCommandQueue(context, devices_[0], 0, &status);
std::unique_ptr<_cl_command_queue, CLCommQueueDeleter> command_queue_ptr(
queue);
CL_CHECK_ERRORS(status);
return std::move(command_queue_ptr);
}
std::unique_ptr<_cl_program, CLProgramDeleter> CreateProgramWith(
cl_context context, std::string file_name) {
FILE *file = fopen(file_name.c_str(), "rb");
PADDLE_MOBILE_ENFORCE(file != nullptr, "can't open file: %s ",
file_name.c_str());
fseek(file, 0, SEEK_END);
int64_t size = ftell(file);
PADDLE_MOBILE_ENFORCE(size > 0, "size is too small");
rewind(file);
char *data = new char[size + 1];
size_t bytes_read = fread(data, 1, size, file);
data[size] = '\0';
PADDLE_MOBILE_ENFORCE(bytes_read == size,
"read binary file bytes do not match with fseek");
fclose(file);
const char *source = data;
size_t sourceSize[] = {strlen(source)};
cl_program p =
clCreateProgramWithSource(context, 1, &source, sourceSize, &status_);
DLOG << " cl kernel file name: " << file_name;
DLOG << " source size: " << sourceSize[0];
CL_CHECK_ERRORS(status_);
std::unique_ptr<_cl_program, CLProgramDeleter> program_ptr(p);
return std::move(program_ptr);
}
std::unique_ptr<_cl_event, CLEventDeleter> CreateEvent(cl_context context) {
cl_event event = clCreateUserEvent(context, &status_);
std::unique_ptr<_cl_event, CLEventDeleter> event_ptr(event);
CL_CHECK_ERRORS(status_);
return std::move(event_ptr);
}
bool BuildProgram(cl_program program) {
cl_int status;
status = clBuildProgram(program, 0, 0, "-cl-fast-relaxed-math -I cl_kernel",
0, 0);
CL_CHECK_ERRORS(status);
if (status_ == CL_BUILD_PROGRAM_FAILURE) {
size_t log_size;
clGetProgramBuildInfo(program, CLEngine::Instance()->DeviceID(),
CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
char *log = reinterpret_cast<char *>(malloc(log_size));
clGetProgramBuildInfo(program, CLEngine::Instance()->DeviceID(),
CL_PROGRAM_BUILD_LOG, log_size, log, NULL);
DLOG << " program build error: " << log;
}
if (status == CL_SUCCESS) {
return true;
} else {
return false;
}
}
cl_device_id DeviceID(int index = 0) { return devices_[index]; }
private:
CLEngine() { initialized_ = false; }
bool SetPlatform();
bool SetClDeviceId();
bool initialized_;
cl_platform_id platform_;
cl_device_id *devices_;
cl_int status_;
std::unique_ptr<_cl_program, CLProgramDeleter> program_;
// bool SetClContext();
// bool SetClCommandQueue();
// bool LoadKernelFromFile(const char *kernel_file);
// bool BuildProgram();
};
} // namespace framework
} // namespace paddle_mobile
此差异已折叠。
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <cstdint>
namespace paddle_mobile {
namespace framework {
typedef uint16_t half_t;
half_t Float2Half(float f);
float Half2Float(half_t h);
void FloatArray2HalfArray(float *f_array, half_t *h_array, int count);
void HalfArray2FloatArray(half_t *h_array, float *f_array, int count);
} // namespace framework
} // namespace paddle_mobile
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <string>
#include <type_traits>
#include <vector>
#include "common/log.h"
#include "framework/cl/cl_deleter.h"
#include "framework/cl/cl_image.h"
#include "framework/cl/cl_scope.h"
namespace paddle_mobile {
namespace framework {
class CLHelper {
public:
CLHelper() = default;
explicit CLHelper(CLScope *scope) : scope_(scope) {}
void AddKernel(const std::string &kernel_name, const std::string &file_name) {
DLOG << " begin add kernel ";
auto kernel = scope_->GetKernel(kernel_name, file_name);
DLOG << " add kernel ing ";
kernels.emplace_back(std::move(kernel));
}
cl_kernel KernelAt(const int index) {
DLOG << " kernel count: " << kernels.size();
return kernels[index].get();
}
cl_command_queue CLCommandQueue() { return scope_->CommandQueue(); }
cl_context CLContext() { return scope_->Context(); }
std::vector<size_t> DefaultWorkSize(const CLImage &image) {
// n c h w
auto image_dim = image.dims();
if (image_dim.size() == 4) {
auto n = image_dim[0];
auto h = image_dim[2];
auto w = image_dim[3];
auto image_width = image.ImageWidth();
auto work_size_0 = image_width / w;
auto work_size_1 = w;
auto work_size_2 = n * h;
return {work_size_0, work_size_1, work_size_2};
} else if (image_dim.size() == 2) {
return {1, image.ImageWidth(), image.ImageHeight()};
} else if (image_dim.size() == 1) {
return {1, image.ImageWidth(), 1};
}
PADDLE_MOBILE_THROW_EXCEPTION(" not support this dim, need imp ");
}
private:
CLScope *scope_;
std::vector<std::unique_ptr<_cl_kernel, CLKernelDeleter>> kernels;
};
} // namespace framework
} // namespace paddle_mobile
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "framework/cl/cl_image.h"
namespace paddle_mobile {
namespace framework {
void CLImageToTensor(CLImage *cl_image, Tensor *tensor,
cl_command_queue commandQueue) {
// TODO(yangfei): need imp
}
void TensorToCLImage(const Tensor *tensor, CLImage *cl_image,
cl_command_queue commandQueue) {
// TODO(yangfei): need imp
}
#ifdef PADDLE_MOBILE_DEBUG
Print &operator<<(Print &printer, const CLImage &cl_image) {
int width = cl_image.ImageDims()[0];
int height = cl_image.ImageDims()[1];
half_t *image_data = new half_t[height * width * 4];
cl_int err;
cl_mem image = cl_image.GetCLImage();
size_t origin[3] = {0, 0, 0};
size_t region[3] = {width, height, 1};
err = clEnqueueReadImage(cl_image.CommandQueue(), image, CL_TRUE, origin,
region, 0, 0, image_data, 0, NULL, NULL);
CL_CHECK_ERRORS(err);
float *tensor_data = new float[cl_image.numel()];
auto converter = cl_image.Converter();
converter->ImageToNCHW(image_data, tensor_data, cl_image.ImageDims(),
cl_image.dims());
int stride = cl_image.numel() / 20;
stride = stride > 0 ? stride : 1;
printer << " dims: " << cl_image.dims() << "\n";
for (int i = 0; i < cl_image.numel(); i += stride) {
printer << tensor_data[i] << " ";
}
delete[](tensor_data);
delete[](image_data);
return printer;
}
#endif
} // namespace framework
} // namespace paddle_mobile
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <vector>
#include "CL/cl.h"
#include "framework/cl/cl_deleter.h"
#include "framework/cl/cl_engine.h"
#include "framework/cl/cl_half.h"
#include "framework/cl/cl_image_converter.h"
#include "framework/cl/cl_tool.h"
#include "framework/ddim.h"
#include "framework/tensor.h"
namespace paddle_mobile {
namespace framework {
class CLImage {
public:
CLImage() = default;
~CLImage() {
if (tensor_data_ != nullptr) {
delete[](tensor_data_);
}
if (image_converter_) {
delete (image_converter_);
}
}
/*
* will not hold input tensor data, memcpy in this method
* */
void SetTensorData(float *tensorData, const DDim &dim) {
int numel = product(dim);
if (tensor_data_ != nullptr) {
delete[](tensor_data_);
tensor_data_ = nullptr;
}
tensor_data_ = new float[numel];
memcpy(tensor_data_, tensorData, numel * sizeof(float));
tensor_dims_ = dim;
}
/*
* need call SetTensorData first
*
* folder when one dim or two dim
* */
void InitCLImage(cl_context context, cl_command_queue command_queue) {
PADDLE_MOBILE_ENFORCE(tensor_data_ != nullptr,
" need call SetTensorData first");
CLImageConverterFolder *folder_converter = new CLImageConverterFolder();
InitCLImage(context, command_queue, folder_converter);
}
void InitCLImage(cl_context context, cl_command_queue command_queue,
CLImageConverterBase *converter) {
if (image_converter_ != nullptr) {
delete (image_converter_);
}
PADDLE_MOBILE_ENFORCE(tensor_data_ != nullptr,
" need call SetTensorData first");
DLOG << " begin init cl image ";
image_dims_ = converter->InitImageDimInfoWith(tensor_dims_);
half_t *image_data = new half_t[product(image_dims_) * 4];
DLOG << " convert to image";
converter->NCHWToImage(tensor_data_, image_data, tensor_dims_);
DLOG << " end convert to image";
InitCLImage(context, image_dims_[0], image_dims_[1], image_data);
delete[](image_data);
delete[](tensor_data_);
command_queue_ = command_queue;
tensor_data_ = nullptr;
image_converter_ = converter;
initialized_ = true;
DLOG << " end init cl image";
}
void InitNImage(cl_context context, cl_command_queue command_queue) {
if (tensor_data_ == nullptr) {
PADDLE_MOBILE_THROW_EXCEPTION(" need call SetTensorData first");
}
CLImageConverterNWBlock *folder_converter = new CLImageConverterNWBlock();
InitCLImage(context, command_queue, folder_converter);
PADDLE_MOBILE_ENFORCE(tensor_dims_.size() == 4, " tensor dim is not 4");
}
void InitDWImage(cl_context context, cl_command_queue command_queue) {
if (tensor_data_ == nullptr) {
PADDLE_MOBILE_THROW_EXCEPTION(" need call SetTensorData first");
}
CLImageConverterDWBlock *dw_converter = new CLImageConverterDWBlock();
InitCLImage(context, command_queue, dw_converter);
PADDLE_MOBILE_ENFORCE(tensor_dims_.size() == 4, " tensor dim is not 4");
}
void InitEmptyImage(cl_context context, cl_command_queue command_queue,
const DDim &dim) {
PADDLE_MOBILE_ENFORCE(tensor_data_ == nullptr,
" empty image tensor data shouldn't have value");
CLImageConverterFolder *folder_converter = new CLImageConverterFolder();
DLOG << " to get image dims ";
image_dims_ = folder_converter->InitImageDimInfoWith(dim);
DLOG << " end get image dims " << image_dims_;
InitCLImage(context, image_dims_[0], image_dims_[1], nullptr);
tensor_dims_ = dim;
command_queue_ = command_queue;
image_converter_ = folder_converter;
cl_event_ = CLEngine::Instance()->CreateEvent(context);
initialized_ = true;
DLOG << " end init cl image";
}
cl_mem GetCLImage() const { return cl_image_.get(); }
const DDim &ImageDims() const { return image_dims_; }
inline size_t ImageWidth() const { return image_dims_[0]; }
inline size_t ImageHeight() const { return image_dims_[1]; }
inline cl_command_queue CommandQueue() const { return command_queue_; }
/*
* resize original tensor dim
* */
inline CLImage &Resize(const DDim &dims) {
tensor_dims_ = dims;
return *this;
}
template <typename T>
T *data() const {
if (initialized_) {
PADDLE_MOBILE_THROW_EXCEPTION(
" cl image has initialized, tensor data has been deleted, can't use "
"tensor data");
}
return reinterpret_cast<T *>(tensor_data_);
}
/*
* numel of tensor dim
* */
inline int64_t numel() const { return product(tensor_dims_); }
/*
* original tensor dim
* */
const DDim &dims() const { return tensor_dims_; }
cl_event GetClEvent() const { return cl_event_.get(); }
CLImageConverterBase *Converter() const { return image_converter_; }
private:
void InitCLImage(cl_context context, int width, int height, void *data) {
cl_image_format cf = {.image_channel_order = CL_RGBA,
.image_channel_data_type = CL_HALF_FLOAT};
cl_image_desc cid = {
.image_type = CL_MEM_OBJECT_IMAGE2D,
.image_width = width,
.image_height = height,
.image_depth = 1,
.image_array_size = 1,
.image_row_pitch = 0,
.image_slice_pitch = 0,
.num_mip_levels = 0,
.num_samples = 0,
// .buffer = nullptr
};
cid.buffer = nullptr;
cl_int err;
cl_mem cl_image = clCreateImage(
context, CL_MEM_READ_WRITE | (data ? CL_MEM_COPY_HOST_PTR : 0),
&cf, // const cl_image_format *image_format
&cid, // const cl_image_desc *image_desc
data, // void *host_ptr
&err);
cl_image_.reset(cl_image);
if (err != CL_SUCCESS) {
CL_CHECK_ERRORS(err);
PADDLE_MOBILE_THROW_EXCEPTION(" create image 2d error ");
}
}
bool initialized_ = false;
std::unique_ptr<_cl_mem, CLMemDeleter> cl_image_;
std::unique_ptr<_cl_event, CLEventDeleter> cl_event_;
DDim tensor_dims_;
DDim image_dims_;
float *tensor_data_ = nullptr;
cl_context context_;
cl_command_queue command_queue_;
CLImageConverterBase *image_converter_ = nullptr;
};
void TensorToCLImage(Tensor *tensor, CLImage *image,
cl_command_queue commandQueue);
void CLImageToTensor(CLImage *image, Tensor *tensor,
cl_command_queue commandQueue);
#ifdef PADDLE_MOBILE_DEBUG
Print &operator<<(Print &printer, const CLImage &image);
#endif
} // namespace framework
} // namespace paddle_mobile
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "framework/cl/cl_image_converter.h"
namespace paddle_mobile {
namespace framework {
const DDim &CLImageConverterDefault::InitImageDimInfoWith(
const DDim &tensor_dim) {
size_t new_dims[] = {1, 1, 1, 1};
for (int j = 0; j < tensor_dim.size(); ++j) {
new_dims[4 - tensor_dim.size() + j] = tensor_dim[j];
}
size_t N, C, H, W;
N = new_dims[0];
C = new_dims[1];
H = new_dims[2];
W = new_dims[3];
size_t width = W * ((C + 3) / 4);
size_t height = H * N;
return make_ddim({width, height});
}
void CLImageConverterDefault::NCHWToImage(float *nchw, half_t *image,
const DDim &tensor_dim) {
size_t new_dims[] = {1, 1, 1, 1};
for (int j = 0; j < tensor_dim.size(); ++j) {
new_dims[4 - tensor_dim.size() + j] = tensor_dim[j];
}
size_t N, C, H, W;
N = new_dims[0];
C = new_dims[1];
H = new_dims[2];
W = new_dims[3];
DDim in_image_dim = InitImageDimInfoWith(tensor_dim);
DLOG << " tensor dim " << tensor_dim;
DLOG << " image dim " << in_image_dim;
size_t width = in_image_dim[0];
size_t height = in_image_dim[1];
int w_block = width / W;
float *p = nchw;
size_t i0 = 0;
for (int n = 0; n < N; n++) {
for (int c = 0; c < w_block * 4; c++) {
size_t i1 = i0 + (c / 4) * W;
for (int h = 0; h < H; h++) {
size_t i2 = (i1 << 2) + c % 4;
for (int w = 0; w < W; w++) {
if (c < C) {
// int x = (n * width * H + h * width + (c / 4) * W + w) * 4 +
// (c % 4);
image[i2] = Float2Half(*p);
i2 += 4;
p++;
} else {
image[i2] = 0.0;
i2 += 4;
}
}
i1 += width;
}
}
i0 += width * H;
}
}
void CLImageConverterDefault::ImageToNCHW(half_t *image, float *tensor,
const DDim &image_dim,
const DDim &tensor_dim) {
size_t new_dims[] = {1, 1, 1, 1};
for (int j = 0; j < tensor_dim.size(); ++j) {
new_dims[4 - tensor_dim.size() + j] = tensor_dim[j];
}
size_t N, C, H, W;
N = new_dims[0];
C = new_dims[1];
H = new_dims[2];
W = new_dims[3];
int width = image_dim[0];
int height = image_dim[0];
float *p = tensor;
size_t i0 = 0;
for (int n = 0; n < N; n++) {
for (int c = 0; c < C; c++) {
size_t i1 = i0 + (c / 4) * W;
for (int h = 0; h < H; h++) {
size_t i2 = (i1 << 2) + c % 4;
for (int w = 0; w < W; w++) {
*p = Half2Float(image[i2]);
i2 += 4;
p++;
}
i1 += width;
}
}
i0 += width * H;
}
}
const DDim &CLImageConverterFolder::InitImageDimInfoWith(
const DDim &tensor_dim) {
if (tensor_dim.size() <= 2) {
int tdim[2] = {1, 1};
if (tensor_dim.size() == 1) {
tdim[1] = tensor_dim[0];
} else {
tdim[0] = tensor_dim[0];
tdim[1] = tensor_dim[1];
}
int width = (tdim[1] + 3) / 4;
int height = tdim[0];
width_of_one_block_ = width;
height_of_one_block_ = height;
c_block_ = 1;
return make_ddim({width, height});
} else {
size_t new_dims[] = {1, 1, 1, 1};
for (int j = 0; j < tensor_dim.size(); ++j) {
new_dims[4 - tensor_dim.size() + j] = tensor_dim[j];
}
size_t N, C, H, W;
N = new_dims[0];
C = new_dims[1];
H = new_dims[2];
W = new_dims[3];
size_t width = W * ((C + 3) / 4);
size_t height = H * N;
width_of_one_block_ = W;
height_of_one_block_ = H;
c_block_ = width / W;
return make_ddim({width, height});
}
}
void CLImageConverterFolder::NCHWToImage(float *tensor, half_t *image,
const DDim &tensor_dim) {
PADDLE_MOBILE_ENFORCE(tensor_dim.size() <= 4 && tensor_dim.size() > 0,
"tensor dim is not support ");
if (tensor_dim.size() > 2) {
CLImageConverterDefault default_converter;
default_converter.NCHWToImage(tensor, image, tensor_dim);
} else {
int tdim[2] = {1, 1};
if (tensor_dim.size() == 1) {
tdim[1] = tensor_dim[0];
} else {
tdim[0] = tensor_dim[0];
tdim[1] = tensor_dim[1];
}
DDim image_dim = InitImageDimInfoWith(tensor_dim);
int width = image_dim[0];
for (int h = 0; h < tdim[0]; h++) {
for (int w = 0; w < tdim[1]; w++) {
image[(h * width + w / 4) * 4 + (w % 4)] =
Float2Half(tensor[h * tdim[1] + w]);
}
}
}
}
void CLImageConverterFolder::ImageToNCHW(half_t *image, float *tensor,
const DDim &image_dim,
const DDim &tensor_dim) {
if (tensor_dim.size() > 2) {
CLImageConverterDefault default_converter;
default_converter.ImageToNCHW(image, tensor, image_dim, tensor_dim);
} else {
int width = image_dim[0];
int height = image_dim[1];
int H, W;
if (tensor_dim.size() == 2) {
H = tensor_dim[0];
W = tensor_dim[1];
} else if (tensor_dim.size() == 1) {
H = 1;
W = tensor_dim[0];
}
float *p = tensor;
for (int h = 0; h < H; h++) {
for (int w = 0; w < W; w++) {
p[h * W + w] = Half2Float(image[(h * width + w / 4) * 4 + (w % 4)]);
}
}
}
}
const DDim &CLImageConverterNWBlock::InitImageDimInfoWith(
const DDim &tensor_dim) {
PADDLE_MOBILE_ENFORCE(tensor_dim.size() == 4, " tensor dim is not 4");
size_t N, C, H, W;
N = tensor_dim[0];
C = tensor_dim[1];
H = tensor_dim[2];
W = tensor_dim[3];
size_t width = W * ((N + 3) / 4);
size_t height = C * H;
return make_ddim({width, height});
}
void CLImageConverterNWBlock::NCHWToImage(float *tensor, half_t *image,
const DDim &tensor_dim) {
PADDLE_MOBILE_ENFORCE(tensor_dim.size() == 4, " tensor dim is not 4");
auto image_dim = InitImageDimInfoWith(tensor_dim);
float *p = tensor;
int N = tensor_dim[0];
int C = tensor_dim[1];
int H = tensor_dim[2];
int W = tensor_dim[3];
int width = image_dim[0];
int height = image_dim[1];
int block = image_dim[0] / tensor_dim[3];
for (int n = 0; n < block * 4; n++) {
for (int c = 0; c < C; c++) {
for (int h = 0; h < H; ++h) {
for (int w = 0; w < W; ++w) {
int index = 4 * c * (width * H) + 4 * h * width + 4 * W * (n / 4) +
w * 4 + n % 4;
if (n < N) {
image[index] = Float2Half(*p);
p++;
} else {
image[index] = 0.0;
}
if (index >= (width * height * 4)) {
DLOG << " index out of range ";
}
}
}
}
}
DLOG << " init done";
}
void CLImageConverterNWBlock::ImageToNCHW(half_t *image, float *tensor,
const DDim &image_dim,
const DDim &tensor_dim) {
PADDLE_MOBILE_ENFORCE(tensor_dim.size() == 4, " tensor dim is not 4");
float *p = tensor;
int N = tensor_dim[0];
int C = tensor_dim[1];
int H = tensor_dim[2];
int W = tensor_dim[3];
int width = image_dim[0];
int height = image_dim[1];
int block = image_dim[0] / tensor_dim[3];
for (int n = 0; n < N; n++) {
for (int c = 0; c < C; c++) {
for (int h = 0; h < H; ++h) {
for (int w = 0; w < W; ++w) {
int index = 4 * c * (width * H) + 4 * h * width + 4 * W * (n / 4) +
w * 4 + n % 4;
*p = Half2Float(image[index]);
p++;
if (index >= (width * height * 4)) {
DLOG << " index out of range ";
}
}
}
}
}
DLOG << " init done";
}
const DDim &CLImageConverterDWBlock::InitImageDimInfoWith(
const DDim &tensor_dim) {
PADDLE_MOBILE_ENFORCE(tensor_dim.size() == 4, " tensor dim is not 4");
size_t N, C, H, W;
N = tensor_dim[0];
C = tensor_dim[1];
H = tensor_dim[2];
W = tensor_dim[3];
size_t width = W * ((N + 3) / 4);
size_t height = C * H;
return make_ddim({width, height});
}
void CLImageConverterDWBlock::NCHWToImage(float *tensor, half_t *image,
const DDim &tensor_dim) {
size_t new_dims[] = {1, 1, 1, 1};
for (int j = 0; j < tensor_dim.size(); ++j) {
new_dims[4 - tensor_dim.size() + j] = tensor_dim[j];
}
size_t N, C, H, W;
N = new_dims[1];
C = new_dims[0];
H = new_dims[2];
W = new_dims[3];
DDim in_image_dim = InitImageDimInfoWith(tensor_dim);
DLOG << " tensor dim " << tensor_dim;
DLOG << " image dim " << in_image_dim;
size_t width = in_image_dim[0];
size_t height = in_image_dim[1];
int w_block = width / W;
float *p = tensor;
size_t i0 = 0;
for (int n = 0; n < N; n++) {
for (int c = 0; c < w_block * 4; c++) {
size_t i1 = i0 + (c / 4) * W;
for (int h = 0; h < H; h++) {
size_t i2 = (i1 << 2) + c % 4;
for (int w = 0; w < W; w++) {
if (c < C) {
// int x = (n * width * H + h * width + (c / 4) * W + w) * 4 +
// (c % 4);
image[i2] = Float2Half(*p);
i2 += 4;
p++;
} else {
image[i2] = 0.0;
i2 += 4;
}
}
i1 += width;
}
}
i0 += width * H;
}
}
void CLImageConverterDWBlock::ImageToNCHW(half_t *image, float *tensor,
const DDim &image_dim,
const DDim &tensor_dim) {
PADDLE_MOBILE_ENFORCE(tensor_dim.size() == 4, " tensor dim is not 4");
float *p = tensor;
int N = tensor_dim[1];
int C = tensor_dim[0];
int H = tensor_dim[2];
int W = tensor_dim[3];
int width = image_dim[0];
int height = image_dim[0];
size_t i0 = 0;
for (int n = 0; n < N; n++) {
for (int c = 0; c < C; c++) {
size_t i1 = i0 + (c / 4) * W;
for (int h = 0; h < H; h++) {
size_t i2 = (i1 << 2) + c % 4;
for (int w = 0; w < W; w++) {
*p = Half2Float(image[i2]);
i2 += 4;
p++;
}
i1 += width;
}
}
i0 += width * H;
}
}
} // namespace framework
} // namespace paddle_mobile
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "framework/cl/cl_half.h"
#include "framework/ddim.h"
namespace paddle_mobile {
namespace framework {
class CLImageConverterBase {
public:
virtual void NCHWToImage(float *nchw, half_t *image,
const DDim &tensor_dim) = 0;
virtual void ImageToNCHW(half_t *image, float *nchw, const DDim &image_dim,
const DDim &tensor_dim) = 0;
virtual const DDim &InitImageDimInfoWith(const DDim &tensor_dim) = 0;
};
class CLImageConverterDefault : public CLImageConverterBase {
public:
const DDim &InitImageDimInfoWith(const DDim &tensor_dim);
void NCHWToImage(float *nchw, half_t *image, const DDim &tensor_dim);
void ImageToNCHW(half_t *image, float *tensor, const DDim &image_dim,
const DDim &tensor_dim);
};
class CLImageConverterFolder : public CLImageConverterBase {
public:
const DDim &InitImageDimInfoWith(const DDim &tensor_dim);
void NCHWToImage(float *tensor, half_t *image, const DDim &tensor_dim);
void ImageToNCHW(half_t *image, float *tensor, const DDim &image_dim,
const DDim &tensor_dim);
/*
* width of original tensor
* */
inline size_t WidthOfOneBlock() const { return width_of_one_block_; }
/*
* height of original tensor
* */
inline size_t HeightOfOneBlock() const { return height_of_one_block_; }
int GetCBlock() const { return c_block_; }
private:
int c_block_;
int width_of_one_block_;
int height_of_one_block_;
};
class CLImageConverterNWBlock : public CLImageConverterBase {
const DDim &InitImageDimInfoWith(const DDim &tensor_dim);
void NCHWToImage(float *tensor, half_t *image, const DDim &tensor_dim);
void ImageToNCHW(half_t *image, float *tensor, const DDim &image_dim,
const DDim &tensor_dim);
};
class CLImageConverterDWBlock : public CLImageConverterBase {
const DDim &InitImageDimInfoWith(const DDim &tensor_dim);
void NCHWToImage(float *tensor, half_t *image, const DDim &tensor_dim);
void ImageToNCHW(half_t *image, float *tensor, const DDim &image_dim,
const DDim &tensor_dim);
};
} // namespace framework
} // namespace paddle_mobile
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <memory>
#include <string>
#include <unordered_map>
#include "CL/cl.h"
#include "framework/cl/cl_deleter.h"
#include "framework/cl/cl_engine.h"
#include "framework/cl/cl_tool.h"
namespace paddle_mobile {
namespace framework {
class CLScope {
public:
CLScope() {
CLEngine *engin = CLEngine::Instance();
context_ = engin->CreateContext();
command_queue_ = engin->CreateClCommandQueue(context_.get());
}
cl_command_queue CommandQueue() { return command_queue_.get(); }
std::unique_ptr<_cl_kernel, CLKernelDeleter> GetKernel(
const std::string &kernel_name, const std::string &file_name) {
DLOG << " to get program " << file_name;
auto program = Program(file_name);
DLOG << " end get program ~ ";
DLOG << " to create kernel: " << kernel_name;
std::unique_ptr<_cl_kernel, CLKernelDeleter> kernel(
clCreateKernel(program, kernel_name.c_str(), &status_));
CL_CHECK_ERRORS(status_);
DLOG << " end create kernel ~ ";
return std::move(kernel);
}
cl_context Context() { return context_.get(); }
cl_program Program(const std::string &file_name) {
auto it = programs_.find(file_name);
if (it != programs_.end()) {
return it->second.get();
}
auto program = CLEngine::Instance()->CreateProgramWith(
context_.get(), "./cl_kernel/" + file_name);
DLOG << " --- begin build program -> " << file_name << " --- ";
CLEngine::Instance()->BuildProgram(program.get());
DLOG << " --- end build program -> " << file_name << " --- ";
programs_[file_name] = std::move(program);
return programs_[file_name].get();
}
private:
cl_int status_;
std::unique_ptr<_cl_context, CLContextDeleter> context_;
std::unique_ptr<_cl_command_queue, CLCommQueueDeleter> command_queue_;
std::unordered_map<std::string,
std::unique_ptr<_cl_program, CLProgramDeleter>>
programs_;
};
} // namespace framework
} // namespace paddle_mobile
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <memory>
#include <string>
#include <vector>
#include "CL/cl.h"
#include "framework/cl/cl_deleter.h"
#include "framework/cl/cl_engine.h"
#include "framework/tensor_base.h"
namespace paddle_mobile {
namespace framework {
class CLTensor : TensorBase {
public:
CLTensor(cl_context context, cl_command_queue command_queue)
: context_(context), command_queue_(command_queue) {}
CLTensor() = default;
/*
* if init method haven't set context and command_queue, need set
* */
void SetContextAndCommandQueue(cl_context context,
cl_command_queue command_queue) {
context_ = context;
command_queue_ = command_queue;
}
/*! Resize the dimensions of the memory block. */
inline CLTensor &Resize(const DDim &dims) {
dims_ = dims;
return *this;
}
template <typename T>
inline cl_mem mutable_with_data(const T *data) {
int64_t size = numel() * sizeof(T);
holder_.reset(new PlaceholderImpl(
size, reinterpret_cast<void *>(const_cast<T *>(data)), typeid(T),
context_, command_queue_));
return reinterpret_cast<cl_mem>(holder_->ptr());
}
inline cl_mem mutable_data(std::type_index type) {
if (holder_ != nullptr) {
holder_->set_type(type);
}
PADDLE_MOBILE_ENFORCE(numel() >= 0, "the Tensor's numel must >=0.")
int64_t size = numel() * SizeOfType(type);
if (holder_ == nullptr || holder_->size() < size + offset_) {
holder_.reset(new PlaceholderImpl(size, type, context_, command_queue_));
offset_ = 0;
}
return reinterpret_cast<cl_mem>(holder_->ptr());
}
/**
* @brief Return a pointer to cl buffer.
* @note If not exist, then allocation.
*/
template <typename T>
inline cl_mem mutable_data() {
return reinterpret_cast<cl_mem>(mutable_data(typeid(T)));
}
/**
* @brief Return a pointer to cl buffer.
*
* @param[in] dims The dimensions of the memory block.
* @param[in] place The place of the memory block.
*
* @note If not exist, then allocation.
*/
template <typename T>
inline cl_mem mutable_data(DDim dims) {
Resize(dims);
return mutable_data<T>();
}
inline cl_mem CLBuffer() {
check_memory_size();
return reinterpret_cast<cl_mem>(
reinterpret_cast<uintptr_t>(holder_->ptr()));
}
template <typename T>
inline T *Data() {
if (host_ptr_) {
delete (host_ptr_);
host_ptr_ = nullptr;
}
cl_mem buffer = CLBuffer();
host_ptr_ = new char[holder_->size()];
cl_int status;
status = clEnqueueReadBuffer(command_queue_, buffer, CL_TRUE, 0,
holder_->size(), host_ptr_, 0, NULL, NULL);
CL_CHECK_ERRORS(status);
return reinterpret_cast<T *>(host_ptr_);
}
int memorySize() { return holder_->size(); }
~CLTensor() {
DLOG << "~CLTensor";
if (host_ptr_) {
DLOG << " delete host ptr ";
delete (host_ptr_);
host_ptr_ = nullptr;
}
}
private:
cl_context context_;
cl_command_queue command_queue_;
void *host_ptr_ = nullptr;
struct PlaceholderImpl : public Placeholder {
PlaceholderImpl(size_t size, void *input, std::type_index type,
cl_context context, cl_command_queue command_queue)
: ptr_(clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
size, reinterpret_cast<void *>(input), NULL)),
size_(size),
type_(type),
command_queue_(command_queue) {}
PlaceholderImpl(size_t size, std::type_index type, cl_context context,
cl_command_queue command_queue)
: ptr_(clCreateBuffer(context, CL_MEM_READ_WRITE, size, NULL, NULL)),
size_(size),
type_(type),
command_queue_(command_queue) {}
virtual size_t size() const { return size_; }
virtual void *ptr() const { return static_cast<void *>(ptr_.get()); }
virtual std::type_index type() const { return type_; }
virtual void set_type(std::type_index type) { type_ = type; }
std::unique_ptr<_cl_mem, CLMemDeleter> ptr_;
size_t size_;
/* the current type of memory */
std::type_index type_;
cl_command_queue command_queue_;
};
};
} // namespace framework
} // namespace paddle_mobile
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "framework/cl/cl_tool.h"
namespace paddle_mobile {
namespace framework {
const char *opencl_error_to_str(cl_int error) {
#define CASE_CL_CONSTANT(NAME) \
case NAME: \
return #NAME;
// Suppose that no combinations are possible.
switch (error) {
CASE_CL_CONSTANT(CL_SUCCESS)
CASE_CL_CONSTANT(CL_DEVICE_NOT_FOUND)
CASE_CL_CONSTANT(CL_DEVICE_NOT_AVAILABLE)
CASE_CL_CONSTANT(CL_COMPILER_NOT_AVAILABLE)
CASE_CL_CONSTANT(CL_MEM_OBJECT_ALLOCATION_FAILURE)
CASE_CL_CONSTANT(CL_OUT_OF_RESOURCES)
CASE_CL_CONSTANT(CL_OUT_OF_HOST_MEMORY)
CASE_CL_CONSTANT(CL_PROFILING_INFO_NOT_AVAILABLE)
CASE_CL_CONSTANT(CL_MEM_COPY_OVERLAP)
CASE_CL_CONSTANT(CL_IMAGE_FORMAT_MISMATCH)
CASE_CL_CONSTANT(CL_IMAGE_FORMAT_NOT_SUPPORTED)
CASE_CL_CONSTANT(CL_BUILD_PROGRAM_FAILURE)
CASE_CL_CONSTANT(CL_MAP_FAILURE)
CASE_CL_CONSTANT(CL_MISALIGNED_SUB_BUFFER_OFFSET)
CASE_CL_CONSTANT(CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST)
CASE_CL_CONSTANT(CL_INVALID_VALUE)
CASE_CL_CONSTANT(CL_INVALID_DEVICE_TYPE)
CASE_CL_CONSTANT(CL_INVALID_PLATFORM)
CASE_CL_CONSTANT(CL_INVALID_DEVICE)
CASE_CL_CONSTANT(CL_INVALID_CONTEXT)
CASE_CL_CONSTANT(CL_INVALID_QUEUE_PROPERTIES)
CASE_CL_CONSTANT(CL_INVALID_COMMAND_QUEUE)
CASE_CL_CONSTANT(CL_INVALID_HOST_PTR)
CASE_CL_CONSTANT(CL_INVALID_MEM_OBJECT)
CASE_CL_CONSTANT(CL_INVALID_IMAGE_FORMAT_DESCRIPTOR)
CASE_CL_CONSTANT(CL_INVALID_IMAGE_SIZE)
CASE_CL_CONSTANT(CL_INVALID_SAMPLER)
CASE_CL_CONSTANT(CL_INVALID_BINARY)
CASE_CL_CONSTANT(CL_INVALID_BUILD_OPTIONS)
CASE_CL_CONSTANT(CL_INVALID_PROGRAM)
CASE_CL_CONSTANT(CL_INVALID_PROGRAM_EXECUTABLE)
CASE_CL_CONSTANT(CL_INVALID_KERNEL_NAME)
CASE_CL_CONSTANT(CL_INVALID_KERNEL_DEFINITION)
CASE_CL_CONSTANT(CL_INVALID_KERNEL)
CASE_CL_CONSTANT(CL_INVALID_ARG_INDEX)
CASE_CL_CONSTANT(CL_INVALID_ARG_VALUE)
CASE_CL_CONSTANT(CL_INVALID_ARG_SIZE)
CASE_CL_CONSTANT(CL_INVALID_KERNEL_ARGS)
CASE_CL_CONSTANT(CL_INVALID_WORK_DIMENSION)
CASE_CL_CONSTANT(CL_INVALID_WORK_GROUP_SIZE)
CASE_CL_CONSTANT(CL_INVALID_WORK_ITEM_SIZE)
CASE_CL_CONSTANT(CL_INVALID_GLOBAL_OFFSET)
CASE_CL_CONSTANT(CL_INVALID_EVENT_WAIT_LIST)
CASE_CL_CONSTANT(CL_INVALID_EVENT)
CASE_CL_CONSTANT(CL_INVALID_OPERATION)
CASE_CL_CONSTANT(CL_INVALID_GL_OBJECT)
CASE_CL_CONSTANT(CL_INVALID_BUFFER_SIZE)
CASE_CL_CONSTANT(CL_INVALID_MIP_LEVEL)
CASE_CL_CONSTANT(CL_INVALID_GLOBAL_WORK_SIZE)
CASE_CL_CONSTANT(CL_INVALID_PROPERTY)
default:
return "UNKNOWN ERROR CODE";
}
#undef CASE_CL_CONSTANT
}
} // namespace framework
} // namespace paddle_mobile
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "CL/cl.h"
namespace paddle_mobile {
namespace framework {
const char* opencl_error_to_str(cl_int error);
#define CL_CHECK_ERRORS(ERR) \
if (ERR != CL_SUCCESS) { \
printf( \
"OpenCL error with code %s happened in file %s at line %d. " \
"Exiting.\n", \
paddle_mobile::framework::opencl_error_to_str(ERR), __FILE__, \
__LINE__); \
}
} // namespace framework
} // namespace paddle_mobile
......@@ -41,7 +41,6 @@ inline DataLayout StringToDataLayout(const std::string &str) {
return DataLayout::kAnyLayout;
} else {
PADDLE_MOBILE_THROW_EXCEPTION("Unknown storage order string: %s", s.c_str())
exit(0);
}
}
......@@ -55,7 +54,6 @@ inline std::string DataLayoutToString(const DataLayout &data_layout) {
return "ANY_LAYOUT";
default:
PADDLE_MOBILE_THROW_EXCEPTION("Unknown storage order string ")
exit(0);
break;
}
}
......
......@@ -42,7 +42,7 @@ struct Dim {
: head(idx % size.head), tail(idx / size.head, size.tail) {}
/** Construct a Dim with each dimension set to the given index */
Dim(int64_t idx) : head(idx), tail(idx) {}
explicit Dim(int64_t idx) : head(idx), tail(idx) {}
bool operator==(const Dim<i> &o) const {
return (head == o.head) && (tail == o.tail);
......@@ -65,7 +65,7 @@ template <>
struct Dim<0> {
static constexpr int dimensions = 0;
Dim(int64_t _head) {}
explicit Dim(int64_t _head) {}
Dim() {}
......@@ -131,7 +131,6 @@ int64_t &indexer(Dim<D> &dim, int idx) {
template <>
int64_t &indexer<0>(Dim<0> &dim, int idx) {
PADDLE_MOBILE_THROW_EXCEPTION("Invalid index")
exit(0);
}
template <int D>
......@@ -148,7 +147,6 @@ int64_t indexer(const Dim<D> &dim, int idx) {
template <>
int64_t indexer<0>(const Dim<0> &dim, int idx) {
PADDLE_MOBILE_THROW_EXCEPTION("Invalid index")
exit(0);
}
} // namespace
......
......@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "io/executor.h"
#include "framework/executor.h"
#include <algorithm>
#include <utility>
#include <vector>
......@@ -26,11 +26,24 @@ limitations under the License. */
#include "framework/program/var_desc.h"
#include "framework/scope.h"
#include "framework/tensor.h"
#include "operators/math/gemm.h"
#ifdef PADDLE_EXECUTOR_MULTITHREAD
#include <queue>
#include <utility>
#include "common/threadpool.h"
#endif
#ifdef PADDLE_MOBILE_CL
#include "framework/cl/cl_image.h"
#endif
namespace paddle_mobile {
namespace framework {
using framework::Variable;
using framework::Variable;
#pragma mark - executor
template <typename Dtype, Precision P>
Executor<Dtype, P>::Executor(const framework::Program<Dtype> p, int batch_size,
......@@ -390,15 +403,92 @@ std::vector<typename Executor<Dtype, P>::Ptype> Executor<Dtype, P>::Predict(
const std::vector<Ptype> &input, const std::vector<int64_t> &dims) {
framework::Tensor tensor(input, framework::make_ddim(dims));
std::shared_ptr<framework::Tensor> output_tensor = Predict(tensor, 0);
Executor<Dtype, P>::Ptype *output_ptr =
output_tensor->data<typename Executor<Dtype, P>::Ptype>();
std::vector<typename Executor<Dtype, P>::Ptype> result_vector;
for (int j = 0; j < output_tensor->numel(); ++j) {
result_vector.push_back(output_ptr[j]);
if (output_tensor != nullptr) {
Executor<Dtype, P>::Ptype *output_ptr =
output_tensor->data<typename Executor<Dtype, P>::Ptype>();
std::vector<typename Executor<Dtype, P>::Ptype> result_vector;
for (int j = 0; j < output_tensor->numel(); ++j) {
result_vector.push_back(output_ptr[j]);
}
return result_vector;
} else {
DLOG << "return empty vector";
return {};
}
return result_vector;
}
#ifdef PADDLE_MOBILE_FPGA
template <typename Dtype, Precision P>
void Executor<Dtype, P>::InjectVariable(const framework::Tensor &t,
string var_name) {
framework::Variable *g_feed_value = program_.scope->Var(var_name);
framework::Tensor *feed_tensor =
g_feed_value->GetMutable<framework::LoDTensor>();
feed_tensor->Resize(t.dims());
feed_tensor->ShareDataWith(t);
}
template <typename Dtype, Precision P>
void Executor<Dtype, P>::FeedData(const framework::Tensor &t) {
InjectVariable(t, "feed");
}
template <typename Dtype, Precision P>
std::shared_ptr<framework::Tensor> Executor<Dtype, P>::FetchResult(int id) {
std::shared_ptr<framework::BlockDesc> to_predict_block =
to_predict_program_->Block(0);
auto &ops = ops_of_block_[*to_predict_block.get()];
PADDLE_MOBILE_ENFORCE(id < ops.size(), "Index out of range");
auto last_op = id < 0 ? ops[ops.size() - 1] : ops[id];
auto output_map = last_op->Outputs();
std::vector<std::string> out_keys = last_op->GetOutKeys();
PADDLE_MOBILE_ENFORCE(!out_keys.empty(), "the last op contains no output");
auto *output_tensor = framework::GetVarValue<framework::LoDTensor>(
out_keys[0], output_map, *(program_.scope));
return std::make_shared<framework::Tensor>(framework::Tensor(*output_tensor));
}
template <typename Dtype, Precision P>
void Executor<Dtype, P>::Predict_From_To(int start, int end) {
std::shared_ptr<framework::BlockDesc> to_predict_block =
to_predict_program_->Block(0);
auto &ops = ops_of_block_[*to_predict_block.get()];
end = end < 0 ? static_cast<int>(ops.size()) : end;
PADDLE_MOBILE_ENFORCE(start >= 0 && start < end && end <= ops.size(),
"start or end parameter is wrong");
#ifdef PADDLE_MOBILE_PROFILE
std::vector<ProfInfo> profile(ops.size());
#endif
for (int i = start; i < end; i++) {
#ifdef PADDLE_MOBILE_PROFILE
struct timespec ts;
clock_gettime(CLOCK_MONOTONIC, &ts);
profile[i].runBegin = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
#endif
DLOG << "Running op: " << i << " " << ops[i]->Type();
ops[i]->Run();
#ifdef PADDLE_MOBILE_PROFILE
clock_gettime(CLOCK_MONOTONIC, &ts);
profile[i].runEnd = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
#endif
}
}
template <typename Dtype, Precision P>
void Executor<Dtype, P>::Predict_From(int start) {
Predict_From_To(start);
}
template <typename Dtype, Precision P>
void Executor<Dtype, P>::Predict_To(int end) {
Predict_From_To(0, end);
}
#endif
#ifdef PADDLE_MOBILE_FPGA
template <typename Dtype, Precision P>
void Executor<Dtype, P>::InjectVariable(const framework::Tensor &t,
......@@ -470,8 +560,232 @@ void Executor<Dtype, P>::Predict_To(int end) {
}
#endif
#ifdef PADDLE_MOBILE_CL
template <typename Dtype, Precision P>
void Executor<Dtype, P>::LoadMemory(const framework::VarDesc var_desc,
float *tensorInput, char **data) {}
template <>
void Executor<GPU_CL, Precision::FP32>::LoadMemory(
const framework::VarDesc var_desc, float *tensorInput, char **data) {
// 1. version
uint32_t version = *reinterpret_cast<uint32_t *>(*data);
(*data) += sizeof(uint32_t);
// 2 Lod information
uint64_t *lod_level_ptr = new uint64_t();
memcpy(lod_level_ptr, (*data), sizeof(uint64_t));
uint64_t lod_level = *lod_level_ptr;
delete lod_level_ptr;
(*data) += sizeof(uint64_t);
for (uint64_t i = 0; i < lod_level; ++i) {
uint64_t size = *reinterpret_cast<uint64_t *>(*data);
(*data) += sizeof(uint64_t);
std::vector<size_t> tmp(size / sizeof(size_t));
for (int k = 0; k < tmp.size(); ++k) {
tmp[k] = *reinterpret_cast<size_t *>(*data);
(*data) += sizeof(size_t);
}
}
// 3. tensor version
uint32_t tensor_version = *reinterpret_cast<uint32_t *>(*data);
(*data) += sizeof(uint32_t);
// 4. tensor desc
int32_t size = *reinterpret_cast<int32_t *>(*data);
(*data) += sizeof(int32_t);
std::unique_ptr<char[]> buf(new char[size]);
for (int m = 0; m < size; ++m) {
buf.get()[m] = (*data)[m];
}
(*data) += (sizeof(char) * size);
const framework::TensorDesc &desc = var_desc.Tensor_desc();
int memory_size = 1;
for (auto l : desc.Dims()) {
memory_size *= l;
}
void *memory = nullptr;
// int type_size = 0;
// switch (desc.DataType()) {
// case framework::VARTYPE_TYPE_FP16:
// type_size = 2;
// break;
// case framework::VARTYPE_TYPE_FP32:
// type_size = 4;
// memory = tensor->mutable_data<float>();
// break;
// case framework::VARTYPE_TYPE_FP64:
// type_size = 8;
// break;
// case framework::VARTYPE_TYPE_INT32:
// memory = tensor->mutable_data<int32_t>();
// type_size = 4;
// break;
// case framework::VARTYPE_TYPE_INT64:
// type_size = 8;
// break;
// case framework::VARTYPE_TYPE_BOOL:
// type_size = 1;
// break;
// default:
// break;
// }
int type_size = 4;
memory = tensorInput;
if (program_.quantification) {
float min_value;
float max_value;
memcpy(&min_value, *data, sizeof(float));
memcpy(&max_value, *data + sizeof(float), sizeof(float));
*data += 2 * sizeof(float);
const float factor = (max_value - min_value) / 255.0;
uint8_t *uint8_data = reinterpret_cast<uint8_t *>(*data);
for (int k = 0; k < memory_size; ++k) {
static_cast<float *>(memory)[k] = uint8_data[k] * factor + min_value;
}
*data += (memory_size * sizeof(uint8_t));
} else {
for (int n = 0; n < memory_size; n++) {
float value;
memcpy(&value, *data + n * type_size, type_size);
if (value < 1e-30 && value > -1e-30) {
static_cast<float *>(memory)[n] = 0.0;
} else {
static_cast<float *>(memory)[n] = value;
}
}
(*data) += (sizeof(char) * memory_size * type_size);
}
}
template <>
void Executor<GPU_CL, Precision::FP32>::InitMemory() {
for (const auto &block : to_predict_program_->Blocks()) {
for (const auto &var_desc : block->Vars()) {
auto var = program_.scope->Var(var_desc->Name());
if (var_desc->Persistable()) {
CLImage *cl_image = nullptr;
if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
var->template GetMutable<framework::LoDTensor>();
continue;
} else {
cl_image = var->template GetMutable<framework::CLImage>();
}
char *origin_data =
ReadFileToBuff(program_.model_path + "/" + var_desc->Name());
char *data = origin_data;
cl_context context = program_.scope->GetCLScpoe()->Context();
const framework::TensorDesc &desc = var_desc->Tensor_desc();
int numel = 1;
for (auto l : desc.Dims()) {
numel *= l;
}
DLOG << var_desc->Name();
float *tensorInput = static_cast<float *>(
paddle_mobile::memory::Alloc(sizeof(float) * numel));
LoadMemory(*var_desc, tensorInput, &data);
framework::DDim ddim = framework::make_ddim(desc.Dims());
// has not init
cl_image->SetTensorData(tensorInput, ddim);
delete origin_data;
paddle_mobile::memory::Free(tensorInput);
} else {
if (var_desc->Type() == framework::VARTYPE_TYPE_LOD_TENSOR) {
auto cl_image = var->template GetMutable<framework::CLImage>();
cl_context context = program_.scope->GetCLScpoe()->Context();
cl_command_queue command_queue =
program_.scope->GetCLScpoe()->CommandQueue();
const framework::TensorDesc &desc = var_desc->Tensor_desc();
// framework::DDim ddim = framework::make_ddim(desc.Dims());
framework::DDim ddim = cl_image->dims();
DLOG << var_desc->Name();
cl_image->InitEmptyImage(context, command_queue, ddim);
}
}
}
}
}
template <>
void Executor<GPU_CL, Precision::FP32>::InitCombineMemory() {
char *origin_data;
if (program_.combined_params_buf && program_.combined_params_len) {
LOG(kLOG_INFO) << "use outter memory";
origin_data = reinterpret_cast<char *>(program_.combined_params_buf);
} else {
LOG(kLOG_INFO) << " begin init combine memory";
origin_data = ReadFileToBuff(program_.para_path);
}
PADDLE_MOBILE_ENFORCE(origin_data != nullptr, "origin_data==nullptr!!!");
float *data = reinterpret_cast<float *>(origin_data);
for (const auto &block : to_predict_program_->Blocks()) {
for (const auto &var_desc : block->Vars()) {
auto var = program_.scope->Var(var_desc->Name());
if (var_desc->Persistable()) {
CLImage *cl_image = nullptr;
if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
var->template GetMutable<framework::LoDTensor>();
continue;
} else {
cl_image = var->template GetMutable<framework::CLImage>();
}
cl_context context = program_.scope->GetCLScpoe()->Context();
const framework::TensorDesc &desc = var_desc->Tensor_desc();
framework::DDim ddim = framework::make_ddim(desc.Dims());
int numel = 1;
for (int i = 0; i < ddim.size(); i++) {
numel = numel * ddim[i];
}
float *tensorInput = static_cast<float *>(
paddle_mobile::memory::Alloc(sizeof(float) * numel));
LoadMemory(*var_desc, tensorInput, &origin_data);
// has not init
cl_image->SetTensorData(tensorInput, ddim);
paddle_mobile::memory::Free(tensorInput);
} else {
auto cl_image = var->template GetMutable<framework::CLImage>();
cl_context context = program_.scope->GetCLScpoe()->Context();
cl_command_queue command_queue =
program_.scope->GetCLScpoe()->CommandQueue();
const framework::TensorDesc &desc = var_desc->Tensor_desc();
framework::DDim ddim = cl_image->dims();
// framework::DDim ddim = framework::make_ddim(desc.Dims());
cl_image->InitEmptyImage(context, command_queue, ddim);
}
}
}
delete origin_data;
LOG(kLOG_INFO) << " end init combine memory ";
}
#endif
template class Executor<CPU, Precision::FP32>;
template class Executor<GPU_MALI, Precision::FP32>;
template class Executor<FPGA, Precision::FP32>;
template class Executor<GPU_CL, Precision::FP32>;
template class Executor<GPU_MALI, Precision::FP32>;
} // namespace framework
} // namespace paddle_mobile
......@@ -26,6 +26,7 @@ limitations under the License. */
#include "framework/tensor.h"
namespace paddle_mobile {
namespace framework {
template <typename Dtype = CPU, Precision P = Precision::FP32>
class Executor {
......@@ -79,7 +80,10 @@ class Executor {
void LoadMemory(void **data,
const std::shared_ptr<framework::VarDesc> var_desc,
framework::LoDTensor *tensor);
#ifdef PADDLE_MOBILE_CL
void LoadMemory(const framework::VarDesc var_desc, float *tensorInput,
char **data);
#endif
framework::Program<Dtype> program_;
int batch_size_ = 1;
std::shared_ptr<framework::ProgramDesc> to_predict_program_;
......@@ -97,4 +101,5 @@ class Executor {
bool loddable_ = false;
};
} // namespace framework
} // namespace paddle_mobile
......@@ -12,13 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "io/loader.h"
#include "framework/loader.h"
#include "framework/lod_tensor.h"
#include "framework/program/program-optimize/program_optimize.h"
#ifdef PADDLE_MOBILE_CL
#include "framework/cl/cl_image.h"
#endif
namespace paddle_mobile {
using framework::Variable;
namespace framework {
/**
* muteandresize tensor as originProgramDesc and scope in loadParams
......@@ -26,23 +29,24 @@ using framework::Variable;
* @param originProgramDesc
* @param scope
*/
void InitMemoryFromProgram(
std::shared_ptr<framework::ProgramDesc> &originProgramDesc, // NOLINT
std::shared_ptr<framework::Scope> &scope) { // NOLINT
template <typename Dtype, Precision P>
void Loader<Dtype, P>::InitMemoryFromProgram(
const std::shared_ptr<ProgramDesc> &originProgramDesc,
const std::shared_ptr<Scope> &scope) {
for (const auto &block : originProgramDesc.get()->Blocks()) {
for (const auto &var_desc : block->Vars()) {
auto var = scope.get()->Var(var_desc->Name());
if (var_desc->Type() == framework::VARTYPE_TYPE_LOD_TENSOR) {
if (var_desc->Type() == VARTYPE_TYPE_LOD_TENSOR) {
if (var_desc->Persistable()) {
auto dim = var_desc->Tensor_desc().Dims();
auto tensor = var->GetMutable<framework::LoDTensor>();
tensor->Resize(framework::make_ddim(dim));
auto tensor = var->GetMutable<LoDTensor>();
tensor->Resize(make_ddim(dim));
} else {
auto dim = var_desc->Tensor_desc().Dims();
PADDLE_MOBILE_ENFORCE(dim.size() > 0, "dim size is 0");
dim[0] = 1;
auto tensor = var->GetMutable<framework::LoDTensor>();
tensor->Resize(framework::make_ddim(dim));
auto tensor = var->GetMutable<LoDTensor>();
tensor->Resize(make_ddim(dim));
}
} else {
// TODO(codeWorm): some.
......@@ -50,6 +54,36 @@ void InitMemoryFromProgram(
}
}
}
#ifdef PADDLE_MOBILE_CL
template <>
void Loader<GPU_CL, Precision::FP32>::InitMemoryFromProgram(
const std::shared_ptr<ProgramDesc> &originProgramDesc,
const std::shared_ptr<Scope> &scope) {
for (const auto &block : originProgramDesc.get()->Blocks()) {
for (const auto &var_desc : block->Vars()) {
auto var = scope.get()->Var(var_desc->Name());
if (var_desc->Type() == VARTYPE_TYPE_LOD_TENSOR) {
if (var_desc->Persistable()) {
auto dim = var_desc->Tensor_desc().Dims();
// auto tensor = var->GetMutable<LoDTensor>();
auto cl_image = var->GetMutable<framework::CLImage>();
cl_image->Resize(make_ddim(dim));
} else {
auto dim = var_desc->Tensor_desc().Dims();
PADDLE_MOBILE_ENFORCE(dim.size() > 0, "dim size is 0");
dim[0] = 1;
auto cl_image = var->GetMutable<framework::CLImage>();
cl_image->Resize(make_ddim(dim));
}
} else {
// TODO(codeWorm): some.
}
}
}
}
#endif
/**
* fusion and print someinfos
* @tparam Dtype
......@@ -61,19 +95,18 @@ void InitMemoryFromProgram(
*/
template <typename Dtype, Precision P>
void FusionAndPrintInfos(
bool optimize, bool can_add_split,
framework::Program<Dtype, P> &program, // NOLINT
const std::shared_ptr<framework::ProgramDesc> &originProgramDesc) {
bool optimize, bool can_add_split, Program<Dtype, P> *program,
const std::shared_ptr<ProgramDesc> &originProgramDesc) {
if (optimize) {
framework::ProgramOptimize program_optimize;
program.optimizeProgram =
ProgramOptimize program_optimize;
program->optimizeProgram =
program_optimize.FusionOptimize(originProgramDesc, can_add_split);
if (!program.optimizeProgram) {
program.optimizeProgram = originProgramDesc;
if (!program->optimizeProgram) {
program->optimizeProgram = originProgramDesc;
}
}
if (optimize) {
program.optimizeProgram->Description("optimize: ");
program->optimizeProgram->Description("optimize: ");
} else {
originProgramDesc->Description("program: ");
}
......@@ -102,9 +135,10 @@ static size_t ReadBuffer(const char *file_name, uint8_t **out) {
}
template <typename Dtype, Precision P>
const framework::Program<Dtype, P> Loader<Dtype, P>::Load(
const std::string &dirname, bool optimize, bool quantification,
bool can_add_split) {
const Program<Dtype, P> Loader<Dtype, P>::Load(const std::string &dirname,
bool optimize,
bool quantification,
bool can_add_split) {
auto program = this->LoadProgram(dirname + "/__model__", optimize,
quantification, can_add_split);
program.model_path = dirname;
......@@ -112,9 +146,10 @@ const framework::Program<Dtype, P> Loader<Dtype, P>::Load(
}
template <typename Dtype, Precision P>
const framework::Program<Dtype, P> Loader<Dtype, P>::Load(
const std::string &model_path, const std::string &para_path, bool optimize,
bool quantification) {
const Program<Dtype, P> Loader<Dtype, P>::Load(const std::string &model_path,
const std::string &para_path,
bool optimize,
bool quantification) {
auto program = this->LoadProgram(model_path, optimize, quantification);
program.para_path = para_path;
......@@ -124,7 +159,7 @@ const framework::Program<Dtype, P> Loader<Dtype, P>::Load(
}
template <typename Dtype, Precision P>
const framework::Program<Dtype, P> Loader<Dtype, P>::LoadProgram(
const Program<Dtype, P> Loader<Dtype, P>::LoadProgram(
const std::string &model_path, bool optimize, bool quantification,
bool can_add_split) {
std::string model_filename = model_path;
......@@ -141,29 +176,29 @@ const framework::Program<Dtype, P> Loader<Dtype, P>::LoadProgram(
//
DLOG << "n_ops: " << (*c_program->blocks)->n_ops;
//
auto originProgramDesc = std::make_shared<framework::ProgramDesc>(c_program);
auto originProgramDesc = std::make_shared<ProgramDesc>(c_program);
framework::Program<Dtype, P> program;
Program<Dtype, P> program;
program.originProgram = originProgramDesc;
program.quantification = quantification;
program.combined_params_len = 0;
program.combined_params_buf = nullptr;
auto scope = std::make_shared<framework::Scope>();
auto scope = std::make_shared<Scope>();
program.scope = scope;
// use originProgramDesc and scope to init tensors
InitMemoryFromProgram(originProgramDesc, scope);
// perform fusion and print infos
FusionAndPrintInfos(optimize, can_add_split, program, originProgramDesc);
FusionAndPrintInfos(optimize, can_add_split, &program, originProgramDesc);
paddle_mobile__framework__proto__program_desc__free_unpacked(c_program, NULL);
return program;
}
template <typename Dtype, Precision P>
const framework::Program<Dtype, P> Loader<Dtype, P>::LoadCombinedMemory(
const Program<Dtype, P> Loader<Dtype, P>::LoadCombinedMemory(
size_t read_size, const uint8_t *buf, size_t combined_params_len,
const uint8_t *combined_params_buf, bool optimize, bool quantification) {
uint8_t *combined_params_buf, bool optimize, bool quantification) {
bool can_add_split = false;
PaddleMobile__Framework__Proto__ProgramDesc *c_program;
......@@ -177,26 +212,31 @@ const framework::Program<Dtype, P> Loader<Dtype, P>::LoadCombinedMemory(
DLOG << "n_ops: " << (*c_program->blocks)->n_ops;
//
auto originProgramDesc = std::make_shared<framework::ProgramDesc>(c_program);
auto originProgramDesc = std::make_shared<ProgramDesc>(c_program);
framework::Program<Dtype, P> program;
Program<Dtype, P> program;
program.combined = true;
program.originProgram = originProgramDesc;
program.quantification = quantification;
program.combined_params_len = combined_params_len;
program.combined_params_buf = combined_params_buf;
auto scope = std::make_shared<framework::Scope>();
auto scope = std::make_shared<Scope>();
program.scope = scope;
InitMemoryFromProgram(originProgramDesc, scope);
FusionAndPrintInfos(optimize, can_add_split, program, originProgramDesc);
FusionAndPrintInfos(optimize, can_add_split, &program, originProgramDesc);
paddle_mobile__framework__proto__program_desc__free_unpacked(c_program,
nullptr);
return program;
}
template class Loader<CPU, Precision::FP32>;
template class Loader<FPGA, Precision::FP32>;
template class Loader<GPU_MALI, Precision::FP32>;
template class Loader<GPU_CL, Precision::FP32>;
} // namespace framework
} // namespace paddle_mobile
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <string>
#include "common/types.h"
#include "framework/program/program.h"
namespace paddle_mobile {
namespace framework {
template <typename Dtype = CPU, Precision P = Precision::FP32>
class Loader {
public:
/*
* @b load separate format fluid model
* @b 加载分开形式的 fluid 模型
* */
const Program<Dtype, P> Load(const std::string &dirname,
bool optimize = false,
bool quantification = false,
bool can_add_split = false);
/*
* @b load combine format fluid mode
* @b 加载结合在一起格式的模型
* */
const Program<Dtype, P> Load(const std::string &model_path,
const std::string &para_path,
bool optimize = false,
bool quantification = false);
const Program<Dtype, P> LoadCombinedMemory(size_t model_len,
const uint8_t *model_buf,
size_t combined_params_len,
uint8_t *combined_params_buf,
bool optimize = false,
bool quantification = false);
private:
const Program<Dtype, P> LoadProgram(const std::string &model_path,
bool optimize = false,
bool quantification = false,
bool can_add_split = false);
void InitMemoryFromProgram(
const std::shared_ptr<ProgramDesc> &originProgramDesc,
const std::shared_ptr<Scope> &scope);
};
} // namespace framework
} // namespace paddle_mobile
......@@ -14,8 +14,10 @@ limitations under the License. */
#pragma once
#include <memory>
#include <string>
#include <tuple>
#include "common/log.h"
#include "common/type_define.h"
#include "framework/op_info.h"
......@@ -120,5 +122,8 @@ class OpRegistry {
#define REGISTER_OPERATOR_FPGA(op_type, op_class) \
REGISTER_OPERATOR(op_type, op_class, fpga, paddle_mobile::FPGA);
#define REGISTER_OPERATOR_CL(op_type, op_class) \
REGISTER_OPERATOR(op_type, op_class, cl, paddle_mobile::GPU_CL);
} // namespace framework
} // namespace paddle_mobile
......@@ -56,37 +56,69 @@ template <typename Dtype>
void OperatorBase<Dtype>::CheckAllInputOutputSet() const {}
template <typename Dtype>
void OperatorBase<Dtype>::Run() const {
void OperatorBase<Dtype>::Run() {
DLOG << " ----- Begin run impl --- " << type_ << " ----- ";
RunImpl();
#ifdef PADDLE_MOBILE_DEBUG
DLOG << "-------------" << type_ << "----------------------------";
vector<string> input_keys = GetInputKeys();
for (const auto key : input_keys) {
auto var_vec_in = inputs_.at(key);
for (int i = 0; i < var_vec_in.size(); ++i) {
auto vari = scope_->FindVar(var_vec_in[i]);
if (vari->IsInitialized()) {
Tensor *tensor = vari->template GetMutable<framework::LoDTensor>();
if (tensor) DLOG << type_ << " input- " << key << "=" << *tensor;
}
}
}
for (const auto key : GetOutKeys()) {
auto var_vec_out = outputs_.at(key);
for (int i = 0; i < var_vec_out.size(); ++i) {
auto vari = scope_->FindVar(var_vec_out[i]);
if (vari->IsInitialized()) {
Tensor *tensor = vari->template GetMutable<framework::LoDTensor>();
if (tensor) DLOG << type_ << " output- " << key << "=" << *tensor;
}
}
}
#endif
DLOG << " ----- End run impl --- " << type_ << " ----- ";
//#ifdef PADDLE_MOBILE_DEBUG
// DLOG << "-------------" << type_ << "----------------------------";
// vector<string> input_keys = GetInputKeys();
// for (const auto key : input_keys) {
// auto var_vec_in = inputs_.at(key);
// for (int i = 0; i < var_vec_in.size(); ++i) {
// auto vari = scope_->FindVar(var_vec_in[i]);
// if (vari->IsInitialized()) {
//#ifdef PADDLE_MOBILE_CL
// if (type_ == "feed") {
// Tensor *tensor = vari->template
// GetMutable<framework::LoDTensor>(); if (tensor) DLOG << type_ << "
// input- " << key << "=" << *tensor;
// } else {
// CLImage *cl_image = vari->template
// GetMutable<framework::CLImage>(); if (cl_image) {
// DLOG << type_ << " input- " << key << "=" << *cl_image;
// }
// }
//
//#else
// Tensor *tensor = vari->template GetMutable<framework::LoDTensor>();
// if (tensor) DLOG << type_ << " input- " << key << "=" << *tensor;
//#endif
// }
// }
// }
// for (const auto key : GetOutKeys()) {
// auto var_vec_out = outputs_.at(key);
// for (int i = 0; i < var_vec_out.size(); ++i) {
// auto vari = scope_->FindVar(var_vec_out[i]);
// if (vari->IsInitialized()) {
//#ifdef PADDLE_MOBILE_CL
// if (type_ == "fetch") {
// Tensor *tensor = vari->template
// GetMutable<framework::LoDTensor>(); if (tensor) {
// DLOG << type_ << " output- " << key << "=" << *tensor;
// }
// } else {
// CLImage *cl_image = vari->template
// GetMutable<framework::CLImage>(); if (cl_image) {
// DLOG << type_ << " output- " << key << "=" << *cl_image;
// }
// }
//
//#else
// Tensor *tensor = vari->template GetMutable<framework::LoDTensor>();
// if (tensor) DLOG << type_ << " output- " << key << "=" << *tensor;
//#endif
// }
// }
// }
//#endif
}
template class OperatorBase<CPU>;
template class OperatorBase<FPGA>;
template class OperatorBase<GPU_MALI>;
template class OperatorBase<GPU_CL>;
} // namespace framework
} // namespace paddle_mobile
......@@ -16,6 +16,7 @@ limitations under the License. */
#include <map>
#include <string>
#include <utility>
#include <vector>
#include "common/enforce.h"
......@@ -31,7 +32,10 @@ limitations under the License. */
#include "framework/scope.h"
#include "framework/tensor.h"
#include "framework/variable.h"
#ifdef PADDLE_MOBILE_CL
#include "framework/cl/cl_helper.h"
#include "framework/cl/cl_scope.h"
#endif
namespace paddle_mobile {
namespace framework {
using std::string;
......@@ -59,10 +63,10 @@ class OperatorBase {
const VariableNameMap &outputs, const AttributeMap &attrs,
std::shared_ptr<Scope> scope);
virtual ~OperatorBase() {}
void Run() const;
void Run();
std::vector<string> GetOutKeys() const;
std::vector<string> GetInputKeys() const;
virtual void RunImpl() const = 0;
virtual void RunImpl() = 0;
virtual void Init() = 0;
/*
......@@ -112,9 +116,13 @@ class OperatorWithKernel : public OperatorBase<Dtype> {
const VariableNameMap &outputs, const AttributeMap &attrs,
std::shared_ptr<Scope> scope)
: OperatorBase<Dtype>(type, inputs, outputs, attrs, scope),
param_(inputs, outputs, attrs, *scope) {}
param_(inputs, outputs, attrs, *scope) {
#ifdef PADDLE_MOBILE_CL
kernel_.InitCLHelper(scope->GetCLScpoe());
#endif
}
virtual void RunImpl() const { this->kernel_.Compute(this->param_); }
virtual void RunImpl() { this->kernel_.Compute(this->param_); }
virtual void InferShape() const = 0;
......@@ -123,6 +131,7 @@ class OperatorWithKernel : public OperatorBase<Dtype> {
// DLOG << i.first;
// DLOG << i.second;
// }
PADDLE_MOBILE_ENFORCE(kernel_.Init(&param_), " %s kernel init failed",
this->type_.c_str());
}
......@@ -138,22 +147,35 @@ class OperatorWithKernel : public OperatorBase<Dtype> {
template <typename Dtype, typename P>
class OpKernelBase {
public:
/*
* @b 所有kernel 需实现 Compute 方法
* @p para 这个参数为 kernel 运算时所需要用到参数组成的一个结构体,
* 所有结构体存在与: paddle-mobile/src/operators/op_param.h
* */
#ifdef PADDLE_MOBILE_MALI_GPU
OpKernelBase() = default;
#ifdef PADDLE_MOBILE_CL
virtual void InitCLHelper(CLScope *clScope) {
cl_helper_ = CLHelper(clScope);
}
#endif
/*
* @b 所有kernel 需实现 Compute 方法
* @p para 这个参数为 kernel 运算时所需要用到参数组成的一个结构体,
* 所有结构体存在与: paddle-mobile/src/operators/op_param.h
* */
#ifdef PADDLE_McOBILE_MALI_GPU
OpKernelBase() { acl_op_ = nullptr; }
void *GetAclOp() const { return acl_op_; }
void SetAclOp(void *op, void *ob) const {
reinterpret_cast<OpKernelBase<Dtype, P> *>(ob)->acl_op_ = op;
}
#endif
virtual void Compute(const P &para) const = 0;
virtual void Compute(const P &para) = 0;
virtual bool Init(P *para) { return true; }
virtual ~OpKernelBase() = default;
protected:
#ifdef PADDLE_MOBILE_CL
CLHelper cl_helper_;
#endif
private:
#ifdef PADDLE_MOBILE_MALI_GPU
void *acl_op_;
......
......@@ -18,6 +18,8 @@ limitations under the License. */
#include "framework/program/program_desc.h"
#include "framework/scope.h"
#include <string>
namespace paddle_mobile {
namespace framework {
......@@ -32,7 +34,7 @@ class Program {
bool combined = false;
bool quantification = false;
size_t combined_params_len;
const uint8_t *combined_params_buf;
uint8_t *combined_params_buf;
};
} // namespace framework
......
......@@ -15,8 +15,14 @@ limitations under the License. */
#pragma once
#include <list>
#include <string>
#include <unordered_map>
#include "variable.h"
#include <vector>
#ifdef PADDLE_MOBILE_CL
#include "framework/cl/cl_scope.h"
#endif
#include "framework/variable.h"
namespace paddle_mobile {
namespace framework {
......@@ -33,6 +39,10 @@ class Scope {
delete kid;
}
kids_.clear();
#ifdef PADDLE_MOBILE_CL
delete cl_scope_;
#endif
}
Scope &NewScope() const;
......@@ -72,6 +82,10 @@ class Scope {
Variable *FindVarLocally(const std::string &name) const;
#ifdef PADDLE_MOBILE_CL
CLScope *GetCLScpoe() { return cl_scope_; }
#endif
private:
// Call Scope::NewScope for a sub-scope.
explicit Scope(Scope const *parent) : parent_(parent) {}
......@@ -79,6 +93,10 @@ class Scope {
mutable std::unordered_map<std::string, Variable *> vars_;
mutable std::list<Scope *> kids_;
Scope const *parent_{nullptr};
#ifdef PADDLE_MOBILE_CL
CLScope *cl_scope_ = new CLScope();
#endif
};
} // namespace framework
} // namespace paddle_mobile
......@@ -24,65 +24,24 @@ limitations under the License. */
#include <vector>
#include "common/enforce.h"
#include "common/types.h"
#include "framework/data_layout.h"
#include "framework/ddim.h"
#include "framework/tensor_base.h"
#include "memory/t_malloc.h"
namespace paddle_mobile {
namespace framework {
template <typename... T>
struct SizeOfTypeFunctor;
template <typename T>
struct SizeOfTypeFunctor<T> {
size_t operator()(std::type_index type) const {
if (typeid(T).hash_code() == type.hash_code()) {
return sizeof(T);
} else {
return 0UL;
}
}
};
template <>
struct SizeOfTypeFunctor<> {
size_t operator()(std::type_index type) const { return 0UL; }
};
template <typename HEAD, typename... TAIL>
struct SizeOfTypeFunctor<HEAD, TAIL...> {
size_t operator()(std::type_index type) const {
SizeOfTypeFunctor<HEAD> head;
size_t head_size = head(type);
if (head_size != 0) {
return head_size;
}
SizeOfTypeFunctor<TAIL...> tail;
return tail(type);
}
};
static inline size_t SizeOfType(std::type_index type) {
SizeOfTypeFunctor<int8_t, int, half, float, double, int16_t, int64_t, bool,
size_t>
functor;
size_t size = functor(type);
PADDLE_MOBILE_ENFORCE(size != 0UL, "Cannot get size of type %s", type.name());
return size;
}
class LoDTensor;
class Tensor {
class Tensor : public TensorBase {
public:
Tensor() : offset_(0) {}
Tensor() {}
template <typename T>
Tensor(std::vector<T> input, DDim ddim) : offset_(0) {
Tensor(std::vector<T> input, DDim ddim) {
PADDLE_MOBILE_ENFORCE(
input.size() == framework::product(ddim),
"input vector'length should be equal to tensor's length");
auto input_ptr = mutable_data<T>(ddim);
for (int i = 0; i < input.size(); ++i) {
input_ptr[i] = input[i];
......@@ -95,46 +54,6 @@ class Tensor {
this->offset_ = inTensor.offset_;
}
/*! Return a pointer to mutable memory block. */
template <typename T>
inline T *data() {
check_memory_size();
PADDLE_MOBILE_ENFORCE(
(std::is_same<T, void>::value ||
holder_->type().hash_code() == typeid(T).hash_code()),
"Tensor holds the wrong type, it holds %s",
this->holder_->type().name());
return reinterpret_cast<T *>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
offset_);
}
/*! Return a pointer to constant memory block. */
template <typename T>
inline const T *data() const {
check_memory_size();
PADDLE_MOBILE_ENFORCE(
(std::is_same<T, void>::value ||
holder_->type().hash_code() == typeid(T).hash_code()),
"Tensor holds the wrong type, it holds %s ,requested:%s",
this->holder_->type().name(), typeid(T).name());
return reinterpret_cast<const T *>(
reinterpret_cast<uintptr_t>(holder_->ptr()) + offset_);
}
inline bool IsInitialized() const { return holder_ != nullptr; }
/**
* @brief Return a pointer to mutable memory block.
* @note If not exist, then allocation.
*/
template <typename T>
inline T *mutable_data() {
static_assert(std::is_pod<T>::value, "T must be POD");
return reinterpret_cast<T *>(mutable_data(typeid(T)));
}
#ifdef PADDLE_MOBILE_DEBUG
template <typename T>
inline void dump(std::string filename) const {
......@@ -151,6 +70,21 @@ class Tensor {
}
#endif
/*! Resize the dimensions of the memory block. */
inline Tensor &Resize(const DDim &dims) {
dims_ = dims;
return *this;
}
/*! The internal of two tensors share the same memory block. */
inline Tensor &ShareDataWith(const Tensor &src) {
src.check_memory_size();
if (holder_.get() != src.holder_.get()) {
*this = src;
}
return *this;
}
inline void *mutable_data(std::type_index type) {
if (holder_ != nullptr) {
holder_->set_type(type);
......@@ -165,6 +99,16 @@ class Tensor {
reinterpret_cast<uintptr_t>(holder_->ptr()) + offset_);
}
/**
* @brief Return a pointer to mutable memory block.
* @note If not exist, then allocation.
*/
template <typename T>
inline T *mutable_data() {
static_assert(std::is_pod<T>::value, "T must be POD");
return reinterpret_cast<T *>(mutable_data(typeid(T)));
}
/**
* @brief Return a pointer to mutable memory block.
*
......@@ -180,27 +124,6 @@ class Tensor {
return mutable_data<T>();
}
/*! Return the dimensions of the memory block. */
inline const DDim &dims() const { return dims_; }
/*! Return the numel of the memory block. */
inline int64_t numel() const { return product(dims_); }
/*! Resize the dimensions of the memory block. */
inline Tensor &Resize(const DDim &dims) {
dims_ = dims;
return *this;
}
/*! The internal of two tensors share the same memory block. */
inline Tensor &ShareDataWith(const Tensor &src) {
src.check_memory_size();
if (holder_.get() != src.holder_.get()) {
*this = src;
}
return *this;
}
/**
* @brief Return a sub-tensor of the given tensor.
*
......@@ -234,44 +157,35 @@ class Tensor {
}
}
std::type_index type() const {
/*! Return a pointer to mutable memory block. */
template <typename T>
inline T *data() {
check_memory_size();
PADDLE_MOBILE_ENFORCE(
holder_ != nullptr,
"Tensor not initialized yet when Tensor::type() is called.")
return holder_->type();
}
(std::is_same<T, void>::value ||
holder_->type().hash_code() == typeid(T).hash_code()),
"Tensor holds the wrong type, it holds %s",
this->holder_->type().name());
// memory size returns the holding memory size in byte.
size_t memory_size() const {
return holder_ == nullptr ? 0UL : holder_->size() - offset_;
return reinterpret_cast<T *>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
offset_);
}
inline void check_memory_size() const {
/*! Return a pointer to constant memory block. */
template <typename T>
inline const T *data() const {
check_memory_size();
PADDLE_MOBILE_ENFORCE(
holder_ != nullptr,
"Tensor holds no memory. Call Tensor::mutable_data first.");
PADDLE_MOBILE_ENFORCE(numel() * SizeOfType(type()) <= memory_size(),
"Tensor's dims_ is out of bound. ");
(std::is_same<T, void>::value ||
holder_->type().hash_code() == typeid(T).hash_code()),
"Tensor holds the wrong type, it holds %s ,requested:%s",
this->holder_->type().name(), typeid(T).name());
return reinterpret_cast<const T *>(
reinterpret_cast<uintptr_t>(holder_->ptr()) + offset_);
}
private:
/**
* @note Placeholder hides type T, so it doesn't appear as a
* template
* parameter of Variable.
*/
struct Placeholder {
virtual ~Placeholder() = default;
virtual void *ptr() const = 0;
virtual size_t size() const = 0;
virtual std::type_index type() const = 0;
virtual void set_type(std::type_index type) = 0;
};
struct PlaceholderImpl : public Placeholder {
PlaceholderImpl(size_t size, std::type_index type)
: ptr_(static_cast<uint8_t *>(memory::Alloc(size)),
......@@ -299,27 +213,6 @@ class Tensor {
std::type_index type_;
};
/*! holds the memory block if allocated. */
std::shared_ptr<Placeholder> holder_;
/**
* @brief points to elements dimensions.
*
* @note dims_ do not indicate the memory block size.
*/
DDim dims_;
/**
* @brief A PlaceHolder may be shared by more than one tensor.
*
* @note Some of them may be slices of the others. So the offset_
* is introduced here to indicate the byte offset between
* PlaceHolder::ptr_ and where the tensor data really
* begins.
*/
size_t offset_;
#ifdef PADDLE_MOBILE_FPGA
public: // NOLINT
inline void reset_data_ptr(void *p) {
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <type_traits>
#include <typeindex>
#include "common/enforce.h"
#include "common/types.h"
#include "framework/ddim.h"
namespace paddle_mobile {
namespace framework {
template <typename... T>
struct SizeOfTypeFunctor;
template <typename T>
struct SizeOfTypeFunctor<T> {
size_t operator()(std::type_index type) const {
if (typeid(T).hash_code() == type.hash_code()) {
return sizeof(T);
} else {
return 0UL;
}
}
};
template <>
struct SizeOfTypeFunctor<> {
size_t operator()(std::type_index type) const { return 0UL; }
};
template <typename HEAD, typename... TAIL>
struct SizeOfTypeFunctor<HEAD, TAIL...> {
size_t operator()(std::type_index type) const {
SizeOfTypeFunctor<HEAD> head;
size_t head_size = head(type);
if (head_size != 0) {
return head_size;
}
SizeOfTypeFunctor<TAIL...> tail;
return tail(type);
}
};
static inline size_t SizeOfType(std::type_index type) {
SizeOfTypeFunctor<int8_t, int, half, float, double, int16_t, int64_t, bool,
size_t>
functor;
size_t size = functor(type);
PADDLE_MOBILE_ENFORCE(size != 0UL, "Cannot get size of type %s", type.name());
return size;
}
class TensorBase {
public:
virtual inline TensorBase &Resize(const DDim &dims) = 0;
inline bool IsInitialized() const { return holder_ != nullptr; }
/*! Return the dimensions of the memory block. */
inline const DDim &dims() const { return dims_; }
/*! Return the numel of the memory block. */
inline int64_t numel() const { return product(dims_); }
std::type_index type() const {
PADDLE_MOBILE_ENFORCE(
holder_ != nullptr,
"Tensor not initialized yet when Tensor::type() is called.")
return holder_->type();
}
// memory size returns the holding memory size in byte.
size_t memory_size() const {
return holder_ == nullptr ? 0UL : holder_->size() - offset_;
}
inline void check_memory_size() const {
PADDLE_MOBILE_ENFORCE(
holder_ != nullptr,
"Tensor holds no memory. Call Tensor::mutable_data first.");
PADDLE_MOBILE_ENFORCE(numel() * SizeOfType(type()) <= memory_size(),
"Tensor's dims_ is out of bound. ");
}
protected:
/**
* @note Placeholder hides type T, so it doesn't appear as a
* template
* parameter of Variable.
*/
struct Placeholder {
virtual ~Placeholder() = default;
virtual void *ptr() const = 0;
virtual size_t size() const = 0;
virtual std::type_index type() const = 0;
virtual void set_type(std::type_index type) = 0;
};
/**
* @brief points to elements dimensions.
*
* @note dims_ do not indicate the memory block size.
*/
DDim dims_;
/*! holds the memory block if allocated. */
std::shared_ptr<Placeholder> holder_;
/**
* @brief A PlaceHolder may be shared by more than one tensor.
*
* @note Some of them may be slices of the others. So the offset_
* is introduced here to indicate the byte offset between
* PlaceHolder::ptr_ and where the tensor data really
* begins.
*/
size_t offset_ = 0;
};
} // namespace framework
} // namespace paddle_mobile
......@@ -126,6 +126,8 @@ CreatePaddlePredictor<PaddleMobileConfig, PaddleEngineKind::kPaddleMobile>(
x.reset(new PaddleMobilePredictor<FPGA, Precision::FP32>(config));
} else if (config.device == PaddleMobileConfig::kGPU_MALI) {
x.reset(new PaddleMobilePredictor<GPU_MALI, Precision::FP32>(config));
} else if (config.device == PaddleMobileConfig::kGPU_CL) {
x.reset(new PaddleMobilePredictor<GPU_CL, Precision::FP32>(config));
} else {
LOG(kLOG_ERROR) << "unsupport device type!";
return nullptr;
......
......@@ -44,7 +44,7 @@ class PaddleBuf {
PaddleBuf(void* data, size_t length)
: data_(data), length_(length), memory_owned_{false} {}
// Own memory.
PaddleBuf(size_t length)
explicit PaddleBuf(size_t length)
: data_(new char[length]), length_(length), memory_owned_(true) {}
// Resize to `length` bytes.
void Resize(size_t length);
......@@ -121,7 +121,7 @@ struct PaddleModelMemoryPack {
struct PaddleMobileConfig : public PaddlePredictor::Config {
enum Precision { FP32 = 0 };
enum Device { kCPU = 0, kFPGA = 1, kGPU_MALI = 2 };
enum Device { kCPU = 0, kFPGA = 1, kGPU_MALI = 2, kGPU_CL = 3 };
enum Precision precision;
enum Device device;
......
......@@ -28,13 +28,13 @@ bool PaddleMobile<Dtype, P>::Load(const std::string &dirname, bool optimize,
bool quantification, int batch_size,
bool loddable) {
if (loader_.get() == nullptr) {
loader_ = std::make_shared<Loader<Dtype, P>>();
loader_ = std::make_shared<framework::Loader<Dtype, P>>();
} else {
LOG(kLOG_INFO) << "loader inited";
}
if (executor_.get() == nullptr) {
executor_ = std::make_shared<Executor<Dtype, P>>(
executor_ = std::make_shared<framework::Executor<Dtype, P>>(
loader_->Load(dirname, optimize, quantification), batch_size, optimize,
loddable);
} else {
......@@ -50,13 +50,13 @@ bool PaddleMobile<Dtype, P>::Load(const std::string &model_path,
bool quantification, int batch_size,
bool loddable) {
if (loader_.get() == nullptr) {
loader_ = std::make_shared<Loader<Dtype, P>>();
loader_ = std::make_shared<framework::Loader<Dtype, P>>();
} else {
LOG(kLOG_INFO) << "loader inited";
}
if (executor_.get() == nullptr) {
executor_ = std::make_shared<Executor<Dtype, P>>(
executor_ = std::make_shared<framework::Executor<Dtype, P>>(
loader_->Load(model_path, para_path, optimize, quantification),
batch_size, optimize, loddable);
} else {
......@@ -67,21 +67,22 @@ bool PaddleMobile<Dtype, P>::Load(const std::string &model_path,
}
template <typename Dtype, Precision P>
bool PaddleMobile<Dtype, P>::LoadCombinedMemory(
size_t model_len, const uint8_t *model_buf, size_t combined_params_len,
const uint8_t *combined_params_buf) {
bool PaddleMobile<Dtype, P>::LoadCombinedMemory(size_t model_len,
const uint8_t *model_buf,
size_t combined_params_len,
uint8_t *combined_params_buf) {
int batch_size = 1;
bool optimise = true;
bool quantification = false;
if (loader_.get() == nullptr) {
loader_ = std::make_shared<Loader<Dtype, P>>();
loader_ = std::make_shared<framework::Loader<Dtype, P>>();
} else {
LOG(kLOG_INFO) << "loader inited";
}
if (executor_.get() == nullptr) {
executor_ = std::make_shared<Executor<Dtype, P>>(
executor_ = std::make_shared<framework::Executor<Dtype, P>>(
loader_->LoadCombinedMemory(model_len, model_buf, combined_params_len,
combined_params_buf, optimise,
quantification),
......@@ -161,4 +162,6 @@ template class PaddleMobile<CPU, Precision::FP32>;
template class PaddleMobile<FPGA, Precision::FP32>;
template class PaddleMobile<GPU_MALI, Precision::FP32>;
template class PaddleMobile<GPU_CL, Precision::FP32>;
} // namespace paddle_mobile
......@@ -22,10 +22,10 @@ limitations under the License. */
#endif // _OPENMP
#include "common/types.h"
#include "framework/executor.h"
#include "framework/load_ops.h"
#include "framework/loader.h"
#include "framework/tensor.h"
#include "io/executor.h"
#include "io/loader.h"
namespace paddle_mobile {
......@@ -52,7 +52,7 @@ class PaddleMobile {
bool LoadCombinedMemory(size_t model_len, const uint8_t *model_buf,
size_t combined_params_len,
const uint8_t *combined_params_buf);
uint8_t *combined_params_buf);
void SetThreadNum(int num);
void Clear();
......@@ -69,8 +69,8 @@ class PaddleMobile {
#endif
private:
std::shared_ptr<Loader<Dtype, P>> loader_;
std::shared_ptr<Executor<Dtype, P>> executor_;
std::shared_ptr<framework::Loader<Dtype, P>> loader_;
std::shared_ptr<framework::Executor<Dtype, P>> executor_;
};
} // namespace paddle_mobile
......@@ -14,7 +14,7 @@ limitations under the License. */
#ifdef BATCHNORM_OP
#include "batchnorm_op.h"
#include "operators/batchnorm_op.h"
#include "framework/op_proto_maker.h"
#include "framework/op_registry.h"
......@@ -40,4 +40,8 @@ REGISTER_OPERATOR_MALI_GPU(batch_norm, ops::BatchNormOp);
#ifdef PADDLE_MOBILE_FPGA
#endif
#ifdef PADDLE_MOBILE_CL
REGISTER_OPERATOR_CL(batch_norm, ops::BatchNormOp);
#endif
#endif
......@@ -40,10 +40,6 @@ class BilinearOp : public framework::OperatorWithKernel<
DeviceType, BilinearInterpParam<DeviceType>,
operators::BilinearInterpKernel<DeviceType, T>>(
type, inputs, outputs, attrs, scope) {}
using framework::OperatorWithKernel<
DeviceType, BilinearInterpParam<DeviceType>,
operators::BilinearInterpKernel<DeviceType, T>>::OperatorWithKernel;
void InferShape() const override;
};
......
......@@ -39,10 +39,6 @@ class BoxCoderOp : public framework::OperatorWithKernel<
: framework::OperatorWithKernel<DeviceType, BoxCoderParam<DeviceType>,
operators::BoxCoderKernel<DeviceType, T>>(
type, inputs, outputs, attrs, scope) {}
using framework::OperatorWithKernel<
DeviceType, BoxCoderParam<DeviceType>,
operators::BoxCoderKernel<DeviceType, T>>::OperatorWithKernel;
void InferShape() const override;
protected:
......
......@@ -34,10 +34,6 @@ class ConcatOp : public framework::OperatorWithKernel<
: framework::OperatorWithKernel<DeviceType, ConcatParam<DeviceType>,
operators::ConcatKernel<DeviceType, T>>(
type, inputs, outputs, attrs, scope) {}
using framework::OperatorWithKernel<
DeviceType, ConcatParam<DeviceType>,
operators::ConcatKernel<DeviceType, T>>::OperatorWithKernel;
void InferShape() const override;
protected:
......
......@@ -62,4 +62,8 @@ REGISTER_OPERATOR_MALI_GPU(conv2d, ops::ConvOp);
REGISTER_OPERATOR_FPGA(conv2d, ops::ConvOp);
#endif
#ifdef PADDLE_MOBILE_CL
REGISTER_OPERATOR_CL(conv2d, ops::ConvOp);
#endif
#endif
......@@ -34,10 +34,6 @@ class ConvOp : public framework::OperatorWithKernel<
: framework::OperatorWithKernel<DeviceType, ConvParam<DeviceType>,
operators::ConvKernel<DeviceType, T>>(
type, inputs, outputs, attrs, scope) {}
using framework::OperatorWithKernel<
DeviceType, ConvParam<DeviceType>,
operators::ConvKernel<DeviceType, T>>::OperatorWithKernel;
void InferShape() const override;
private:
......
......@@ -37,10 +37,6 @@ class CrfOp : public framework::OperatorWithKernel<
: framework::OperatorWithKernel<DeviceType, CrfParam<DeviceType>,
operators::CrfKernel<DeviceType, T>>(
type, inputs, outputs, attrs, scope) {}
using framework::OperatorWithKernel<
DeviceType, CrfParam<DeviceType>,
operators::CrfKernel<DeviceType, T>>::OperatorWithKernel;
void InferShape() const override;
};
......
......@@ -36,10 +36,6 @@ class DepthwiseConvOp : public framework::OperatorWithKernel<
DeviceType, ConvParam<DeviceType>,
operators::DepthwiseConvKernel<DeviceType, T>>(
type, inputs, outputs, attrs, scope) {}
using framework::OperatorWithKernel<
DeviceType, ConvParam<DeviceType>,
operators::DepthwiseConvKernel<DeviceType, T>>::OperatorWithKernel;
void InferShape() const override;
private:
......
......@@ -38,10 +38,6 @@ class DropoutOp : public framework::OperatorWithKernel<
: framework::OperatorWithKernel<DeviceType, DropoutParam<DeviceType>,
operators::DropoutKernel<DeviceType, T>>(
type, inputs, outputs, attrs, scope) {}
// using framework::OperatorWithKernel<DeviceType, DropoutParam<DeviceType>,
// operators::DropoutKernel<DeviceType,
// T>>;
void InferShape() const override;
protected:
......
......@@ -14,7 +14,7 @@ limitations under the License. */
#ifdef ELEMENTWISEADD_OP
#include "elementwise_add_op.h"
#include "operators/elementwise_add_op.h"
namespace paddle_mobile {
namespace operators {
......@@ -36,4 +36,8 @@ REGISTER_OPERATOR_CPU(elementwise_add, ops::ElementwiseAddOp);
REGISTER_OPERATOR_MALI_GPU(elementwise_add, ops::ElementwiseAddOp);
#endif
#ifdef PADDLE_MOBILE_CL
REGISTER_OPERATOR_CL(elementwise_add, ops::ElementwiseAddOp);
#endif
#endif
......@@ -37,10 +37,6 @@ class ElementwiseAddOp : public framework::OperatorWithKernel<
DeviceType, ElementwiseAddParam<DeviceType>,
operators::ElementwiseAddKernel<DeviceType, T>>(
type, inputs, outputs, attrs, scope) {}
using framework::OperatorWithKernel<
DeviceType, ElementwiseAddParam<DeviceType>,
operators::ElementwiseAddKernel<DeviceType, T>>::OperatorWithKernel;
void InferShape() const override;
protected:
......
......@@ -14,6 +14,19 @@ limitations under the License. */
#include "operators/feed_op.h"
namespace paddle_mobile {
namespace operators {
template <typename DeviceType, typename T>
void FeedOp<DeviceType, T>::InferShape() const {
auto out_dims = this->param_.Out()->dims();
out_dims[0] = this->param_.BatchSize();
this->param_.Out()->Resize(out_dims);
}
} // namespace operators
} // namespace paddle_mobile
namespace ops = paddle_mobile::operators;
#ifdef PADDLE_MOBILE_CPU
......@@ -25,3 +38,6 @@ REGISTER_OPERATOR_MALI_GPU(feed, ops::FeedOp);
#ifdef PADDLE_MOBILE_FPGA
REGISTER_OPERATOR_FPGA(feed, ops::FeedOp);
#endif
#ifdef PADDLE_MOBILE_CL
REGISTER_OPERATOR_CL(feed, ops::FeedOp);
#endif
......@@ -16,68 +16,29 @@ limitations under the License. */
#include <string>
#include "framework/operator.h"
#include "operators/kernel/feed_kernel.h"
#include "operators/op_param.h"
namespace paddle_mobile {
namespace operators {
using std::string;
template <typename DeviceType, typename T>
class FeedOp : public framework::OperatorBase<DeviceType> {
class FeedOp
: public framework::OperatorWithKernel<DeviceType, FeedParam<DeviceType>,
FeedKernel<DeviceType, T>> {
public:
FeedOp(const std::string &type, const VariableNameMap &inputs,
const VariableNameMap &outputs, const framework::AttributeMap attrs,
std::shared_ptr<framework::Scope> scope)
: framework::OperatorBase<DeviceType>(type, inputs, outputs, attrs,
scope),
param_(inputs, outputs, attrs, scope.get()) {}
void InferShape() const {
auto out_dims = param_.Out()->dims();
out_dims[0] = param_.BatchSize();
param_.Out()->Resize(out_dims);
}
#ifdef PADDLE_MOBILE_FPGA
void Init() {
Tensor *output = param_.Out();
fpga::format_fp16_ofm(output);
}
void RunImpl() const {
auto input = (Tensor *)const_cast<LoDTensor *>(param_.InputX()); // NOLINT
fpga::format_image(input);
auto input_ptr = input->data<float>();
Tensor *output = param_.Out();
auto output_ptr = output->data<float>();
fpga::BypassArgs args = {fpga::DATA_TYPE_FP32};
args.input_data_type = fpga::DATA_TYPE_FP32;
args.output_data_type = fpga::DATA_TYPE_FP16;
args.input_layout_type = fpga::LAYOUT_CHW;
args.output_layout_type = fpga::LAYOUT_HWC;
args.image.address = (void *)input_ptr; // NOLINT
args.image.channels = (uint32_t)input->dims()[1];
args.image.height = (uint32_t)input->dims()[2];
args.image.width = (uint32_t)input->dims()[3];
args.image.pad_height = 0;
args.image.pad_width = 0;
args.output.address = output_ptr;
args.output.scale_address = output->scale;
fpga::PerformBypass(args);
}
#else
void Init() {}
void RunImpl() const {
param_.Out()->ShareDataWith(*param_.InputX());
param_.Out()->set_lod(param_.InputX()->lod());
}
#endif
: framework::OperatorWithKernel<DeviceType, FeedParam<DeviceType>,
FeedKernel<DeviceType, T>>(
type, inputs, outputs, attrs, scope) {}
void InferShape() const override;
protected:
FeedParam<DeviceType> param_;
};
} // namespace operators
......
......@@ -13,6 +13,17 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "operators/fetch_op.h"
namespace paddle_mobile {
namespace operators {
template <typename DeviceType, typename T>
void FetchOp<DeviceType, T>::InferShape() const {
auto x_dims = this->param_.InputX()->dims();
this->param_.Out()->Resize(x_dims);
}
} // namespace operators
} // namespace paddle_mobile
namespace ops = paddle_mobile::operators;
#ifdef PADDLE_MOBILE_CPU
......@@ -24,3 +35,6 @@ REGISTER_OPERATOR_MALI_GPU(fetch, ops::FetchOp);
#ifdef PADDLE_MOBILE_FPGA
REGISTER_OPERATOR_FPGA(fetch, ops::FetchOp);
#endif
#ifdef PADDLE_MOBILE_CL
REGISTER_OPERATOR_CL(fetch, ops::FetchOp);
#endif
......@@ -16,6 +16,7 @@ limitations under the License. */
#include <string>
#include "framework/operator.h"
#include "operators/kernel/fetch_kernel.h"
#include "operators/op_param.h"
namespace paddle_mobile {
......@@ -23,25 +24,20 @@ namespace operators {
using std::string;
template <typename DeviceType, typename T>
class FetchOp : public framework::OperatorBase<DeviceType> {
class FetchOp
: public framework::OperatorWithKernel<DeviceType, FetchParam<DeviceType>,
FetchKernel<DeviceType, T>> {
public:
FetchOp(const string &type, const VariableNameMap &inputs,
const VariableNameMap &outputs, const framework::AttributeMap attrs,
std::shared_ptr<framework::Scope> scope)
: framework::OperatorBase<DeviceType>(type, inputs, outputs, attrs,
scope),
param_(inputs, outputs, attrs, *scope) {}
void RunImpl() const { param_.Out()->ShareDataWith(*param_.InputX()); }
: framework::OperatorWithKernel<DeviceType, FetchParam<DeviceType>,
FetchKernel<DeviceType, T>>(
type, inputs, outputs, attrs, scope) {}
void Init() {}
void InferShape() const {
auto x_dims = param_.InputX()->dims();
param_.Out()->Resize(x_dims);
}
void InferShape() const override;
protected:
FetchParam<DeviceType> param_;
};
} // namespace operators
......
......@@ -20,9 +20,6 @@ namespace ops = paddle_mobile::operators;
#ifdef PADDLE_MOBILE_CPU
REGISTER_OPERATOR_CPU(fill_constant, ops::FillConstantOp);
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
REGISTER_OPERATOR_MALI_GPU(fill_constant, ops::FillConstantOp);
#endif
#ifdef PADDLE_MOBILE_FPGA
REGISTER_OPERATOR_FPGA(fill_constant, ops::FillConstantOp);
#endif
......
......@@ -37,7 +37,7 @@ class FillConstantOp : public framework::OperatorBase<DeviceType> {
: framework::OperatorBase<DeviceType>(type, inputs, outputs, attrs,
scope),
param_(inputs, outputs, attrs, *scope) {}
void RunImpl() const {
void RunImpl() {
auto data_type =
static_cast<_PaddleMobile__Framework__Proto__VarType__Type>(
param_.DataDtype());
......
......@@ -17,6 +17,7 @@ limitations under the License. */
#pragma once
#include <string>
#include <vector>
#include "framework/operator.h"
#include "operators/kernel/flatten_kernel.h"
......@@ -53,10 +54,6 @@ class FlattenOp : public framework::OperatorWithKernel<
: framework::OperatorWithKernel<DeviceType, FlattenParam<DeviceType>,
operators::FlattenKernel<DeviceType, T>>(
type, inputs, outputs, attrs, scope) {}
using framework::OperatorWithKernel<
DeviceType, FlattenParam<DeviceType>,
operators::FlattenKernel<DeviceType, T>>::OperatorWithKernel;
void InferShape() const override;
};
......
......@@ -17,6 +17,7 @@ limitations under the License. */
#pragma once
#include <string>
#include <utility>
#include <vector>
#include "framework/operator.h"
#include "framework/program/program-optimize/fusion_op_register.h"
......@@ -67,10 +68,6 @@ class FusionConvAddAddPReluOp
DeviceType, FusionConvAddAddPReluParam<DeviceType>,
operators::ConvAddAddPReluKernel<DeviceType, T>>(
type, inputs, outputs, attrs, scope) {}
using framework::OperatorWithKernel<
DeviceType, FusionConvAddAddPReluParam<DeviceType>,
operators::ConvAddAddPReluKernel<DeviceType, T>>::OperatorWithKernel;
void InferShape() const override;
protected:
......
......@@ -58,5 +58,7 @@ REGISTER_OPERATOR_CPU(fusion_conv_add_bn_relu, ops::FusionConvAddBNReluOp);
#ifdef PADDLE_MOBILE_FPGA
REGISTER_OPERATOR_FPGA(fusion_conv_add_bn_relu, ops::FusionConvAddBNReluOp);
#endif
#ifdef PADDLE_MOBILE_CL
REGISTER_OPERATOR_CL(fusion_conv_add_bn_relu, ops::FusionConvAddBNReluOp);
#endif
#endif
......@@ -20,8 +20,8 @@ limitations under the License. */
#include <vector>
#include "framework/operator.h"
#include "framework/program/program-optimize/fusion_op_register.h"
#include "op_param.h"
#include "operators/kernel/conv_add_bn_relu_kernel.h"
#include "operators/op_param.h"
namespace paddle_mobile {
namespace operators {
......@@ -66,10 +66,6 @@ class FusionConvAddBNReluOp
DeviceType, FusionConvAddBNReluParam<DeviceType>,
operators::ConvAddBNReluKernel<DeviceType, T>>(
type, inputs, outputs, attrs, scope) {}
using framework::OperatorWithKernel<
DeviceType, FusionConvAddBNReluParam<DeviceType>,
operators::ConvAddBNReluKernel<DeviceType, T>>::OperatorWithKernel;
void InferShape() const override;
protected:
......
......@@ -58,4 +58,8 @@ REGISTER_OPERATOR_CPU(fusion_conv_add, ops::FusionConvAddOp);
REGISTER_OPERATOR_MALI_GPU(fusion_conv_add, ops::FusionConvAddOp);
#endif
#ifdef PADDLE_MOBILE_CL
REGISTER_OPERATOR_CL(fusion_conv_add, ops::FusionConvAddOp);
#endif
#endif
......@@ -19,8 +19,8 @@ limitations under the License. */
#include <vector>
#include "framework/operator.h"
#include "framework/program/program-optimize/fusion_op_register.h"
#include "op_param.h"
#include "operators/kernel/conv_add_kernel.h"
#include "operators/op_param.h"
namespace paddle_mobile {
namespace operators {
......@@ -56,10 +56,6 @@ class FusionConvAddOp : public framework::OperatorWithKernel<
FusionConvAddParam<DeviceType>,
operators::ConvAddKernel<DeviceType, T>>(
type, inputs, outputs, attrs, scope) {}
using framework::OperatorWithKernel<
DeviceType, FusionConvAddParam<DeviceType>,
operators::ConvAddKernel<DeviceType, T>>::OperatorWithKernel;
void InferShape() const override;
protected:
......
......@@ -39,10 +39,7 @@ class FusionConvAddPReluOpMatcher : public framework::FusionOpMatcher {
std::vector<std::shared_ptr<framework::Node>> *removed_nodes) {
node->Folder(node_.Depth(), Type(),
{{G_OP_TYPE_ELEMENTWISE_ADD, {{"Y", "Y"}}},
{G_OP_TYPE_PRELU, {{"Alpha", "Alpha"}}}
},
{G_OP_TYPE_PRELU, {{"Alpha", "Alpha"}}}},
removed_nodes);
}
std::string Type() { return G_OP_TYPE_FUSION_CONV_ADD_PRELU; }
......@@ -63,9 +60,6 @@ class FusionConvAddPReluOp
operators::ConvAddPReluKernel<DeviceType, T>>(type, inputs, outputs,
attrs, scope) {}
using framework::OperatorWithKernel<
DeviceType, FusionConvAddPReluParam<DeviceType>,
operators::ConvAddPReluKernel<DeviceType, T>>::OperatorWithKernel;
void InferShape() const override;
protected:
......
......@@ -56,5 +56,7 @@ REGISTER_OPERATOR_CPU(fusion_conv_add_relu, ops::FusionConvAddReluOp);
#ifdef PADDLE_MOBILE_FPGA
REGISTER_OPERATOR_FPGA(fusion_conv_add_relu, ops::FusionConvAddReluOp);
#endif
#ifdef PADDLE_MOBILE_CL
REGISTER_OPERATOR_CL(fusion_conv_add_relu, ops::FusionConvAddReluOp);
#endif
#endif
......@@ -56,9 +56,6 @@ class FusionConvAddReluOp : public framework::OperatorWithKernel<
operators::ConvAddReluKernel<DeviceType, T>>(type, inputs, outputs,
attrs, scope) {}
using framework::OperatorWithKernel<
DeviceType, FusionConvAddReluParam<DeviceType>,
operators::ConvAddReluKernel<DeviceType, T>>::OperatorWithKernel;
void InferShape() const override;
protected:
......
......@@ -17,11 +17,12 @@ limitations under the License. */
#pragma once
#include <string>
#include <utility>
#include <vector>
#include "framework/operator.h"
#include "framework/program/program-optimize/fusion_op_register.h"
#include "op_param.h"
#include "operators/kernel/conv_bn_add_relu_kernel.h"
#include "operators/op_param.h"
namespace paddle_mobile {
namespace operators {
......@@ -71,10 +72,6 @@ class FusionConvBNAddReluOp
DeviceType, FusionConvBNAddReluParam<DeviceType>,
operators::ConvBNAddReluKernel<DeviceType, T>>(
type, inputs, outputs, attrs, scope) {}
using framework::OperatorWithKernel<
DeviceType, FusionConvBNAddReluParam<DeviceType>,
operators::ConvBNAddReluKernel<DeviceType, T>>::OperatorWithKernel;
void InferShape() const override;
protected:
......
......@@ -63,10 +63,6 @@ class FusionConvBNReluOp : public framework::OperatorWithKernel<
DeviceType, FusionConvBNReluParam<DeviceType>,
operators::ConvBNReluKernel<DeviceType, T>>(type, inputs, outputs,
attrs, scope) {}
using framework::OperatorWithKernel<
DeviceType, FusionConvBNReluParam<DeviceType>,
operators::ConvBNReluKernel<DeviceType, T>>::OperatorWithKernel;
void InferShape() const override;
protected:
......
......@@ -20,8 +20,8 @@ limitations under the License. */
#include <vector>
#include "framework/operator.h"
#include "framework/program/program-optimize/fusion_op_register.h"
#include "op_param.h"
#include "operators/kernel/dwconv_bn_relu_kernel.h"
#include "operators/op_param.h"
namespace paddle_mobile {
namespace operators {
......@@ -65,9 +65,6 @@ class FusionDWConvBNReluOp
operators::DWConvBNReluKernel<DeviceType, T>>(type, inputs, outputs,
attrs, scope) {}
using framework::OperatorWithKernel<
DeviceType, FusionDWConvBNReluParam<DeviceType>,
operators::DWConvBNReluKernel<DeviceType, T>>::OperatorWithKernel;
void InferShape() const override;
protected:
......
......@@ -56,10 +56,6 @@ class FusionFcOp : public framework::OperatorWithKernel<
operators::FusionFcKernel<DeviceType, T>>(
type, inputs, outputs, attrs, scope) {}
using framework::OperatorWithKernel<
DeviceType, FusionFcParam<DeviceType>,
operators::FusionFcKernel<DeviceType, T>>::OperatorWithKernel;
void InferShape() const override;
};
......
......@@ -56,9 +56,6 @@ class FusionFcReluOp : public framework::OperatorWithKernel<
operators::FusionFcReluKernel<DeviceType, T>>(type, inputs, outputs,
attrs, scope) {}
using framework::OperatorWithKernel<
DeviceType, FusionFcReluParam<DeviceType>,
operators::FusionFcReluKernel<DeviceType, T>>::OperatorWithKernel;
void InferShape() const override;
protected:
......
......@@ -37,10 +37,6 @@ class GruOp : public framework::OperatorWithKernel<
: framework::OperatorWithKernel<DeviceType, GruParam<DeviceType>,
operators::GruKernel<DeviceType, T>>(
type, inputs, outputs, attrs, scope) {}
using framework::OperatorWithKernel<
DeviceType, GruParam<DeviceType>,
operators::GruKernel<DeviceType, T>>::OperatorWithKernel;
void InferShape() const override;
};
......
......@@ -16,15 +16,14 @@ limitations under the License. */
#pragma once
#include <operators/op_param.h>
#include <string>
#include "framework/operator.h"
#include "operators/kernel/im2sequence_kernel.h"
#include "operators/op_param.h"
namespace paddle_mobile {
namespace operators {
using namespace framework;
template <typename DeviceType, typename T>
class Im2SequenceOp : public framework::OperatorWithKernel<
DeviceType, Im2SequenceParam<DeviceType>,
......@@ -39,9 +38,6 @@ class Im2SequenceOp : public framework::OperatorWithKernel<
operators::Im2SequenceKernel<DeviceType, T>>(type, inputs, outputs,
attrs, scope) {}
// using framework::OperatorWithKernel<
// DeviceType, Im2SequenceParam<DeviceType>,
// operators::Im2SequenceKernel<DeviceType, T>>::OperatorWithKernel;
void InferShape() const override;
private:
......
......@@ -26,8 +26,7 @@ bool BatchNormKernel<CPU, float>::Init(BatchNormParam<CPU> *param) {
}
template <>
void BatchNormKernel<CPU, float>::Compute(
const BatchNormParam<CPU> &param) const {
void BatchNormKernel<CPU, float>::Compute(const BatchNormParam<CPU> &param) {
BatchnormCompute<float>(param);
}
......
......@@ -27,7 +27,7 @@ bool BilinearInterpKernel<CPU, float>::Init(BilinearInterpParam<CPU> *param) {
template <>
void BilinearInterpKernel<CPU, float>::Compute(
const BilinearInterpParam<CPU> &param) const {
const BilinearInterpParam<CPU> &param) {
BilinearInterpCompute<float>(param);
}
......
......@@ -26,8 +26,7 @@ bool BoxCoderKernel<CPU, float>::Init(BoxCoderParam<CPU> *param) {
}
template <>
void BoxCoderKernel<CPU, float>::Compute(
const BoxCoderParam<CPU> &param) const {
void BoxCoderKernel<CPU, float>::Compute(const BoxCoderParam<CPU> &param) {
BoxCoderCompute<float>(param);
}
......
......@@ -26,7 +26,7 @@ bool ConcatKernel<CPU, float>::Init(ConcatParam<CPU> *param) {
}
template <>
void ConcatKernel<CPU, float>::Compute(const ConcatParam<CPU> &param) const {
void ConcatKernel<CPU, float>::Compute(const ConcatParam<CPU> &param) {
ConcatCompute<float>(param);
param.Out()->set_lod(param.Inputs()[0]->lod());
}
......
......@@ -28,7 +28,7 @@ bool ConvAddAddPReluKernel<CPU, float>::Init(
template <>
void ConvAddAddPReluKernel<CPU, float>::Compute(
const FusionConvAddAddPReluParam<CPU> &param) const {
const FusionConvAddAddPReluParam<CPU> &param) {
ConvAddAddPReluCompute<float>(param);
}
template class ConvAddAddPReluKernel<CPU, float>;
......
......@@ -55,7 +55,7 @@ bool ConvAddBNReluKernel<CPU, float>::Init(
template <>
void ConvAddBNReluKernel<CPU, float>::Compute(
const FusionConvAddBNReluParam<CPU> &param) const {
const FusionConvAddBNReluParam<CPU> &param) {
ConvAddBNReluCompute<float>(param);
}
template class ConvAddBNReluKernel<CPU, float>;
......
......@@ -25,8 +25,7 @@ bool ConvAddKernel<CPU, float>::Init(FusionConvAddParam<CPU> *param) {
}
template <>
void ConvAddKernel<CPU, float>::Compute(
const FusionConvAddParam<CPU> &param) const {
void ConvAddKernel<CPU, float>::Compute(const FusionConvAddParam<CPU> &param) {
ConvAddCompute<float>(param);
}
......
......@@ -27,7 +27,7 @@ bool ConvAddPReluKernel<CPU, float>::Init(FusionConvAddPReluParam<CPU> *param) {
template <>
void ConvAddPReluKernel<CPU, float>::Compute(
const FusionConvAddPReluParam<CPU> &param) const {
const FusionConvAddPReluParam<CPU> &param) {
ConvAddPReluCompute<float>(param);
}
template class ConvAddPReluKernel<CPU, float>;
......
......@@ -27,7 +27,7 @@ bool ConvAddReluKernel<CPU, float>::Init(FusionConvAddReluParam<CPU> *param) {
template <>
void ConvAddReluKernel<CPU, float>::Compute(
const FusionConvAddReluParam<CPU> &param) const {
const FusionConvAddReluParam<CPU> &param) {
ConvAddReluCompute<float>(param);
}
template class ConvAddReluKernel<CPU, float>;
......
......@@ -55,7 +55,7 @@ bool ConvBNAddReluKernel<CPU, float>::Init(
template <>
void ConvBNAddReluKernel<CPU, float>::Compute(
const FusionConvBNAddReluParam<CPU> &param) const {
const FusionConvBNAddReluParam<CPU> &param) {
ConvBNAddReluCompute<float>(param);
}
template class ConvBNAddReluKernel<CPU, float>;
......
......@@ -57,7 +57,7 @@ bool ConvBNReluKernel<CPU, float>::Init(FusionConvBNReluParam<CPU> *param) {
template <>
void ConvBNReluKernel<CPU, float>::Compute(
const FusionConvBNReluParam<CPU> &param) const {
const FusionConvBNReluParam<CPU> &param) {
ConvBNReluCompute<float>(param);
}
template class ConvBNReluKernel<CPU, float>;
......
......@@ -26,7 +26,7 @@ bool ConvKernel<CPU, float>::Init(ConvParam<CPU> *param) {
}
template <>
void ConvKernel<CPU, float>::Compute(const ConvParam<CPU> &param) const {
void ConvKernel<CPU, float>::Compute(const ConvParam<CPU> &param) {
ConvCompute<float>(param);
}
......
......@@ -27,7 +27,7 @@ bool ConvTransposeKernel<CPU, float>::Init(ConvTransposeParam<CPU> *param) {
template <>
void ConvTransposeKernel<CPU, float>::Compute(
const ConvTransposeParam<CPU> &param) const {
const ConvTransposeParam<CPU> &param) {
ConvTransposeCompute<float>(param);
}
......
......@@ -27,7 +27,7 @@ bool CrfKernel<CPU, float>::Init(CrfParam<CPU> *param) {
}
template <>
void CrfKernel<CPU, float>::Compute(const CrfParam<CPU> &param) const {
void CrfKernel<CPU, float>::Compute(const CrfParam<CPU> &param) {
CrfCompute<float>(param);
}
......
......@@ -26,8 +26,7 @@ bool DepthwiseConvKernel<CPU, float>::Init(ConvParam<CPU> *param) {
}
template <>
void DepthwiseConvKernel<CPU, float>::Compute(
const ConvParam<CPU> &param) const {
void DepthwiseConvKernel<CPU, float>::Compute(const ConvParam<CPU> &param) {
DepthwiseConvCompute<float>(param);
}
......
......@@ -29,8 +29,7 @@ bool DequantizeKernel<CPU, float>::Init(DequantizeParam<CPU> *param) {
}
template <>
void DequantizeKernel<CPU, float>::Compute(
const DequantizeParam<CPU> &param) const {
void DequantizeKernel<CPU, float>::Compute(const DequantizeParam<CPU> &param) {
const Tensor *input = param.input_;
Tensor *output = param.out_;
float activation_scale = param.activation_scale_->data<float>()[0];
......
......@@ -27,7 +27,7 @@ bool DropoutKernel<CPU, float>::Init(DropoutParam<CPU> *para) {
template <typename T>
struct DropoutFunctor {
DropoutFunctor(T drop_pro) : dropout_pro_(drop_pro) {}
explicit DropoutFunctor(T drop_pro) : dropout_pro_(drop_pro) {}
inline T operator()(T in) const { return (1 - dropout_pro_) * in; }
private:
......@@ -35,7 +35,7 @@ struct DropoutFunctor {
};
template <>
void DropoutKernel<CPU, float>::Compute(const DropoutParam<CPU> &param) const {
void DropoutKernel<CPU, float>::Compute(const DropoutParam<CPU> &param) {
const auto *input_x = param.InputX();
auto *input_x_ptr = input_x->data<float>();
auto *out = param.Out();
......
......@@ -54,7 +54,7 @@ bool DWConvBNReluKernel<CPU, float>::Init(FusionDWConvBNReluParam<CPU> *param) {
template <>
void DWConvBNReluKernel<CPU, float>::Compute(
const FusionDWConvBNReluParam<CPU> &param) const {
const FusionDWConvBNReluParam<CPU> &param) {
DWConvBNReluCompute<float>(param);
}
template class DWConvBNReluKernel<CPU, float>;
......
......@@ -27,7 +27,7 @@ bool ElementwiseAddKernel<CPU, float>::Init(ElementwiseAddParam<CPU> *param) {
template <>
void ElementwiseAddKernel<CPU, float>::Compute(
const ElementwiseAddParam<CPU> &param) const {
const ElementwiseAddParam<CPU> &param) {
ElementwiseAddCompute<float>(param);
param.Out()->set_lod(param.InputX()->lod());
}
......
......@@ -27,7 +27,7 @@ bool ElementwiseMulKernel<CPU, float>::Init(ElementwiseMulParam<CPU> *param) {
template <>
void ElementwiseMulKernel<CPU, float>::Compute(
const ElementwiseMulParam<CPU> &param) const {
const ElementwiseMulParam<CPU> &param) {
ElementwiseMulCompute<float>(param);
param.Out()->set_lod(param.InputX()->lod());
}
......
......@@ -27,7 +27,7 @@ bool ElementwiseSubKernel<CPU, float>::Init(ElementwiseSubParam<CPU> *param) {
template <>
void ElementwiseSubKernel<CPU, float>::Compute(
const ElementwiseSubParam<CPU> &param) const {
const ElementwiseSubParam<CPU> &param) {
ElementwiseSubCompute<float>(param);
param.Out()->set_lod(param.InputX()->lod());
}
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "operators/kernel/feed_kernel.h"
namespace paddle_mobile {
namespace operators {
template <>
bool FeedKernel<CPU, float>::Init(FeedParam<CPU> *param) {
return true;
}
template <>
void FeedKernel<CPU, float>::Compute(const FeedParam<CPU> &param) {
param.Out()->ShareDataWith(*(param.InputX()));
param.Out()->set_lod(param.InputX()->lod());
}
template class FeedKernel<CPU, float>;
} // namespace operators
} // namespace paddle_mobile
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef FUSION_CONVADD_OP
#include "operators/kernel/fetch_kernel.h"
namespace paddle_mobile {
namespace operators {
template <>
bool FetchKernel<CPU, float>::Init(FetchParam<CPU> *param) {
return true;
}
template <>
void FetchKernel<CPU, float>::Compute(const FetchParam<CPU> &param) {
param.Out()->ShareDataWith(*(param.InputX()));
}
template class FetchKernel<CPU, float>;
} // namespace operators
} // namespace paddle_mobile
#endif
......@@ -26,7 +26,7 @@ bool FlattenKernel<CPU, float>::Init(FlattenParam<CPU> *param) {
}
template <>
void FlattenKernel<CPU, float>::Compute(const FlattenParam<CPU> &param) const {
void FlattenKernel<CPU, float>::Compute(const FlattenParam<CPU> &param) {
FlattenCompute<float>(param);
}
......
......@@ -26,8 +26,7 @@ bool FusionFcKernel<CPU, float>::Init(FusionFcParam<CPU> *param) {
}
template <>
void FusionFcKernel<CPU, float>::Compute(
const FusionFcParam<CPU> &param) const {
void FusionFcKernel<CPU, float>::Compute(const FusionFcParam<CPU> &param) {
FusionFcCompute<float>(param);
param.Out()->set_lod(param.InputX()->lod());
}
......
......@@ -26,7 +26,7 @@ bool GruKernel<CPU, float>::Init(GruParam<CPU> *param) {
}
template <>
void GruKernel<CPU, float>::Compute(const GruParam<CPU> &param) const {
void GruKernel<CPU, float>::Compute(const GruParam<CPU> &param) {
GruCompute<float>(param);
param.OutHidden()->set_lod(param.InputInput()->lod());
// DLOG << "________________" << param.OutHidden()->dims();
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册