提交 0533e07b 编写于 作者: Z zhangyang

implement data format for FPGA track

上级 123449e8
...@@ -26,8 +26,14 @@ limitations under the License. */ ...@@ -26,8 +26,14 @@ limitations under the License. */
#include <cmath> #include <cmath>
#include <cstdio> #include <cstdio>
#include <cstring> #include <cstring>
#include <vector>
#include "api.h" #include "api.h"
#include "bias_scale.h"
#include "common/enforce.h"
#include "common/types.h"
#include "filter.h"
#include "image.h"
#define FPGA_TEST_MODE #define FPGA_TEST_MODE
#ifdef FPGA_TEST_MODE #ifdef FPGA_TEST_MODE
...@@ -164,5 +170,59 @@ int PerformBypass(const struct BypassArgs &args) { ...@@ -164,5 +170,59 @@ int PerformBypass(const struct BypassArgs &args) {
return do_ioctl(IOCTL_CONFIG_BYPASS, &args); return do_ioctl(IOCTL_CONFIG_BYPASS, &args);
} }
void format_image(framework::Tensor *image_tensor) {
auto dims = image_tensor->dims();
int channel = dims[1], height = dims[2], width = dims[3];
auto data_ptr = image_tensor->mutable_data<float>();
size_t memory_size = channel * height * width * sizeof(float);
float *new_data = (float *)fpga_malloc(memory_size);
fpga_copy(new_data, data_ptr, memory_size);
image::format_image(&new_data, channel, height, width);
image_tensor->reset_data_ptr(new_data);
}
void format_ofm(framework::Tensor *ofm_tensor) {
auto dims = ofm_tensor->dims();
int channel = dims[1], height = dims[2], width = dims[3];
size_t memory_size =
height * align_to_x(channel * width, IMAGE_ALIGNMENT) * sizeof(half);
ofm_tensor->reset_data_ptr(fpga_malloc(memory_size));
}
void format_filter(framework::Tensor *filter_tensor, int group_num) {
auto dims = filter_tensor->dims();
int num = dims[0], channel = dims[1], height = dims[2], width = dims[3];
auto data_ptr = filter_tensor->mutable_data<float>();
size_t memory_size = num * channel * height * width * sizeof(float);
float *new_data = (float *)fpga_malloc(memory_size);
fpga_copy(new_data, data_ptr, memory_size);
float max_value = filter::find_max(new_data, num * channel * height * width);
filter::format_filter(&new_data, num, channel, height, width, group_num,
max_value);
filter_tensor->reset_data_ptr(new_data);
}
void format_fc_matrix(framework::Tensor *filter_tensor, int group_num,
int height, int width) {
auto dims = filter_tensor->dims();
PADDLE_MOBILE_ENFORCE(dims[0] % (height * width) == 0,
"Filter number should be divisible by group number");
int num = dims[1], channel = dims[0] / height / width;
auto data_ptr = filter_tensor->mutable_data<float>();
size_t memory_size = num * channel * height * width * sizeof(float);
float *new_data = (float *)fpga_malloc(memory_size);
fpga_copy(new_data, data_ptr, memory_size);
float max_value = filter::find_max(new_data, num * channel * height * width);
filter::format_filter(&new_data, num, channel, height, width, group_num,
max_value);
filter_tensor->reset_data_ptr(new_data);
}
void format_bias_scale_array(float **bias_scale_array,
int element_num_per_division, int num) {
bias_scale::format_bias_scale_array(bias_scale_array,
element_num_per_division, num);
}
} // namespace fpga } // namespace fpga
} // namespace paddle_mobile } // namespace paddle_mobile
...@@ -18,6 +18,7 @@ limitations under the License. */ ...@@ -18,6 +18,7 @@ limitations under the License. */
#include <cstddef> #include <cstddef>
#include <iostream> #include <iostream>
#include <limits> #include <limits>
#include "framework/tensor.h"
// memory management; // memory management;
...@@ -175,6 +176,13 @@ int ComputeFpgaPool(const struct PoolingArgs& args); ...@@ -175,6 +176,13 @@ int ComputeFpgaPool(const struct PoolingArgs& args);
int ComputeFpgaEWAdd(const struct EWAddArgs& args); int ComputeFpgaEWAdd(const struct EWAddArgs& args);
static inline int align_to_x(int num, int x) { return (num + x - 1) / x * x; } static inline int align_to_x(int num, int x) { return (num + x - 1) / x * x; }
void format_image(framework::Tensor* image_tensor);
void format_ofm(framework::Tensor* ofm_tensor); // only allocate memory
void format_filter(framework::Tensor* filter_tensor, int group_num);
void format_fc_matrix(framework::Tensor* filter_tensor, int group_num,
int height = 1, int width = 1);
void format_bias_scale_array(float** bias_scale_array,
int element_num_per_division, int num);
} // namespace fpga } // namespace fpga
} // namespace paddle_mobile } // namespace paddle_mobile
...@@ -21,6 +21,7 @@ namespace fpga { ...@@ -21,6 +21,7 @@ namespace fpga {
namespace bias_scale { namespace bias_scale {
void align_element(float **data_in, int num_per_div_before_alignment, int num) { void align_element(float **data_in, int num_per_div_before_alignment, int num) {
int copynum = 0;
float *ptr_unaligned = *data_in; float *ptr_unaligned = *data_in;
int div_num = int div_num =
(num + num_per_div_before_alignment - 1) / num_per_div_before_alignment; (num + num_per_div_before_alignment - 1) / num_per_div_before_alignment;
...@@ -33,8 +34,20 @@ void align_element(float **data_in, int num_per_div_before_alignment, int num) { ...@@ -33,8 +34,20 @@ void align_element(float **data_in, int num_per_div_before_alignment, int num) {
memset(ptr_aligned, 0, num_element * sizeof(float)); memset(ptr_aligned, 0, num_element * sizeof(float));
for (int i = 0; i < div_num; i++) { for (int i = 0; i < div_num; i++) {
memcpy(ptr_aligned + i * num_per_div_after_alignment, ptr_unaligned, if (i == div_num - 1) {
num_per_div_before_alignment * sizeof(float)); copynum = (num_per_div_after_alignment * div_num > num)
? (num % num_per_div_after_alignment)
: (num_per_div_before_alignment);
} else {
copynum = num_per_div_before_alignment;
}
memcpy(ptr_aligned + i * num_per_div_after_alignment,
ptr_unaligned + num_per_div_before_alignment * i,
copynum * sizeof(float));
memcpy(ptr_aligned + (div_num + i) * num_per_div_after_alignment,
ptr_unaligned + num_per_div_before_alignment * i + num,
copynum * sizeof(float));
} }
fpga_free(ptr_unaligned); fpga_free(ptr_unaligned);
...@@ -52,14 +65,22 @@ void interleave(float **data_in, int num_after_alignment) { ...@@ -52,14 +65,22 @@ void interleave(float **data_in, int num_after_alignment) {
memcpy(ptr_interleaved + 8 * i, ptr_uninterleaved + 4 * i, memcpy(ptr_interleaved + 8 * i, ptr_uninterleaved + 4 * i,
4 * sizeof(float)); 4 * sizeof(float));
memcpy(ptr_interleaved + 8 * i + 4, memcpy(ptr_interleaved + 8 * i + 4,
ptr_uninterleaved + num_after_alignment * sizeof(float) + 4 * i, ptr_uninterleaved + num_after_alignment + 4 * i, 4 * sizeof(float));
4 * sizeof(float));
} }
fpga_free(ptr_uninterleaved); fpga_free(ptr_uninterleaved);
*data_in = ptr_interleaved; *data_in = ptr_interleaved;
} }
void format_bias_scale_array(float **bias_scale_array,
int element_num_per_division, int num) {
align_element(bias_scale_array, element_num_per_division, num);
int div_num = (num + element_num_per_division - 1) / element_num_per_division;
int element_num_after_division =
align_to_x(element_num_per_division, BS_NUM_ALIGNMENT);
interleave(bias_scale_array, div_num * element_num_after_division);
}
} // namespace bias_scale } // namespace bias_scale
} // namespace fpga } // namespace fpga
} // namespace paddle_mobile } // namespace paddle_mobile
...@@ -22,6 +22,8 @@ namespace bias_scale { ...@@ -22,6 +22,8 @@ namespace bias_scale {
void align_element(float** data_in, int num_per_div_before_alignment, int num); void align_element(float** data_in, int num_per_div_before_alignment, int num);
void interleave(float** data_in, int num_after_alignment); void interleave(float** data_in, int num_after_alignment);
void format_bias_scale_array(float** bias_scale_array,
int element_num_per_division, int num);
} // namespace bias_scale } // namespace bias_scale
} // namespace fpga } // namespace fpga
......
...@@ -19,21 +19,190 @@ namespace paddle_mobile { ...@@ -19,21 +19,190 @@ namespace paddle_mobile {
namespace fpga { namespace fpga {
namespace filter { namespace filter {
void convert_to_hwc(float** data_in, int num, int channel, int height, int calc_division_capacity(int chw) { return 2048 / ((chw + 15) / 16) * 32; }
int width) {}
float find_max(float* data_in, int num) { return 0; } int calc_split_num(int num, int division_capacity) {
return (num + division_capacity - 1) / division_capacity;
}
void quantize(float* data_in, int num) {} int calc_division_number(int num, int group_num, int division_capacity) {
PADDLE_MOBILE_ENFORCE(num % group_num == 0,
"Filter number should be divisible by group number");
int split_num = calc_split_num(num, division_capacity);
PADDLE_MOBILE_ENFORCE(group_num == 1 || split_num == 1,
"Split number or group number should be 1");
return group_num * split_num;
}
void align_element(float** data_in, int num, int chw) {} int calc_num_per_div(int num, int group_num, int division_capacity) {
if (group_num == 1) {
if (num > division_capacity) {
return division_capacity;
} else {
return num;
}
} else {
return (num + group_num - 1) / group_num;
}
}
void align_num(float** data_in, int num_per_div_before_alignment, int num, void convert_to_hwc(char **data_in, int num, int channel, int height,
int chw) {} int width) {
char *tmp = *data_in;
int chw = channel * height * width;
char *data_tmp = (char *)fpga_malloc(chw * num * sizeof(char));
for (int n = 0; n < num; n++) {
int64_t amount_per_row = width * channel;
for (int c = 0; c < channel; c++) {
for (int h = 0; h < height; h++) {
int64_t offset_height = h * amount_per_row;
for (int w = 0; w < width; w++) {
*(data_tmp + n * chw + offset_height + w * channel + c) =
*((*data_in)++);
}
}
}
}
void reorder(float** data_in, int num_after_alignment, int chw) {} *data_in = data_tmp;
fpga_free(tmp);
}
void interleave(float** data_in, int num_after_alignment, int chw) {} float find_max(float *data_in, int data_size) {
float max = 0.0;
for (int i = 0; i < data_size; ++i) {
float value = data_in[i];
float abs = value > 0 ? value : -value;
max = std::max(max, abs);
}
return max;
}
void quantize(float **data_in, int data_size, float max) {
float *tmp = *data_in;
float fix_range = 127;
float scale = fix_range / max;
char *tmp_data = (char *)fpga_malloc(data_size * sizeof(char));
for (int i = 0; i < data_size; i++) {
tmp_data[i] = (char)((*data_in)[i] * scale);
}
*data_in = (float *)tmp_data;
fpga_free(tmp);
}
void align_element(char **data_in, int num, int chw) {
int i = 0;
int j = 0;
int align_chw = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT);
if (align_chw != chw) {
printf("align %d \n", align_chw);
char *tmp = *data_in;
char *data_tmp = (char *)fpga_malloc(num * align_chw * sizeof(char));
memset(data_tmp, 0, num * align_chw);
for (j = 0; j < num; j++) {
memcpy(data_tmp + j * align_chw, (*data_in) + j * chw, chw);
}
*data_in = data_tmp;
fpga_free(tmp);
}
}
void align_num(char **data_in, int num_per_div_before_alignment, int num,
int chw) {
int i = 0;
int align_chw = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT);
int num_per_div_after_alignment =
align_to_x(num_per_div_before_alignment, FILTER_NUM_ALIGNMENT);
if (num_per_div_after_alignment != num_per_div_before_alignment) {
char *tmp = *data_in;
int div_num =
(num + num_per_div_before_alignment - 1) / num_per_div_before_alignment;
int num_element = div_num * num_per_div_after_alignment * align_chw;
char *data_tmp = (char *)fpga_malloc(num_element * sizeof(char));
memset(data_tmp, 0, num_element * sizeof(char));
for (i = 0; i < div_num; i++) {
memcpy(data_tmp + num_per_div_after_alignment * align_chw * i,
*data_in + num_per_div_before_alignment * align_chw * i,
num_per_div_before_alignment * align_chw);
}
*data_in = data_tmp;
fpga_free(tmp);
}
}
void reorder(char **data_in, int num_after_alignment, int chw) {
int index = 0;
int new_index;
int chw_align = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT);
char *data_tmp =
(char *)fpga_malloc(chw_align * num_after_alignment * sizeof(char));
char *tmp = *data_in;
for (index = 0; index < num_after_alignment; index++) {
new_index = index / 32 * 32 + (index % 16 / 4 * 8) + (index % 16 % 4) +
(index / 16 % 2 * 4);
memcpy(data_tmp + index * chw_align, *data_in + new_index * chw_align,
chw_align);
}
*data_in = data_tmp;
fpga_free(tmp);
}
void interleave(char **data_in, int num_after_alignment, int chw) {
int i = 0;
int j = 0;
int k = 0;
int interleave_per_num = 16;
;
int chw_align = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT);
char *data_tmp =
(char *)fpga_malloc(chw_align * num_after_alignment * sizeof(char));
char *tmp = *data_in;
int interleave_num = chw_align * 2 / interleave_per_num;
for (i = 0; i < num_after_alignment; i += 2) {
for (j = 0, k = 0; j < interleave_num; j += 2, k++) {
memcpy(data_tmp + i * chw_align + interleave_per_num * j,
*data_in + i * chw_align + interleave_per_num * k,
interleave_per_num);
memcpy(data_tmp + i * chw_align + interleave_per_num * (j + 1),
*data_in + (i + 1) * chw_align + interleave_per_num * k,
interleave_per_num);
}
}
*data_in = data_tmp;
fpga_free(tmp);
}
void format_filter(float **data_in, int num, int channel, int height, int width,
int group_num, float max) {
int data_size = channel * height * width * num;
int chw = channel * height * width;
int division_capacity = calc_division_capacity(chw);
int num_per_div_before_alignment =
calc_num_per_div(num, group_num, division_capacity);
int num_per_div_after_alignment =
align_to_x(num_per_div_before_alignment, FILTER_NUM_ALIGNMENT);
int div_num =
(num + num_per_div_before_alignment - 1) / num_per_div_before_alignment;
int num_after_alignment = num_per_div_after_alignment * div_num;
quantize(data_in, data_size, max);
char **quantize_data = (char **)data_in;
convert_to_hwc(quantize_data, num, channel, height, width);
align_element(quantize_data, num, chw);
align_num(quantize_data, num_per_div_before_alignment, num, chw);
reorder(quantize_data, num_after_alignment, chw);
interleave(quantize_data, num_after_alignment, chw);
}
} // namespace filter } // namespace filter
} // namespace fpga } // namespace fpga
......
...@@ -22,14 +22,15 @@ namespace fpga { ...@@ -22,14 +22,15 @@ namespace fpga {
namespace filter { namespace filter {
void convert_to_hwc(float** data_in, int num, int channel, int height, void convert_to_hwc(float** data_in, int num, int channel, int height,
int width); int width);
float find_max(float* data_in, int num); float find_max(float* data_in, int data_size);
void quantize(float* data_in, int num); void quantize(float** data_in, int data_size, float max);
void align_element(float** data_in, int num, int chw); void align_element(float** data_in, int num, int chw);
void align_num(float** data_in, int num_per_div_before_alignment, int num, void align_num(char** data_in, int num_per_div_before_alignment, int num,
int chw); int chw);
void reorder(float** data_in, int num_after_alignment, int chw); void reorder(float** data_in, int num_after_alignment, int chw);
void interleave(float** data_in, int num_after_alignment, int chw); void interleave(float** data_in, int num_after_alignment, int chw);
void format_filter(float** data_in, int num, int channel, int height, int width,
int group_num, float max);
} // namespace filter } // namespace filter
} // namespace fpga } // namespace fpga
} // namespace paddle_mobile } // namespace paddle_mobile
...@@ -11,3 +11,57 @@ distributed under the License is distributed on an "AS IS" BASIS, ...@@ -11,3 +11,57 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "image.h"
#include <memory.h>
#include "api.h"
namespace paddle_mobile {
namespace fpga {
namespace image {
void convert_to_hwc(float **data_in, int channel, int height, int width) {
float *tmp = *data_in;
float *data_tmp =
(float *)fpga_malloc(channel * height * width * sizeof(float));
int64_t amount_per_row = width * channel;
for (int c = 0; c < channel; c++) {
for (int h = 0; h < height; h++) {
int64_t offset_height = h * amount_per_row;
for (int w = 0; w < width; w++) {
*(data_tmp + offset_height + w * channel + c) = *((*data_in)++);
}
}
}
*data_in = data_tmp;
fpga_free(tmp);
}
void align_element_conv(float **data_in, int height, int cw) {
int i = 0;
int h = 0;
int align_cw = align_to_x(cw, IMAGE_ALIGNMENT);
if (align_cw != cw) {
float *tmp = *data_in;
float *data_tmp = (float *)fpga_malloc(height * align_cw * sizeof(float));
memset(data_tmp, 0, height * align_cw * sizeof(float));
for (h = 0; h < height; h++) {
memcpy((void *)(data_tmp + h * align_cw), (void *)(*data_in + h * cw),
cw * sizeof(float));
}
*data_in = data_tmp;
fpga_free(tmp);
}
}
void format_image(float **data_in, int channel, int height, int width) {
convert_to_hwc(data_in, channel, height, width);
align_element_conv(data_in, height, channel * width);
}
} // namespace image
} // namespace fpga
} // namespace paddle_mobile
...@@ -11,3 +11,16 @@ distributed under the License is distributed on an "AS IS" BASIS, ...@@ -11,3 +11,16 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once
#define IMAGE_ALIGNMENT 16 // Aligned to 16
namespace paddle_mobile {
namespace fpga {
namespace image {
void convert_to_hwc(float** data_in, int channel, int height, int width);
void align_element_conv(float** data_in, int height, int cw);
void format_image(float** data_in, int channel, int height, int width);
} // namespace image
} // namespace fpga
} // namespace paddle_mobile
...@@ -254,30 +254,6 @@ class Tensor { ...@@ -254,30 +254,6 @@ class Tensor {
"Tensor's dims_ is out of bound. "); "Tensor's dims_ is out of bound. ");
} }
#ifdef PADDLE_MOBILE_FPGA
struct FPGAArgs {
friend class Tensor;
inline float *scale_pointer() { return scale_; }
inline float scale() { return *scale_; }
private:
float *scale_;
};
struct FPGAArgs fpga_args() const {
FPGAArgs args;
args.scale_ = scale.get();
return args;
}
void SetFpgaScale(float s) { *(scale.get()) = s; }
private:
std::shared_ptr<float> scale = std::make_shared<float>(0);
#endif
private: private:
/** /**
* @note Placeholder hides type T, so it doesn't appear as a * @note Placeholder hides type T, so it doesn't appear as a
...@@ -313,9 +289,12 @@ class Tensor { ...@@ -313,9 +289,12 @@ class Tensor {
virtual std::type_index type() const { return type_; } virtual std::type_index type() const { return type_; }
virtual void set_type(std::type_index type) { type_ = type; } virtual void set_type(std::type_index type) { type_ = type; }
#ifndef PADDLE_MOBILE_FPGA
/*! the pointer of memory block. */ /*! the pointer of memory block. */
std::unique_ptr<uint8_t, memory::PODDeleter<uint8_t>> ptr_; std::unique_ptr<uint8_t, memory::PODDeleter<uint8_t>> ptr_;
#else
std::shared_ptr<uint8_t> ptr_;
#endif
/*! the size of memory block. */ /*! the size of memory block. */
size_t size_; size_t size_;
...@@ -344,6 +323,34 @@ class Tensor { ...@@ -344,6 +323,34 @@ class Tensor {
* begins. * begins.
*/ */
size_t offset_; size_t offset_;
#ifdef PADDLE_MOBILE_FPGA
public:
inline void reset_data_ptr(void *p) {
((PlaceholderImpl *)(holder_.get()))->ptr_.reset((uint8_t *)p);
}
struct FPGAArgs {
friend class Tensor;
inline float *scale_pointer() { return scale_; }
inline float scale() { return *scale_; }
private:
float *scale_;
};
struct FPGAArgs fpga_args() const {
FPGAArgs args;
args.scale_ = scale.get();
return args;
}
void SetFpgaScale(float s) { *(scale.get()) = s; }
private:
std::shared_ptr<float> scale = std::make_shared<float>(0);
#endif
}; };
#ifdef PADDLE_MOBILE_DEBUG #ifdef PADDLE_MOBILE_DEBUG
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册