提交 4ea3ca7a 编写于 作者: X xiaogang 提交者: GitHub

fix: update fpga backend and kernel (#2848)

* fix: update fpga backend and kernel
test=develop

* style: style fix
test=develop
上级 4c1bf784
......@@ -32,7 +32,8 @@ class Debugger {
}
void registerOutput(std::string op_type, zynqmp::Tensor* tensor) {
if (op_type != "conv") { // NOLINT
if (op_config[op_type]) {
tensor->saveToFile(op_type, true);
}
}
......@@ -40,8 +41,21 @@ class Debugger {
std::unordered_map<std::string, bool> op_config;
Debugger() {
op_config["concat"] = true;
op_config["pooling"] = true;
op_config["conv"] = true;
op_config["dwconv"] = true;
op_config["ew_add"] = true;
op_config["crop"] = true;
op_config["feed"] = true;
op_config["mul"] = true;
op_config["fetch"] = true;
op_config["boxes"] = true;
op_config["scores"] = true;
op_config["nms"] = true;
op_config["pb_boxes"] = true;
op_config["pb_variances"] = true;
// op_config["fc"] = true;
op_config["softmax"] = true;
}
};
......@@ -131,9 +145,7 @@ inline void save_tensor(const lite::Tensor* t,
chw_to_hwc(const_cast<lite::Tensor*>(t), dst);
data = dst;
}
save_float(data, name, t->numel());
delete[] dst;
}
} // namespace lite
......
......@@ -31,7 +31,7 @@ void saveToFile(std::string name, void* data_in, int size) {
std::ofstream ofs;
ofs.open(name);
int8_t* data = static_cast<int8_t*> data_in;
int8_t* data = static_cast<int8_t*>(data_in);
for (int i = 0; i < size; i++) {
float value = data[i];
ofs << value << std::endl;
......@@ -84,6 +84,14 @@ int calc_num_per_div(int num, int group_num, int division_capacity) {
}
}
int calc_pack_num(int num_per_group, int group, int division_capacity) {
auto n = 1;
while ((num_per_group * (group + n - 1) / n) > division_capacity) {
n++;
}
return (n);
}
void convert_to_hwc(int8_t* chw_data,
int8_t* hwc_data,
int num,
......@@ -231,10 +239,9 @@ int8_t* format_filter(float* data_in,
for (int n = 0; n < num; n++) {
float* filter_start = data_in + n * chw;
float f_max = find_max(filter_start, chw);
int8_t* quantized_start = quantized_data + n * chw;
quantize(filter_start, quantized_start, chw, max);
filter_max.push_back(max);
filter_max.push_back(1);
}
int8_t* hwc_data =
......@@ -256,6 +263,7 @@ int8_t* format_filter(float* data_in,
int filter_num_alignment = get_filter_num_alignment();
int num_per_div_after_alignment =
align_to_x(num_per_div_before_alignment, filter_num_alignment);
int num_element = div_num * num_per_div_after_alignment * chw_aligned;
int8_t* num_aligned_data =
reinterpret_cast<int8_t*>(fpga_malloc(num_element * sizeof(int8_t)));
......
......@@ -31,6 +31,7 @@ int calc_division_capacity(int chw);
int calc_split_num(int num, int division_capacity);
int calc_division_number(int num, int group_num, int division_capacity);
int calc_num_per_div(int num, int group_num, int division_capacity);
int calc_pack_num(int num_per_group, int group, int division_capacity);
float find_max(float* data_in, int data_size);
int8_t* format_filter(float* data_in,
......@@ -40,11 +41,13 @@ int8_t* format_filter(float* data_in,
int height,
int width,
int group_num,
float max, // NOLINT
float max,
std::vector<float>& filter_max); // NOLINT
void convert_to_hwn(int16_t** data_in, int num, int height, int width);
size_t align_element_n(int16_t** data_in, int num, int height, int width);
// void quantize_to_fp16(float** data_in, int num, int height, int width,
// float* scale_ptr);
size_t format_dwconv_filter(
float** data_in, int num, int height, int width, float* scale_ptr);
......
......@@ -28,7 +28,7 @@ limitations under the License. */
namespace paddle {
namespace zynqmp {
#define PADDLE_OS_LINUX
#define PADDLE_MOBILE_OS_LINUX
static int fd = -1;
static const char *device_path = "/dev/fpgadrv0";
......@@ -38,7 +38,7 @@ static size_t memory_size_max = 0;
static size_t memory_size = 0;
static inline int do_ioctl(uint64_t req, const void *arg) {
#ifdef PADDLE_OS_LINUX
#ifdef PADDLE_MOBILE_OS_LINUX
return ioctl(fd, req, arg);
#else
return -1;
......@@ -61,17 +61,33 @@ void reset_device() {
// memory management;
void *fpga_malloc(size_t size) {
#ifdef ENABLE_DEBUG
#endif
#ifdef PADDLE_OS_LINUX
#ifdef PADDLE_MOBILE_OS_LINUX
void *ptr = reinterpret_cast<void *>(
mmap64(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0));
if (ptr == NULL) {
if (ptr == MAP_FAILED) {
std::cout << "not enough memory !";
exit(-1);
}
if (errno == ENOMEM) {
std::cout << "mmap failed with not enough memory !";
exit(-1);
}
if (errno == EINVAL) {
std::cout << "mmap failed with invalid arguments ! (size=" << size << ")"
<< std::endl;
exit(-1);
}
if (ptr == NULL) {
std::cout << "NULL returned, errno=" << errno
<< ", mmap failed with other errors other than memory usage !"
<< std::endl;
exit(-1);
}
memory_map.insert(std::make_pair(ptr, size));
memory_size += size;
if (memory_size > memory_size_max) {
memory_size_max = memory_size;
}
......@@ -87,7 +103,7 @@ size_t fpga_get_memory_size_max() { return memory_size_max; }
size_t fpga_diagnose_memory(int detailed) {
size_t total = 0;
auto iter = memory_map.begin(); // std::map<void *, size_t>::iterator
auto iter = memory_map.begin();
while (iter != memory_map.end()) {
total += iter->second;
iter++;
......@@ -97,13 +113,15 @@ size_t fpga_diagnose_memory(int detailed) {
void fpga_free(void *ptr) {
size_t size = 0;
auto iter = memory_map.find(ptr); // std::map<void *, size_t>::iterator
auto iter = memory_map.find(ptr);
if (iter != memory_map.end()) {
size = iter->second;
memory_map.erase(iter);
}
memory_size -= size;
#ifdef PADDLE_OS_LINUX
#ifdef PADDLE_MOBILE_OS_LINUX
munmap(ptr, size);
#else
free(ptr);
......@@ -230,6 +248,7 @@ int perform_bypass(const struct BypassArgs &args) {
ret = do_ioctl(IOCTL_CONFIG_BYPASS, &bypassArgs);
scale = std::max(scale, scales[0]);
}
args.output.scale_address[0] = scale;
args.output.scale_address[1] = 1.0f / scale;
return ret;
......@@ -238,6 +257,26 @@ int perform_bypass(const struct BypassArgs &args) {
int compute_fpga_concat(const struct ConcatArgs &args) { return -1; }
int compute_fpga_scale(const struct ScaleArgs &args) {
#ifdef ENABLE_DEBUG
std::cout << "======Compute Scale======";
std::cout << "scale_address:" << args.scale_address << std::endl;
std::cout << "bias_address:" << args.bias_address << std::endl;
std::cout << "wc_alignment:" << args.wc_alignment << std::endl;
std::cout << "channel_alignment:" << args.channel_alignment << std::endl;
std::cout << " image_address:" << args.image.address
<< " image_scale_address:" << args.image.scale_address
<< " image_channels:" << args.image.channels
<< " image_height:" << args.image.height
<< " image_width:" << args.image.width
<< " pad_height:" << args.image.pad_height
<< " pad_width:" << args.image.pad_width;
std::cout << " out_address:" << args.output.address
<< " out_scale_address:" << args.output.scale_address;
#endif
return do_ioctl(IOCTL_CONFIG_SCALE, &args);
}
......
......@@ -28,7 +28,6 @@ namespace zynqmp {
typedef int16_t half;
#define IMAGE_ALIGNMENT 16 // Aligned to 16
#define FILTER_NUM_ALIGNMENT 32 // Filter number aligned to 32
#define FILTER_ELEMENT_ALIGNMENT 16 // Filter element number aligned to 16
#define BS_NUM_ALIGNMENT 8
#define BIAS_NUM_ALIGNMENT 16
......@@ -44,14 +43,11 @@ enum DLayoutType {
};
enum ActiveType {
TYPE_RELU = 0,
TYPE_RELU6 = 1,
TYPE_LEAK_RELU = 2,
TYPE_SIGMOID = 3,
};
struct VersionArgs {
void* buffer;
TYPE_NONE = 0,
TYPE_RELU = 1,
TYPE_RELU6 = 2,
TYPE_LEAKY_RELU = 3,
TYPE_SIGMOID = 4,
};
struct DeviceInfo {
......@@ -67,6 +63,11 @@ struct DeviceInfo {
uint32_t reserved6;
};
struct VersionArgs {
void* buffer;
size_t size;
};
struct MemoryCopyArgs {
void* src;
void* dest;
......@@ -78,7 +79,9 @@ struct MemoryCacheArgs {
size_t size;
};
struct MemoryBarrierArgs {};
struct MemoryBarrierArgs {
uint16_t dummy;
};
struct BNArgs {
bool enabled;
......
......@@ -30,8 +30,13 @@ struct ReLUParam {
float leaky_relu_factor = 0.0f;
};
struct ActiveParam {
enum ActiveType type = TYPE_NONE;
float leaky_relu_factor;
};
struct PEParam {
ReLUParam relu;
ActiveParam activeParam;
};
struct InputParam : PEParam {
......@@ -100,24 +105,6 @@ struct DepthwiseConvParam : ConvParam {
Tensor* quantizedFilter_ = new Tensor();
};
struct GRUParam : PEParam {
public:
Tensor* input = nullptr;
Tensor* h0 = nullptr;
Tensor* weight = nullptr;
Tensor* bias = nullptr;
Tensor* batch_gate = nullptr;
Tensor* batch_reset_hidden_prev = nullptr;
Tensor* batch_hidden = nullptr;
Tensor* hidden = nullptr;
std::string gate_activation = "sigmoid";
std::string activation = "tanh";
bool is_reverse = false;
bool origin_mode = false;
};
enum PoolingType : int {
MAX = 0,
AVERAGE = 1,
......@@ -155,7 +142,8 @@ struct ElementwiseAddParam : PEParam {
struct ElementwiseMulParam : PEParam {
public:
std::vector<Tensor*> inputs;
Tensor* input_x;
Tensor* input_y = nullptr;
Tensor* output = nullptr;
};
......@@ -223,6 +211,17 @@ struct PriorBoxParam : PEParam {
float offset;
};
struct YoloBoxParam : PEParam {
Tensor* input;
Tensor* imgSize;
Tensor* outputBoxes;
Tensor* outputScores;
int downsampleRatio;
std::vector<int> anchors;
int classNum;
float confThresh;
};
struct ScaleParam : PEParam {
public:
Tensor* input = nullptr;
......@@ -255,5 +254,24 @@ struct CropParam : PEParam {
std::vector<int> offsets;
std::vector<int> shape;
};
struct GRUParam : PEParam {
public:
Tensor* input = nullptr;
Tensor* h0 = nullptr;
Tensor* weight = nullptr;
Tensor* bias = nullptr;
Tensor* batch_gate = nullptr;
Tensor* batch_reset_hidden_prev = nullptr;
Tensor* batch_hidden = nullptr;
Tensor* hidden = nullptr;
std::string gate_activation = "sigmoid";
std::string activation = "tanh";
bool is_reverse = false;
bool origin_mode = false;
};
} // namespace zynqmp
} // namespace paddle
......@@ -25,6 +25,7 @@ limitations under the License. */
#include "lite/backends/fpga/KD/pes/conv_process.hpp"
#include "lite/backends/fpga/KD/pes/elementwise_add_pe.hpp"
#include "lite/backends/fpga/KD/pes/scale_pe.hpp"
#include "lite/backends/fpga/KD/pes/split_pe.hpp"
namespace paddle {
namespace zynqmp {
......@@ -41,6 +42,8 @@ class ConvPE : public PE {
void apply() {
split_axis = fill_split_arg(param_);
split_channel = param_.groups != 1 && param_.splitParams().size() > 1;
if (split_axis == 0 && param_.splitParams().size() > 1) {
ConcatParam& concat_param = concatPE_.param();
for (auto conv_param : param_.splitParams()) {
......@@ -51,107 +54,28 @@ class ConvPE : public PE {
concatPE_.apply();
}
if (split_channel) {
SplitParam& split_param = splitPE_.param();
split_param.input = param_.input;
for (auto conv_param : param_.splitParams()) {
split_param.outputs.push_back(&conv_param->input);
}
splitPE_.init();
splitPE_.apply();
}
if (DLEngine::get_instance().isZU3() &&
param_.input->shape().dimSize() == 4 &&
param_.input->shape().width() == 1 &&
param_.input->shape().width() >= 2048) {
param_.input->shape().channel() >= 2048) {
use_cpu_ = true;
}
if (param_.filter->shape().width() == 1 &&
param_.filter->shape().height() == 1) { // NOLINT
}
if (!use_cpu_) { // NOLINT
if (!use_cpu_) {
// param_.filter->releaseData();
}
}
void cpu_conv_hwc() {
Tensor* input = param_.input;
Tensor* output = param_.output;
input->syncToCPU();
Tensor float_input;
Tensor float_output;
float* image_addr = float_input.mutableData<float>(FP32, input->shape());
float_input.copyFrom(input);
float_input.syncToCPU();
float* out = float_output.mutableData<float>(FP32, output->shape());
int out_width = output->shape().width();
int out_channel = output->shape().channel();
int in_channel = input->shape().channel();
float* filter_data = param_.filter->data<float>();
int image_height = input->shape().height();
int image_width = input->shape().width();
int image_channels = input->shape().channel();
int image_pad_h = param_.paddings[0];
int image_pad_w = param_.paddings[1];
int kernel_height = param_.filter->shape().height();
int kernel_width = param_.filter->shape().width();
int kernel_step_h = param_.strides[0];
int kernel_step_w = param_.strides[1];
int pooled_height_ = output->shape().height();
int pooled_width_ = out_width;
int filter_chw = image_channels * kernel_height * kernel_width;
float max = 0;
for (int ph = 0; ph < pooled_height_; ph++) {
for (int pw = 0; pw < pooled_width_; pw++) {
int hstart = ph * kernel_step_h - image_pad_h;
int wstart = pw * kernel_step_w - image_pad_w;
int hend =
std::min(hstart + kernel_height, static_cast<int>(image_height));
int wend =
std::min(wstart + kernel_width, static_cast<int>(image_width));
hstart = std::max(hstart, static_cast<int>(0));
wstart = std::max(wstart, static_cast<int>(0));
for (int oc = 0; oc < out_channel; oc++) {
float sum = 0.0f;
const int pool_index = (ph * pooled_width_ + pw) * out_channel + oc;
for (int c = 0; c < image_channels; c++) {
for (int h = hstart; h < hend; h++) {
int hi = 0;
if (ph == 0) {
hi = h - hstart + image_pad_h;
} else {
hi = h - hstart;
}
for (int w = wstart; w < wend; w++) {
int wi = 0;
if (pw == 0) {
wi = w - wstart + image_pad_w;
} else {
wi = w - wstart;
}
const int index = (h * image_width + w) * image_channels + c;
int weight_index = oc * filter_chw +
kernel_width * kernel_height * c +
kernel_width * hi + wi;
float value = image_addr[index] * filter_data[weight_index];
sum += value;
}
}
}
if (param_.relu.enabled && sum < 0) {
sum = -sum;
}
if (sum > max) {
max = sum;
}
out[pool_index] = sum;
}
}
}
float_output.flush();
output->copyFrom(&float_output);
output->scale()[0] = max / 127;
output->scale()[1] = 127 / max;
// exit(-1);
}
void cpu_compute() {
Tensor* input = param_.input;
Tensor* output = param_.output;
......@@ -161,93 +85,98 @@ class ConvPE : public PE {
Tensor float_output;
float* image_addr = float_input.mutableData<float>(FP32, input->shape());
float_input.copyFrom(input);
float_input.syncToCPU();
// float16* data_out = output->data<float16>();
float* out = float_output.mutableData<float>(FP32, output->shape());
float* bias_data = param_.bias()->data<float>();
int out_width = output->shape().width();
int out_channel = output->shape().channel();
int in_channel = input->shape().channel();
float* filter_data = param_.filter->data<float>();
float* mi = new float[in_channel];
float max = 0;
int out_index = 0;
for (int i = 0; i < out_channel; i++) {
float* image = image_addr;
float* filter_ptr = filter_data + i * in_channel;
float* out_ptr = mi;
#pragma omp parallel for
for (int j = 0; j < in_channel; j++) {
// float32x4_t x0 = vld1q_f32(image);
// float32x4_t x1 = vld1q_f32(filter_ptr);
// float32x4_t r = vmulq_f32(x0, x1);
// vst1q_f32(out_ptr, r);
// image += 4;
// filter_ptr += 4;
// out_ptr += 4;
float value = image_addr[j] * filter_ptr[j];
mi[j] = value;
}
for (int h = 0; h < output->shape().height(); h++) {
for (int w = 0; w < output->shape().width(); w++) {
float sum = 0;
// #pragma omp parallel for
for (int j = 0; j < in_channel; j++) {
int image_index = h * out_width * in_channel + w * in_channel + j;
float value = image_addr[image_index] * filter_ptr[j];
sum += value;
}
sum += bias_data[i];
if (param_.relu.enabled && sum < 0) {
sum = 0;
}
if (sum > max) {
max = sum;
}
out_index = h * out_width * out_channel + w * out_channel + i;
out[out_index] = sum;
}
float sum = 0;
for (int j = 0; j < in_channel; j++) {
sum += mi[j];
}
out[i] = sum;
}
delete[] mi;
float_output.flush();
output->copyFrom(&float_output);
output->scale()[0] = max / 127;
output->scale()[1] = 127 / max;
}
bool dispatch() {
fpga_reset();
if (use_cpu_) {
cpu_compute();
return true;
}
inplace_.leaky_relu_enable =
(param_.relu.leaky_relu_factor != 0) ? true : false;
inplace_.relu_enable =
inplace_.leaky_relu_enable ? false : param_.relu.enabled;
if (param_.activeParam.type == TYPE_RELU) {
inplace_.relu_enable = true;
} else if (param_.activeParam.type == TYPE_RELU6) {
inplace_.relu6_enable = true;
} else if (param_.activeParam.type == TYPE_SIGMOID) {
inplace_.sigmoid_enable = true;
} else if (param_.activeParam.type == TYPE_LEAKY_RELU) {
inplace_.leaky_relu_enable = true;
}
inplace_.power_enable = false;
inplace_.normalize_enable = false;
if (inplace_.relu_enable || inplace_.leaky_relu_enable) {
if (inplace_.relu_enable || inplace_.leaky_relu_enable ||
inplace_.relu6_enable || inplace_.sigmoid_enable) {
config_inplace(inplace_);
if (inplace_.leaky_relu_enable) {
activeParamterArgs.type = TYPE_LEAK_RELU;
activeParamterArgs.type = TYPE_LEAKY_RELU;
activeParamterArgs.leaky_relu_factor =
fp32_2_fp16(param_.relu.leaky_relu_factor);
fp32_2_fp16(param_.activeParam.leaky_relu_factor);
config_activation(activeParamterArgs);
}
}
std::vector<BasicConvParam*>& params = param_.splitParams();
if (split_channel) {
// splitPE_.param().input->saveToFile("input_image",true);
splitPE_.dispatch();
}
int ret = 0;
for (auto conv_param : params) {
// conv_param->input.printScale();
// if (split_channel) {
// conv_param->input.saveToFile("pack_image",true);
// }
ret |= compute_fpga_conv_basic(conv_param->args);
}
if (inplace_.relu_enable || inplace_.leaky_relu_enable) {
if (inplace_.relu_enable || inplace_.leaky_relu_enable ||
inplace_.relu6_enable || inplace_.sigmoid_enable) {
inplace_.relu_enable = false;
inplace_.leaky_relu_enable = false;
inplace_.relu6_enable = false;
inplace_.sigmoid_enable = false;
config_inplace(inplace_);
if (inplace_.leaky_relu_enable) {
activeParamterArgs.type = TYPE_LEAK_RELU;
activeParamterArgs.type = TYPE_LEAKY_RELU;
activeParamterArgs.leaky_relu_factor = fp32_2_fp16(0);
config_activation(activeParamterArgs);
}
......@@ -255,16 +184,29 @@ class ConvPE : public PE {
size_t size = params.size();
if (split_axis == 0 && ret == 0 && size > 1) {
// std::cout << "concat size:" << size << std::endl;
concatPE_.dispatch();
}
if (split_axis == 1 && ret == 0 && size > 1) {
// for (int n = 0; n < size - 1; n++) {
ElementwiseAddParam& add_param = addPE_.param();
add_param.inputs = {&params[0]->output, &params[1]->output};
add_param.output = param_.output;
addPE_.init();
addPE_.apply();
addPE_.dispatch();
// param_.output->printScale();
// params[0]->input.saveToFile("conv_1.txt");
// params[1]->input.saveToFile("conv_2.txt");
// params[0]->output.saveToFile("ew_o1.txt");
// params[1]->output.saveToFile("ew_o2.txt");
// std::cout << "\n ================== EW ================== \n";
// }
}
return ret == 0;
}
......@@ -272,8 +214,10 @@ class ConvPE : public PE {
private:
bool use_cpu_ = false;
bool split_channel = false;
ConvParam param_;
ConcatPE concatPE_;
SplitPE splitPE_;
ElementwiseAddPE addPE_;
int split_axis = 0;
InplaceArgs inplace_ = {0};
......
......@@ -53,6 +53,16 @@ inline int get_split_num(Tensor* filter) {
return filter::calc_split_num(aligned_num, div_capacity);
}
inline int get_pack_num(Tensor* filter, int group_num) {
auto chw = filter->shape().channel() * filter->shape().height() *
filter->shape().width();
auto num = filter->shape().num();
int div_capacity = filter::calc_division_capacity(chw);
int filter_num_alignment = filter::get_filter_num_alignment();
int aligned_num_per_group = align_to_x(num / group_num, filter_num_alignment);
return filter::calc_pack_num(aligned_num_per_group, group_num, div_capacity);
}
inline void fill_scale_bias_const(ConvParam* param_) {
int channel = param_->output->shape().channel();
Shape sb_shape(N, {channel});
......@@ -117,6 +127,50 @@ inline void combine_add_bn_params(BatchnormParam* bn,
param_->bias()->setDataLocation(CPU);
}
inline int gcd_(int a, int b) {
while (b) {
int temp = a;
a = b;
b = temp % b;
}
return a;
}
inline int lcm_(int a, int b) { return a * b / gcd_(a, b); }
inline void format_bias_scale_new(Tensor* bias,
Tensor* scale,
Tensor* scale_bias) {
Shape& bias_shape = bias->shape();
int channel = bias_shape.channel();
int repeat = 1;
int alignment = 16;
int length = channel;
if (channel % alignment != 0 || channel < alignment) {
int c_lcm = lcm_(channel, alignment);
repeat = c_lcm / (channel);
}
Shape shape(N, {2 * channel * repeat});
float16* scale_bias_data = scale_bias->mutableData<float16>(FP16, shape);
float* bias_data_float = bias->data<float>();
float* scale_data_float = scale->data<float>();
for (int i = 0; i < repeat; i++) {
for (int j = 0; j < length; j++) {
float16 value_bias = float_to_half(bias_data_float[j]);
scale_bias_data[i * length + j] = value_bias;
}
}
for (int i = 0; i < repeat; i++) {
for (int j = 0; j < length; j++) {
float16 value_scale = float_to_half(scale_data_float[j]);
scale_bias_data[i * length + j + length * repeat] = value_scale;
}
}
}
inline void format_scale_bias(Tensor* scale,
Tensor* bias,
Tensor* filter,
......@@ -180,8 +234,10 @@ inline void format_scale_bias(Tensor* scale,
inline void format_filter(Tensor* filter,
Tensor* quantized_filter,
int group,
std::vector<float>& scales) { // NOLINT
std::vector<float>& scales, // NOLINT
float max) {
float max_value = find_max(*filter);
// max_value = max; //TODO: global quantization for filter
Shape& filter_shape = filter->shape();
int mem_size;
......@@ -206,10 +262,22 @@ inline void format_filter(Tensor* filter,
memcpy(src, quantized_data, mem_size);
quantized_filter->flush();
for (size_t i = 0; i < max_values.size(); i++) {
scales.push_back(max_values[i] / max_value);
}
fpga_free(quantized_data);
// for (size_t i = 0; i < max_values.size(); i++) {
// // scales.push_back(max_values[i] / max_value);
// scales.push_back(1.0f);
// }
// filter->saveToFile("filter.txt");
// std::ofstream ofs;
// ofs.open("quant.txt");
// for (int i = 0; i < mem_size; i++) {
// float value = quantized_data[i];
// ofs << value << std::endl;
// }
// ofs.close();
// exit(-1);
}
inline void format_dw_filter(Tensor* filter,
......@@ -256,17 +324,10 @@ inline void split_filter_num(const ConvParam& c_param) {
Tensor* out = param.output;
Tensor* filter = param.filter;
auto channel = out->shape().channel();
int split_num = param.groups == 1 ? get_split_num(param.filter) : 1;
int split_num = get_split_num(param.filter);
int filter_num_per_div = get_filter_num_per_div(filter, param.groups);
auto chw = filter->shape().channel() * filter->shape().height() *
filter->shape().width();
auto num = filter->shape().num();
int div_capacity = filter::calc_division_capacity(chw);
int filter_num_alignment = filter::get_filter_num_alignment();
int aligned_num =
align_to_x(num / param.groups, filter_num_alignment) * param.groups;
split_num = filter::calc_split_num(aligned_num, div_capacity);
float max = find_max(*filter);
Shape& out_shape = out->shape();
for (int i = 0; i < split_num; i++) {
......@@ -310,45 +371,29 @@ inline void split_filter_num(const ConvParam& c_param) {
new_filter.flush();
conv_param->filter.mutableData<float>(FP32, f_shape);
if (param.groups != 1) {
int mem_factor =
32 / filter_num_per_div; // TODO(chonwhite): change 32 to param;
conv_param->filter.setMemScale(mem_factor);
}
std::vector<float> v; // TODO(chonwhite): change local variable name
format_filter(&new_filter, &(conv_param->filter), param.groups, v);
std::vector<float> v; // TODO(chonwhite) change variable name;
format_filter(&new_filter, &(conv_param->filter), param.groups, v, max);
conv_param->filter.setDataType(INT8);
int sb_num = 2 * align_to_x(filter_num, BS_NUM_ALIGNMENT);
Tensor scale;
Tensor bias;
int chnnnel_start = i * filter_num_per_div;
Shape s_shape(N, {filter_num});
Shape s_shape(NC, {1, filter_num});
float* scale_data = scale.mutableData<float>(FP32, s_shape);
float* bias_data = bias.mutableData<float>(FP32, s_shape);
for (int n = 0; n < filter_num; n++) {
scale_data[n] = param.scale()->data<float>()[n + chnnnel_start] * v[n];
scale_data[n] = param.scale()->data<float>()[n + chnnnel_start];
}
for (int n = 0; n < filter_num; n++) {
bias_data[n] = param.bias()->data<float>()[n + chnnnel_start];
}
Shape sb_shape(N, {sb_num});
format_scale_bias(&scale,
&bias,
&conv_param->filter,
&conv_param->scaleBias,
param.groups);
format_bias_scale_new(&bias, &scale, &conv_param->scaleBias);
conv_param->scaleBias.flush();
float* bs_data = conv_param->scaleBias.data<float>();
args.group_num = param.groups;
args.relu_enabled = param.relu.enabled;
args.sb_address = conv_param->scaleBias.data<float>();
args.sb_address = bs_data;
args.sb_address = conv_param->scaleBias.data<float16>();
args.kernel.stride_h = param.strides[1];
args.kernel.stride_w = param.strides[0];
args.kernel.height = new_filter.shape().height();
......@@ -372,6 +417,135 @@ inline void split_filter_num(const ConvParam& c_param) {
}
}
inline void pack_channel_filter(const ConvParam& c_param) {
ConvParam& param = const_cast<ConvParam&>(c_param);
Tensor* input = param.input;
Tensor* out = param.output;
Tensor* filter = param.filter;
int filter_num_alignment = filter::get_filter_num_alignment();
auto filter_num = filter->shape().num();
int pack_num = get_pack_num(param.filter, param.groups);
int group_per_pack = (param.groups + pack_num - 1) / pack_num;
int filter_per_group = filter_num / param.groups;
int filter_per_pack = filter_per_group * group_per_pack;
int channel_per_pack = filter->shape().channel() * group_per_pack;
float max = find_max(*filter);
Shape& out_shape = out->shape();
for (int i = 0; i < pack_num; i++) {
BasicConvParam* conv_param = new BasicConvParam();
conv_param->output.setDataLocation(Device);
conv_param->output.setAligned(true);
float16* out_address = nullptr;
float* out_scale_address = nullptr;
float16* input_address = nullptr;
ConvArgs& args = conv_param->args;
if (pack_num == 1) {
out_address = out->data<float16>();
out_scale_address = out->scale();
}
int new_group = param.groups;
int filter_current_pack = filter->shape().num();
int channel_current_pack = input->shape().channel();
new_group = i == pack_num - 1
? param.groups - (pack_num - 1) * group_per_pack
: group_per_pack;
filter_current_pack = new_group * filter_per_group;
channel_current_pack = new_group * filter->shape().channel();
if (pack_num == 1) {
input_address = input->data<float16>();
} else {
Shape in_shape(NCHW,
{1,
channel_current_pack,
input->shape().height(),
input->shape().width()});
input_address = conv_param->input.mutableData<float16>(FP16, in_shape);
}
if (pack_num != 1) {
Shape shape(
NHWC,
{1, out_shape.height(), out_shape.width(), filter_current_pack});
out_address = conv_param->output.mutableData<float16>(FP16, shape);
out_scale_address = conv_param->output.scale();
}
Shape f_shape(NCHW,
{filter_current_pack,
filter->shape().channel(),
filter->shape().height(),
filter->shape().width()});
Tensor new_filter;
float* new_filter_data = new_filter.mutableData<float>(FP32, f_shape);
int filter_hwc = filter->shape().height() * filter->shape().width() *
filter->shape().channel();
memcpy(new_filter_data,
filter->data<float>() + i * filter_per_pack * filter_hwc,
filter_current_pack * filter_hwc * sizeof(float));
new_filter.flush();
conv_param->filter.mutableData<float>(FP32, f_shape);
float mem_factor = filter_num_alignment / filter_per_pack;
conv_param->filter.setMemScale(mem_factor);
std::vector<float> v; // TODO(chonwhite) change variable name
format_filter(&new_filter, &(conv_param->filter), new_group, v, max);
conv_param->filter.setDataType(INT8);
Tensor scale;
Tensor bias;
int chnnnel_start = i * filter_per_pack;
Shape s_shape(NC, {1, filter_current_pack});
float* scale_data = scale.mutableData<float>(FP32, s_shape);
float* bias_data = bias.mutableData<float>(FP32, s_shape);
for (int n = 0; n < filter_current_pack; n++) {
scale_data[n] = param.scale()->data<float>()[n + chnnnel_start];
}
for (int n = 0; n < filter_current_pack; n++) {
bias_data[n] = param.bias()->data<float>()[n + chnnnel_start];
}
format_bias_scale_new(&bias, &scale, &conv_param->scaleBias);
conv_param->scaleBias.flush();
args.group_num = new_group;
args.sb_address = conv_param->scaleBias.data<float16>();
args.kernel.stride_h = param.strides[1];
args.kernel.stride_w = param.strides[0];
args.kernel.height = new_filter.shape().height();
args.kernel.width = new_filter.shape().width();
args.filter_address = conv_param->filter.data<int8_t>();
args.filter_num = filter_current_pack;
args.filter_scale_address = conv_param->filter.scale();
args.image.address = input_address;
args.image.scale_address = input->scale();
args.image.channels = channel_current_pack;
args.image.width = input->shape().width();
args.image.height = input->shape().height();
args.image.pad_width = param.paddings[1];
args.image.pad_height = param.paddings[0];
args.dilation = param.dilations[0];
args.output.address = out_address;
args.output.scale_address = out_scale_address;
param.splitParams().push_back(conv_param);
}
}
inline void split_channel(const ConvParam& c_param) {
ConvParam& param = const_cast<ConvParam&>(c_param);
Tensor* input = param.input;
......@@ -383,6 +557,8 @@ inline void split_channel(const ConvParam& c_param) {
Shape bs_shape(N, {channel});
float max = 1.0f;
for (int i = 0; i < num; i++) {
BasicConvParam* conv_param = new BasicConvParam();
......@@ -406,7 +582,8 @@ inline void split_channel(const ConvParam& c_param) {
}
new_filter.flush();
std::vector<float> scales;
format_filter(&new_filter, &(conv_param->filter), param.groups, scales);
format_filter(
&new_filter, &(conv_param->filter), param.groups, scales, max);
Tensor bias;
Tensor scale;
......@@ -428,7 +605,6 @@ inline void split_channel(const ConvParam& c_param) {
ConvArgs& args = conv_param->args;
args.group_num = param.groups;
args.relu_enabled = param.relu.enabled;
args.sb_address = conv_param->scaleBias.data<float>();
args.kernel.stride_h = param.strides[1];
args.kernel.stride_w = param.strides[0];
......@@ -457,13 +633,17 @@ inline int fill_split_arg(const ConvParam& c_param) {
ConvParam& param = const_cast<ConvParam&>(c_param);
Tensor* input = param.input;
Tensor* output = param.output;
if (output->shape().dimSize() == 4 && input->shape().channel() > 2047 &&
input->shape().width() == 1) {
split_channel(c_param);
return 1;
} else {
} else if (param.groups == 1) {
split_filter_num(c_param);
return 0;
} else {
pack_channel_filter(c_param);
return 0;
}
}
......
......@@ -24,6 +24,17 @@ namespace zynqmp {
class DepthwiseConvPE : public PE {
public:
inline int gcd_(int a, int b) {
while (b) {
int temp = a;
a = b;
b = temp % b;
}
return a;
}
inline int lcm_(int a, int b) { return a * b / gcd_(a, b); }
bool init() {
Tensor* output = param_.output;
output->setAligned(true);
......@@ -37,17 +48,40 @@ class DepthwiseConvPE : public PE {
Tensor* output = param.output;
int channel = output->shape().channel();
float16* b_data = bias_.mutableData<float16>(FP16, param_.bias()->shape());
int repeat = 1;
int alignment = 16;
int length = channel;
if (channel % alignment != 0 || channel < alignment) {
int c_lcm = lcm_(channel, alignment);
repeat = c_lcm / (channel);
}
Shape shape(N, {channel * repeat});
float16* b_data = bias_.mutableData<float16>(FP16, shape);
if (param_.bias()->dataType() == FP32) {
float* new_bias_data = param_.bias()->data<float>();
// bias从float转换成float16
for (int i = 0; i < channel; i++) {
b_data[i] = float_to_half(new_bias_data[i]);
// for (int i = 0; i < channel; i++) {
// b_data[i] = float_to_half(new_bias_data[i]);
// }
// bias 按16对齐填充hw
for (int i = 0; i < repeat; i++) {
for (int j = 0; j < length; j++) {
float16 value = float_to_half(new_bias_data[j]);
b_data[i * length + j] = value;
}
}
bias_.flush();
} else {
float16* new_bias_data = param_.bias()->data<float16>();
memcpy(b_data, new_bias_data, channel * sizeof(float16));
// memcpy(b_data, new_bias_data, channel * sizeof(float16));
for (int i = 0; i < repeat; i++) {
for (int j = 0; j < length; j++) {
// float16 value = float_to_half(bias_data_float[j]);
b_data[i * length + j] = new_bias_data[j];
}
}
bias_.flush();
}
......@@ -62,6 +96,8 @@ class DepthwiseConvPE : public PE {
float16* scale_data = param_.scale()->data<float16>();
float16* filter_data = param.quantizedFilter()->mutableData<float16>(
FP16, param.filter->shape());
// memcpy(filter_data, scale_data, channel * sizeof(float16));
memcpy(filter_data,
scale_data,
param.filter->shape().numel() * sizeof(float16));
......@@ -89,20 +125,33 @@ class DepthwiseConvPE : public PE {
args.sub_conv_num = 1;
param.args = args;
inplace_.relu_enable = param_.relu.enabled;
inplace_.power_enable = false;
inplace_.normalize_enable = false;
}
bool dispatch() {
param_.input->syncToDevice();
if (param_.relu.enabled) {
inplace_.relu_enable = param_.relu.enabled;
if (param_.activeParam.type == TYPE_RELU) {
inplace_.relu_enable = true;
} else if (param_.activeParam.type == TYPE_RELU6) {
inplace_.relu6_enable = true;
} else if (param_.activeParam.type == TYPE_SIGMOID) {
inplace_.sigmoid_enable = true;
} else if (param_.activeParam.type == TYPE_LEAKY_RELU) {
inplace_.leaky_relu_enable = true;
}
if (inplace_.relu_enable || inplace_.leaky_relu_enable ||
inplace_.relu6_enable || inplace_.sigmoid_enable) {
config_inplace(inplace_);
}
bool ret = compute_fpga_dwconv(param_.args) == 0;
if (param_.relu.enabled) {
if (inplace_.relu_enable || inplace_.leaky_relu_enable ||
inplace_.relu6_enable || inplace_.sigmoid_enable) {
inplace_.relu_enable = false;
inplace_.leaky_relu_enable = false;
inplace_.relu6_enable = false;
inplace_.sigmoid_enable = false;
config_inplace(inplace_);
}
return ret;
......
......@@ -58,15 +58,29 @@ class ElementwiseAddPE : public PE {
bool dispatch() {
param_.inputs[0]->syncToDevice();
param_.inputs[1]->syncToDevice();
InplaceArgs inplace_args = {0};
if (param_.relu.enabled) {
inplace_args.relu_enable = true;
config_inplace(inplace_args);
// InplaceArgs inplace_ = {0};
if (param_.activeParam.type == TYPE_RELU) {
inplace_.relu_enable = true;
} else if (param_.activeParam.type == TYPE_RELU6) {
inplace_.relu6_enable = true;
} else if (param_.activeParam.type == TYPE_SIGMOID) {
inplace_.sigmoid_enable = true;
} else if (param_.activeParam.type == TYPE_LEAKY_RELU) {
inplace_.leaky_relu_enable = true;
}
if (inplace_.relu_enable || inplace_.leaky_relu_enable ||
inplace_.relu6_enable || inplace_.sigmoid_enable) {
config_inplace(inplace_);
}
compute_fpga_ewadd(param_.ewargs);
if (param_.relu.enabled) {
inplace_args.relu_enable = false;
config_inplace(inplace_args);
if (inplace_.relu_enable || inplace_.leaky_relu_enable ||
inplace_.relu6_enable || inplace_.sigmoid_enable) {
inplace_.relu_enable = false;
inplace_.relu6_enable = false;
inplace_.sigmoid_enable = false;
inplace_.leaky_relu_enable = false;
config_inplace(inplace_);
}
return true;
}
......@@ -75,6 +89,7 @@ class ElementwiseAddPE : public PE {
private:
ElementwiseAddParam param_;
InplaceArgs inplace_ = {0};
};
} // namespace zynqmp
......
......@@ -29,17 +29,17 @@ class ElementwiseMulPE : public PE {
}
void apply() {
Tensor* input = param_.inputs[0];
Tensor* input = param_.input_x;
Tensor* output = param_.output;
int wc_aligned = align_to_x(param_.inputs[0]->shape().numel(), 32);
int wc_aligned = align_to_x(param_.input_x->shape().numel(), 32);
Shape s(N, {wc_aligned});
float16* bias_data = bias_tensor.mutableData<float16>(FP16, s);
memset(bias_data, 0, wc_aligned * sizeof(float16));
ScaleArgs& args = args_;
args.scale_address = param_.inputs[1]->data<void>();
args.scale_address = param_.input_y->data<void>();
args.bias_address = bias_tensor.data<void>();
args.wc_alignment = wc_aligned;
args.channel_alignment = wc_aligned;
......
......@@ -37,10 +37,9 @@ class FullyConnectedPE : public PE {
ConvParam& convParam_ = convPE_.param();
Tensor* input = param_.input;
convParam_.input = param_.input;
num_ = param_.input->shape().num();
convParam_.output = param_.output;
// convParam_.relu = param_.relu;
convParam_.activeParam.type = param_.activeParam.type;
convParam_.groups = 1;
convParam_.strides = {1, 1};
convParam_.paddings = {0, 0};
......@@ -49,6 +48,9 @@ class FullyConnectedPE : public PE {
int num = param_.filter->shape().channel();
int chw = param_.filter->shape().num();
// if (num == 2) {
// return;
// }
int height = param_.input->shape().height();
int width = param_.input->shape().width();
......@@ -66,6 +68,7 @@ class FullyConnectedPE : public PE {
new_filter_data[i * chw + j] = scale;
}
}
conv_filter->flush();
convParam_.filter = conv_filter;
......@@ -84,15 +87,51 @@ class FullyConnectedPE : public PE {
convPE_.apply();
}
bool dispatch() { return convPE_.dispatch(); }
void cpu_compute() {
int num = param_.filter->shape().channel();
int chw = param_.filter->shape().num();
float* filter_data = param_.filter->data<float>();
float max = 0.0f;
Tensor* input = param_.input;
Tensor* output = param_.output;
float16* input_data = input->data<float16>();
float16* output_data = output->data<float16>();
for (int i = 0; i < num; i++) {
float sum = 0;
float bias = param_.bias->data<float>()[i];
for (int j = 0; j < chw; j++) {
float scale = filter_data[j * num + i];
float data = half_to_float(input_data[j]);
sum += scale * data;
}
output_data[i] = float_to_half(sum + bias);
if (max < output_data[i]) {
max = output_data[i];
}
}
output->flush();
output->scale()[0] = max / 127.0f;
output->scale()[1] = 127.0f / max;
}
bool dispatch() {
// int num = param_.filter->shape().channel();
// if (num == 2) {
// cpu_compute();
// return 1;
// } else {
return convPE_.dispatch();
// }
}
FullyConnectedParam& param() { return param_; }
private:
FullyConnectedParam param_;
ConvPE convPE_;
Tensor tempOut_;
int num_ = 1;
};
} // namespace zynqmp
} // namespace paddle
......@@ -47,8 +47,10 @@ class GRUPE : public PE {
zynqmp::Shape hidden_shape{zynqmp::NCHW, {1, frame_size, 1, 1}};
float16* prev_hidden_data =
prev_hidden_.mutableData<float16>(zynqmp::FP16, hidden_shape);
// set previous hidden data to 0;
memset(prev_hidden_data, 0, hidden_shape.numel() * sizeof(float16));
// copy 2/3 weight from param.weight;
zynqmp::Shape weight_shape{zynqmp::NC, {frame_size, frame_size * 2}};
float* weight_data = weight_.mutableData<float>(zynqmp::FP32, weight_shape);
memset(weight_data, 0, weight_shape.numel() * sizeof(float));
......@@ -74,7 +76,9 @@ class GRUPE : public PE {
reset_hidden_.mutableData<void>(FP16, hidden_shape);
ElementwiseMulParam& mul_param = mul_pe_.param();
mul_param.inputs = {&reset_gate_, &prev_hidden_};
// mul_param.inputs = {&reset_gate_, &prev_hidden_};
mul_param.input_x = &reset_gate_;
mul_param.input_y = &prev_hidden_;
mul_param.output = &reset_hidden_;
mul_pe_.init();
mul_pe_.apply();
......@@ -115,11 +119,8 @@ class GRUPE : public PE {
if (hidden_prev) {
// TODO(chonwhite): change to pre_out;
prev_hidden_.copyFrom(value.pre_output);
prev_hidden_.saveToFile("prev_.txt");
}
mul_pe_.dispatch();
reset_hidden_.saveToFile("reset_hidden_.txt");
update_gate_data += stride_update;
reset_gate_data += stride_update;
......
......@@ -72,8 +72,10 @@ class NormPE : public PE {
input_float.mutableData<float>(FP32, param_.input->shape());
float_out.mutableData<float>(FP32, param_.output->shape());
// param_.input->syncToDevice();
input_float.copyFrom(param_.input);
input_float.syncToCPU();
// input_float.saveToFile("normalize_", true);
int channel = input_float.shape().channel();
int height = input_float.shape().height();
......@@ -85,6 +87,7 @@ class NormPE : public PE {
float* out_ptr = float_out.data<float>();
int loop = height * width;
#pragma omp parallel for
for (int i = 0; i < loop; i++) {
float sum = param_.epsilon;
for (int c = 0; c < channel; c++) {
......@@ -98,11 +101,26 @@ class NormPE : public PE {
}
}
float_out.flush();
// float_out.saveToFile("normalize_", true);
param_.output->copyFrom(&float_out);
}
bool dispatch() {
cpuCompute();
// std::cout << "CPU normalize ---------------------" << std::endl;
// param_.input->syncToDevice();
// // param_.input->saveToFile("normalize_fpga_", true);
// config_norm_param(norm_param_args_);
// inplace_args_.normalize_enable = true;
// config_inplace(inplace_args_);
// perform_bypass(bypass_args_);
// inplace_args_.normalize_enable = false;
// config_inplace(inplace_args_);
// compute_norm(norm_args_);
// param_.output->saveToFile("normalize_fpga_", true);
// std::cout << "FPGA normalize ---------------------" << std::endl;
return true;
}
......
......@@ -35,14 +35,17 @@ class PoolingPE : public PE {
Tensor* input = param_.input;
Tensor* output = param_.output;
uint32_t k_height = param_.kernelSize[0];
uint32_t k_width = param_.kernelSize[1];
uint32_t k_height = 1;
uint32_t k_width = 1;
if (param_.globalPooling) {
k_width = input->shape().width();
k_height = input->shape().height();
param_.kernelSize[0] = k_height;
param_.kernelSize[1] = k_width;
} else {
k_height = param_.kernelSize[0];
k_width = param_.kernelSize[1];
}
PoolingArgs args = {0};
......@@ -65,9 +68,12 @@ class PoolingPE : public PE {
args.out_width = output->shape().width();
param_.poolingArgs = args;
// use_cpu_ = output->shape().width() == 1 && output->shape().height() == 1
// &&
// (k_width > 7 || k_height > 7);
use_cpu_ = output->shape().width() == 1 && output->shape().height() == 1 &&
(k_width > 7 || k_height > 7);
use_cpu_ = param_.type == AVERAGE;
(k_width > 255 || k_height > 255);
// use_cpu_ = param_.type == AVERAGE;
}
void compute() {
......@@ -76,6 +82,7 @@ class PoolingPE : public PE {
input->syncToCPU();
Tensor float_input;
// Tensor float_output;
float* image_addr = float_input.mutableData<float>(FP32, input->shape());
float_input.copyFrom(input);
float16* data_out = output->data<float16>();
......@@ -110,6 +117,8 @@ class PoolingPE : public PE {
for (int c = 0; c < image_channels; ++c) {
const int pool_index = (ph * pooled_width_ + pw) * image_channels + c;
float sum = 0;
// const int index =
// (hstart * image_width + wstart) * image_channels + c;
for (int h = hstart; h < hend; ++h) {
for (int w = wstart; w < wend; ++w) {
const int index = (h * image_width + w) * image_channels + c;
......@@ -138,7 +147,9 @@ class PoolingPE : public PE {
Tensor float_input;
float_input.mutableData<float>(FP32, input->shape());
float_input.copyFrom(input);
// float_input.saveToFile("pool_float.txt");
float16* data_out = output->data<float16>();
int kernel_hw = param_.kernelSize[0] * param_.kernelSize[1];
float scale_max = 0;
......@@ -155,6 +166,7 @@ class PoolingPE : public PE {
output->scale()[0] = scale_max / 127.0f;
output->scale()[1] = 127.0f / scale_max;
output->flush();
// exit(-1);
}
void cpu_compute() {
......@@ -184,11 +196,14 @@ class PoolingPE : public PE {
output->scale()[0] = scale_max / 127.0f;
output->scale()[1] = 127.0f / scale_max;
output->flush();
// exit(-1);
}
bool dispatch() {
if (use_cpu_) {
// cpu_compute();
compute();
// exit(-1);
return true;
}
param_.input->syncToDevice();
......
......@@ -89,7 +89,6 @@ class ScalePE : public PE {
}
}
}
float* scale_data_float = param_.scale->data<float>();
for (int i = 0; i < repeat; i++) {
for (int j = 0; j < length; j++) {
......
......@@ -53,20 +53,37 @@ class SplitPE : public PE {
int64_t src_after = src_stride_numel[axis];
int64_t dst_after = dst_stride_numel[axis];
// PADDLE_MOBILE_ENFORCE(src_stride_numel.size() == dst_stride_numel.size(),
// "src and dst tensor should have the same dims
// size.");
for (int64_t i = 0; i < axis; ++i) {
if (i < axis) {
// PADDLE_MOBILE_ENFORCE(src_stride_numel[i] / src_stride_numel[axis] ==
// dst_stride_numel[i] /
// dst_stride_numel[axis],
// "src and dst should have the same elements "
// "except the specified axis.");
} else if (i == axis) {
continue;
} else {
// PADDLE_MOBILE_ENFORCE(src_stride_numel[i] == dst_stride_numel[i],
// "src and dst should have the same elements "
// "except the specified axis.");
}
}
for (int64_t i = 0; i < before; ++i) {
memory::Copy(dst + i * dst_after, src + i * src_after, sizeof(T) * size);
memcpy(dst + i * dst_after, src + i * src_after, sizeof(T) * size);
}
}
void split3D() { int axis = param_.axis; }
void split3D() {
int axis = param_.axis;
// float16* dst = param_.output->data<float16>();
// std::vector<int>& dst_dims = ;
// StridedNumelCopyWithAxis();
}
bool dispatch() {
Tensor* input = param_.input;
......@@ -88,6 +105,7 @@ class SplitPE : public PE {
in_stride,
out_stride[axis]);
input_offset += out_stride[axis];
// out->flush();
}
return true;
}
......@@ -95,21 +113,26 @@ class SplitPE : public PE {
std::vector<Tensor*> outputs = param_.outputs;
int in_channel = input->shape().channel();
int split_channel = input->shape().channel() / param_.num;
// int split_channel = input->shape().channel() / param_.num;
int hw = input->shape().height() * input->shape().width();
float16* in_data = input->data<float16>();
for (int i = 0; i < hw; i++) {
int channel_stride = 0;
for (int n = 0; n < outputs.size(); n++) {
Tensor* out = outputs[n];
float16* out_data = out->data<float16>();
memcpy(out_data + i * split_channel,
in_data + i * in_channel + n * split_channel,
split_channel * sizeof(float16));
memcpy(out_data + i * out->shape().channel(),
in_data + i * in_channel + channel_stride,
out->shape().channel() * sizeof(float16));
channel_stride += out->shape().channel();
}
}
for (int n = 0; n < outputs.size(); n++) {
Tensor* out = outputs[n];
out->flush();
out->copyScaleFrom(input);
}
return true;
......@@ -120,5 +143,6 @@ class SplitPE : public PE {
private:
SplitParam param_;
};
} // namespace zynqmp
} // namespace paddle
......@@ -300,14 +300,12 @@ class Tensor {
}
void flush() {
size_t memorySize =
shape_->memorySize(CellSize(dataType_)) * mem_scale_factor_;
size_t memorySize = placeHolder_->memorySize();
fpga_flush(placeHolder_->data(), memorySize);
}
void invalidate() {
size_t memorySize =
shape_->memorySize(CellSize(dataType_)) * mem_scale_factor_;
size_t memorySize = placeHolder_->memorySize();
fpga_invalidate(placeHolder_->data(), memorySize);
}
......@@ -385,6 +383,7 @@ class Tensor {
invalidate();
std::ofstream ofs;
ofs.open(path);
ofs << scale()[0] << " / " << scale()[1] << std::endl;
for (int i = 0; i < shape_->numel(); i++) {
float value = 0;
......
......@@ -46,7 +46,10 @@ void ConvCompute::PrepareForRun() {
conv_param.dilations = *param.dilations;
fill_scale_bias_const(&conv_param);
conv_param.bias()->copyFrom(param.bias->ZynqTensor());
conv_param.relu.enabled = param.fuse_relu;
if (param.fuse_relu) {
conv_param.activeParam.type = zynqmp::TYPE_RELU;
}
dw_conv_pe_.init();
dw_conv_pe_.apply();
......@@ -65,7 +68,16 @@ void ConvCompute::PrepareForRun() {
conv_param.bias()->copyFrom(param.bias->ZynqTensor());
}
conv_param.relu.enabled = param.fuse_relu;
if (param.fuse_relu) {
conv_param.activeParam.type = zynqmp::TYPE_RELU;
}
// conv_param.filter->saveToFile("conv_filter_", true);
// if (param.bias != nullptr) {
// std::cout << "param.bias != nullptr" << std::endl;
// conv_param.bias()->saveToFile("conv_bias_", true);
// }
conv_pe_.init();
conv_pe_.apply();
}
......@@ -76,8 +88,14 @@ void ConvCompute::Run() {
if (param.x->ZynqTensor()->shape().channel() != 1 &&
param.groups == param.x->ZynqTensor()->shape().channel()) {
dw_conv_pe_.dispatch();
#ifdef FPGA_PRINT_TENSOR
zynqmp::DepthwiseConvParam& dwconv_param = dw_conv_pe_.param();
Debugger::get_instance().registerOutput("dwconv", dwconv_param.output);
#endif
} else {
// zynqmp::ConvParam& conv_param = conv_pe_.param();
conv_pe_.dispatch();
#ifdef FPGA_PRINT_TENSOR
zynqmp::ConvParam& conv_param = conv_pe_.param();
Debugger::get_instance().registerOutput("conv", conv_param.output);
......@@ -103,3 +121,21 @@ REGISTER_LITE_KERNEL(
PRECISION(kFP16),
DATALAYOUT(kNHWC))})
.Finalize();
REGISTER_LITE_KERNEL(depthwise_conv2d,
kFPGA,
kFP16,
kNHWC,
paddle::lite::kernels::fpga::ConvCompute,
def)
.BindInput("Input",
{LiteType::GetTensorTy(TARGET(kFPGA),
PRECISION(kFP16),
DATALAYOUT(kNHWC))})
.BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM))})
.BindInput("Filter", {LiteType::GetTensorTy(TARGET(kARM))})
.BindOutput("Output",
{LiteType::GetTensorTy(TARGET(kFPGA),
PRECISION(kFP16),
DATALAYOUT(kNHWC))})
.Finalize();
......@@ -33,7 +33,8 @@ void ElementwiseAddCompute::PrepareForRun() {
ew_param.inputs = {param.X->ZynqTensor(), param.Y->ZynqTensor()};
ew_param.output = param.Out->ZynqTensor();
ew_param.axis = param.axis;
ew_param.relu.enabled = false;
ew_param.activeParam.type = zynqmp::TYPE_NONE;
pe_.init();
pe_.apply();
......@@ -56,7 +57,7 @@ void ElementwiseAddActivationCompute::PrepareForRun() {
ew_param.inputs = {param.X->ZynqTensor(), param.Y->ZynqTensor()};
ew_param.output = param.Out->ZynqTensor();
ew_param.axis = param.axis;
ew_param.relu.enabled = true;
ew_param.activeParam.type = zynqmp::TYPE_RELU;
pe_.init();
pe_.apply();
}
......@@ -76,7 +77,7 @@ void ElementwiseMulCompute::PrepareForRun() {
scale_param.input = param.X->ZynqTensor();
scale_param.output = param.Out->ZynqTensor();
scale_param.relu.enabled = false;
scale_param.activeParam.type = zynqmp::TYPE_NONE;
int channel = scale_param.input->shape().channel();
zynqmp::Tensor* scale = new zynqmp::Tensor();
......@@ -124,7 +125,10 @@ REGISTER_LITE_KERNEL(elementwise_add,
{LiteType::GetTensorTy(TARGET(kFPGA),
PRECISION(kFP16),
DATALAYOUT(kNHWC))})
.BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
.BindInput("Y",
{LiteType::GetTensorTy(TARGET(kFPGA),
PRECISION(kFP16),
DATALAYOUT(kNHWC))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kFPGA),
PRECISION(kFP16),
......
......@@ -93,18 +93,25 @@ void elementwise_compute_ref(const operators::ElementwiseParam& param,
}
// do elementwise add/sub/max...
if (elt_type == "add") {
for (int i = 0; i < batch; ++i) {
for (int j = 0; j < channels; ++j) {
int offset = (i * channels + j) * num;
const dtype* din_ptr = x_data + offset;
const dtype diny_data = y_data[j];
dtype* dout_ptr = out_data + offset;
for (int k = 0; k < num; ++k) {
*dout_ptr = sum(*din_ptr, diny_data);
dout_ptr++;
din_ptr++;
}
}
// for (int i = 0; i < batch; ++i) {
// for (int j = 0; j < channels; ++j) {
// int offset = (i * channels + j) * num;
// const dtype* din_ptr = x_data + offset;
// const dtype diny_data = y_data[j];
// dtype* dout_ptr = out_data + offset;
// for (int k = 0; k < num; ++k) {
// *dout_ptr =
// zynqmp::float_to_half(sum(zynqmp::half_to_float(*din_ptr),
// zynqmp::half_to_float(diny_data)));
// dout_ptr++;
// din_ptr++;
// }
// }
// }
int count = x_dims[0] * x_dims[1] * x_dims[2] * x_dims[3];
for (int i = 0; i < count; ++i) {
out_data[i] = zynqmp::float_to_half(sum(
zynqmp::half_to_float(x_data[i]), zynqmp::half_to_float(y_data[i])));
}
} else if (elt_type == "sub") {
for (int i = 0; i < batch; ++i) {
......@@ -148,9 +155,9 @@ TEST(elementwise_add, compute) {
lite::Tensor x, y, output, output_ref;
for (auto n : {1}) {
for (auto c : {8}) {
for (auto h : {8}) {
for (auto w : {8}) {
for (auto h : {72}) {
for (auto w : {192}) {
for (auto c : {24}) {
for (auto axis : {0}) {
for (auto yd : {std::vector<int64_t>({n, c, h, w})}) {
auto x_dim = DDim(std::vector<int64_t>({n, c, h, w}));
......@@ -174,10 +181,16 @@ TEST(elementwise_add, compute) {
auto* output_ref_data =
output_ref.mutable_data<float16>(TARGET(kFPGA));
for (int i = 0; i < x_dim.production(); i++) {
x_data[i] = zynqmp::float_to_half(i);
float sign = i % 3 == 0 ? -0.03 : 0.05f;
float x = sign * (i % 128);
std::cout << "x:" << x << std::endl;
x_data[i] = zynqmp::float_to_half(x);
}
for (int i = 0; i < y_dim.production(); i++) {
y_data[i] = zynqmp::float_to_half(i);
float sign = i % 3 == 0 ? -0.03 : 0.05f;
float y = sign * (i % 128);
std::cout << "y:" << y << std::endl;
y_data[i] = zynqmp::float_to_half(y);
}
param.X = &x;
param.Y = &y;
......@@ -190,7 +203,14 @@ TEST(elementwise_add, compute) {
elementwise_compute_ref<float16>(param, "add", "");
for (int i = 0; i < output.dims().production(); i++) {
EXPECT_NEAR(output_data[i], output_ref_data[i], 1e-5);
std::cout << "output_data:"
<< zynqmp::half_to_float(output_data[i])
<< ",output_ref_data:"
<< zynqmp::half_to_float(output_ref_data[i])
<< std::endl;
EXPECT_NEAR(zynqmp::half_to_float(output_data[i]),
zynqmp::half_to_float(output_ref_data[i]),
1e-5);
}
}
}
......@@ -209,73 +229,74 @@ TEST(fusion_elementwise_add_activation_fpga, retrive_op) {
ASSERT_TRUE(fusion_elementwise_add_activation.front());
}
TEST(fusion_elementwise_add_activation_fpga, init) {
ElementwiseAddActivationCompute fusion_elementwise_add_activation;
ASSERT_EQ(fusion_elementwise_add_activation.precision(), PRECISION(kFP16));
ASSERT_EQ(fusion_elementwise_add_activation.target(), TARGET(kFPGA));
}
// TEST(fusion_elementwise_add_activation_fpga, init) {
// ElementwiseAddActivationCompute fusion_elementwise_add_activation;
// ASSERT_EQ(fusion_elementwise_add_activation.precision(), PRECISION(kFP16));
// ASSERT_EQ(fusion_elementwise_add_activation.target(), TARGET(kFPGA));
// }
TEST(fusion_elementwise_add_activation_fpga, compute) {
ElementwiseAddActivationCompute fusion_elementwise_add_activation;
operators::FusionElementwiseActivationParam param;
lite::Tensor x, y, output, output_ref;
// TEST(fusion_elementwise_add_activation_fpga, compute) {
// ElementwiseAddActivationCompute fusion_elementwise_add_activation;
// operators::FusionElementwiseActivationParam param;
// lite::Tensor x, y, output, output_ref;
for (auto act_type : {"relu"}) {
for (auto n : {1}) {
for (auto c : {8}) {
for (auto h : {8}) {
for (auto w : {8}) {
for (auto axis : {0}) {
for (auto yd : {std::vector<int64_t>({n, c, h, w})}) {
auto x_dim = DDim(std::vector<int64_t>({n, c, h, w}));
auto y_dim = DDim(yd);
int axis_t = axis < 0 ? x_dim.size() - y_dim.size() : axis;
// for (auto act_type : {"relu"}) {
// for (auto n : {1}) {
// for (auto c : {8}) {
// for (auto h : {8}) {
// for (auto w : {8}) {
// for (auto axis : {0}) {
// for (auto yd : {std::vector<int64_t>({n, c, h, w})}) {
// auto x_dim = DDim(std::vector<int64_t>({n, c, h, w}));
// auto y_dim = DDim(yd);
// int axis_t = axis < 0 ? x_dim.size() - y_dim.size() : axis;
if (axis_t + y_dim.size() > 4) continue;
bool flag = false;
for (int i = 0; i < y_dim.size(); i++) {
if (x_dim[i + axis_t] != y_dim[i]) flag = true;
}
if (flag) continue;
// if (axis_t + y_dim.size() > 4) continue;
// bool flag = false;
// for (int i = 0; i < y_dim.size(); i++) {
// if (x_dim[i + axis_t] != y_dim[i]) flag = true;
// }
// if (flag) continue;
x.Resize(x_dim);
y.Resize(y_dim);
output.Resize(x_dim);
output_ref.Resize(x_dim);
auto* x_data = x.mutable_data<float16>(TARGET(kFPGA));
auto* y_data = y.mutable_data<float16>(TARGET(kFPGA));
auto* output_data = output.mutable_data<float16>(TARGET(kFPGA));
auto* output_ref_data =
output_ref.mutable_data<float16>(TARGET(kFPGA));
for (int i = 0; i < x_dim.production(); i++) {
float sign = i % 3 == 0 ? -1.0f : 1.0f;
x_data[i] = zynqmp::float_to_half(i * sign);
}
for (int i = 0; i < y_dim.production(); i++) {
float sign = i % 2 == 0 ? 0.5f : -0.5f;
y_data[i] = zynqmp::float_to_half(i * sign);
}
param.X = &x;
param.Y = &y;
param.axis = axis;
param.Out = &output;
param.act_type = act_type;
fusion_elementwise_add_activation.SetParam(param);
fusion_elementwise_add_activation.PrepareForRun();
fusion_elementwise_add_activation.Run();
param.Out = &output_ref;
elementwise_compute_ref<float16>(param, "add", act_type);
for (int i = 0; i < output.dims().production(); i++) {
EXPECT_NEAR(output_data[i], output_ref_data[i], 1e-5);
}
}
}
}
}
}
}
}
}
// x.Resize(x_dim);
// y.Resize(y_dim);
// output.Resize(x_dim);
// output_ref.Resize(x_dim);
// auto* x_data = x.mutable_data<float16>(TARGET(kFPGA));
// auto* y_data = y.mutable_data<float16>(TARGET(kFPGA));
// auto* output_data =
// output.mutable_data<float16>(TARGET(kFPGA));
// auto* output_ref_data =
// output_ref.mutable_data<float16>(TARGET(kFPGA));
// for (int i = 0; i < x_dim.production(); i++) {
// float sign = i % 3 == 0 ? -1.0f : 1.0f;
// x_data[i] = zynqmp::float_to_half(i * sign);
// }
// for (int i = 0; i < y_dim.production(); i++) {
// float sign = i % 2 == 0 ? 0.5f : -0.5f;
// y_data[i] = zynqmp::float_to_half(i * sign);
// }
// param.X = &x;
// param.Y = &y;
// param.axis = axis;
// param.Out = &output;
// param.act_type = act_type;
// fusion_elementwise_add_activation.SetParam(param);
// fusion_elementwise_add_activation.PrepareForRun();
// fusion_elementwise_add_activation.Run();
// param.Out = &output_ref;
// elementwise_compute_ref<float16>(param, "add", act_type);
// for (int i = 0; i < output.dims().production(); i++) {
// EXPECT_NEAR(output_data[i], output_ref_data[i], 1e-5);
// }
// }
// }
// }
// }
// }
// }
// }
// }
} // namespace fpga
} // namespace kernels
......@@ -283,4 +304,4 @@ TEST(fusion_elementwise_add_activation_fpga, compute) {
} // namespace paddle
USE_LITE_KERNEL(elementwise_add, kFPGA, kFP16, kNHWC, def);
USE_LITE_KERNEL(fusion_elementwise_add_activation, kFPGA, kFP16, kNHWC, def);
// USE_LITE_KERNEL(fusion_elementwise_add_activation, kFPGA, kFP16, kNHWC, def);
......@@ -67,3 +67,13 @@ REGISTER_LITE_KERNEL(
PRECISION(kFP16),
DATALAYOUT(kNHWC))})
.Finalize();
REGISTER_LITE_KERNEL(feed,
kFPGA,
kFP16,
kNHWC,
paddle::lite::kernels::fpga::FeedCompute,
def_host)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kHost))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))})
.Finalize();
......@@ -43,8 +43,14 @@ void FetchCompute::PrepareForRun() {
}
void FetchCompute::Run() {
pe_.dispatch();
auto& param = this->Param<param_t>();
auto fetch_list = param.fetch_list;
if (fetch_list->size() <= static_cast<size_t>(param.col)) {
fetch_list->resize(param.col + 1);
}
Tensor& out = param.fetch_list->at(param.col);
out.Resize(param.input->dims());
pe_.dispatch();
#ifdef FPGA_PRINT_TENSOR
zynqmp::OutputParam& fetch_param = pe_.param();
......@@ -67,10 +73,7 @@ REGISTER_LITE_KERNEL(fetch,
{LiteType::GetTensorTy(TARGET(kFPGA),
PRECISION(kAny),
DATALAYOUT(kAny))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kHost),
PRECISION(kAny),
DATALAYOUT(kAny))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))})
.Finalize();
REGISTER_LITE_KERNEL(fetch,
......@@ -79,12 +82,6 @@ REGISTER_LITE_KERNEL(fetch,
kNHWC,
paddle::lite::kernels::fpga::FetchCompute,
host_host)
.BindInput("X",
{LiteType::GetTensorTy(TARGET(kHost),
PRECISION(kAny),
DATALAYOUT(kAny))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kHost),
PRECISION(kAny),
DATALAYOUT(kAny))})
.BindInput("X", {LiteType::GetTensorTy(TARGET(kHost))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))})
.Finalize();
......@@ -12,6 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include <unistd.h>
#include <iostream>
#include <string>
#include <vector>
......@@ -83,6 +84,7 @@ void GRUCompute::PrepareForRun() {
void GRUCompute::Run() {
auto& param = this->Param<param_t>();
param.hidden->mutable_data<float>();
// inputs
auto input = param.input;
auto h0 = param.h0;
......@@ -130,6 +132,7 @@ void GRUCompute::Run() {
// //3.
gru_value.prev_out_value = ordered_h0.mutable_data<float>();
gru_tensors.pre_output = ordered_h0.ZynqTensor();
} else {
gru_value.prev_out_value = nullptr;
gru_tensors.pre_output = nullptr;
......@@ -169,6 +172,7 @@ void GRUCompute::Run() {
float* hidden_data =
hidden_out.mutableData<float>(zynqmp::FP32, float_input_shape);
gru_tensors.gate = &float_input;
gru_tensors.output = &hidden_out;
......@@ -187,11 +191,6 @@ void GRUCompute::Run() {
*(batch_hidden->mutable_lod()) = batch_gate->lod();
batch_hidden->mutable_data<float>();
to_seq(*batch_hidden, hidden);
save_tensor(const_cast<Tensor*>(input), "_input.txt");
save_tensor(hidden, "_gru.txt");
exit(-1);
}
} // namespace fpga
......
......@@ -119,7 +119,79 @@ class IoCopyFpgaToHostCompute
auto out_lod = param.y->mutable_lod();
*out_lod = param.x->lod();
}
std::string doc() const override { return "Copy IO from FPGA to HOST"; }
};
void hwc_to_chw(float* chw_data,
float* hwc_data,
int num,
int channel,
int height,
int width) {
int chw = channel * height * width;
int wc = width * channel;
int wh = width * height;
int index = 0;
for (int n = 0; n < num; n++) {
for (int h = 0; h < height; h++) {
for (int w = 0; w < width; w++) {
for (int c = 0; c < channel; c++) {
chw_data[n * chw + c * wh + h * width + w] = hwc_data[index];
index++;
}
}
}
}
}
class IoCopyFpgaToHostCHWCompute
: public KernelLite<TARGET(kFPGA), PRECISION(kAny), DATALAYOUT(kAny)> {
public:
void Run() override {
auto& param = Param<operators::IoCopyParam>();
CHECK(param.x->target() == TARGET(kHost) ||
param.x->target() == TARGET(kFPGA));
Tensor hwc;
hwc.Resize(param.y->dims());
float* hwc_data = hwc.mutable_data<float>();
float* chw_data = param.y->mutable_data<float>();
param.y->ZynqTensor()->setDataType(zynqmp::FP32);
param.x->ZynqTensor()->syncToDevice();
if (param.x->ZynqTensor()->aligned() &&
param.x->ZynqTensor()->shape().shouldAlign()) {
zynqmp::Tensor tempTensor;
tempTensor.mutableData<float16>(zynqmp::FP16,
param.x->ZynqTensor()->shape());
tempTensor.copyFrom(param.x->ZynqTensor());
tempTensor.setAligned(true);
tempTensor.unalignImage();
hwc.ZynqTensor()->copyFrom(&tempTensor);
} else {
hwc.ZynqTensor()->copyFrom(param.x->ZynqTensor());
}
int num = 1;
int channel = 1;
int height = 1;
int width = 1;
auto dims = param.y->ZynqTensor()->shape();
hwc_to_chw(chw_data,
hwc_data,
dims.num(),
dims.channel(),
dims.height(),
dims.width());
param.y->ZynqTensor()->copyScaleFrom(param.x->ZynqTensor());
param.y->ZynqTensor()->flush();
auto out_lod = param.y->mutable_lod();
*out_lod = param.x->lod();
}
std::string doc() const override { return "Copy IO from FPGA to HOST"; }
};
......@@ -170,7 +242,7 @@ REGISTER_LITE_KERNEL(io_copy,
PRECISION(kFP16),
DATALAYOUT(kNHWC))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kARM),
{LiteType::GetTensorTy(TARGET(kHost),
PRECISION(kFloat),
DATALAYOUT(kNHWC))})
.Finalize();
......@@ -179,8 +251,8 @@ REGISTER_LITE_KERNEL(io_copy,
kFPGA,
kAny,
kAny,
paddle::lite::kernels::fpga::IoCopyFpgaToHostCompute,
device_to_host_22)
paddle::lite::kernels::fpga::IoCopyFpgaToHostCHWCompute,
device_to_host_chw)
.BindInput("Input",
{LiteType::GetTensorTy(TARGET(kFPGA),
PRECISION(kFP16),
......
......@@ -384,6 +384,7 @@ void MulticlassNmsCompute::Run() {
scores_slice, boxes_slice, all_indices[i], score_dims.size(), &out);
outs->ZynqTensor()->copyFrom(out.ZynqTensor());
}
outs->Resize({static_cast<int64_t>(e - s), out_dim});
}
}
LoD lod;
......
......@@ -34,7 +34,9 @@ void PoolCompute::PrepareForRun() {
zynqmp::PoolingParam& pool_param = pe_.param();
pool_param.input = param.x->ZynqTensor();
pool_param.output = param.output->ZynqTensor();
pool_param.relu.enabled = false;
pool_param.activeParam.type = zynqmp::TYPE_RELU;
pool_param.type = param.pooling_type == "max" ? zynqmp::PoolingType::MAX
: zynqmp::PoolingType::AVERAGE;
pool_param.globalPooling = param.global_pooling;
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册