未验证 提交 e9591074 编写于 作者: A Artem Tsvetkov 提交者: GitHub

Updated utility functions to work with embARC MLI Library 2.0 for ARC (#231)

* Updated utility functions to work with embARC MLI Library 2.0

* Updated copyrights in several files.

* Minor fix for mli_tf_utils.h

* Update mli_tf_utils.h
上级 d47d48e3
/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "mli_interface.h" // NOLINT
#include <math.h>
namespace tflite {
namespace ops {
namespace micro {
template <>
int8_t* MliTensorInterface::Data<int8_t>(void) {
TFLITE_DCHECK(tensor_->el_type == MLI_EL_ASYM_I8);
return static_cast<int8_t*>(tensor_->data);
}
template <>
int32_t* MliTensorInterface::Data<int32_t>(void) {
TFLITE_DCHECK(tensor_->el_type == MLI_EL_ASYM_I32);
return static_cast<int32_t*>(tensor_->data);
}
template <>
int32_t* MliTensorInterface::Scale(void) {
return &tensor_->el_params.asym.scale.i32;
}
template <>
int32_t** MliTensorInterface::Scale(void) {
return &tensor_->el_params.asym.scale.pi32;
}
template <>
void MliTensorInterface::SetData(int8_t* data, uint32_t capacity) const {
TFLITE_DCHECK(tensor_->el_type == MLI_EL_ASYM_I8);
tensor_->data = data;
tensor_->capacity = capacity;
}
template <>
void MliTensorInterface::SetData(int32_t* data, uint32_t capacity) const {
TFLITE_DCHECK(tensor_->el_type == MLI_EL_ASYM_I32);
tensor_->data = data;
tensor_->capacity = capacity;
}
mli_tensor* MliTensorInterface::MliTensor(void) { return tensor_; }
const mli_tensor* MliTensorInterface::MliTensor(void) const {
return static_cast<const mli_tensor*>(
const_cast<MliTensorInterface*>(this)->MliTensor());
}
uint32_t* MliTensorInterface::Rank(void) { return &tensor_->rank; }
const uint32_t* MliTensorInterface::DataCapacity(void) const {
return &tensor_->capacity;
}
mli_element_type* MliTensorInterface::ElType(void) { return &tensor_->el_type; }
template <>
int16_t* MliTensorInterface::ZeroPoint(void) {
return &tensor_->el_params.asym.zero_point.i16;
}
template <>
int16_t** MliTensorInterface::ZeroPoint(void) {
return &tensor_->el_params.asym.zero_point.pi16;
}
uint32_t* MliTensorInterface::ZeroPointCapacity(void) { return nullptr; }
int32_t* MliTensorInterface::Dim(void) { return &tensor_->el_params.asym.dim; }
uint32_t* MliTensorInterface::ScaleCapacity(void) { return nullptr; }
template <>
int8_t* MliTensorInterface::ScaleFracBits(void) {
return &tensor_->el_params.asym.scale_frac_bits;
}
uint32_t* MliTensorInterface::ScaleFracBitsCapacity(void) { return nullptr; }
int32_t* MliTensorInterface::MemStride(void) { return tensor_->mem_stride; }
uint32_t* MliTensorInterface::Shape(void) { return tensor_->shape; }
const uint32_t* MliTensorInterface::Shape(void) const {
return static_cast<const uint32_t*>(
const_cast<MliTensorInterface*>(this)->Shape());
}
void MliTensorInterface::SetScale(float fscale) {
int exp;
frexpf(fscale, &exp);
int frac_bits = 31 - exp;
int32_t iscale = (int32_t)((1ll << frac_bits) * fscale + 0.5f);
*(this->ScaleFracBits<int8_t*>()) = frac_bits;
*(this->Scale<int32_t*>()) = (int32_t)iscale;
}
void MliTensorInterface::SetScalePerChannel(float* fscale,
const int num_channels) {
int min_frac_bits;
for (int i = 0; i < num_channels; i++) {
int exp;
frexpf(fscale[i], &exp);
int cur_frac_bits = 31 - exp;
if (i == 0) {
min_frac_bits = cur_frac_bits;
} else {
min_frac_bits =
min_frac_bits < cur_frac_bits ? min_frac_bits : cur_frac_bits;
}
}
*this->ScaleFracBits<int8_t*>() = min_frac_bits;
for (int i = 0; i < num_channels; i++) {
int32_t iscale = (int32_t)((1ll << min_frac_bits) * fscale[i] + 0.5f);
(*this->Scale<int32_t**>())[i] = iscale;
}
}
void MliTensorInterface::SetElType(TfLiteType type) {
if (type == kTfLiteInt8) {
*this->ElType() = MLI_EL_ASYM_I8;
} else if (type == kTfLiteInt32) {
*this->ElType() = MLI_EL_ASYM_I32;
} else {
TF_LITE_FATAL("Wrong data type. Expected int8_t or int32_t.");
}
}
} // namespace micro
} // namespace ops
} // namespace tflite
/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_LITE_MICRO_KERNELS_ARC_MLI_INTERFACE_H_
#define TENSORFLOW_LITE_MICRO_KERNELS_ARC_MLI_INTERFACE_H_
#include "mli_api.h" // NOLINT
#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
namespace tflite {
namespace ops {
namespace micro {
// Abstracts access to mli_tensor fields to use different versions of MLI
// Library (1.x and 2.x)
// Example:
// ops::micro::MliTensorInterface mli_in =
// ops::micro::MliTensorInterface(static_cast<mli_tensor*>(
// context->AllocatePersistentBuffer(context, sizeof(mli_tensor))));
class MliTensorInterface {
public:
// Make sure that lifetime of MliTensorInterface instance isn't bigger than
// related mli_tensor.
MliTensorInterface(mli_tensor* tensor) : tensor_(tensor){};
MliTensorInterface() = default;
~MliTensorInterface() = default;
template <typename T>
T* Data();
template <typename T>
T Scale();
template <typename T>
T ZeroPoint();
template <typename T>
T ScaleFracBits();
mli_tensor* MliTensor();
const mli_tensor* MliTensor() const;
int32_t* Dim();
uint32_t* Rank();
uint32_t* Shape();
const uint32_t* Shape() const;
const uint32_t* DataCapacity() const;
uint32_t* ScaleCapacity();
mli_element_type* ElType();
uint32_t* ScaleFracBitsCapacity();
int32_t* MemStride();
uint32_t* ZeroPointCapacity();
template <typename T>
void SetData(T* data, uint32_t capacity) const;
void SetScale(float fscale);
void SetScalePerChannel(float* fscale, const int num_channels);
void SetElType(TfLiteType type);
private:
mli_tensor* tensor_;
};
} // namespace micro
} // namespace ops
} // namespace tflite
#endif // TENSORFLOW_LITE_MICRO_KERNELS_ARC_MLI_SLICERS_H_
/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include <math.h>
#include "mli_interface.h" // NOLINT
namespace tflite {
namespace ops {
namespace micro {
#ifdef MLI_2_0
template <>
int8_t* MliTensorInterface::Data(void) {
TFLITE_DCHECK(tensor_->el_type == MLI_EL_SA_8);
return tensor_->data.mem.pi8;
}
template <>
int32_t* MliTensorInterface::Data(void) {
TFLITE_DCHECK(tensor_->el_type == MLI_EL_SA_32);
return tensor_->data.mem.pi32;
}
template <>
int16_t** MliTensorInterface::Scale(void) {
return &tensor_->el_params.sa.scale.mem.pi16;
}
template <>
int16_t* MliTensorInterface::Scale(void) {
return &tensor_->el_params.sa.scale.mem.i16;
}
template <>
void MliTensorInterface::SetData(int8_t* data, uint32_t capacity) const {
TFLITE_DCHECK(tensor_->el_type == MLI_EL_SA_8);
tensor_->data.mem.pi8 = data;
tensor_->data.capacity = capacity;
}
template <>
void MliTensorInterface::SetData(int32_t* data, uint32_t capacity) const {
TFLITE_DCHECK(tensor_->el_type == MLI_EL_SA_32);
tensor_->data.mem.pi32 = data;
tensor_->data.capacity = capacity;
}
mli_tensor* MliTensorInterface::MliTensor(void) { return tensor_; }
const mli_tensor* MliTensorInterface::MliTensor(void) const {
return static_cast<const mli_tensor*>(
const_cast<MliTensorInterface*>(this)->MliTensor());
}
uint32_t* MliTensorInterface::Rank(void) { return &tensor_->rank; }
const uint32_t* MliTensorInterface::DataCapacity(void) const {
return &tensor_->data.capacity;
}
mli_element_type* MliTensorInterface::ElType(void) { return &tensor_->el_type; }
template <>
int16_t* MliTensorInterface::ZeroPoint(void) {
return &tensor_->el_params.sa.zero_point.mem.i16;
}
template <>
int16_t** MliTensorInterface::ZeroPoint(void) {
return &tensor_->el_params.sa.zero_point.mem.pi16;
}
uint32_t* MliTensorInterface::ZeroPointCapacity(void) {
return &tensor_->el_params.sa.zero_point.capacity;
}
int32_t* MliTensorInterface::Dim(void) { return &tensor_->el_params.sa.dim; }
uint32_t* MliTensorInterface::ScaleCapacity(void) {
return &tensor_->el_params.sa.scale.capacity;
}
template <>
int8_t** MliTensorInterface::ScaleFracBits(void) {
return &tensor_->el_params.sa.scale_frac_bits.mem.pi8;
}
template <>
int8_t* MliTensorInterface::ScaleFracBits(void) {
return &tensor_->el_params.sa.scale_frac_bits.mem.i8;
}
uint32_t* MliTensorInterface::ScaleFracBitsCapacity(void) {
return &tensor_->el_params.sa.scale_frac_bits.capacity;
}
int32_t* MliTensorInterface::MemStride(void) { return tensor_->mem_stride; }
uint32_t* MliTensorInterface::Shape(void) { return tensor_->shape; }
const uint32_t* MliTensorInterface::Shape(void) const {
return static_cast<const uint32_t*>(
const_cast<MliTensorInterface*>(this)->Shape());
}
void MliTensorInterface::SetScale(float fscale) {
int exp;
frexpf(fscale, &exp);
int frac_bits = 15 - exp;
int16_t iscale = (int16_t)((1ll << frac_bits) * fscale + 0.5f);
*(this->Scale<int16_t*>()) = (int16_t)iscale;
*(this->ScaleFracBits<int8_t*>()) = frac_bits;
*this->ScaleCapacity() = 1 * sizeof(int16_t);
*this->ScaleFracBitsCapacity() = 1 * sizeof(int8_t);
}
void MliTensorInterface::SetScalePerChannel(float* fscale,
const int num_channels) {
for (int i = 0; i < num_channels; i++) {
int exp;
frexpf(fscale[i], &exp);
int cur_frac_bits = 15 - exp;
(*this->ScaleFracBits<int8_t**>())[i] = cur_frac_bits;
}
for (int i = 0; i < num_channels; i++) {
int16_t iscale =
(int16_t)((1ll << (*this->ScaleFracBits<int8_t**>())[i]) * fscale[i] +
0.5f);
(*this->Scale<int16_t**>())[i] = iscale;
}
}
void MliTensorInterface::SetElType(TfLiteType type) {
if (type == kTfLiteInt8) {
*this->ElType() = MLI_EL_SA_8;
} else if (type == kTfLiteInt32) {
*this->ElType() = MLI_EL_SA_32;
} else {
TF_LITE_FATAL("Wrong data type. Expected int8_t or int32_t.");
}
}
#endif
} // namespace micro
} // namespace ops
} // namespace tflite
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
......@@ -17,104 +17,261 @@ limitations under the License.
#define TENSORFLOW_LITE_MICRO_KERNELS_ARC_MLI_TF_UTILS_H_
#include "mli_api.h" // NOLINT
#include "mli_interface.h"
#include "tensorflow/lite/kernels/internal/common.h"
#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
#include "tensorflow/lite/micro/kernels/kernel_util.h"
#include "tensorflow/lite/micro/micro_error_reporter.h"
constexpr int kFracBitsQ15 = 15;
constexpr int kFracBitsQ31 = 31;
#define KRNL_C_DIM_NHWC 0 // output channels
namespace tflite {
namespace ops {
namespace micro {
inline void ConvertToMliTensorData(const TfLiteTensor* tfT, mli_tensor* mliT) {
inline void ConvertToMliTensorData(const TfLiteTensor* tfT,
MliTensorInterface* mliT,
bool is_bias_tensor) {
// Data is NULL until MliTensorAttachBuffer is called.
mliT->data = NULL;
mliT->SetElType(tfT->type);
if (tfT->type == kTfLiteInt8) {
mliT->el_type = MLI_EL_ASYM_I8;
mliT->SetData<int8_t>(nullptr, tfT->bytes);
} else if (tfT->type == kTfLiteInt32) {
mliT->el_type = MLI_EL_ASYM_I32;
mliT->SetData<int32_t>(nullptr, tfT->bytes);
} else {
MicroPrintf("Wrong data type. Expected int8_t or int32_t.");
TFLITE_ABORT;
}
const int32_t dims_count = GetTensorShape(tfT).DimensionsCount();
*mliT->Rank() = is_bias_tensor ? 1 : dims_count;
mliT->capacity = tfT->bytes;
mliT->rank = GetTensorShape(tfT).DimensionsCount();
for (int i = 0; i < GetTensorShape(tfT).DimensionsCount(); i++) {
mliT->shape[i] = GetTensorShape(tfT).Dims(i);
if (is_bias_tensor) {
mliT->Shape()[0] = GetTensorShape(tfT).Dims(dims_count - 1);
mliT->MemStride()[0] = 0;
} else {
for (int i = 0; i < dims_count; i++) {
mliT->Shape()[i] = GetTensorShape(tfT).Dims(i);
mliT->MemStride()[i] = 0;
}
}
}
inline void ConvertToMliQuantParams(const TfLiteTensor* tfT, mli_tensor* mliT) {
mliT->el_params.asym.dim = -1;
mliT->el_params.asym.zero_point.i16 = tfT->params.zero_point;
inline void ConvertToMliQuantParams(const TfLiteTensor* tfT,
MliTensorInterface* mliT) {
*mliT->Dim() = -1;
#ifdef MLI_2_0
*mliT->ZeroPointCapacity() = 1 * sizeof(int16_t);
#endif
*mliT->ZeroPoint<int16_t*>() = tfT->params.zero_point;
float fscale = tfT->params.scale;
int exp;
frexpf(fscale, &exp);
int frac_bits = kFracBitsQ31 - exp;
int32_t iscale = (int32_t)((1ll << frac_bits) * fscale + 0.5f);
mliT->el_params.asym.scale_frac_bits = frac_bits;
mliT->el_params.asym.scale.i32 = (int32_t)iscale;
mliT->SetScale(fscale);
}
inline void ConvertToMliQuantParamsPerChannel(const TfLiteTensor* tfT,
mli_tensor* mliT) {
MliTensorInterface* mliT,
bool is_bias_tensor) {
// mli tensor scale and zero_point arrays should be allocated at this point
TFLITE_DCHECK_NE(mliT->el_params.asym.scale.pi16, 0);
TFLITE_DCHECK_NE(mliT->el_params.asym.zero_point.pi16, 0);
#ifdef MLI_2_0
TFLITE_DCHECK_NE(*mliT->Scale<int16_t**>(), 0);
TFLITE_DCHECK_NE(*mliT->ZeroPoint<int16_t**>(), 0);
#else
TFLITE_DCHECK_NE(*mliT->Scale<int32_t**>(), 0);
TFLITE_DCHECK_NE(*mliT->ZeroPoint<int16_t**>(), 0);
#endif
// get per channel quantization parameters
const auto* affine_quantization =
reinterpret_cast<TfLiteAffineQuantization*>(tfT->quantization.params);
mliT->el_params.asym.dim = affine_quantization->quantized_dimension;
int32_t quantized_dimension =
is_bias_tensor ? 0 : affine_quantization->quantized_dimension;
const int num_channels = mliT->Shape()[quantized_dimension];
*mliT->Dim() = quantized_dimension;
// find frac_bits
const int num_channels =
mliT->shape[affine_quantization->quantized_dimension];
int min_frac_bits;
// set capacities
#ifdef MLI_2_0
*mliT->ScaleFracBitsCapacity() = num_channels * sizeof(int8_t);
*mliT->ScaleCapacity() = num_channels * sizeof(int16_t);
*mliT->ZeroPointCapacity() = num_channels * sizeof(int16_t);
#endif
float* fscale = affine_quantization->scale->data;
for (int i = 0; i < num_channels; i++) {
int exp;
frexpf(fscale[i], &exp);
int cur_frac_bits = kFracBitsQ31 - exp;
if (i == 0) {
min_frac_bits = cur_frac_bits;
} else {
min_frac_bits =
min_frac_bits < cur_frac_bits ? min_frac_bits : cur_frac_bits;
}
}
mliT->el_params.asym.scale_frac_bits = min_frac_bits;
mliT->SetScalePerChannel(fscale, num_channels);
#ifdef MLI_2_0
int16_t* zero_point = *mliT->ZeroPoint<int16_t**>();
for (int i = 0; i < num_channels; i++) {
int32_t iscale = (int32_t)((1ll << min_frac_bits) * fscale[i] + 0.5f);
mliT->el_params.asym.scale.pi32[i] = iscale;
zero_point[i] = tfT->params.zero_point;
}
#endif
}
template <typename datatype>
inline void MliTensorAttachBuffer(const TfLiteEvalTensor* tfT,
mli_tensor* mliT) {
inline void MliTensorAttachBuffer(const TfLiteEvalTensor*,
const MliTensorInterface*);
template <>
inline void MliTensorAttachBuffer<int8_t>(const TfLiteEvalTensor* tfT,
const MliTensorInterface* mliT) {
// "const_cast" here used to attach const data buffer to the initially
// non-const mli_tensor. This is required by current implementation of MLI
// backend and planned for redesign due to this and some other aspects.
mliT->data = const_cast<void*>(
static_cast<const void*>(tflite::micro::GetTensorData<datatype>(tfT)));
mliT->SetData<int8_t>(
const_cast<int8_t*>(tflite::micro::GetTensorData<int8_t>(tfT)),
*mliT->DataCapacity());
}
inline void ConvertToMliTensor(const TfLiteTensor* tfT, mli_tensor* mliT) {
ConvertToMliTensorData(tfT, mliT);
template <>
inline void MliTensorAttachBuffer<int32_t>(const TfLiteEvalTensor* tfT,
const MliTensorInterface* mliT) {
// "const_cast" here used to attach const data buffer to the initially
// non-const mli_tensor. This is required by current implementation of MLI
// backend and planned for redesign due to this and some other aspects.
mliT->SetData<int32_t>(
const_cast<int32_t*>(tflite::micro::GetTensorData<int32_t>(tfT)),
*mliT->DataCapacity());
}
inline void ConvertToMliTensor(const TfLiteTensor* tfT,
MliTensorInterface* mliT) {
ConvertToMliTensorData(tfT, mliT, false);
ConvertToMliQuantParams(tfT, mliT);
}
inline void ConvertToMliTensorPerChannel(const TfLiteTensor* tfT,
mli_tensor* mliT) {
ConvertToMliTensorData(tfT, mliT);
ConvertToMliQuantParamsPerChannel(tfT, mliT);
MliTensorInterface* mliT,
bool is_bias_tensor) {
ConvertToMliTensorData(tfT, mliT, is_bias_tensor);
ConvertToMliQuantParamsPerChannel(tfT, mliT, is_bias_tensor);
}
#ifdef MLI_2_0_KRNL_TEST
// Reorder an array according to given indexes. If backward is true, order of
// index array must be reversed.
inline static void reorder(uint32_t* arr, const uint8_t index[],
bool backward) {
uint32_t temp[MLI_MAX_RANK];
for (int8_t i = 0; i < MLI_MAX_RANK; i++) {
if (backward)
temp[index[i]] = arr[i];
else
temp[i] = arr[index[i]];
}
for (int8_t i = 0; i < MLI_MAX_RANK; i++) {
arr[i] = temp[i];
}
}
// Change shape of mli tensor and recalculate mem strides.
inline void change_shape(mli_tensor* mliT, const uint8_t dim_order[]) {
reorder(mliT->shape, dim_order, false);
// Calculate strides for new layout
int mli_tensor_memstride = 1;
for (int shape_idx = mliT->rank - 1; shape_idx >= 0; --shape_idx) {
mliT->mem_stride[shape_idx] = mli_tensor_memstride;
mli_tensor_memstride *= mliT->shape[shape_idx];
}
}
inline void permute_weights(const mli_tensor* weights_src,
const mli_permute_cfg* permute_cfg,
mli_tensor* weights_dst,
mli_data_container* buffer_data) {
mli_tensor buffer = {};
buffer.el_params = weights_dst->el_params;
buffer.data = *buffer_data;
// Compare weights tensor size and avaliable buffer capacity.
int buffer_size = buffer_data->capacity;
int weights_size = mli_hlp_count_elem_num(weights_src, 0) *
mli_hlp_tensor_element_size(weights_src);
if (buffer_size >= weights_size) {
mli_mov_cfg_t copy_config;
mli_mov_cfg_for_copy(&copy_config);
mli_mov_tensor_sync(weights_src, &copy_config, &buffer);
mli_krn_permute_sa8(&buffer, permute_cfg, weights_dst);
} else {
// Weights shape is NHWC and output (buffer) shape is HWC where N_w = C_o.
// Buffer size (H_o * W_o) must be more or equal then the weights size (H_w
// * W_w * C_w). So, this is the reason, why buffer size (output tensor) is
// divided by channel shape.
uint32_t slice_size = buffer_size / weights_src->shape[KRNL_C_DIM_NHWC];
mli_mov_cfg_t copy_config = {};
uint32_t src_offsets[] = {0, 0, 0, 0};
uint32_t src_sizes[] = {0, 0, 0, 0};
int dst_mem_stride[] = {0, 0, 0, 0};
// Need to change shape of distanation weights buffer according to permute
// dimensions order to calculate slice sizes
change_shape(weights_dst, permute_cfg->perm_dim);
mli_tensor weights_dst_sub_tensor;
mli_sub_tensor_cfg sub_tensor_cfg = {};
sub_tensor_cfg.sub_tensor_rank = weights_src->rank;
// Calculate dimensions for slice accroding to buffer capacity.
// Now, after calling change_shape() function, dst weights buffer has the
// MLI layout (HWCN). This means, the innermost dimension (N) of dst weights
// tensor is equal to the innermost dimension of output tensor (N).
sub_tensor_cfg.size[weights_dst->rank - 1] =
src_sizes[weights_dst->rank - 1] = weights_src->shape[KRNL_C_DIM_NHWC];
// Now need to calculate other shapes for weights slice. Total slice size is
// H*W*C*N, so to calculate sizes for each axis, avaliable slice size is
// divided by shape for each axis.
uint32_t slice_size_left = slice_size;
for (uint32_t i = 0; i < weights_dst->rank - 1; i++) {
sub_tensor_cfg.size[i] = src_sizes[i] =
slice_size_left / weights_dst->shape[i] > 0 ? weights_dst->shape[i]
: slice_size_left;
slice_size_left /= weights_dst->shape[i];
slice_size_left = slice_size_left > 0 ? slice_size_left : 1;
}
// Need to reorder src tensor sizes because it is still in TFLM format
// (NHWC) and src_sizes array calculated as (HWCN).
reorder(src_sizes, permute_cfg->perm_dim, true);
sub_tensor_cfg.offset[KRNL_C_DIM_HWCN] = src_offsets[KRNL_H_DIM_HWCN] = 0;
sub_tensor_cfg.offset[KRNL_H_DIM_HWCN] = src_offsets[KRNL_W_DIM_HWCN] = 0;
sub_tensor_cfg.offset[KRNL_W_DIM_HWCN] = src_offsets[KRNL_D_DIM_HWCN] = 0;
sub_tensor_cfg.offset[KRNL_D_DIM_HWCN] = src_offsets[KRNL_C_DIM_HWCN] = 0;
do {
do {
do {
do {
mli_mov_cfg_for_slice(&copy_config, (int*)src_offsets,
(int*)src_sizes, dst_mem_stride);
mli_mov_tensor_sync(weights_src, &copy_config, &buffer);
mli_hlp_create_subtensor(weights_dst, &sub_tensor_cfg,
&weights_dst_sub_tensor);
mli_krn_permute_sa8(&buffer, permute_cfg, &weights_dst_sub_tensor);
// For each axis, it is necessary to recalculate the offsets and
// slice sizes.
sub_tensor_cfg.offset[2] = src_offsets[3] += src_sizes[3];
src_sizes[3] =
std::min(src_sizes[3], weights_src->shape[3] - src_offsets[3]);
} while (src_offsets[3] < weights_src->shape[3]);
sub_tensor_cfg.offset[1] = src_offsets[2] += src_sizes[2];
src_sizes[2] =
std::min(src_sizes[2], weights_src->shape[2] - src_offsets[2]);
} while (src_offsets[2] < weights_src->shape[2]);
sub_tensor_cfg.offset[0] = src_offsets[1] += src_sizes[1];
src_sizes[1] =
std::min(src_sizes[1], weights_src->shape[1] - src_offsets[1]);
} while (src_offsets[1] < weights_src->shape[1]);
sub_tensor_cfg.offset[3] = src_offsets[0] += src_sizes[0];
src_sizes[0] =
std::min(src_sizes[0], weights_src->shape[0] - src_offsets[0]);
} while (src_offsets[0] < weights_src->shape[0]);
}
}
#endif
} // namespace micro
} // namespace ops
} // namespace tflite
......
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
......@@ -25,7 +25,7 @@ namespace tflite {
namespace ops {
namespace micro {
#ifdef __Xxy
#if (defined(__Xxy)) || (defined(__Xvdsp))
static void get_arc_two_buffer_sizes(int request_size_1, int request_size_2,
int* grant_size_1, int* grant_size_2) {
int maxrequest = 0;
......@@ -66,202 +66,215 @@ static void get_arc_two_buffer_sizes(int request_size_1, int request_size_2,
}
static TfLiteStatus get_arc_scratch_buffer_for_io_tensors(
TfLiteContext* context, mli_tensor* in, mli_tensor* out) {
TfLiteContext* context, MliTensorInterface* in, MliTensorInterface* out) {
int request_size_in = 0;
int request_size_out = 0;
int grant_size_in = 0;
int grant_size_out = 0;
if (!inside_arc_ccm(in->data)) {
if (!inside_arc_ccm(in->Data<int8_t>())) {
// In case the input tensor contains multiple batches, it has rank 4
// because the mli kernel cannot operate on batches, we need to have the
// size of a single HWC tensor. that is why the start_rank is 1 in case of
// input rank 4
int start_rank = in->rank - 3;
request_size_in = mli_hlp_count_elem_num(in, start_rank) *
mli_hlp_tensor_element_size(in);
int start_rank = *in->Rank() - 3;
request_size_in = mli_hlp_count_elem_num(in->MliTensor(), start_rank) *
mli_hlp_tensor_element_size(in->MliTensor());
}
if (!inside_arc_ccm(out->data)) {
if (!inside_arc_ccm(out->Data<int8_t>())) {
// In case the input tensor contains multiple batches, it has rank 4
// because the mli kernel cannot operate on batches, we need to have the
// size of a single batch. that is why the start_rank is 1 in case of input
// rank 4
int start_rank = out->rank - 3;
request_size_out = mli_hlp_count_elem_num(out, start_rank) *
mli_hlp_tensor_element_size(out);
int start_rank = *out->Rank() - 3;
request_size_out = mli_hlp_count_elem_num(out->MliTensor(), start_rank) *
mli_hlp_tensor_element_size(out->MliTensor());
}
get_arc_two_buffer_sizes(request_size_in, request_size_out, &grant_size_in,
&grant_size_out);
if (!inside_arc_ccm(in->data)) {
in->data = get_arc_scratch_buffer(grant_size_in);
in->capacity = grant_size_in;
if (in->data == NULL) return kTfLiteError;
if (!inside_arc_ccm(in->Data<int8_t>())) {
in->SetData<int8_t>(
static_cast<int8_t*>(get_arc_scratch_buffer(grant_size_in)),
grant_size_in);
if (in->Data<int8_t>() == NULL) return kTfLiteError;
}
if (!inside_arc_ccm(out->data)) {
out->data = get_arc_scratch_buffer(grant_size_out);
out->capacity = grant_size_out;
if (out->data == NULL) return kTfLiteError;
if (!inside_arc_ccm(out->Data<int8_t>())) {
out->SetData<int8_t>(
static_cast<int8_t*>(get_arc_scratch_buffer(grant_size_out)),
grant_size_out);
if (out->Data<int8_t>() == NULL) return kTfLiteError;
}
return kTfLiteOk;
}
#endif
TfLiteStatus get_arc_scratch_buffer_for_conv_tensors(TfLiteContext* context,
mli_tensor* in,
mli_tensor* weights,
mli_tensor* bias,
mli_tensor* out) {
TfLiteStatus get_arc_scratch_buffer_for_conv_tensors(
TfLiteContext* context, MliTensorInterface* in, MliTensorInterface* weights,
MliTensorInterface* bias, MliTensorInterface* out) {
TfLiteStatus ret_val = kTfLiteOk;
#ifdef __Xxy
#if (defined(__Xxy)) || (defined(__Xvdsp))
init_arc_scratch_buffers();
if (!inside_arc_ccm(weights->data)) {
int weights_size = mli_hlp_count_elem_num(weights, 0) *
mli_hlp_tensor_element_size(weights);
int max_weights_size = 0;
weights->data = get_arc_scratch_buffer(weights_size);
weights->capacity = weights_size;
if (weights->data == NULL) {
get_arc_scratch_buffer_max_size(&max_weights_size);
weights->data = get_arc_scratch_buffer(max_weights_size);
weights->capacity = max_weights_size;
if (max_weights_size == 0) ret_val = kTfLiteError;
}
if (weights->data == NULL) ret_val = kTfLiteError;
}
if (!inside_arc_ccm(bias->data)) {
if (!inside_arc_ccm(bias->Data<int32_t>())) {
uint32_t bias_mem_requirements =
mli_hlp_count_elem_num(bias, 0) * mli_hlp_tensor_element_size(bias);
bias->data = get_arc_scratch_buffer(bias_mem_requirements);
bias->capacity = bias_mem_requirements;
}
if (ret_val == kTfLiteOk) {
ret_val = get_arc_scratch_buffer_for_io_tensors(context, in, out);
mli_hlp_count_elem_num(bias->MliTensor(), 0) *
mli_hlp_tensor_element_size(bias->MliTensor());
bias->SetData<int32_t>(
static_cast<int32_t*>(get_arc_scratch_buffer(bias_mem_requirements)),
bias_mem_requirements);
}
if (bias->data == NULL) {
if (bias->Data<int32_t>() == NULL) {
int max_bias_size = 0;
get_arc_scratch_buffer_max_size(&max_bias_size);
bias->data = get_arc_scratch_buffer(max_bias_size);
bias->capacity = max_bias_size;
bias->SetData<int32_t>(
static_cast<int32_t*>(get_arc_scratch_buffer(max_bias_size)),
max_bias_size);
if (max_bias_size == 0) ret_val = kTfLiteError;
}
if (bias->data == NULL) ret_val = kTfLiteError;
if (bias->Data<int32_t>() == NULL) ret_val = kTfLiteError;
if (!inside_arc_ccm(weights->Data<int8_t>())) {
int weights_size = mli_hlp_count_elem_num(weights->MliTensor(), 0) *
mli_hlp_tensor_element_size(weights->MliTensor());
int max_weights_size = 0;
weights->SetData<int8_t>(
static_cast<int8_t*>(get_arc_scratch_buffer(weights_size)),
weights_size);
if (weights->Data<int8_t>() == NULL) {
get_arc_scratch_buffer_max_size(&max_weights_size);
weights->SetData<int8_t>(
static_cast<int8_t*>(get_arc_scratch_buffer(max_weights_size)),
max_weights_size);
if (max_weights_size == 0) ret_val = kTfLiteError;
}
if (weights->Data<int8_t>() == NULL) ret_val = kTfLiteError;
}
if (ret_val == kTfLiteOk) {
ret_val = get_arc_scratch_buffer_for_io_tensors(context, in, out);
}
#endif
return ret_val;
}
TfLiteStatus get_arc_scratch_buffer_for_fully_connect_tensors(
TfLiteContext* context, mli_tensor* in, mli_tensor* weights,
mli_tensor* bias, mli_tensor* out) {
TfLiteContext* context, MliTensorInterface* in, MliTensorInterface* weights,
MliTensorInterface* bias, MliTensorInterface* out) {
TfLiteStatus ret_val = kTfLiteOk;
#ifdef __Xxy
#if (defined(__Xxy)) || (defined(__Xvdsp))
init_arc_scratch_buffers();
/* strategy for FC kernels:
first allocate input, because this cannot be sliced. (in case of batch
processing, only a single input needs to be allocated) then weights & bias
because if fully loaded, they can be reused over batches. then output.
The number of output channels (for weights slicing) depends on size of
output and size of weights&bias */
if (!inside_arc_ccm(in->data)) {
/* In case the input tensor contains multiple batches,
only count the size if the inner most dimension */
int size_in = mli_hlp_count_elem_num(in, in->rank - 1) *
mli_hlp_tensor_element_size(in);
in->data = get_arc_scratch_buffer(size_in);
in->capacity = size_in;
if (in->data == NULL) {
in->capacity = 0;
ret_val = kTfLiteError;
}
if (!inside_arc_ccm(bias->Data<int32_t>())) {
int bias_mem_requirements = mli_hlp_count_elem_num(bias->MliTensor(), 0) *
mli_hlp_tensor_element_size(bias->MliTensor());
bias->SetData<int32_t>(
static_cast<int32_t*>(get_arc_scratch_buffer(bias_mem_requirements)),
bias_mem_requirements);
}
if (bias->Data<int32_t>() == NULL) {
int max_bias_size = 0;
get_arc_scratch_buffer_max_size(&max_bias_size);
bias->SetData<int32_t>(
static_cast<int32_t*>(get_arc_scratch_buffer(max_bias_size)),
max_bias_size);
if (max_bias_size == 0) ret_val = kTfLiteError;
}
if (bias->Data<int32_t>() == NULL) ret_val = kTfLiteError;
if (!inside_arc_ccm(weights->data)) {
int weights_size = mli_hlp_count_elem_num(weights, 0) *
mli_hlp_tensor_element_size(weights);
if (!inside_arc_ccm(weights->Data<int8_t>())) {
int weights_size = mli_hlp_count_elem_num(weights->MliTensor(), 0) *
mli_hlp_tensor_element_size(weights->MliTensor());
int max_weights_size = 0;
weights->data = get_arc_scratch_buffer(weights_size);
weights->capacity = weights_size;
if (weights->data == NULL) {
weights->SetData<int8_t>(
static_cast<int8_t*>(get_arc_scratch_buffer(weights_size)),
weights_size);
if (weights->Data<int8_t>() == NULL) {
get_arc_scratch_buffer_max_size(&max_weights_size);
weights->data = get_arc_scratch_buffer(max_weights_size);
weights->capacity = max_weights_size;
weights->SetData<int8_t>(
static_cast<int8_t*>(get_arc_scratch_buffer(max_weights_size)),
max_weights_size);
if (max_weights_size == 0) ret_val = kTfLiteError;
}
if (weights->data == NULL) ret_val = kTfLiteError;
if (weights->Data<int8_t>() == NULL) ret_val = kTfLiteError;
}
if (!inside_arc_ccm(bias->data)) {
int bias_mem_requirements =
mli_hlp_count_elem_num(bias, 0) * mli_hlp_tensor_element_size(bias);
bias->data = get_arc_scratch_buffer(bias_mem_requirements);
bias->capacity = bias_mem_requirements;
}
/* strategy for FC kernels:
first allocate input, because this cannot be sliced. (in case of batch
processing, only a single input needs to be allocated) then weights &
bias because if fully loaded, they can be reused over batches. then
output. The number of output channels (for weights slicing) depends on
size of output and size of weights&bias */
if (!inside_arc_ccm(out->data)) {
if (!inside_arc_ccm(in->Data<int8_t>())) {
/* In case the input tensor contains multiple batches,
only count the size if the inner most dimension */
int out_size = mli_hlp_count_elem_num(out, out->rank - 1) *
mli_hlp_tensor_element_size(out);
int size_in = mli_hlp_count_elem_num(in->MliTensor(), *in->Rank() - 1) *
mli_hlp_tensor_element_size(in->MliTensor());
in->SetData<int8_t>(static_cast<int8_t*>(get_arc_scratch_buffer(size_in)),
size_in);
if (in->Data<int8_t>() == NULL) {
in->SetData<int8_t>(nullptr, 0);
ret_val = kTfLiteError;
}
}
if (!inside_arc_ccm(out->Data<int8_t>())) {
/* In case the input tensor contains multiple batches,
only count the size if the inner most dimension */
int out_size = mli_hlp_count_elem_num(out->MliTensor(), *out->Rank() - 1) *
mli_hlp_tensor_element_size(out->MliTensor());
int max_out_size = 0;
out->data = get_arc_scratch_buffer(out_size);
out->capacity = out_size;
if (out->data == NULL) {
out->SetData<int8_t>(static_cast<int8_t*>(get_arc_scratch_buffer(out_size)),
out_size);
if (out->Data<int8_t>() == NULL) {
get_arc_scratch_buffer_max_size(&max_out_size);
out->data = get_arc_scratch_buffer(max_out_size);
out->capacity = max_out_size;
out->SetData<int8_t>(
static_cast<int8_t*>(get_arc_scratch_buffer(max_out_size)),
max_out_size);
if (max_out_size == 0) ret_val = kTfLiteError;
}
if (out->data == NULL) ret_val = kTfLiteError;
if (out->Data<int8_t>() == NULL) ret_val = kTfLiteError;
}
if (bias->data == NULL) {
int max_bias_size = 0;
get_arc_scratch_buffer_max_size(&max_bias_size);
bias->data = get_arc_scratch_buffer(max_bias_size);
bias->capacity = max_bias_size;
if (max_bias_size == 0) ret_val = kTfLiteError;
}
if (bias->data == NULL) ret_val = kTfLiteError;
#endif
return ret_val;
}
TfLiteStatus arc_scratch_buffer_calc_slice_size_io(
const mli_tensor* in, const mli_tensor* out, const int kernel_height,
const int stride_height, const int padding_top, const int padding_bot,
int* in_slice_height, int* out_slice_height) {
const MliTensorInterface* in, const MliTensorInterface* out,
const int kernel_height, const int stride_height, const int padding_top,
const int padding_bot, int* in_slice_height, int* out_slice_height) {
const int height_dimension = 1;
const int in_height = in->shape[height_dimension];
const int out_height = out->shape[height_dimension];
const int line_size_in = mli_hlp_count_elem_num(in, height_dimension + 1) *
mli_hlp_tensor_element_size(in);
const int line_size_out = mli_hlp_count_elem_num(out, height_dimension + 1) *
mli_hlp_tensor_element_size(out);
const int in_height = in->Shape()[height_dimension];
const int out_height = out->Shape()[height_dimension];
const int line_size_in =
mli_hlp_count_elem_num(in->MliTensor(), height_dimension + 1) *
mli_hlp_tensor_element_size(in->MliTensor());
const int line_size_out =
mli_hlp_count_elem_num(out->MliTensor(), height_dimension + 1) *
mli_hlp_tensor_element_size(out->MliTensor());
int max_lines_in = 0;
int max_lines_out = 0;
int max_out_lines_for_input = 0;
bool fit = (static_cast<int>(in->capacity) >= in_height * line_size_in) &&
(static_cast<int>(out->capacity) >= out_height * line_size_out);
bool fit =
(static_cast<int>(*in->DataCapacity()) >= in_height * line_size_in) &&
(static_cast<int>(*out->DataCapacity()) >= out_height * line_size_out);
if (fit) {
// in case both tensors completely fit in the capacity, there is no need for
// slicing. As padding can affect effective input region, we also derive it
// from output height, and rely on a clipping logic which intend to reduce
// last smaller slice. I.e the only slice is a kind of
// "smaller last slice that need to be corrected"
// in case both tensors completely fit in the capacity, there is no need
// for slicing. As padding can affect effective input region, we also
// derive it from output height, and rely on a clipping logic which intend
// to reduce last smaller slice. I.e the only slice is a kind of "smaller
// last slice that need to be corrected"
*in_slice_height = std::max(in_height, out_height * stride_height);
*out_slice_height = out_height;
} else {
// First compute how many lines fit into the input tensor, and compute how
// many output lines can be computed with that.
max_lines_in =
std::min(in_height, static_cast<int>(in->capacity) / line_size_in);
max_lines_in = std::min(
in_height, static_cast<int>(*in->DataCapacity()) / line_size_in);
if (max_lines_in >= in_height) {
max_out_lines_for_input = out_height;
} else if (2 * max_lines_in >= in_height) {
......@@ -276,8 +289,8 @@ TfLiteStatus arc_scratch_buffer_calc_slice_size_io(
(max_lines_in - kernel_height + 1) / stride_height;
}
// Then compute how many output lines fit into the output tensor.
max_lines_out =
std::min(out_height, static_cast<int>(out->capacity) / line_size_out);
max_lines_out = std::min(
out_height, static_cast<int>(*out->DataCapacity()) / line_size_out);
// the smallest of the two determines the slice height for the output, and
// the derived sliceheight for the input.
*out_slice_height = std::min(max_out_lines_for_input, max_lines_out);
......@@ -292,29 +305,32 @@ TfLiteStatus arc_scratch_buffer_calc_slice_size_io(
}
TfLiteStatus arc_scratch_buffer_calc_slice_size_weights(
const mli_tensor* weights, const mli_tensor* bias,
const MliTensorInterface* weights, const MliTensorInterface* bias,
const int weight_out_ch_dimension, int* slice_channels) {
const int channels = weights->shape[weight_out_ch_dimension];
const int ch_size_w = (mli_hlp_count_elem_num(weights, 0) / channels) *
mli_hlp_tensor_element_size(weights);
const int ch_size_b = (mli_hlp_count_elem_num(bias, 0) / channels) *
mli_hlp_tensor_element_size(bias);
const int channels = weights->Shape()[weight_out_ch_dimension];
const int ch_size_w =
(mli_hlp_count_elem_num(weights->MliTensor(), 0) / channels) *
mli_hlp_tensor_element_size(weights->MliTensor());
const int ch_size_b =
(mli_hlp_count_elem_num(bias->MliTensor(), 0) / channels) *
mli_hlp_tensor_element_size(bias->MliTensor());
int max_ch_weigths = 0;
int max_ch_bias = 0;
bool fit = (static_cast<int>(weights->capacity) >= channels * ch_size_w) &&
(static_cast<int>(bias->capacity) >= channels * ch_size_b);
bool fit =
(static_cast<int>(*weights->DataCapacity()) >= channels * ch_size_w) &&
(static_cast<int>(*bias->DataCapacity()) >= channels * ch_size_b);
if (fit) {
// in case both tensors completely fit in the capacity, there is no need for
// slicing
// in case both tensors completely fit in the capacity, there is no need
// for slicing
*slice_channels = channels;
} else {
// First compute how many channels fit into the weights tensor
max_ch_weigths =
std::min(channels, static_cast<int>(weights->capacity) / ch_size_w);
max_ch_weigths = std::min(
channels, static_cast<int>(*weights->DataCapacity()) / ch_size_w);
// Ten compute how many channels fit into the bias tensor.
max_ch_bias =
std::min(channels, static_cast<int>(bias->capacity) / ch_size_b);
std::min(channels, static_cast<int>(*bias->DataCapacity()) / ch_size_b);
// the smallest of the two determines the slice size
*slice_channels = std::min(max_ch_weigths, max_ch_bias);
}
......@@ -326,10 +342,9 @@ TfLiteStatus arc_scratch_buffer_calc_slice_size_weights(
}
}
TfLiteStatus get_arc_scratch_buffer_for_pooling_tensors(TfLiteContext* context,
mli_tensor* in,
mli_tensor* out) {
#ifdef __Xxy
TfLiteStatus get_arc_scratch_buffer_for_pooling_tensors(
TfLiteContext* context, MliTensorInterface* in, MliTensorInterface* out) {
#if (defined(__Xxy)) || (defined(__Xvdsp))
init_arc_scratch_buffers();
return get_arc_scratch_buffer_for_io_tensors(context, in, out);
#else
......
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
......@@ -17,6 +17,7 @@ limitations under the License.
#define TENSORFLOW_LITE_MICRO_ARC_SCRATCH_BUF_MGR_H_
#include "mli_api.h" // NOLINT
#include "mli_interface.h"
#include "tensorflow/lite/c/common.h"
namespace tflite {
......@@ -37,11 +38,9 @@ namespace micro {
*
* @return Tf Lite status code
*/
TfLiteStatus get_arc_scratch_buffer_for_conv_tensors(TfLiteContext* context,
mli_tensor* in,
mli_tensor* weights,
mli_tensor* bias,
mli_tensor* out);
TfLiteStatus get_arc_scratch_buffer_for_conv_tensors(
TfLiteContext* context, MliTensorInterface* in, MliTensorInterface* weights,
MliTensorInterface* bias, MliTensorInterface* out);
/**
* @brief Function to allocate scratch buffers for pooling kernels with only
......@@ -56,9 +55,8 @@ TfLiteStatus get_arc_scratch_buffer_for_conv_tensors(TfLiteContext* context,
*
* @return Tf Lite status code
*/
TfLiteStatus get_arc_scratch_buffer_for_pooling_tensors(TfLiteContext* context,
mli_tensor* in,
mli_tensor* out);
TfLiteStatus get_arc_scratch_buffer_for_pooling_tensors(
TfLiteContext* context, MliTensorInterface* in, MliTensorInterface* out);
/**
* @brief Function to allocate scratch buffers for the fully connect tensors
......@@ -75,8 +73,8 @@ TfLiteStatus get_arc_scratch_buffer_for_pooling_tensors(TfLiteContext* context,
* @return Tf Lite status code
*/
TfLiteStatus get_arc_scratch_buffer_for_fully_connect_tensors(
TfLiteContext* context, mli_tensor* in, mli_tensor* weights,
mli_tensor* bias, mli_tensor* out);
TfLiteContext* context, MliTensorInterface* in, MliTensorInterface* weights,
MliTensorInterface* bias, MliTensorInterface* out);
/**
* @brief Function to calculate slice size for io tensors
......@@ -99,9 +97,9 @@ TfLiteStatus get_arc_scratch_buffer_for_fully_connect_tensors(
* @return Tf Lite status code
*/
TfLiteStatus arc_scratch_buffer_calc_slice_size_io(
const mli_tensor* in, const mli_tensor* out, const int kernelHeight,
const int strideHeight, const int padding_top, const int padding_bot,
int* in_slice_height, int* out_slice_height);
const MliTensorInterface* in, const MliTensorInterface* out,
const int kernelHeight, const int strideHeight, const int padding_top,
const int padding_bot, int* in_slice_height, int* out_slice_height);
/**
* @brief Function to calculate slice size for weight slicing
......@@ -119,7 +117,7 @@ TfLiteStatus arc_scratch_buffer_calc_slice_size_io(
* @return Tf Lite status code
*/
TfLiteStatus arc_scratch_buffer_calc_slice_size_weights(
const mli_tensor* weights, const mli_tensor* bias,
const MliTensorInterface* weights, const MliTensorInterface* bias,
const int weight_out_ch_dimension, int* slice_channels);
} // namespace micro
......
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
......@@ -25,31 +25,45 @@ namespace micro {
* used for the data section and the stack. the values can be overruled by
* adding a -D option to the makefile of the application
*/
#ifdef __Xxy
#ifndef SCRATCH_MEM_X_SIZE
#ifdef core_config_xy_size
#define SCRATCH_MEM_X_SIZE (core_config_xy_size)
#else
#define SCRATCH_MEM_X_SIZE (0)
#endif
#endif
#ifndef SCRATCH_MEM_Y_SIZE
#ifdef core_config_xy_size
#define SCRATCH_MEM_Y_SIZE (core_config_xy_size)
#else
#define SCRATCH_MEM_Y_SIZE (0)
#endif
#endif
#ifndef SCRATCH_MEM_Z_SIZE
#ifdef core_config_dccm_size
#define SCRATCH_MEM_Z_SIZE ((core_config_dccm_size) / 2)
#else
#define SCRATCH_MEM_Z_SIZE (0)
#endif
#endif
#elif defined(__Xvdsp)
#ifndef SCRATCH_MEM_VEC_SIZE
#ifdef core_config_vec_mem_size
#define SCRATCH_MEM_VEC_SIZE ((core_config_vec_mem_size * 3) / 4)
#endif
#endif
#else
#define SCRATCH_MEM_SIZE (65536)
#endif
namespace {
#ifdef __Xxy
#pragma Bss(".Xdata")
static int8_t scratch_mem_x[SCRATCH_MEM_X_SIZE];
#pragma Bss()
......@@ -61,12 +75,43 @@ static int8_t scratch_mem_y[SCRATCH_MEM_Y_SIZE];
#pragma Bss(".Zdata")
static int8_t scratch_mem_z[SCRATCH_MEM_Z_SIZE];
#pragma Bss()
#elif defined(__Xvdsp)
#pragma Bss(".vecmem_data")
static int8_t scratch_mem_vec_1[SCRATCH_MEM_VEC_SIZE / 4];
static int8_t scratch_mem_vec_2[SCRATCH_MEM_VEC_SIZE / 4];
static int8_t scratch_mem_vec_3[SCRATCH_MEM_VEC_SIZE / 2];
#pragma Bss()
#else
static int8_t scratch_mem_stack[SCRATCH_MEM_SIZE];
#endif
} // namespace
#ifdef __Xxy
static int8_t* scratch_mem[] = {scratch_mem_x, scratch_mem_y, scratch_mem_z};
static uint32_t scratch_sizes[] = {SCRATCH_MEM_X_SIZE, SCRATCH_MEM_Y_SIZE,
SCRATCH_MEM_Z_SIZE};
#elif defined(__Xvdsp)
static int8_t* scratch_mem[] = {scratch_mem_vec_1, scratch_mem_vec_2,
scratch_mem_vec_3};
static uint32_t scratch_sizes[] = {SCRATCH_MEM_VEC_SIZE / 4,
SCRATCH_MEM_VEC_SIZE / 4,
SCRATCH_MEM_VEC_SIZE / 2};
#else
static int8_t* scratch_mem[] = {scratch_mem_stack};
static uint32_t scratch_sizes[] = {SCRATCH_MEM_SIZE};
#endif
void* get_arc_scratch_buffer(int size) {
// Function to asign fast memory from one of 3 scratch buffers.
// Best Fit strategy - memory is allocated from that memory bank that leaves
......@@ -85,7 +130,7 @@ void* get_arc_scratch_buffer(int size) {
}
}
if (best_mem_idx >= 0) {
buf = static_cast<void*>(scratch_mem[best_mem_idx]);
buf = scratch_mem[best_mem_idx];
scratch_mem[best_mem_idx] += size;
scratch_sizes[best_mem_idx] -= size;
}
......@@ -122,12 +167,24 @@ void get_arc_scratch_buffer_two_max_sizes(int* size1, int* size2) {
}
void init_arc_scratch_buffers(void) {
#ifdef __Xxy
scratch_mem[0] = scratch_mem_x;
scratch_mem[1] = scratch_mem_y;
scratch_mem[2] = scratch_mem_z;
scratch_sizes[0] = SCRATCH_MEM_X_SIZE;
scratch_sizes[1] = SCRATCH_MEM_Y_SIZE;
scratch_sizes[2] = SCRATCH_MEM_Z_SIZE;
#elif defined(__Xvdsp)
scratch_mem[0] = scratch_mem_vec_1;
scratch_mem[1] = scratch_mem_vec_2;
scratch_mem[2] = scratch_mem_vec_3;
scratch_sizes[0] = SCRATCH_MEM_VEC_SIZE / 4;
scratch_sizes[1] = SCRATCH_MEM_VEC_SIZE / 4;
scratch_sizes[2] = SCRATCH_MEM_VEC_SIZE / 2;
#else
scratch_mem[0] = scratch_mem_stack;
scratch_sizes[0] = SCRATCH_MEM_SIZE;
#endif
}
} // namespace micro
......
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
......@@ -49,7 +49,7 @@ static inline bool inside_arc_xccm(void* p) {
}
static inline bool inside_arc_yccm(void* p) {
#if core_config_xy
#if core_config_xy_size
return ((unsigned)p >= core_config_xy_y_base) &&
((unsigned)p < core_config_xy_y_base + core_config_xy_size);
#else
......@@ -57,8 +57,18 @@ static inline bool inside_arc_yccm(void* p) {
#endif
}
static inline bool inside_arc_vccm(void* p) {
#if core_config_vec_mem_size
return ((unsigned)p >= core_config_vec_mem_base) &&
((unsigned)p < core_config_vec_mem_base + core_config_vec_mem_size);
#else
return false;
#endif
}
static inline bool inside_arc_ccm(void* p) {
return inside_arc_dccm(p) || inside_arc_xccm(p) || inside_arc_yccm(p);
return inside_arc_dccm(p) || inside_arc_xccm(p) || inside_arc_yccm(p) ||
inside_arc_vccm(p);
}
} // namespace micro
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册