diff --git a/tensorflow/lite/kernels/lstm_eval.cc b/tensorflow/lite/kernels/lstm_eval.cc index 3f74f3e7fffe6209c01bb50f084bfc64bd6a7687..178c2b62fb9ed4456f4cd78c440925f96b1360a1 100644 --- a/tensorflow/lite/kernels/lstm_eval.cc +++ b/tensorflow/lite/kernels/lstm_eval.cc @@ -127,6 +127,210 @@ inline float GetTensorScale(const TfLiteTensor* tensor) { return tensor == nullptr ? 1.0f : tensor->params.scale; } +// Calculates the output state tensor of an LSTM step. +// +// Implements the following formula: +// output_no_projection = output_gate .* activate(cell_state) +// (elementwise vector product) +// If no projection is used: +// output = output_state = output_no_projection +// With projection: +// output = output_state = clip(W*output_no_projection + bias) +// +// Output might not have a different 'stride' than n_batch, so we need to copy. +// +// Parameters: +// - n_batch: batches: the number of distinct vectors in each array. +// - n_cell, n_output: sizes of vectors. +// - cell_state, output_gate: input vectors, size n_batch*n_cell. +// - projection_weights, projection_weights_scale, projection_bias: +// constant inputs, describing projection matrix and bias. +// - proj_clip: if > 0, clip the output of the projection. +// - output_state: output vector, size n_batch*n_output. Must be contigous. +// - scratch: scratch area, size n_batch*n_cell. +// LINT.IfChange +void CalculateLstmOutputFloat(int n_batch, int n_cell, int n_output, + const float* cell_state, const float* output_gate, + TfLiteFusedActivation activation, + const float* projection_weights, + const float* projection_bias, + const float proj_clip, float* output_state, + float* scratch) { + tensor_utils::ApplyActivationToVector(cell_state, n_batch * n_cell, + activation, scratch); + tensor_utils::VectorVectorCwiseProduct(output_gate, scratch, n_batch * n_cell, + scratch); + + const bool use_projection = (projection_weights != nullptr); + const bool use_projection_bias = (projection_bias != nullptr); + + if (use_projection) { + if (use_projection_bias) { + tensor_utils::VectorBatchVectorAssign(projection_bias, n_output, n_batch, + output_state); + } else { + std::fill_n(output_state, n_batch * n_output, 0.0f); + } + tensor_utils::MatrixBatchVectorMultiplyAccumulate( + projection_weights, n_output, n_cell, scratch, n_batch, output_state); + if (proj_clip > 0.0f) { + tensor_utils::CwiseClipping(output_state, n_batch * n_output, proj_clip); + } + } else { + std::copy_n(scratch, n_batch * n_output, output_state); + } +} +// LINT.ThenChange(//tensorflow/lite/tools/optimize/calibration/builtin_logging_ops/lstm.cc) + +// Calculates the output state tensor of an LSTM step. See Float version too. +// +// Parameters: +// - n_batch: batches: the number of distinct vectors in each array. +// - n_cell, n_output: sizes of vectors. +// - cell_state, output_gate: input vectors, size n_batch*n_cell. +// - projection_weights, projection_weights_scale, projection_bias: +// constant inputs, describing projection matrix and bias. +// - proj_clip: if > 0, clip the output of the projection. +// - output_state: output vector, size n_batch*n_output. Must be contigous. +// - asymmetric_quantize_inputs: parameter to control quantization. +// - projection_weights_row_sums, compute_row_sums, context: Data for optimized +// MatrixBatchVectorMultiplyAccumulate. +// - scratch0: scratch area of size n_batch*n_cell +// - scratch1: scratch area of size n_batch*n_cell +// - scratch2: scratch area of size n_batch +// - scratch3: scratch area of size n_batch +// - scratch4: scratch area used by MatrixBatchVectorMultiplyAccumulate +void CalculateLstmOutputHybrid( + int n_batch, int n_cell, int n_output, const float* cell_state, + const float* output_gate, TfLiteFusedActivation activation, + const int8_t* projection_weights, float projection_weights_scale, + const float* projection_bias, const float proj_clip, float* output_state, + bool asymmetric_quantize_inputs, int32_t* projection_weights_row_sums, + bool* compute_row_sums, CpuBackendContext* context, float* scratch0, + int8_t* scratch1, float* scratch2, int32_t* scratch3, int32_t* scratch4) { + tensor_utils::ApplyActivationToVector(cell_state, n_batch * n_cell, + activation, scratch0); + tensor_utils::VectorVectorCwiseProduct(output_gate, scratch0, + n_batch * n_cell, scratch0); + + const bool use_projection = (projection_weights != nullptr); + const bool use_projection_bias = (projection_bias != nullptr); + + if (use_projection) { + if (use_projection_bias) { + tensor_utils::VectorBatchVectorAssign(projection_bias, n_output, n_batch, + output_state); + } else { + std::fill_n(output_state, n_batch * n_output, 0.0f); + } + if (!tensor_utils::IsZeroVector(scratch0, n_batch * n_cell)) { + // Save quantization and matmul computation for all zero output. + tensor_utils::BatchQuantizeFloats(scratch0, n_batch, n_cell, scratch1, + scratch2, scratch3, + asymmetric_quantize_inputs); + tensor_utils::MatrixBatchVectorMultiplyAccumulate( + projection_weights, n_output, n_cell, scratch1, + projection_weights_scale, scratch2, n_batch, output_state, + /*per_channel_scale=*/nullptr, + asymmetric_quantize_inputs ? scratch3 : nullptr, scratch4, + projection_weights_row_sums, compute_row_sums, scratch2, context); + } + if (proj_clip > 0.0f) { + tensor_utils::CwiseClipping(output_state, n_batch * n_output, proj_clip); + } + } else { + std::copy_n(scratch0, n_batch * n_output, output_state); + } +} + +// Calculates the output state tensor of an LSTM step. See Float and hybrid +// versions as well. +// +// Parameters: +// - n_batch: batches: the number of distinct vectors in each array. +// - n_cell, n_output: sizes of vectors. +// - cell_state, output_gate: input vectors, size n_batch*n_cell. +// - cell_state_scale: scaling of cell_state. +// - effective_hidden_scale_[a|b]: effective scale of cell_state.*output_gate +// - hidden_zp: zero_point for cell_state.*output_gate +// - projection_weights, effective_proj_scale_[a|b], projection_effective_bias: +// constant inputs, describing projection matrix and bias. +// - output_state_zp: zero point of output_state. (Input, calibrated value.) +// - quantized_proj_clip: if > 0, clip the output of the projection. +// - output_state: output vector, size n_batch*n_output. Must be contigous. +// - context: data for optimized MatrixBatchVectorMultiplyAccumulate. +// - scratch0: scratch area of size n_batch*n_cell +// - scratch1: scratch area of size n_batch*n_cell +// - scratch2: scratch area used by MatrixBatchVectorMultiplyAccumulate +void CalculateLstmOutputInteger8x8_16( + int n_batch, int n_cell, int n_output, const int16_t* cell_state, + int32_t cell_state_scale, const int16_t* output_gate, + int32_t effective_hidden_scale_a, int32_t effective_hidden_scale_b, + int32_t hidden_zp, const int8_t* projection_weights, + int32_t effective_proj_scale_a, int32_t effective_proj_scale_b, + const int32_t* projection_effective_bias, int32_t output_state_zp, + int8_t quantized_proj_clip, int8_t* output_state, + CpuBackendContext* context, int16_t* scratch0, int8_t* scratch1, + int32_t* scratch2) { + // Note: unlike float/hybrid, the activation is always Tanh. + tensor_utils::ApplyTanh(15 + cell_state_scale, cell_state, n_batch, n_cell, + scratch0); + tensor_utils::CwiseMul(output_gate, scratch0, effective_hidden_scale_a, + effective_hidden_scale_b, n_batch, n_cell, hidden_zp, + scratch1); + + const bool use_projection = (projection_weights != nullptr); + + if (use_projection) { + // Note: no bias like in float/hybrid + std::fill_n(output_state, n_batch * n_output, 0); + tensor_utils::MatrixBatchVectorMultiplyAccumulate( + scratch1, projection_effective_bias, projection_weights, + effective_proj_scale_a, effective_proj_scale_b, n_batch, n_cell, + n_output, output_state_zp, scratch2, output_state, context); + if (quantized_proj_clip > 0) { + tensor_utils::CwiseClipping(output_state, n_batch * n_output, + quantized_proj_clip); + } + } else { + std::copy_n(scratch1, n_batch * n_output, output_state); + } +} + +// Calculates the output state tensor of an LSTM step. See Float and hybrid +// versions as well. +// +// Parameters: +// - n_batch: batches: the number of distinct vectors in each array. +// - n_cell, n_output: sizes of vectors. +// - cell_state, output_gate: input vectors, size n_batch*n_cell. +// - projection_weights, effective_proj_scale_[a|b], projection_bias: +// constant inputs, describing projection matrix and bias. +// - output_state_zp: zero point of the output state. +// - quantized_proj_clip: if > 0, clip the output of the projection. +// - output_state: output vector, size n_batch*n_output. Must be contigous. +// - scratch: scratch area of size n_batch*n_cell +void CalculateLstmOutputInteger8x8_8( + int n_batch, int n_cell, int n_output, const int16_t* cell_state, + const int16_t* output_gate, const int8_t* projection_weights, + int32_t effective_proj_scale_a, int32_t effective_proj_scale_b, + const int32_t* projection_bias, int32_t output_state_zp, + int32_t quantized_proj_clip, int8_t* output_state, int16_t* scratch) { + // Note: unlike float/hybrid, the activation is always Tanh. + tensor_utils::ApplyTanhFloat(cell_state, n_batch, n_cell, -15, scratch); + tensor_utils::CwiseMul(output_gate, scratch, n_batch, n_cell, 15 + 15 - 15, + scratch); + // Note: no bias like in float/hybrid + tensor_utils::MatrixBatchVectorMultiply( + scratch, projection_weights, effective_proj_scale_a, + effective_proj_scale_b, projection_bias, n_batch, n_cell, n_output, + output_state_zp, output_state); + if (quantized_proj_clip > 0) { + tensor_utils::CwiseClipping(output_state, n_batch * n_output, + quantized_proj_clip); + } +} + // Performs an LSTM batch inference step for input specified by input_ptr. // The LSTM cell is specified by the pointers to its weights (*_weights_ptr) and // biases (*_bias_ptr), and buffers (*_scratch), along with additional @@ -395,32 +599,12 @@ inline void LstmStepFloat( } tensor_utils::ApplySigmoidToVector(output_gate_scratch, n_batch * n_cell, output_gate_scratch); - tensor_utils::ApplyActivationToVector(cell_state_ptr, n_batch * n_cell, - params->activation, cell_gate_scratch); - tensor_utils::VectorVectorCwiseProduct(output_gate_scratch, cell_gate_scratch, - n_batch * n_cell, output_gate_scratch); - const bool use_projection_weight = (projection_weights_ptr != nullptr); - const bool use_projection_bias = (projection_bias_ptr != nullptr); + CalculateLstmOutputFloat(n_batch, n_cell, n_output, cell_state_ptr, + output_gate_scratch, params->activation, + projection_weights_ptr, projection_bias_ptr, + params->proj_clip, output_state_ptr, scratch2); - // For each batch: update output_state. - if (use_projection_weight) { - if (use_projection_bias) { - tensor_utils::VectorBatchVectorAssign(projection_bias_ptr, n_output, - n_batch, output_state_ptr); - } else { - std::fill_n(output_state_ptr, n_batch * n_output, 0.0f); - } - tensor_utils::MatrixBatchVectorMultiplyAccumulate( - projection_weights_ptr, n_output, n_cell, output_gate_scratch, n_batch, - output_state_ptr); - if (params->proj_clip > 0.0) { - tensor_utils::CwiseClipping(output_state_ptr, n_batch * n_output, - params->proj_clip); - } - } else { - std::copy_n(output_gate_scratch, n_batch * n_output, output_state_ptr); - } // Copy output_state to the output. Note that the output batch rows may not be // contiguous (output_batch_leading_dim != n_output). for (int b = 0; b < n_batch; b++) { @@ -861,44 +1045,17 @@ inline void LstmStepHybrid( } tensor_utils::ApplySigmoidToVector(output_gate_scratch, n_batch * n_cell, output_gate_scratch); - tensor_utils::ApplyActivationToVector(cell_state_ptr, n_batch * n_cell, - params->activation, cell_gate_scratch); - tensor_utils::VectorVectorCwiseProduct(output_gate_scratch, cell_gate_scratch, - n_batch * n_cell, output_gate_scratch); - const bool use_projection_weight = (projection_weights_ptr != nullptr); - const bool use_projection_bias = (projection_bias_ptr != nullptr); + CalculateLstmOutputHybrid( + n_batch, n_cell, n_output, cell_state_ptr, output_gate_scratch, + params->activation, projection_weights_ptr, projection_weights_scale, + projection_bias_ptr, params->proj_clip, output_state_ptr, + asymmetric_quantize_inputs, projection_weights_row_sums, compute_row_sums, + context, scratch2, quantized_output_scratch, scaling_factors, zero_points, + accum_scratch_ptr); - // For each batch: update the projection and output_state. Note that since - // the output batch rows may not be contiguous (output_batch_leading_dim != - // n_output), we unroll the batched operations. - if (use_projection_weight) { - if (use_projection_bias) { - tensor_utils::VectorBatchVectorAssign(projection_bias_ptr, n_output, - n_batch, output_state_ptr); - } else { - std::fill_n(output_state_ptr, n_batch * n_output, 0.0f); - } - if (!tensor_utils::IsZeroVector(output_gate_scratch, n_batch * n_cell)) { - // Save quantization and matmul computation for all zero input. - tensor_utils::BatchQuantizeFloats( - output_gate_scratch, n_batch, n_cell, quantized_output_scratch, - scaling_factors, zero_points, asymmetric_quantize_inputs); - tensor_utils::MatrixBatchVectorMultiplyAccumulate( - projection_weights_ptr, n_output, n_cell, quantized_output_scratch, - projection_weights_scale, scaling_factors, n_batch, output_state_ptr, - /*per_channel_scale=*/nullptr, - asymmetric_quantize_inputs ? zero_points : nullptr, accum_scratch_ptr, - projection_weights_row_sums, compute_row_sums, - scaling_factors_scratch, context); - } - if (params->proj_clip > 0.0) { - tensor_utils::CwiseClipping(output_state_ptr, n_batch * n_output, - params->proj_clip); - } - } else { - std::copy_n(output_gate_scratch, n_batch * n_output, output_state_ptr); - } + // Copy output_state_ptr to the output. Note that the output batch rows may + // not be contiguous (output_batch_leading_dim != n_output). for (int b = 0; b < n_batch; b++) { std::copy_n(output_state_ptr + b * n_output, n_output, output_ptr + b * output_batch_leading_dim); @@ -1071,7 +1228,6 @@ inline void LstmStepInteger8x8_16( const bool use_cifg = (input_to_input_weight_ptr == nullptr); const bool use_peephole = (cell_to_output_weight_ptr != nullptr); const bool use_layer_norm = (layer_norm_forget_weight_ptr != nullptr); - const bool use_projection = (projection_weight_ptr != nullptr); // Check for nullptrs. TFLITE_DCHECK(input_to_forget_effective_bias); @@ -1219,28 +1375,17 @@ inline void LstmStepInteger8x8_16( tensor_utils::ApplySigmoid(output_gate_scratch, n_batch, n_cell, output_gate_scratch); - // Hidden. - tensor_utils::ApplyTanh(15 + cell_state_scale, cell_state_ptr, n_batch, - n_cell, input_gate_scratch); - - tensor_utils::CwiseMul(output_gate_scratch, input_gate_scratch, - effective_hidden_scale_a, effective_hidden_scale_b, - n_batch, n_cell, hidden_zp, scratch4); - // Projection. - if (use_projection) { - std::fill_n(output_ptr, n_batch * n_output, 0); - tensor_utils::MatrixBatchVectorMultiplyAccumulate( - scratch4, projection_effective_bias, projection_weight_ptr, - effective_proj_scale_a, effective_proj_scale_b, n_batch, n_cell, - n_output, output_state_zp, scratch5, output_ptr, context); - if (quantized_proj_clip > 0) { - tensor_utils::CwiseClipping(output_ptr, n_batch * n_output, - quantized_proj_clip); - } - } else { - std::copy_n(scratch4, n_batch * n_output, output_ptr); - } - std::copy_n(output_ptr, n_batch * n_output, output_state_ptr); + CalculateLstmOutputInteger8x8_16( + n_batch, n_cell, n_output, cell_state_ptr, cell_state_scale, + output_gate_scratch, effective_hidden_scale_a, effective_hidden_scale_b, + hidden_zp, projection_weight_ptr, effective_proj_scale_a, + effective_proj_scale_b, projection_effective_bias, output_state_zp, + quantized_proj_clip, output_state_ptr, context, scratch0, scratch4, + scratch5); + + // Copy output state to the output. Note that unlike float or hybrid, output + // is always contigous. + std::copy_n(output_state_ptr, n_batch * n_output, output_ptr); } // Fully quantized lstm kernel for 8 bit gate matmul output. @@ -1502,27 +1647,15 @@ inline void LstmStepInteger8x8_8( quantized_cell_clip); } - // Cell to hidden. - tensor_utils::ApplyTanhFloat(cell_state_ptr, n_batch, n_cell, -15, - forget_gate_scratch); - - tensor_utils::CwiseMul(output_gate_scratch, forget_gate_scratch, n_batch, - n_cell, 15 + 15 - 15, cell_gate_scratch); - - // Projection. - tensor_utils::MatrixBatchVectorMultiply( - cell_gate_scratch, projection_weight_ptr, effective_proj_scale_a, - effective_proj_scale_b, projection_bias_ptr, n_batch, n_cell, n_output, - output_state_zp, output_ptr); - - // Projection clipping. - if (quantized_proj_clip > 0) { - tensor_utils::CwiseClipping(output_ptr, n_batch * n_output, - quantized_proj_clip); - } + CalculateLstmOutputInteger8x8_8( + n_batch, n_cell, n_output, cell_state_ptr, output_gate_scratch, + projection_weight_ptr, effective_proj_scale_a, effective_proj_scale_b, + projection_bias_ptr, output_state_zp, quantized_proj_clip, + output_state_ptr, scratch2); - // Copy output to output state. - std::copy_n(output_ptr, n_batch * n_output, output_state_ptr); + // Copy output state to the output. Note that unlike float or hybrid, output + // is always contigous. + std::copy_n(output_state_ptr, n_batch * n_output, output_ptr); } } // namespace diff --git a/tensorflow/lite/tools/optimize/calibration/builtin_logging_ops/lstm.cc b/tensorflow/lite/tools/optimize/calibration/builtin_logging_ops/lstm.cc index ed1ef07d8d3c9a4350aaee513e8fd50779c23cbf..6399af013c5e40332a6ba12e139a84cc9c24aa09 100644 --- a/tensorflow/lite/tools/optimize/calibration/builtin_logging_ops/lstm.cc +++ b/tensorflow/lite/tools/optimize/calibration/builtin_logging_ops/lstm.cc @@ -37,6 +37,41 @@ namespace builtin { namespace { +void CalculateLstmOutputFloat( + int n_batch, int n_cell, int n_output, const float* cell_state, + const float* output_gate, TfLiteFusedActivation activation, + const float* projection_weights, const float* projection_bias, + const float proj_clip, float* output_state, float* scratch, Logger* logger, + const std::vector& intermediate_tensor_indexes, + ErrorReporter* error_reporter) { + tensor_utils::ApplyActivationToVector(cell_state, n_batch * n_cell, + activation, scratch); + tensor_utils::VectorVectorCwiseProduct(output_gate, scratch, n_batch * n_cell, + scratch); + + logger->LogTensorValue(intermediate_tensor_indexes[4], scratch, + n_cell * n_batch, error_reporter); + + const bool use_projection = (projection_weights != nullptr); + const bool use_projection_bias = (projection_bias != nullptr); + + if (use_projection) { + if (use_projection_bias) { + tensor_utils::VectorBatchVectorAssign(projection_bias, n_output, n_batch, + output_state); + } else { + std::fill_n(output_state, n_batch * n_output, 0.0f); + } + tensor_utils::MatrixBatchVectorMultiplyAccumulate( + projection_weights, n_output, n_cell, scratch, n_batch, output_state); + if (proj_clip > 0.0f) { + tensor_utils::CwiseClipping(output_state, n_batch * n_output, proj_clip); + } + } else { + std::copy_n(scratch, n_batch * n_output, output_state); + } +} + inline void LstmStepWithAuxInput( const float* input_ptr, const float* input_to_input_weights_ptr, const float* input_to_forget_weights_ptr, @@ -245,35 +280,13 @@ inline void LstmStepWithAuxInput( } tensor_utils::ApplySigmoidToVector(output_gate_scratch, n_batch * n_cell, output_gate_scratch); - tensor_utils::ApplyActivationToVector(cell_state_ptr, n_batch * n_cell, - params->activation, cell_gate_scratch); - tensor_utils::VectorVectorCwiseProduct(output_gate_scratch, cell_gate_scratch, - n_batch * n_cell, output_gate_scratch); - logger->LogTensorValue(intermediate_tensor_indexes[4], output_gate_scratch, - n_cell * n_batch, error_reporter); - - const bool use_projection_weight = (projection_weights_ptr != nullptr); - const bool use_projection_bias = (projection_bias_ptr != nullptr); + CalculateLstmOutputFloat(n_batch, n_cell, n_output, cell_state_ptr, + output_gate_scratch, params->activation, + projection_weights_ptr, projection_bias_ptr, + params->proj_clip, output_state_ptr, scratch2, + logger, intermediate_tensor_indexes, error_reporter); - // For each batch: update output_state. - if (use_projection_weight) { - if (use_projection_bias) { - tensor_utils::VectorBatchVectorAssign(projection_bias_ptr, n_output, - n_batch, output_state_ptr); - } else { - std::fill_n(output_state_ptr, n_batch * n_output, 0.0f); - } - tensor_utils::MatrixBatchVectorMultiplyAccumulate( - projection_weights_ptr, n_output, n_cell, output_gate_scratch, n_batch, - output_state_ptr); - if (params->proj_clip > 0.0) { - tensor_utils::CwiseClipping(output_state_ptr, n_batch * n_output, - params->proj_clip); - } - } else { - std::copy_n(output_gate_scratch, n_batch * n_output, output_state_ptr); - } // Copy output_state to the output. Note that the output batch rows may not be // contiguous (output_batch_leading_dim != n_output). for (int b = 0; b < n_batch; b++) {