From 5072f1dab5c1d6ead6cb749d23d24fad2bef89e0 Mon Sep 17 00:00:00 2001
From: Adrian Lundell <adrian.lundell@arm.com>
Date: Tue, 19 Mar 2024 09:33:09 +0100
Subject: [PATCH] Add support for int16 unidirectional lstm

Change-Id: I1ca5cf4f0778a119bf0fd2fa6e1daadf16d53e83
---
 ARM.CMSIS-NN.pdsc                             |   6 +
 Include/arm_nn_types.h                        |  14 +-
 Include/arm_nnfunctions.h                     |  87 ++-
 Include/arm_nnsupportfunctions.h              | 109 ++-
 .../arm_elementwise_mul_s16_batch_offset.c    | 166 +++++
 Source/FullyConnectedFunctions/CMakeLists.txt |   6 +-
 .../arm_vector_sum_s8_s64.c                   | 156 +++++
 Source/LSTMFunctions/CMakeLists.txt           |   3 +-
 .../arm_lstm_unidirectional_s16.c             |  95 +++
 .../arm_nn_lstm_calculate_gate_s16.c          |  96 +++
 .../NNSupportFunctions/arm_nn_lstm_step_s16.c | 112 ++++
 .../arm_nn_vec_mat_mul_result_acc_s16.c       | 362 ++++++++++
 Tests/UnitTest/CMakeLists.txt                 |   1 +
 .../{test_arm_lstm_s16.json => lstm_s16.json} |   2 +-
 .../JsonTemplates/lstm_s16_tm.json            | 620 ++++++++++++++++++
 .../UnitTest/RefactoredTestGen/Lib/op_lstm.py |  14 +-
 .../RefactoredTestGen/Lib/op_utils.py         |   2 +-
 Tests/UnitTest/RefactoredTestGen/Lib/test.py  |  35 +-
 .../RefactoredTestGen/Lib/test_plan.py        |  21 +-
 .../RefactoredTestGen/generate_test_data.py   |   7 +-
 .../UnitTest/RefactoredTestGen/test_plan.json |   6 +-
 .../TestData/lstm_1_s16/cell_gate_bias.h      |   7 +
 .../lstm_1_s16/cell_gate_hidden_weights.h     |  12 +
 .../lstm_1_s16/cell_gate_input_weights.h      |  19 +
 .../TestData/lstm_1_s16/config_data.h         |  34 +
 .../TestData/lstm_1_s16/forget_gate_bias.h    |   6 +
 .../lstm_1_s16/forget_gate_hidden_weights.h   |  12 +
 .../lstm_1_s16/forget_gate_input_weights.h    |  19 +
 .../TestCases/TestData/lstm_1_s16/input.h     |  22 +
 .../TestData/lstm_1_s16/input_gate_bias.h     |   6 +
 .../lstm_1_s16/input_gate_hidden_weights.h    |  12 +
 .../lstm_1_s16/input_gate_input_weights.h     |  19 +
 .../TestCases/TestData/lstm_1_s16/output.h    |  13 +
 .../TestData/lstm_1_s16/output_gate_bias.h    |   7 +
 .../lstm_1_s16/output_gate_hidden_weights.h   |  12 +
 .../lstm_1_s16/output_gate_input_weights.h    |  18 +
 .../TestCases/TestData/lstm_1_s16/test_data.h |  15 +
 .../TestData/lstm_2_s16/cell_gate_bias.h      |   6 +
 .../lstm_2_s16/cell_gate_hidden_weights.h     |   9 +
 .../lstm_2_s16/cell_gate_input_weights.h      |   8 +
 .../TestData/lstm_2_s16/config_data.h         |  34 +
 .../TestData/lstm_2_s16/forget_gate_bias.h    |   6 +
 .../lstm_2_s16/forget_gate_hidden_weights.h   |   9 +
 .../lstm_2_s16/forget_gate_input_weights.h    |   8 +
 .../TestCases/TestData/lstm_2_s16/input.h     |  10 +
 .../TestData/lstm_2_s16/input_gate_bias.h     |   6 +
 .../lstm_2_s16/input_gate_hidden_weights.h    |   9 +
 .../lstm_2_s16/input_gate_input_weights.h     |   8 +
 .../TestCases/TestData/lstm_2_s16/output.h    |  10 +
 .../TestData/lstm_2_s16/output_gate_bias.h    |   6 +
 .../lstm_2_s16/output_gate_hidden_weights.h   |   9 +
 .../lstm_2_s16/output_gate_input_weights.h    |   8 +
 .../TestCases/TestData/lstm_2_s16/test_data.h |  15 +
 .../lstm_one_time_step_s16/cell_gate_bias.h   |   6 +
 .../cell_gate_hidden_weights.h                |   6 +
 .../cell_gate_input_weights.h                 |   9 +
 .../lstm_one_time_step_s16/config_data.h      |  34 +
 .../lstm_one_time_step_s16/forget_gate_bias.h |   6 +
 .../forget_gate_hidden_weights.h              |   6 +
 .../forget_gate_input_weights.h               |   9 +
 .../TestData/lstm_one_time_step_s16/input.h   |  11 +
 .../lstm_one_time_step_s16/input_gate_bias.h  |   6 +
 .../input_gate_hidden_weights.h               |   6 +
 .../input_gate_input_weights.h                |   9 +
 .../TestData/lstm_one_time_step_s16/output.h  |   6 +
 .../lstm_one_time_step_s16/output_gate_bias.h |   6 +
 .../output_gate_hidden_weights.h              |   6 +
 .../output_gate_input_weights.h               |   9 +
 .../lstm_one_time_step_s16/test_data.h        |  15 +
 .../CMakeLists.txt                            |  23 +
 .../unity_test_arm_lstm_unidirectional_s16.c  |  48 ++
 .../test_arm_lstm_unidirectional_s16.c        | 475 ++++++++++++++
 72 files changed, 2945 insertions(+), 65 deletions(-)
 create mode 100644 Source/BasicMathFunctions/arm_elementwise_mul_s16_batch_offset.c
 create mode 100644 Source/FullyConnectedFunctions/arm_vector_sum_s8_s64.c
 create mode 100644 Source/LSTMFunctions/arm_lstm_unidirectional_s16.c
 create mode 100644 Source/NNSupportFunctions/arm_nn_lstm_calculate_gate_s16.c
 create mode 100644 Source/NNSupportFunctions/arm_nn_lstm_step_s16.c
 create mode 100644 Source/NNSupportFunctions/arm_nn_vec_mat_mul_result_acc_s16.c
 rename Tests/UnitTest/RefactoredTestGen/JsonTemplates/{test_arm_lstm_s16.json => lstm_s16.json} (99%)
 create mode 100644 Tests/UnitTest/RefactoredTestGen/JsonTemplates/lstm_s16_tm.json
 create mode 100644 Tests/UnitTest/TestCases/TestData/lstm_1_s16/cell_gate_bias.h
 create mode 100644 Tests/UnitTest/TestCases/TestData/lstm_1_s16/cell_gate_hidden_weights.h
 create mode 100644 Tests/UnitTest/TestCases/TestData/lstm_1_s16/cell_gate_input_weights.h
 create mode 100644 Tests/UnitTest/TestCases/TestData/lstm_1_s16/config_data.h
 create mode 100644 Tests/UnitTest/TestCases/TestData/lstm_1_s16/forget_gate_bias.h
 create mode 100644 Tests/UnitTest/TestCases/TestData/lstm_1_s16/forget_gate_hidden_weights.h
 create mode 100644 Tests/UnitTest/TestCases/TestData/lstm_1_s16/forget_gate_input_weights.h
 create mode 100644 Tests/UnitTest/TestCases/TestData/lstm_1_s16/input.h
 create mode 100644 Tests/UnitTest/TestCases/TestData/lstm_1_s16/input_gate_bias.h
 create mode 100644 Tests/UnitTest/TestCases/TestData/lstm_1_s16/input_gate_hidden_weights.h
 create mode 100644 Tests/UnitTest/TestCases/TestData/lstm_1_s16/input_gate_input_weights.h
 create mode 100644 Tests/UnitTest/TestCases/TestData/lstm_1_s16/output.h
 create mode 100644 Tests/UnitTest/TestCases/TestData/lstm_1_s16/output_gate_bias.h
 create mode 100644 Tests/UnitTest/TestCases/TestData/lstm_1_s16/output_gate_hidden_weights.h
 create mode 100644 Tests/UnitTest/TestCases/TestData/lstm_1_s16/output_gate_input_weights.h
 create mode 100644 Tests/UnitTest/TestCases/TestData/lstm_1_s16/test_data.h
 create mode 100644 Tests/UnitTest/TestCases/TestData/lstm_2_s16/cell_gate_bias.h
 create mode 100644 Tests/UnitTest/TestCases/TestData/lstm_2_s16/cell_gate_hidden_weights.h
 create mode 100644 Tests/UnitTest/TestCases/TestData/lstm_2_s16/cell_gate_input_weights.h
 create mode 100644 Tests/UnitTest/TestCases/TestData/lstm_2_s16/config_data.h
 create mode 100644 Tests/UnitTest/TestCases/TestData/lstm_2_s16/forget_gate_bias.h
 create mode 100644 Tests/UnitTest/TestCases/TestData/lstm_2_s16/forget_gate_hidden_weights.h
 create mode 100644 Tests/UnitTest/TestCases/TestData/lstm_2_s16/forget_gate_input_weights.h
 create mode 100644 Tests/UnitTest/TestCases/TestData/lstm_2_s16/input.h
 create mode 100644 Tests/UnitTest/TestCases/TestData/lstm_2_s16/input_gate_bias.h
 create mode 100644 Tests/UnitTest/TestCases/TestData/lstm_2_s16/input_gate_hidden_weights.h
 create mode 100644 Tests/UnitTest/TestCases/TestData/lstm_2_s16/input_gate_input_weights.h
 create mode 100644 Tests/UnitTest/TestCases/TestData/lstm_2_s16/output.h
 create mode 100644 Tests/UnitTest/TestCases/TestData/lstm_2_s16/output_gate_bias.h
 create mode 100644 Tests/UnitTest/TestCases/TestData/lstm_2_s16/output_gate_hidden_weights.h
 create mode 100644 Tests/UnitTest/TestCases/TestData/lstm_2_s16/output_gate_input_weights.h
 create mode 100644 Tests/UnitTest/TestCases/TestData/lstm_2_s16/test_data.h
 create mode 100644 Tests/UnitTest/TestCases/TestData/lstm_one_time_step_s16/cell_gate_bias.h
 create mode 100644 Tests/UnitTest/TestCases/TestData/lstm_one_time_step_s16/cell_gate_hidden_weights.h
 create mode 100644 Tests/UnitTest/TestCases/TestData/lstm_one_time_step_s16/cell_gate_input_weights.h
 create mode 100644 Tests/UnitTest/TestCases/TestData/lstm_one_time_step_s16/config_data.h
 create mode 100644 Tests/UnitTest/TestCases/TestData/lstm_one_time_step_s16/forget_gate_bias.h
 create mode 100644 Tests/UnitTest/TestCases/TestData/lstm_one_time_step_s16/forget_gate_hidden_weights.h
 create mode 100644 Tests/UnitTest/TestCases/TestData/lstm_one_time_step_s16/forget_gate_input_weights.h
 create mode 100644 Tests/UnitTest/TestCases/TestData/lstm_one_time_step_s16/input.h
 create mode 100644 Tests/UnitTest/TestCases/TestData/lstm_one_time_step_s16/input_gate_bias.h
 create mode 100644 Tests/UnitTest/TestCases/TestData/lstm_one_time_step_s16/input_gate_hidden_weights.h
 create mode 100644 Tests/UnitTest/TestCases/TestData/lstm_one_time_step_s16/input_gate_input_weights.h
 create mode 100644 Tests/UnitTest/TestCases/TestData/lstm_one_time_step_s16/output.h
 create mode 100644 Tests/UnitTest/TestCases/TestData/lstm_one_time_step_s16/output_gate_bias.h
 create mode 100644 Tests/UnitTest/TestCases/TestData/lstm_one_time_step_s16/output_gate_hidden_weights.h
 create mode 100644 Tests/UnitTest/TestCases/TestData/lstm_one_time_step_s16/output_gate_input_weights.h
 create mode 100644 Tests/UnitTest/TestCases/TestData/lstm_one_time_step_s16/test_data.h
 create mode 100644 Tests/UnitTest/TestCases/test_arm_lstm_unidirectional_s16/CMakeLists.txt
 create mode 100644 Tests/UnitTest/TestCases/test_arm_lstm_unidirectional_s16/Unity/unity_test_arm_lstm_unidirectional_s16.c
 create mode 100644 Tests/UnitTest/TestCases/test_arm_lstm_unidirectional_s16/test_arm_lstm_unidirectional_s16.c

diff --git a/ARM.CMSIS-NN.pdsc b/ARM.CMSIS-NN.pdsc
index 5dbf5495..c8246ea4 100644
--- a/ARM.CMSIS-NN.pdsc
+++ b/ARM.CMSIS-NN.pdsc
@@ -84,6 +84,7 @@
         <file category="source" name="Source/PoolingFunctions/arm_avgpool_s16.c"/>
         <file category="source" name="Source/BasicMathFunctions/arm_elementwise_mul_s8.c"/>
         <file category="source" name="Source/BasicMathFunctions/arm_elementwise_mul_s16.c"/>
+        <file category="source" name="Source/BasicMathFunctions/arm_elementwise_mul_batch_offset.c"/>
         <file category="source" name="Source/BasicMathFunctions/arm_elementwise_mul_acc_s16.c"/>
         <file category="source" name="Source/BasicMathFunctions/arm_elementwise_add_s8.c"/>
         <file category="source" name="Source/BasicMathFunctions/arm_elementwise_add_s16.c"/>
@@ -110,16 +111,21 @@
         <file category="source" name="Source/NNSupportFunctions/arm_nn_depthwise_conv_nt_t_s8.c"/>
         <file category="source" name="Source/NNSupportFunctions/arm_nn_mat_mul_core_1x_s8.c"/>
         <file category="source" name="Source/NNSupportFunctions/arm_nn_lstm_step_s8.c"/>
+        <file category="source" name="Source/NNSupportFunctions/arm_nn_lstm_step_s16.c"/>
         <file category="source" name="Source/NNSupportFunctions/arm_nn_lstm_calculate_gate_s8_s16.c"/>
+        <file category="source" name="Source/NNSupportFunctions/arm_nn_lstm_calculate_gate_s16.c"/>
         <file category="source" name="Source/NNSupportFunctions/arm_nn_vec_mat_mul_result_acc_s8_s16.c"/>
         <file category="source" name="Source/ConvolutionFunctions/arm_nn_mat_mult_kernel_row_offset_s8_s16.c"/>
+        <file category="source" name="Source/NNSupportFunctions/arm_nn_vec_mat_mul_result_acc_s16.c"/>
         <file category="source" name="Source/FullyConnectedFunctions/arm_fully_connected_s4.c"/>
         <file category="source" name="Source/FullyConnectedFunctions/arm_fully_connected_s8.c"/>
         <file category="source" name="Source/FullyConnectedFunctions/arm_fully_connected_s16.c"/>
         <file category="source" name="Source/FullyConnectedFunctions/arm_fully_connected_get_buffer_sizes_s16.c"/>
         <file category="source" name="Source/FullyConnectedFunctions/arm_fully_connected_get_buffer_sizes_s8.c"/>
         <file category="source" name="Source/FullyConnectedFunctions/arm_vector_sum_s8.c"/>
+        <file category="source" name="Source/FullyConnectedFunctions/arm_vector_sum_s8_s64.c"/>
         <file category="source" name="Source/LSTMFunctions/arm_lstm_unidirectional_s8.c"/>
+        <file category="source" name="Source/LSTMFunctions/arm_lstm_unidirectional_s16.c"/>
         <file category="source" name="Source/SoftmaxFunctions/arm_softmax_s8.c"/>
         <file category="source" name="Source/SoftmaxFunctions/arm_nn_softmax_common_s8.c"/>
         <file category="source" name="Source/SoftmaxFunctions/arm_softmax_s8_s16.c"/>
diff --git a/Include/arm_nn_types.h b/Include/arm_nn_types.h
index c567f0c1..affc1d5f 100644
--- a/Include/arm_nn_types.h
+++ b/Include/arm_nn_types.h
@@ -22,8 +22,8 @@
  * Description:  Public header file to contain the CMSIS-NN structs for the
  *               TensorFlowLite micro compliant functions
  *
- * $Date:        19 January 2024
- * $Revision:    V.3.0.0
+ * $Date:        26 March 2024
+ * $Revision:    V.3.1.0
  *
  * Target :  Arm(R) M-Profile Architecture
  * -------------------------------------------------------------------- */
@@ -191,15 +191,15 @@ typedef struct
 {
     int32_t input_multiplier;
     int32_t input_shift;
-    const int8_t *input_weights;
-    const int32_t *input_effective_bias; /**< Bias added with precomputed kernel_sum * lhs_offset*/
+    const void *input_weights;
+    const void *input_effective_bias; /**< Bias added with precomputed kernel_sum * lhs_offset*/
 
     int32_t hidden_multiplier;
     int32_t hidden_shift;
-    const int8_t *hidden_weights;
-    const int32_t *hidden_effective_bias; /**< Precomputed kernel_sum * lhs_offset*/
+    const void *hidden_weights;
+    const void *hidden_effective_bias; /**< Precomputed kernel_sum * lhs_offset*/
 
-    const int32_t *bias;
+    const void *bias;
     arm_nn_activation_type activation_type;
 } cmsis_nn_lstm_gate;
 
diff --git a/Include/arm_nnfunctions.h b/Include/arm_nnfunctions.h
index e4604c17..b381a7fe 100644
--- a/Include/arm_nnfunctions.h
+++ b/Include/arm_nnfunctions.h
@@ -21,8 +21,8 @@
  * Title:        arm_nnfunctions.h
  * Description:  Public header file for CMSIS NN Library
  *
- * $Date:        11 March 2024
- * $Revision:    V.15.0.0
+ * $Date:        20 February 2024
+ * $Revision:    V.15.1.0
 
  *
  * Target :  Arm(R) M-Profile Architecture
@@ -1475,7 +1475,7 @@ arm_cmsis_nn_status arm_fully_connected_s8(const cmsis_nn_context *ctx,
                                            int8_t *output_data);
 
 /**
- * @brief Calculate the sum of each row in vector_data, multiply by lhs_offset and optionally add bias_data.
+ * @brief Calculate the sum of each row in vector_data, multiply by lhs_offset and optionally add s32 bias_data.
  * @param[in, out]      vector_sum_buf              Buffer for vector sums
  * @param[in]           vector_cols                 Number of vector columns
  * @param[in]           vector_rows                 Number of vector rows
@@ -1492,6 +1492,24 @@ arm_cmsis_nn_status arm_vector_sum_s8(int32_t *vector_sum_buf,
                                       const int32_t lhs_offset,
                                       const int32_t *bias_data);
 
+/**
+ * @brief Calculate the sum of each row in vector_data, multiply by lhs_offset and optionally add s64 bias_data.
+ * @param[in, out]      vector_sum_buf              Buffer for vector sums
+ * @param[in]           vector_cols                 Number of vector columns
+ * @param[in]           vector_rows                 Number of vector rows
+ * @param[in]           vector_data                 Vector of weigths data
+ * @param[in]           lhs_offset                  Constant multiplied with each sum
+ * @param[in]           bias_data                   Vector of bias data, added to each sum.
+ * @return              The function returns
+ *                         <code>ARM_CMSIS_NN_SUCCESS</code> - Successful operation
+ */
+arm_cmsis_nn_status arm_vector_sum_s8_s64(int64_t *vector_sum_buf,
+                                          const int32_t vector_cols,
+                                          const int32_t vector_rows,
+                                          const int8_t *vector_data,
+                                          const int32_t lhs_offset,
+                                          const int64_t *bias_data);
+
 /**
  * @brief Get size of additional buffer required by arm_fully_connected_s8().
  *        See also arm_vector_sum_s8, which is required if buffer size is > 0.
@@ -2401,13 +2419,41 @@ arm_cmsis_nn_status arm_svdf_state_s16_s8(const cmsis_nn_context *input_ctx,
                                           const cmsis_nn_dims *output_dims,
                                           int8_t *output_data);
 
+/**
+ * @brief Get size of additional buffer required by arm_svdf_s8().
+ * @param[in]      filter_dims             dimension of filter
+ * @return         The function returns    required buffer size in bytes
+ *
+ */
+int32_t arm_svdf_s8_get_buffer_size(const cmsis_nn_dims *filter_dims);
+
+/**
+ * @brief Get size of additional buffer required by arm_svdf_s8() for processors with DSP extension.
+ *        Refer to arm_svdf_s8_get_buffer_size() for function argument details.
+ *
+ * @note       Intended for compilation on Host. If compiling for an Arm target, use
+ *             arm_svdf_s8_get_buffer_size().
+ *
+ */
+int32_t arm_svdf_s8_get_buffer_size_dsp(const cmsis_nn_dims *filter_dims);
+
+/**
+ * @brief Get size of additional buffer required by arm_svdf_s8() for Arm(R) Helium Architecture case.
+ *        Refer to arm_svdf_s8_get_buffer_size() for function argument details.
+ *
+ * @note       Intended for compilation on Host. If compiling for an Arm target, use
+ *             arm_svdf_s8_get_buffer_size().
+ *
+ */
+int32_t arm_svdf_s8_get_buffer_size_mve(const cmsis_nn_dims *filter_dims);
+
 /**
  * @defgroup LSTM LSTM Layer Functions
  *
  */
 
 /**
- * @brief LSTM unidirectional function with 8 bit input and output and 16 bit gate output.
+ * @brief LSTM unidirectional function with 8 bit input and output and 16 bit gate output, 32 bit bias.
  *
  * @param[in]   input                      Pointer to input data
  * @param[out]  output                     Pointer to output data
@@ -2428,32 +2474,25 @@ arm_cmsis_nn_status arm_lstm_unidirectional_s8(const int8_t *input,
                                                cmsis_nn_lstm_context *buffers);
 
 /**
- * @brief Get size of additional buffer required by arm_svdf_s8().
- * @param[in]      filter_dims             dimension of filter
- * @return         The function returns    required buffer size in bytes
+ * @brief LSTM unidirectional function with 16 bit input and output and 16 bit gate output, 64 bit bias.
  *
- */
-int32_t arm_svdf_s8_get_buffer_size(const cmsis_nn_dims *filter_dims);
-
-/**
- * @brief Get size of additional buffer required by arm_svdf_s8() for processors with DSP extension.
- *        Refer to arm_svdf_s8_get_buffer_size() for function argument details.
+ * @param[in]   input                      Pointer to input data
+ * @param[out]  output                     Pointer to output data
+ * @param[in]   params                     Struct containing all information about the lstm operator, see arm_nn_types.
+ * @param[in]   buffers                    Struct containing pointers to all temporary scratch buffers needed for the
+ * lstm operator, see arm_nn_types.
  *
- * @note       Intended for compilation on Host. If compiling for an Arm target, use
- *             arm_svdf_s8_get_buffer_size().
  *
- */
-int32_t arm_svdf_s8_get_buffer_size_dsp(const cmsis_nn_dims *filter_dims);
-
-/**
- * @brief Get size of additional buffer required by arm_svdf_s8() for Arm(R) Helium Architecture case.
- *        Refer to arm_svdf_s8_get_buffer_size() for function argument details.
+ * @return     The function returns <code>ARM_CMSIS_NN_SUCCESS</code>
  *
- * @note       Intended for compilation on Host. If compiling for an Arm target, use
- *             arm_svdf_s8_get_buffer_size().
+ * @details
+ *    1. Supported framework: TensorFlow Lite Micro
  *
  */
-int32_t arm_svdf_s8_get_buffer_size_mve(const cmsis_nn_dims *filter_dims);
+arm_cmsis_nn_status arm_lstm_unidirectional_s16(const int16_t *input,
+                                                int16_t *output,
+                                                const cmsis_nn_lstm_params *params,
+                                                cmsis_nn_lstm_context *buffers);
 
 #ifdef __cplusplus
 }
diff --git a/Include/arm_nnsupportfunctions.h b/Include/arm_nnsupportfunctions.h
index 70c57022..26d096e9 100644
--- a/Include/arm_nnsupportfunctions.h
+++ b/Include/arm_nnsupportfunctions.h
@@ -21,8 +21,8 @@
  * Title:        arm_nnsupportfunctions.h
  * Description:  Public header file of support functions for CMSIS NN Library
  *
- * $Date:        22 March 2024
- * $Revision:    V.20.0.0
+ * $Date:        14 February 2024
+ * $Revision:    V.20.1.0
  *
  * Target :  Arm(R) M-Profile Architecture
  * -------------------------------------------------------------------- */
@@ -1538,9 +1538,9 @@ __STATIC_FORCEINLINE void arm_nn_write_s8x2_ia(int8_t **dst, int16_t src)
 
 // Support functions for LSTM
 /**
- * @brief Update LSTM function for an iteration step
+ * @brief Update LSTM function for an iteration step using s8 input and output, and s16 internally.
  *
- * @param[in]   data_in                         Data input pointervoid
+ * @param[in]   data_in                         Data input pointer
  * @param[in]   hidden_in                       Hidden state/ recurrent input pointer
  * @param[out]  hidden_out                      Hidden state/ recurrent output pointer
  * @param[in]   params                          Struct containg all information about the lstm operator, see
@@ -1561,6 +1561,30 @@ arm_cmsis_nn_status arm_nn_lstm_step_s8(const int8_t *data_in,
                                         cmsis_nn_lstm_context *buffers,
                                         const int32_t batch_offset);
 
+/**
+ * @brief Update LSTM function for an iteration step using s16 input and output, and s16 internally.
+ *
+ * @param[in]   data_in                         Data input pointer
+ * @param[in]   hidden_in                       Hidden state/ recurrent input pointer
+ * @param[out]  hidden_out                      Hidden state/ recurrent output pointer
+ * @param[in]   params                          Struct containg all information about the lstm operator, see
+ * arm_nn_types.
+ * @param[in]   buffers                         Struct containg pointers to all temporary scratch buffers needed for the
+ * lstm operator, see arm_nn_types.
+ * @param[in]   batch_offset                    Number of timesteps between consecutive batches.
+ * E.g for params->timing_major = true, all batches for t=0 are stored sequentially, so batch offset = 1.
+ * For params->time major = false, all time steps are stored continously before the next batch, so
+ * batch offset = params->time_steps.
+ * @return                                      The function returns ARM_CMSIS_NN_SUCCESS
+
+ */
+arm_cmsis_nn_status arm_nn_lstm_step_s16(const int16_t *data_in,
+                                         const int16_t *hidden_in,
+                                         int16_t *hidden_out,
+                                         const cmsis_nn_lstm_params *params,
+                                         cmsis_nn_lstm_context *buffers,
+                                         const int32_t batch_offset);
+
 /**
  * @brief Updates a LSTM gate for an iteration step of LSTM function, int8x8_16 version.
  *
@@ -1582,6 +1606,27 @@ arm_cmsis_nn_status arm_nn_lstm_calculate_gate_s8_s16(const int8_t *data_in,
                                                       int16_t *output,
                                                       const int32_t batch_offset);
 
+/**
+ * @brief Updates a LSTM gate for an iteration step of LSTM function, int16x8_16 version.
+ *
+ * @param[in]   data_in                         Data input pointer
+ * @param[in]   hidden_in                       Hidden state/ recurrent input pointer
+ * @param[in]   gate_data                       Struct containing all information about the gate caluclation, see
+ * arm_nn_types.
+ * @param[in]   params                          Struct containing all information about the lstm_operation, see
+ * arm_nn_types
+ * @param[out]  output                          Hidden state/ recurrent output pointer
+ * @param[in]   batch_offset                    Number of timesteps between consecutive batches, see
+ * arm_nn_lstm_step_s16.
+ * @return                                      The function returns ARM_CMSIS_NN_SUCCESS
+ */
+arm_cmsis_nn_status arm_nn_lstm_calculate_gate_s16(const int16_t *data_in,
+                                                   const int16_t *hidden_in,
+                                                   const cmsis_nn_lstm_gate *gate_data,
+                                                   const cmsis_nn_lstm_params *params,
+                                                   int16_t *output,
+                                                   const int32_t batch_offset);
+
 /**
  * @brief The result of the multiplication is accumulated to the passed result buffer.
  * Multiplies a matrix by a "batched" vector (i.e. a matrix with a batch dimension composed by input vectors independent
@@ -1612,6 +1657,36 @@ arm_cmsis_nn_status arm_nn_vec_mat_mul_result_acc_s8_s16(const int8_t *lhs,
                                                          const int32_t batches,
                                                          const int32_t batch_offset);
 
+/**
+ * @brief The result of the multiplication is accumulated to the passed result buffer.
+ * Multiplies a matrix by a "batched" vector (i.e. a matrix with a batch dimension composed by input vectors independent
+ * from each other).
+ *
+ * @param[in]   lhs              Batched vector
+ * @param[in]   rhs              Weights - input matrix (H(Rows)xW(Columns))
+ * @param[in]   effective_bias   Bias + lhs_offset * kernel_sum term precalculated into a constant vector.
+ * @param[out]  dst              Output
+ * @param[in]   dst_multiplier   Multiplier for quantization
+ * @param[in]   dst_shift        Shift for quantization
+ * @param[in]   rhs_cols         Vector/matarix column length
+ * @param[in]   rhs_rows         Row count of matrix
+ * @param[in]   batches          Batch size
+ * @param[in]   batch_offset     Number of timesteps between consecutive batches in input, see arm_nn_lstm_step_s16.
+ Note that the output is always stored with sequential batches.
+ * @return                       The function returns <code>ARM_CMSIS_NN_SUCCESS</code>
+
+ */
+arm_cmsis_nn_status arm_nn_vec_mat_mul_result_acc_s16(const int16_t *lhs,
+                                                      const int8_t *rhs,
+                                                      const int64_t *effective_bias,
+                                                      int16_t *dst,
+                                                      const int32_t dst_multiplier,
+                                                      const int32_t dst_shift,
+                                                      const int32_t rhs_cols,
+                                                      const int32_t rhs_rows,
+                                                      const int32_t batches,
+                                                      const int32_t batch_offset);
+
 /**
  * @brief s16 elementwise multiplication with s8 output
  * @param[in]       input_1_vect        pointer to input vector 1
@@ -1638,6 +1713,32 @@ arm_cmsis_nn_status arm_elementwise_mul_s16_s8(const int16_t *input_1_vect,
                                                const int32_t batch_size,
                                                const int32_t batch_offset);
 
+/**
+ * @brief s16 elementwise multiplication with s16 output
+ * @param[in]       input_1_vect        pointer to input vector 1
+ * @param[in]       input_2_vect        pointer to input vector 2
+ * @param[in,out]   output              pointer to output vector
+ * @param[in]       out_offset          output offset
+ * @param[in]       out_mult            output multiplier
+ * @param[in]       out_shift           output shift
+ * @param[in]       block_size          number of samples per batch
+ * @param[in]       batch_size          number of samples per batch
+ * @param[in]       batch_offset        Number of timesteps between consecutive batches in output, see
+ * arm_nn_lstm_step_s16. Note that it is assumed that the input is stored with sequential batches.
+ * @return          The function returns ARM_CMSIS_NN_SUCCESS
+ *
+ * @details   Supported framework: TensorFlow Lite micro
+ */
+arm_cmsis_nn_status arm_elementwise_mul_s16_batch_offset(const int16_t *input_1_vect,
+                                                         const int16_t *input_2_vect,
+                                                         int16_t *output,
+                                                         const int32_t out_offset,
+                                                         const int32_t out_mult,
+                                                         const int32_t out_shift,
+                                                         const int32_t block_size,
+                                                         const int32_t batch_size,
+                                                         const int32_t batch_offset);
+
 /**
  * @brief s16 elementwise multiplication. The result of the multiplication is accumulated to the passed result buffer.
  * @param[in]       input_1_vect        pointer to input vector 1
diff --git a/Source/BasicMathFunctions/arm_elementwise_mul_s16_batch_offset.c b/Source/BasicMathFunctions/arm_elementwise_mul_s16_batch_offset.c
new file mode 100644
index 00000000..b3acc3eb
--- /dev/null
+++ b/Source/BasicMathFunctions/arm_elementwise_mul_s16_batch_offset.c
@@ -0,0 +1,166 @@
+/*
+ * SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS NN Library
+ * Title:        arm_elementwise_mul_s16_batch_offset
+ * Description:  Element wise multiplication
+ *
+ * $Date:        18 March 2024
+ * $Revision:    V.1.0.0
+ *
+ * Target :  Arm(R) M-Profile Architecture
+ *
+ * -------------------------------------------------------------------- */
+
+#include "arm_nnfunctions.h"
+#include "arm_nnsupportfunctions.h"
+
+/**
+ *  @ingroup Public
+ */
+
+/**
+ * @addtogroup groupElementwise
+ * @{
+ */
+
+/**
+ * @brief s16 element wise multiplication of batches of two vectors
+ *
+ * @note   Refer header file for details.
+ *
+ */
+arm_cmsis_nn_status arm_elementwise_mul_s16_batch_offset(const int16_t *input_1_vect,
+                                                         const int16_t *input_2_vect,
+                                                         int16_t *output,
+                                                         const int32_t out_offset,
+                                                         const int32_t out_mult,
+                                                         const int32_t out_shift,
+                                                         const int32_t block_size,
+                                                         const int32_t batch_size,
+                                                         const int32_t batch_offset)
+{
+
+    int32_t loop_count;
+
+    for (int i = 0; i < batch_size; i++)
+    {
+
+#if defined(ARM_MATH_MVEI)
+
+        const int16_t *input_1_ptr = input_1_vect;
+        const int16_t *input_2_ptr = input_2_vect;
+        int16_t *output_ptr = output;
+
+        loop_count = block_size;
+
+        while (loop_count > 0)
+        {
+            mve_pred16_t pred = vctp32q(loop_count);
+
+            int32x4_t input_1 = vldrhq_z_s32(input_1_ptr, pred);
+            int32x4_t input_2 = vldrhq_z_s32(input_2_ptr, pred);
+
+            int32x4_t res_0 = vmulq_s32(input_1, input_2);
+
+            res_0 = arm_requantize_mve_32x4(res_0, vdupq_n_s32(out_mult), vdupq_n_s32(out_shift));
+            res_0 = vaddq_n_s32(res_0, out_offset);
+
+            res_0 = vmaxq_s32(res_0, vdupq_n_s32(NN_Q15_MIN));
+            res_0 = vminq_s32(res_0, vdupq_n_s32(NN_Q15_MAX));
+
+            vstrhq_p_s32(output_ptr, res_0, pred);
+            input_1_ptr += 4;
+            input_2_ptr += 4;
+
+            output_ptr += 4;
+            loop_count -= 4;
+        }
+
+        input_1_vect += block_size;
+        input_2_vect += block_size;
+        output += block_size;
+
+#else
+        int32_t input_1;
+        int32_t input_2;
+        int32_t mul_res;
+        int32_t two_halfword_1, two_halfword_2;
+        int16_t mul_1, mul_2;
+        loop_count = block_size / 2;
+
+        while (loop_count > 0)
+        {
+            two_halfword_1 = arm_nn_read_q15x2_ia(&input_1_vect);
+            two_halfword_2 = arm_nn_read_q15x2_ia(&input_2_vect);
+
+    #if defined(ARM_MATH_DSP)
+            mul_res = SMULBB(two_halfword_1, two_halfword_2);
+    #else
+            input_1 = (int16_t)(two_halfword_1 & 0xFFFF);
+            input_2 = (int16_t)(two_halfword_2 & 0xFFFF);
+            mul_res = input_1 * input_2;
+    #endif
+            mul_res = arm_nn_requantize(mul_res, out_mult, out_shift) + out_offset;
+            mul_res = MAX(mul_res, NN_Q15_MIN);
+            mul_res = MIN(mul_res, NN_Q15_MAX);
+            mul_1 = (int16_t)mul_res;
+
+    #if defined(ARM_MATH_DSP)
+            mul_res = SMULTT(two_halfword_1, two_halfword_2);
+    #else
+            input_1 = (int16_t)(two_halfword_1 >> 16);
+            input_2 = (int16_t)(two_halfword_2 >> 16);
+            mul_res = input_1 * input_2;
+    #endif
+            mul_res = arm_nn_requantize(mul_res, out_mult, out_shift) + out_offset;
+            mul_res = MAX(mul_res, NN_Q15_MIN);
+            mul_res = MIN(mul_res, NN_Q15_MAX);
+            mul_2 = (int16_t)mul_res;
+
+            arm_nn_write_q15x2_ia(&output, PACK_Q15x2_32x1(mul_1, mul_2));
+
+            loop_count--;
+        }
+
+        if (block_size & 0x1)
+        {
+            /* C = A * B */
+
+            input_1 = *input_1_vect++;
+            input_2 = *input_2_vect++;
+
+            mul_res = input_1 * input_2;
+            mul_res = arm_nn_requantize(mul_res, out_mult, out_shift) + out_offset;
+
+            mul_res = MAX(mul_res, NN_Q15_MIN);
+            mul_res = MIN(mul_res, NN_Q15_MAX);
+
+            *output++ = (int16_t)mul_res;
+        }
+#endif // #if defined(ARM_MATH_MVEI)
+
+        output += (batch_offset - 1) * block_size;
+    }
+    return ARM_CMSIS_NN_SUCCESS;
+}
+
+/**
+ * @} end of Doxygen group
+ */
diff --git a/Source/FullyConnectedFunctions/CMakeLists.txt b/Source/FullyConnectedFunctions/CMakeLists.txt
index e2cfbf04..1d254c87 100644
--- a/Source/FullyConnectedFunctions/CMakeLists.txt
+++ b/Source/FullyConnectedFunctions/CMakeLists.txt
@@ -1,5 +1,5 @@
 #
-# SPDX-FileCopyrightText: Copyright 2019-2021, 2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2019-2021, 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 #
 # SPDX-License-Identifier: Apache-2.0
 #
@@ -19,4 +19,6 @@
 file(GLOB SRC_S4 "./*_s4.c")
 file(GLOB SRC_S8 "./*_s8.c")
 file(GLOB SRC_S16 "./*_s16*.c")
-target_sources(cmsis-nn PRIVATE ${SRC_S4} ${SRC_S8} ${SRC_S16})
+file(GLOB SRC_S64 "./*_s64.c")
+
+target_sources(cmsis-nn PRIVATE ${SRC_S4} ${SRC_S8} ${SRC_S16} ${SRC_S64})
diff --git a/Source/FullyConnectedFunctions/arm_vector_sum_s8_s64.c b/Source/FullyConnectedFunctions/arm_vector_sum_s8_s64.c
new file mode 100644
index 00000000..f5fc824e
--- /dev/null
+++ b/Source/FullyConnectedFunctions/arm_vector_sum_s8_s64.c
@@ -0,0 +1,156 @@
+/*
+ * SPDX-FileCopyrightText: Copyright 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS NN Library
+ * Title:        arm_vector_sum_s8_s64
+ * Description:  Generic function for calculating vector sums
+ *
+ * $Date:        26 March 2024
+ * $Revision:    V.1.0.0
+ *
+ * Target :  Arm(R) M-Profile Architecture
+ *
+ * -------------------------------------------------------------------- */
+
+#include "arm_nnfunctions.h"
+#include "arm_nnsupportfunctions.h"
+/**
+ *  @ingroup Public
+ */
+
+/**
+ * @addtogroup FC
+ * @{
+ */
+
+/*
+ * S8 vector sum fuction in preparation for e.g. kernel sums in fully connected and matrix multiplication layer function
+ *
+ * Refer header file for details.
+ *
+ */
+arm_cmsis_nn_status arm_vector_sum_s8_s64(int64_t *vector_sum_buf,
+                                          const int32_t vector_cols,
+                                          const int32_t vector_rows,
+                                          const int8_t *vector_data,
+                                          const int32_t lhs_offset,
+                                          const int64_t *bias_data)
+{
+
+    if (bias_data)
+    {
+        memcpy(vector_sum_buf, bias_data, vector_rows * sizeof(int64_t));
+    }
+    else
+    {
+        memset(vector_sum_buf, 0, vector_rows * sizeof(int64_t));
+    }
+    if (lhs_offset)
+    {
+#if defined(ARM_MATH_MVEI)
+
+        const int32_t row_loop_cnt = vector_rows / 5;
+        for (int i_row_loop_cnt = 0; i_row_loop_cnt < row_loop_cnt; i_row_loop_cnt++)
+        {
+            const int32_t col_loop_cnt = (vector_cols + 15) / 16;
+            const int8_t *vector_0 = vector_data;
+            const int8_t *vector_1 = vector_data + vector_cols;
+            const int8_t *vector_2 = vector_data + 2 * vector_cols;
+            const int8_t *vector_3 = vector_data + 3 * vector_cols;
+            const int8_t *vector_4 = vector_data + 4 * vector_cols;
+            int32_t vector_sum_0 = 0;
+            int32_t vector_sum_1 = 0;
+            int32_t vector_sum_2 = 0;
+            int32_t vector_sum_3 = 0;
+            int32_t vector_sum_4 = 0;
+            uint32_t col_cnt = (uint32_t)vector_cols;
+            for (int i = 0; i < col_loop_cnt; i++)
+            {
+                mve_pred16_t p = vctp8q(col_cnt);
+                col_cnt -= 16;
+                const int8x16_t ker_0 = vldrbq_z_s8(vector_0, p);
+                vector_sum_0 = vaddvaq_s8(vector_sum_0, ker_0);
+                const int8x16_t ker_1 = vldrbq_z_s8(vector_1, p);
+                vector_sum_1 = vaddvaq_s8(vector_sum_1, ker_1);
+                const int8x16_t ker_2 = vldrbq_z_s8(vector_2, p);
+                vector_sum_2 = vaddvaq_s8(vector_sum_2, ker_2);
+                const int8x16_t ker_3 = vldrbq_z_s8(vector_3, p);
+                vector_sum_3 = vaddvaq_s8(vector_sum_3, ker_3);
+                const int8x16_t ker_4 = vldrbq_z_s8(vector_4, p);
+                vector_sum_4 = vaddvaq_s8(vector_sum_4, ker_4);
+                vector_0 += 16;
+                vector_1 += 16;
+                vector_2 += 16;
+                vector_3 += 16;
+                vector_4 += 16;
+            }
+            vector_data += 5 * vector_cols;
+
+            vector_sum_0 *= lhs_offset;
+            vector_sum_1 *= lhs_offset;
+            vector_sum_2 *= lhs_offset;
+            vector_sum_3 *= lhs_offset;
+            vector_sum_4 *= lhs_offset;
+
+            vector_sum_buf[0] += vector_sum_0;
+            vector_sum_buf[1] += vector_sum_1;
+            vector_sum_buf[2] += vector_sum_2;
+            vector_sum_buf[3] += vector_sum_3;
+            vector_sum_buf[4] += vector_sum_4;
+            vector_sum_buf += 5;
+        }
+        const int32_t loop_cnt = vector_rows % 5;
+        for (int i_row_loop_cnt = 0; i_row_loop_cnt < loop_cnt; i_row_loop_cnt++)
+        {
+            const int32_t col_loop_cnt = (vector_cols + 15) / 16;
+            const int8_t *vector_0 = vector_data;
+            int32_t vector_sum_0 = 0;
+            uint32_t col_cnt = (uint32_t)vector_cols;
+            for (int i = 0; i < col_loop_cnt; i++)
+            {
+                mve_pred16_t p = vctp8q(col_cnt);
+                col_cnt -= 16;
+                const int8x16_t ker_0 = vldrbq_z_s8(vector_0, p);
+                vector_sum_0 = vaddvaq_s8(vector_sum_0, ker_0);
+                vector_0 += 16;
+            }
+            vector_data += vector_cols;
+            vector_sum_0 *= lhs_offset;
+
+            vector_sum_buf[i_row_loop_cnt] += vector_sum_0;
+        }
+#else
+        for (int i = 0; i < vector_rows; i++)
+        {
+            int64_t sum = 0;
+            for (int j = 0; j < vector_cols; j++)
+            {
+                sum += *vector_data++;
+            }
+            *vector_sum_buf++ += sum * (int64_t)lhs_offset;
+        }
+#endif
+    }
+
+    return (ARM_CMSIS_NN_SUCCESS);
+}
+
+/**
+ * @} end of FC group
+ */
diff --git a/Source/LSTMFunctions/CMakeLists.txt b/Source/LSTMFunctions/CMakeLists.txt
index eed27265..e201e3a0 100644
--- a/Source/LSTMFunctions/CMakeLists.txt
+++ b/Source/LSTMFunctions/CMakeLists.txt
@@ -17,4 +17,5 @@
 #
 
 file(GLOB SRC_S8 "./*_s8.c")
-target_sources(cmsis-nn PRIVATE ${SRC_S8})
+file(GLOB SRC_S16 "./*_s16.c")
+target_sources(cmsis-nn PRIVATE ${SRC_S8} ${SRC_S16})
diff --git a/Source/LSTMFunctions/arm_lstm_unidirectional_s16.c b/Source/LSTMFunctions/arm_lstm_unidirectional_s16.c
new file mode 100644
index 00000000..4d4ed021
--- /dev/null
+++ b/Source/LSTMFunctions/arm_lstm_unidirectional_s16.c
@@ -0,0 +1,95 @@
+/*
+ * SPDX-FileCopyrightText: Copyright 2024, Arm Limited and/or its affiliates <open-source-office@arm.com>
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS NN Library
+ * Title:        arm_lstm_unidirectional_s16.c
+ * Description:  S16 LSTM function with S16 gate output
+ *
+ * $Date:        26 March 2024
+ * $Revision:    V.1.0.0
+ *
+ * Target Processor:  Cortex-M processors
+ *
+ * -------------------------------------------------------------------- */
+
+#include "arm_nnfunctions.h"
+#include "arm_nnsupportfunctions.h"
+/**
+ * @ingroup Public
+ */
+
+/**
+ * @addtogroup LSTM
+ * @{
+ */
+
+/*
+ * S16 LSTM function for TensorFlow Lite with S16 gate output
+ *
+ * Refer to header file for details.
+ *
+ */
+
+arm_cmsis_nn_status arm_lstm_unidirectional_s16(const int16_t *input,
+                                                int16_t *output,
+                                                const cmsis_nn_lstm_params *params,
+                                                cmsis_nn_lstm_context *buffers)
+{
+
+    int16_t *hidden_in = NULL;
+    memset(buffers->cell_state, 0, params->batch_size * params->hidden_size * sizeof(int16_t));
+    if (params->time_major)
+    {
+        // First dimension is time, input/output for each time step is stored continously in memory
+        for (int t = 0; t < params->time_steps; t++)
+        {
+            const int16_t *data_in = input + (t * params->batch_size * params->input_size);
+            int16_t *hidden_out = output + (t * params->batch_size * params->hidden_size);
+            arm_cmsis_nn_status status = arm_nn_lstm_step_s16(data_in, hidden_in, hidden_out, params, buffers, 1);
+            if (status != ARM_CMSIS_NN_SUCCESS)
+            {
+                return status;
+            }
+            // Output is used as recurrent input/hidden state for the next timestep.
+            hidden_in = &hidden_out[0];
+        }
+    }
+    else
+    {
+        // First dimension is time, add batch_offset to jump in memory for each batch
+        for (int t = 0; t < params->time_steps; t++)
+        {
+            const int16_t *data_in = input + (t * params->input_size);
+            int16_t *hidden_out = output + (t * params->hidden_size);
+            arm_cmsis_nn_status status =
+                arm_nn_lstm_step_s16(data_in, hidden_in, hidden_out, params, buffers, params->time_steps);
+            if (status != ARM_CMSIS_NN_SUCCESS)
+            {
+                return status;
+            }
+            // Output is used as recurrent input/hidden state for the next timestep.
+            hidden_in = &hidden_out[0];
+        }
+    }
+    return ARM_CMSIS_NN_SUCCESS;
+}
+
+/**
+ * @} end of LSTM group
+ */
diff --git a/Source/NNSupportFunctions/arm_nn_lstm_calculate_gate_s16.c b/Source/NNSupportFunctions/arm_nn_lstm_calculate_gate_s16.c
new file mode 100644
index 00000000..1c49fbfe
--- /dev/null
+++ b/Source/NNSupportFunctions/arm_nn_lstm_calculate_gate_s16.c
@@ -0,0 +1,96 @@
+/*
+ * SPDX-FileCopyrightText: Copyright 2022, 2024 Arm Limited and/or its affiliates <open-source-office.com>
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS NN Library
+ * Title:        arm_nn_lstm_calculate_gate_s16.c
+ * Description:  Update single gate for an incremental step of LSTM function.
+ *
+ * $Date:        26 March 2024
+ * $Revision:    V.1.0.0
+ *
+ * Target Processor:  Cortex-M cores
+ *
+ * -------------------------------------------------------------------- */
+
+#include "arm_nn_tables.h"
+#include "arm_nnfunctions.h"
+#include "arm_nnsupportfunctions.h"
+/**
+ * @ingroup groupSupport
+ */
+
+/**
+ * @defgroup supportLSTM
+ *
+ * Support functions for LSTM
+ *
+ */
+
+/**
+ * @addtogroup supportLSTM
+ * @{
+ */
+
+/*
+ * Calculates a single LSTM gate, int16x8_16 version.
+ * Refer to header file for details
+ */
+arm_cmsis_nn_status arm_nn_lstm_calculate_gate_s16(const int16_t *data_in,
+                                                   const int16_t *hidden_in,
+                                                   const cmsis_nn_lstm_gate *gate,
+                                                   const cmsis_nn_lstm_params *params,
+                                                   int16_t *output,
+                                                   const int32_t batch_offset)
+{
+
+    memset(output, 0, params->hidden_size * params->batch_size * sizeof(int16_t));
+
+    arm_nn_vec_mat_mul_result_acc_s16(data_in,
+                                      gate->input_weights,
+                                      gate->input_effective_bias,
+                                      output,
+                                      gate->input_multiplier,
+                                      gate->input_shift,
+                                      params->input_size,
+                                      params->hidden_size,
+                                      params->batch_size,
+                                      batch_offset);
+
+    if (hidden_in)
+    {
+
+        arm_nn_vec_mat_mul_result_acc_s16(hidden_in,
+                                          gate->hidden_weights,
+                                          gate->hidden_effective_bias,
+                                          output,
+                                          gate->hidden_multiplier,
+                                          gate->hidden_shift,
+                                          params->hidden_size,
+                                          params->hidden_size,
+                                          params->batch_size,
+                                          batch_offset);
+    }
+
+    arm_nn_activation_s16(output, output, params->hidden_size * params->batch_size, 0, gate->activation_type);
+
+    return ARM_CMSIS_NN_SUCCESS;
+}
+/**
+ * @} end of supportLSTM group
+ */
\ No newline at end of file
diff --git a/Source/NNSupportFunctions/arm_nn_lstm_step_s16.c b/Source/NNSupportFunctions/arm_nn_lstm_step_s16.c
new file mode 100644
index 00000000..37cf363f
--- /dev/null
+++ b/Source/NNSupportFunctions/arm_nn_lstm_step_s16.c
@@ -0,0 +1,112 @@
+/*
+ * SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates <open-source-office.com>
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS NN Library
+ * Title:        arm_nn_lstm_step_s16.c
+ * Description:  Update LSTM function for a single iteration step.
+ *
+ * $Date:        26 March 2024
+ * $Revision:    V.1.0.0
+ *
+ * Target :  Arm(R) M-Profile Architecture
+ *
+ * -------------------------------------------------------------------- */
+#include "arm_nnfunctions.h"
+#include "arm_nnsupportfunctions.h"
+
+/**
+ * @ingroup groupSupport
+ */
+
+/**
+ * @addtogroup supportLSTM
+ * @{
+ */
+
+/*
+ * Calculate the output state tensor of an LSTM step, s16 input/output/weights and s16 internal buffers version.
+ * Refer to header file for details.
+ */
+arm_cmsis_nn_status arm_nn_lstm_step_s16(const int16_t *data_in,
+                                         const int16_t *hidden_in,
+                                         int16_t *hidden_out,
+                                         const cmsis_nn_lstm_params *params,
+                                         cmsis_nn_lstm_context *buffers,
+                                         const int32_t batch_offset)
+{
+    int16_t *forget_gate = buffers->temp1;
+    int16_t *input_gate = buffers->temp1;
+    int16_t *cell_gate = buffers->temp2;
+    int16_t *output_gate = buffers->temp1;
+    int16_t *hidden_temp = buffers->temp2;
+
+    int16_t *cell_state = buffers->cell_state;
+
+    arm_nn_lstm_calculate_gate_s16(data_in, hidden_in, &params->forget_gate, params, forget_gate, batch_offset);
+
+    // Calculate first term of cell state in place early to maximise reuse of scratch-buffers
+    arm_elementwise_mul_s16(forget_gate,
+                            cell_state,
+                            0,
+                            0,
+                            cell_state,
+                            0,
+                            params->forget_to_cell_multiplier,
+                            params->forget_to_cell_shift,
+                            NN_Q15_MIN,
+                            NN_Q15_MAX,
+                            params->hidden_size * params->batch_size);
+
+    arm_nn_lstm_calculate_gate_s16(data_in, hidden_in, &params->input_gate, params, input_gate, batch_offset);
+
+    arm_nn_lstm_calculate_gate_s16(data_in, hidden_in, &params->cell_gate, params, cell_gate, batch_offset);
+
+    // Reminder of cell state calculation, multiply and add to previous result.
+    arm_elementwise_mul_acc_s16(forget_gate,
+                                cell_gate,
+                                0,
+                                0,
+                                cell_state,
+                                0,
+                                params->input_to_cell_multiplier,
+                                params->input_to_cell_shift,
+                                -params->cell_clip,
+                                params->cell_clip,
+                                params->hidden_size * params->batch_size);
+
+    arm_nn_lstm_calculate_gate_s16(data_in, hidden_in, &params->output_gate, params, output_gate, batch_offset);
+
+    // Calculate hidden state directly to output.
+    arm_nn_activation_s16(
+        cell_state, hidden_temp, params->hidden_size * params->batch_size, params->cell_scale_power + 12, ARM_TANH);
+    arm_elementwise_mul_s16_batch_offset(output_gate,
+                                         hidden_temp,
+                                         hidden_out,
+                                         params->output_offset,
+                                         params->output_multiplier,
+                                         params->output_shift,
+                                         params->hidden_size,
+                                         params->batch_size,
+                                         batch_offset);
+
+    return ARM_CMSIS_NN_SUCCESS;
+}
+/**
+ * @} end of supportLSTM group
+ */
diff --git a/Source/NNSupportFunctions/arm_nn_vec_mat_mul_result_acc_s16.c b/Source/NNSupportFunctions/arm_nn_vec_mat_mul_result_acc_s16.c
new file mode 100644
index 00000000..6281e64f
--- /dev/null
+++ b/Source/NNSupportFunctions/arm_nn_vec_mat_mul_result_acc_s16.c
@@ -0,0 +1,362 @@
+/*
+ * SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS NN Library
+ * Title:        arm_nn_vec_mat_mul_result_acc_s16
+ * Description:  s16 vector by matrix (transposed) multiplication
+ *
+ * $Date:        26 March 2023
+ * $Revision:    V.1.0.0
+ *
+ * Target :  Arm(R) M-Profile Architecture
+ *
+ * -------------------------------------------------------------------- */
+
+#include "arm_nnsupportfunctions.h"
+
+/**
+ * @ingroup groupSupport
+ */
+
+/**
+ * @addtogroup supportFC
+ * @{
+ */
+
+/*
+ * s16 vector(lhs) by matrix (transposed) multiplication with result accumulation
+ *
+ * Refer header file for details.
+ *
+ */
+arm_cmsis_nn_status arm_nn_vec_mat_mul_result_acc_s16(const int16_t *lhs,
+                                                      const int8_t *rhs,
+                                                      const int64_t *effective_bias,
+                                                      int16_t *dst,
+                                                      const int32_t dst_multiplier,
+                                                      const int32_t dst_shift,
+                                                      const int32_t rhs_cols,
+                                                      const int32_t rhs_rows,
+                                                      const int32_t batches,
+                                                      const int32_t batch_offset)
+{
+
+    int32_t reduced_multiplier = REDUCE_MULTIPLIER(dst_multiplier);
+
+    for (int batch = 0; batch < batches; batch++)
+    {
+
+        const int8_t *rhs_ptr = &rhs[0];
+        const int64_t *effective_bias_ptr = &effective_bias[0];
+
+#if defined(ARM_MATH_DSP)
+
+        int32_t rhs_cols_fast = rhs_cols;
+
+        if (rhs_cols > MAX_COL_COUNT)
+        {
+            rhs_cols_fast = MAX_COL_COUNT;
+        }
+
+    #if defined(ARM_MATH_MVEI)
+        int32_t row_loop_cnt = rhs_rows / 4;
+        const int32_t col_loop_cnt = (rhs_cols_fast + 7) / 8;
+
+        for (int32_t i_row_loop_count = 0; i_row_loop_count < row_loop_cnt; i_row_loop_count++)
+        {
+            int32_t col_cnt = rhs_cols_fast;
+
+            const int16_t *lhs_ptr = lhs;
+            const int8_t *rhs_ptr_0 = rhs_ptr;
+            const int8_t *rhs_ptr_1 = rhs_ptr + rhs_cols;
+            const int8_t *rhs_ptr_2 = rhs_ptr + rhs_cols * 2;
+            const int8_t *rhs_ptr_3 = rhs_ptr + rhs_cols * 3;
+
+            int32_t result_0 = *effective_bias_ptr++;
+            int32_t result_1 = *effective_bias_ptr++;
+            int32_t result_2 = *effective_bias_ptr++;
+            int32_t result_3 = *effective_bias_ptr++;
+
+            for (int i_col_loop_cnt = 0; i_col_loop_cnt < col_loop_cnt; i_col_loop_cnt++)
+            {
+                mve_pred16_t pred = vctp16q(col_cnt);
+                col_cnt -= 8;
+
+                int16x8_t lhs_input = vldrhq_z_s16(lhs_ptr, pred);
+
+                int16x8_t rhs_input_0 = vldrbq_z_s16(rhs_ptr_0, pred);
+                int16x8_t rhs_input_1 = vldrbq_z_s16(rhs_ptr_1, pred);
+                int16x8_t rhs_input_2 = vldrbq_z_s16(rhs_ptr_2, pred);
+                int16x8_t rhs_input_3 = vldrbq_z_s16(rhs_ptr_3, pred);
+
+                result_0 = vmladavaq_s16(result_0, lhs_input, rhs_input_0);
+                result_1 = vmladavaq_s16(result_1, lhs_input, rhs_input_1);
+                result_2 = vmladavaq_s16(result_2, lhs_input, rhs_input_2);
+                result_3 = vmladavaq_s16(result_3, lhs_input, rhs_input_3);
+
+                lhs_ptr += 8;
+
+                rhs_ptr_0 += 8;
+                rhs_ptr_1 += 8;
+                rhs_ptr_2 += 8;
+                rhs_ptr_3 += 8;
+            }
+
+            int64_t result_64_0 = result_0;
+            int64_t result_64_1 = result_1;
+            int64_t result_64_2 = result_2;
+            int64_t result_64_3 = result_3;
+
+            if (rhs_cols > MAX_COL_COUNT)
+            {
+                for (int i_rhs_cols = MAX_COL_COUNT; i_rhs_cols < rhs_cols; i_rhs_cols++)
+                {
+                    const int16_t lhs_temp = *lhs_ptr++;
+
+                    result_64_0 += *rhs_ptr_0++ * lhs_temp;
+                    result_64_1 += *rhs_ptr_1++ * lhs_temp;
+                    result_64_2 += *rhs_ptr_2++ * lhs_temp;
+                    result_64_3 += *rhs_ptr_3++ * lhs_temp;
+                }
+            }
+
+            int32_t tmp;
+            tmp = arm_nn_requantize_s64(result_64_0, reduced_multiplier, dst_shift);
+            tmp += (int64_t)*dst;
+            tmp = MAX(tmp, NN_Q15_MIN);
+            tmp = MIN(tmp, NN_Q15_MAX);
+            *dst++ = (int16_t)tmp;
+
+            tmp = 0;
+            tmp = arm_nn_requantize_s64(result_64_1, reduced_multiplier, dst_shift);
+            tmp += (int64_t)*dst;
+            tmp = MAX(tmp, NN_Q15_MIN);
+            tmp = MIN(tmp, NN_Q15_MAX);
+            *dst++ = (int16_t)tmp;
+
+            tmp = 0;
+            tmp = arm_nn_requantize_s64(result_64_2, reduced_multiplier, dst_shift);
+            tmp += (int64_t)*dst;
+            tmp = MAX(tmp, NN_Q15_MIN);
+            tmp = MIN(tmp, NN_Q15_MAX);
+            *dst++ = (int16_t)tmp;
+
+            tmp = 0;
+            tmp = arm_nn_requantize_s64(result_64_3, reduced_multiplier, dst_shift);
+            tmp += (int64_t)*dst;
+            tmp = MAX(tmp, NN_Q15_MIN);
+            tmp = MIN(tmp, NN_Q15_MAX);
+            *dst++ = (int16_t)tmp;
+
+            rhs_ptr += 4 * rhs_cols;
+        }
+
+        for (int8_t rows_left = rhs_rows & 0x3; rows_left > 0; rows_left--)
+        {
+            int32_t result = *effective_bias_ptr++;
+
+            const int16_t *lhs_ptr = lhs;
+            const int8_t *rhs_ptr0 = rhs_ptr;
+
+            int32_t col_cnt = (int32_t)rhs_cols_fast;
+
+            for (int i_col_loop_cnt = 0; i_col_loop_cnt < col_loop_cnt; i_col_loop_cnt++)
+            {
+                mve_pred16_t pred = vctp16q(col_cnt);
+                col_cnt -= 8;
+
+                int16x8_t lhs_input = vldrhq_z_s16(lhs_ptr, pred);
+                int16x8_t rhs_input = vldrbq_z_s16(rhs_ptr0, pred);
+
+                result = vmladavaq_p_s16(result, lhs_input, rhs_input, pred);
+
+                lhs_ptr += 8;
+                rhs_ptr0 += 8;
+            }
+
+            int64_t result_64 = result;
+
+            if (rhs_cols > MAX_COL_COUNT)
+            {
+                for (int i_rhs_cols = MAX_COL_COUNT; i_rhs_cols < rhs_cols; i_rhs_cols++)
+                {
+                    const int16_t lhs_temp = *lhs_ptr++;
+
+                    result_64 += *rhs_ptr0++ * lhs_temp;
+                }
+            }
+
+            int32_t tmp = 0;
+            tmp = arm_nn_requantize_s64(result_64, reduced_multiplier, dst_shift);
+            tmp += (int64_t)*dst;
+            tmp = MAX(tmp, NN_Q15_MIN);
+            tmp = MIN(tmp, NN_Q15_MAX);
+            *dst++ = (int16_t)tmp;
+
+            rhs_ptr += rhs_cols;
+        }
+
+    #else // ARM_MATH_MVEI
+
+        const int32_t row_loop_cnt = rhs_rows / 2;
+
+        for (int32_t i = 0; i < row_loop_cnt; i++)
+        {
+
+            int64_t acc_64_0 = 0;
+            int64_t acc_64_1 = 0;
+            int32_t acc_0 = 0;
+            int32_t acc_1 = 0;
+
+            const int32_t col_loop_cnt = rhs_cols_fast / 4;
+
+            const int16_t *lhs_vec = lhs;
+            const int8_t *rhs_0 = rhs_ptr;
+            rhs_ptr += rhs_cols;
+            const int8_t *rhs_1 = rhs_ptr;
+            rhs_ptr += rhs_cols;
+
+            for (int j = col_loop_cnt; j != 0; j--)
+            {
+                int32_t ker_0, ker_1, vec_part_0, vec_part_1;
+
+                vec_part_0 = arm_nn_read_q15x2_ia(&lhs_vec);
+                vec_part_1 = arm_nn_read_q15x2_ia(&lhs_vec);
+
+                rhs_0 = read_and_pad(rhs_0, &ker_0, &ker_1);
+
+                acc_0 = SMLAD(ker_0, vec_part_0, acc_0);
+                acc_0 = SMLAD(ker_1, vec_part_1, acc_0);
+
+                rhs_1 = read_and_pad(rhs_1, &ker_0, &ker_1);
+
+                acc_1 = SMLAD(ker_0, vec_part_0, acc_1);
+                acc_1 = SMLAD(ker_1, vec_part_1, acc_1);
+            }
+
+            acc_64_0 += acc_0;
+            acc_64_1 += acc_1;
+
+            for (int k = col_loop_cnt * 4; k < rhs_cols; k++)
+            {
+                const int32_t lhs_temp = (*lhs_vec);
+                lhs_vec++;
+                acc_64_0 += lhs_temp * (*rhs_0);
+                rhs_0++;
+                acc_64_1 += lhs_temp * (*rhs_1);
+                rhs_1++;
+            }
+
+            acc_64_0 += *effective_bias_ptr++;
+            acc_64_1 += *effective_bias_ptr++;
+            int32_t tmp;
+
+            tmp = arm_nn_requantize_s64(acc_64_0, reduced_multiplier, dst_shift);
+            tmp += (int64_t)*dst;
+            tmp = MAX(tmp, NN_Q15_MIN);
+            tmp = MIN(tmp, NN_Q15_MAX);
+            *dst++ = (int16_t)tmp;
+
+            tmp = arm_nn_requantize_s64(acc_64_1, reduced_multiplier, dst_shift);
+            tmp += (int64_t)*dst;
+            tmp = MAX(tmp, NN_Q15_MIN);
+            tmp = MIN(tmp, NN_Q15_MAX);
+            *dst++ = (int16_t)tmp;
+        }
+
+        if (rhs_rows & 0x1)
+        {
+            int64_t acc_64_0 = 0;
+            int32_t acc_0 = 0;
+            const int32_t col_loop_cnt = rhs_cols_fast / 4;
+
+            const int16_t *lhs_vec = lhs;
+            const int8_t *rhs_0 = rhs_ptr;
+
+            for (int i = col_loop_cnt; i != 0; i--)
+            {
+                int32_t ker_0, ker_1, vec;
+                rhs_0 = read_and_pad(rhs_0, &ker_0, &ker_1);
+
+                vec = arm_nn_read_q15x2_ia(&lhs_vec);
+                acc_0 = SMLAD(ker_0, vec, acc_0);
+
+                vec = arm_nn_read_q15x2_ia(&lhs_vec);
+                acc_0 = SMLAD(ker_1, vec, acc_0);
+            }
+
+            acc_64_0 += acc_0;
+
+            for (int j = col_loop_cnt * 4; j < rhs_cols; j++)
+            {
+                const int32_t lhs_temp = (*lhs_vec);
+                lhs_vec++;
+                acc_64_0 += lhs_temp * (*rhs_0);
+                rhs_0++;
+            }
+
+            acc_64_0 += *effective_bias_ptr++;
+
+            int32_t tmp;
+            tmp = arm_nn_requantize_s64(acc_64_0, reduced_multiplier, dst_shift);
+            tmp += (int64_t)*dst;
+            tmp = MAX(tmp, NN_Q15_MIN);
+            tmp = MIN(tmp, NN_Q15_MAX);
+            *dst++ = (int16_t)tmp;
+        }
+
+    #endif // ARM_MATH_MVEI
+#else      // ARM_MATH_DSP
+        for (int i_row_loop_cnt = 0; i_row_loop_cnt < rhs_rows; i_row_loop_cnt++)
+        {
+            const int16_t *lhs_ptr = lhs;
+
+            int64_t result = *effective_bias_ptr++;
+
+            for (int32_t rhs_cols_idx = 0; rhs_cols_idx < rhs_cols; ++rhs_cols_idx)
+            {
+                const int64_t rhs_value0 = (int8_t)*rhs_ptr;
+                const int64_t lhs_value = *lhs_ptr;
+
+                result += lhs_value * rhs_value0;
+                ++rhs_ptr;
+                ++lhs_ptr;
+            }
+
+            // Quantize down
+            result = arm_nn_requantize_s64(result, reduced_multiplier, dst_shift);
+            result += (int64_t)*dst;
+
+            // Clamp the result
+            result = ((result) > (NN_Q15_MIN) ? (result) : (NN_Q15_MIN));
+            result = ((result) < (NN_Q15_MAX) ? (result) : (NN_Q15_MAX));
+
+            *dst++ = (int16_t)result;
+        }
+#endif     // ARM_MATH_DSP
+
+        lhs += rhs_cols * batch_offset;
+    }
+
+    return ARM_CMSIS_NN_SUCCESS;
+}
+
+/**
+ * @} end of Doxygen group
+ */
diff --git a/Tests/UnitTest/CMakeLists.txt b/Tests/UnitTest/CMakeLists.txt
index cb273db5..dcc06577 100644
--- a/Tests/UnitTest/CMakeLists.txt
+++ b/Tests/UnitTest/CMakeLists.txt
@@ -105,6 +105,7 @@ add_subdirectory(TestCases/test_arm_softmax_s8_s16)
 add_subdirectory(TestCases/test_arm_svdf_s8)
 add_subdirectory(TestCases/test_arm_svdf_state_s16_s8)
 add_subdirectory(TestCases/test_arm_transpose_conv_s8)
+add_subdirectory(TestCases/test_arm_lstm_unidirectional_s16)
 
 set(MAKE_CMD "python3")
 set(MAKE_CMD_SCRIPT "${CMAKE_CURRENT_SOURCE_DIR}/unittest_targets.py")
diff --git a/Tests/UnitTest/RefactoredTestGen/JsonTemplates/test_arm_lstm_s16.json b/Tests/UnitTest/RefactoredTestGen/JsonTemplates/lstm_s16.json
similarity index 99%
rename from Tests/UnitTest/RefactoredTestGen/JsonTemplates/test_arm_lstm_s16.json
rename to Tests/UnitTest/RefactoredTestGen/JsonTemplates/lstm_s16.json
index d45163b9..0b947e20 100644
--- a/Tests/UnitTest/RefactoredTestGen/JsonTemplates/test_arm_lstm_s16.json
+++ b/Tests/UnitTest/RefactoredTestGen/JsonTemplates/lstm_s16.json
@@ -448,7 +448,7 @@
             "fused_activation_function": "TANH",
             "cell_clip": cell_clip,
             "proj_clip": 0.0,
-            "time_major": false,
+            "time_major": time_major,
             "asymmetric_quantize_inputs": false,
             "diagonal_recurrent_tensors": false
           },
diff --git a/Tests/UnitTest/RefactoredTestGen/JsonTemplates/lstm_s16_tm.json b/Tests/UnitTest/RefactoredTestGen/JsonTemplates/lstm_s16_tm.json
new file mode 100644
index 00000000..6e0e61d5
--- /dev/null
+++ b/Tests/UnitTest/RefactoredTestGen/JsonTemplates/lstm_s16_tm.json
@@ -0,0 +1,620 @@
+{
+  "version": 3,
+  "operator_codes": [
+    {
+      "deprecated_builtin_code": 44,
+      "version": 1,
+      "builtin_code": "UNIDIRECTIONAL_SEQUENCE_LSTM"
+    }
+  ],
+  "subgraphs": [
+    {
+      "tensors": [
+        {
+          "shape": [
+            time_steps,
+            batch_size,
+            input_size
+          ],
+          "type": "INT16",
+          "buffer": 0,
+          "name": "serving_default_input:0",
+          "quantization": {
+            "scale": [
+              input_scale
+            ],
+            "zero_point": [
+              input_zero_point
+            ],
+            "details_type": "NONE",
+            "quantized_dimension": 0
+          },
+          "is_variable": false,
+          "has_rank": true
+        },
+        {
+          "shape": [
+            hidden_size
+          ],
+          "type": "INT64",
+          "buffer": 2,
+          "name": "arith.constant4",
+          "quantization": {
+            "scale": [
+              0.000029
+            ],
+            "zero_point": [
+              0
+            ],
+            "details_type": "NONE",
+            "quantized_dimension": 0
+          },
+          "is_variable": false,
+          "has_rank": true
+        },
+        {
+          "shape": [
+            hidden_size
+          ],
+          "type": "INT64",
+          "buffer": 3,
+          "name": "arith.constant5",
+          "quantization": {
+            "scale": [
+              0.000024
+            ],
+            "zero_point": [
+              0
+            ],
+            "details_type": "NONE",
+            "quantized_dimension": 0
+          },
+          "is_variable": false,
+          "has_rank": true
+        },
+        {
+          "shape": [
+            hidden_size
+          ],
+          "type": "INT64",
+          "buffer": 4,
+          "name": "arith.constant6",
+          "quantization": {
+            "scale": [
+              0.000028
+            ],
+            "zero_point": [
+              0
+            ],
+            "details_type": "NONE",
+            "quantized_dimension": 0
+          },
+          "is_variable": false,
+          "has_rank": true
+        },
+        {
+          "shape": [
+            hidden_size
+          ],
+          "type": "INT64",
+          "buffer": 5,
+          "name": "arith.constant7",
+          "quantization": {
+            "scale": [
+              0.00002
+            ],
+            "zero_point": [
+              0
+            ],
+            "details_type": "NONE",
+            "quantized_dimension": 0
+          },
+          "is_variable": false,
+          "has_rank": true
+        },
+        {
+          "shape": [
+            hidden_size,
+            hidden_size
+          ],
+          "type": "INT8",
+          "buffer": 6,
+          "name": "arith.constant",
+          "quantization": {
+            "scale": [
+              output_gate_hidden_scale
+            ],
+            "zero_point": [
+              0
+            ],
+            "details_type": "NONE",
+            "quantized_dimension": 0
+          },
+          "is_variable": false,
+          "has_rank": true
+        },
+        {
+          "shape": [
+            hidden_size,
+            hidden_size
+          ],
+          "type": "INT8",
+          "buffer": 7,
+          "name": "arith.constant1",
+          "quantization": {
+            "scale": [
+              cell_gate_hidden_scale
+            ],
+            "zero_point": [
+              0
+            ],
+            "details_type": "NONE",
+            "quantized_dimension": 0
+          },
+          "is_variable": false,
+          "has_rank": true
+        },
+        {
+          "shape": [
+            hidden_size,
+            hidden_size
+          ],
+          "type": "INT8",
+          "buffer": 8,
+          "name": "arith.constant2",
+          "quantization": {
+            "scale": [
+              forget_gate_hidden_scale
+            ],
+            "zero_point": [
+              0
+            ],
+            "details_type": "NONE",
+            "quantized_dimension": 0
+          },
+          "is_variable": false,
+          "has_rank": true
+        },
+        {
+          "shape": [
+            hidden_size,
+            hidden_size
+          ],
+          "type": "INT8",
+          "buffer": 9,
+          "name": "arith.constant3",
+          "quantization": {
+            "scale": [
+              input_gate_hidden_scale
+            ],
+            "zero_point": [
+              0
+            ],
+            "details_type": "NONE",
+            "quantized_dimension": 0
+          },
+          "is_variable": false,
+          "has_rank": true
+        },
+        {
+          "shape": [
+            hidden_size,
+            input_size
+          ],
+          "type": "INT8",
+          "buffer": 10,
+          "name": "arith.constant8",
+          "quantization": {
+            "scale": [
+              output_gate_input_scale
+            ],
+            "zero_point": [
+              0
+            ],
+            "details_type": "NONE",
+            "quantized_dimension": 0
+          },
+          "is_variable": false,
+          "has_rank": true
+        },
+        {
+          "shape": [
+            hidden_size,
+            input_size
+          ],
+          "type": "INT8",
+          "buffer": 11,
+          "name": "arith.constant9",
+          "quantization": {
+            "scale": [
+              cell_gate_input_scale
+            ],
+            "zero_point": [
+              0
+            ],
+            "details_type": "NONE",
+            "quantized_dimension": 0
+          },
+          "is_variable": false,
+          "has_rank": true
+        },
+        {
+          "shape": [
+            hidden_size,
+            input_size
+          ],
+          "type": "INT8",
+          "buffer": 12,
+          "name": "arith.constant10",
+          "quantization": {
+            "scale": [
+              forget_gate_input_scale
+            ],
+            "zero_point": [
+              0
+            ],
+            "details_type": "NONE",
+            "quantized_dimension": 0
+          },
+          "is_variable": false,
+          "has_rank": true
+        },
+        {
+          "shape": [
+            hidden_size,
+            input_size
+          ],
+          "type": "INT8",
+          "buffer": 13,
+          "name": "arith.constant11",
+          "quantization": {
+            "scale": [
+              input_gate_input_scale
+            ],
+            "zero_point": [
+              0
+            ],
+            "details_type": "NONE",
+            "quantized_dimension": 0
+          },
+          "is_variable": false,
+          "has_rank": true
+        },
+        {
+          "shape": [
+            batch_size,
+            hidden_size
+          ],
+          "type": "INT16",
+          "buffer": 0,
+          "name": "tfl.pseudo_qconst",
+          "quantization": {
+            "scale": [
+              output_scale
+            ],
+            "zero_point": [
+              output_zero_point
+            ],
+            "details_type": "NONE",
+            "quantized_dimension": 0
+          },
+          "is_variable": true,
+          "has_rank": true
+        },
+        {
+          "shape": [
+            batch_size,
+            hidden_size
+          ],
+          "type": "INT16",
+          "buffer": 0,
+          "name": "tfl.pseudo_qconst1",
+          "quantization": {
+            "scale": [
+              cell_scale
+            ],
+            "zero_point": [
+              0
+            ],
+            "details_type": "NONE",
+            "quantized_dimension": 0
+          },
+          "is_variable": true,
+          "has_rank": true
+        },
+        {
+          "shape": [
+            0
+          ],
+          "type": "FLOAT32",
+          "buffer": 0,
+          "name": "input_to_input_intermediate",
+          "is_variable": false,
+          "has_rank": true
+        },
+        {
+          "shape": [
+            0
+          ],
+          "type": "FLOAT32",
+          "buffer": 0,
+          "name": "input_to_forget_intermediate",
+          "is_variable": false,
+          "has_rank": true
+        },
+        {
+          "shape": [
+            0
+          ],
+          "type": "FLOAT32",
+          "buffer": 0,
+          "name": "input_to_cell_intermediate",
+          "is_variable": false,
+          "has_rank": true
+        },
+        {
+          "shape": [
+            0
+          ],
+          "type": "FLOAT32",
+          "buffer": 0,
+          "name": "input_to_output_intermediate",
+          "is_variable": false,
+          "has_rank": true
+        },
+        {
+          "shape": [
+            0
+          ],
+          "type": "INT16",
+          "buffer": 0,
+          "name": "effective_hidden_scale_intermediate",
+          "quantization": {
+            "scale": [
+              0
+            ],
+            "zero_point": [
+              0
+            ],
+            "details_type": "NONE",
+            "quantized_dimension": 0
+          },
+          "is_variable": false,
+          "has_rank": true
+        },
+        {
+          "shape": [
+            time_steps,
+            batch_size,
+            hidden_size
+          ],
+          "type": "INT16",
+          "buffer": 16,
+          "name": "StatefulPartitionedCall:0",
+          "quantization": {
+            "scale": [
+              output_scale
+            ],
+            "zero_point": [
+              output_zero_point
+            ],
+            "details_type": "NONE",
+            "quantized_dimension": 0
+          },
+          "is_variable": false,
+          "has_rank": true
+        }
+      ],
+      "inputs": [
+        0
+      ],
+      "outputs": [
+        20
+      ],
+      "operators": [
+        {
+          "opcode_index": 0,
+          "inputs": [
+            0,
+            12,
+            11,
+            10,
+            9,
+            8,
+            7,
+            6,
+            5,
+            -1,
+            -1,
+            -1,
+            4,
+            3,
+            2,
+            1,
+            -1,
+            -1,
+            13,
+            14,
+            -1,
+            -1,
+            -1,
+            -1
+          ],
+          "outputs": [
+            20
+          ],
+          "builtin_options_type": "UnidirectionalSequenceLSTMOptions",
+          "builtin_options": {
+            "fused_activation_function": "TANH",
+            "cell_clip": cell_clip,
+            "proj_clip": 0.0,
+            "time_major": time_major,
+            "asymmetric_quantize_inputs": false,
+            "diagonal_recurrent_tensors": false
+          },
+          "custom_options_format": "FLEXBUFFERS",
+          "intermediates": [
+            15,
+            16,
+            17,
+            18,
+            19
+          ],
+          "large_custom_options_offset": 0,
+          "large_custom_options_size": 0,
+          "builtin_options_2_type": "NONE"
+        }
+      ],
+      "name": "main"
+    }
+  ],
+  "description": "MLIR Converted.",
+  "buffers": [
+    {
+      "offset": 0,
+      "size": 10
+    },
+    {
+      "offset": 0,
+      "size": 0
+    },
+    {
+      "data": [
+        output_gate_bias
+      ],
+      "offset": 0,
+      "size": 0
+    },
+    {
+"data": [
+        cell_gate_bias
+      ],
+      "offset": 0,
+      "size": 0
+    },
+    {
+"data": [
+        forget_gate_bias
+      ],
+      "offset": 0,
+      "size": 0
+    },
+    {
+"data": [
+        input_gate_bias
+      ],
+      "offset": 0,
+      "size": 0
+    },
+    {
+      "data": [
+        output_gate_hidden_weights
+      ],
+      "offset": 0,
+      "size": 0
+    },
+    {
+      "data": [
+        cell_gate_hidden_weights
+      ],
+      "offset": 0,
+      "size": 0
+    },
+    {
+      "data": [
+        forget_gate_hidden_weights
+      ],
+      "offset": 0,
+      "size": 0
+    },
+    {
+      "data": [
+        input_gate_hidden_weights
+      ],
+      "offset": 0,
+      "size": 0
+    },
+    {
+      "data": [
+        output_gate_input_weights
+      ],
+      "offset": 0,
+      "size": 0
+    },
+    {
+      "data": [
+        cell_gate_input_weights
+      ],
+      "offset": 0,
+      "size": 0
+    },
+    {
+      "data": [
+        forget_gate_input_weights
+      ],
+      "offset": 0,
+      "size": 0
+    },
+    {
+      "data": [
+        input_gate_input_weights
+      ],
+      "offset": 0,
+      "size": 0
+    },
+    {
+      "data": [
+        0,
+        0,
+        0,
+        0
+      ],
+      "offset": 0,
+      "size": 0
+    },
+    {
+      "offset": 0,
+      "size": 0
+    },
+    {
+      "offset": 0,
+      "size": 0
+    },
+    {
+      "offset": 0,
+      "size": 0
+    },
+    {
+      "offset": 0,
+      "size": 0
+    }
+  ],
+  "metadata": [
+    {
+      "name": "min_runtime_version",
+      "buffer": 17
+    },
+    {
+      "name": "CONVERSION_METADATA",
+      "buffer": 18
+    }
+  ],
+  "signature_defs": [
+    {
+      "inputs": [
+        {
+          "name": "input",
+          "tensor_index": 0
+        }
+      ],
+      "outputs": [
+        {
+          "name": "lstm",
+          "tensor_index": 20
+        }
+      ],
+      "signature_key": "serving_default",
+      "subgraph_index": 0
+    }
+  ]
+}
diff --git a/Tests/UnitTest/RefactoredTestGen/Lib/op_lstm.py b/Tests/UnitTest/RefactoredTestGen/Lib/op_lstm.py
index d1d8aef9..02f2ec88 100644
--- a/Tests/UnitTest/RefactoredTestGen/Lib/op_lstm.py
+++ b/Tests/UnitTest/RefactoredTestGen/Lib/op_lstm.py
@@ -27,7 +27,11 @@ class Op_lstm(Lib.op_utils.Op_type):
 
     def get_shapes(params):
         shapes = {}
-        shapes["input"] = (params["batch_size"], params["time_steps"], params["input_size"])
+        if params["time_major"] and params["tflite_generator"] == "json":
+            shapes["input"] = (params["time_steps"], params["batch_size"], params["input_size"])
+        else:
+            shapes["input"] = (params["batch_size"], params["time_steps"], params["input_size"])
+
         shapes["input_weights"] = (params["input_size"], params["hidden_size"])
         shapes["all_input_weights"] = (params["input_size"], params["hidden_size"] * 4)
 
@@ -135,8 +139,8 @@ def generate_data_json(shapes, params):
         effective_scales = {}
         generated_params = {}
 
-        maxval = 0.009
-        minval = 0.002
+        maxval = 0.001
+        minval = 0.0001
 
         scales["input_scale"] = np.round(np.random.rand(1) * (maxval - minval) + minval, 6)[0]
         scales["cell_scale"] = np.round(np.random.rand(1) * (maxval - minval) + maxval, 6)[0]
@@ -175,8 +179,8 @@ def create_scales(name, input_scale1):
         tensors["cell_gate_input_weights"] = np.random.randint(minval, maxval, size=shapes["input_weights"])
         tensors["output_gate_input_weights"] = np.random.randint(minval, maxval, size=shapes["input_weights"])
 
-        maxval = 1
-        minval = 0
+        maxval = Lib.op_utils.get_dtype_max(params["input_data_type"])
+        minval = 0 # Negative weights are not supported in test generation
         tensors["input_gate_bias"] = np.random.randint(minval, maxval, size=shapes["bias"])
         tensors["forget_gate_bias"] = np.random.randint(minval, maxval, size=shapes["bias"])
         tensors["cell_gate_bias"] = np.random.randint(minval, maxval, size=shapes["bias"])
diff --git a/Tests/UnitTest/RefactoredTestGen/Lib/op_utils.py b/Tests/UnitTest/RefactoredTestGen/Lib/op_utils.py
index dc45e87c..ad74b0e8 100644
--- a/Tests/UnitTest/RefactoredTestGen/Lib/op_utils.py
+++ b/Tests/UnitTest/RefactoredTestGen/Lib/op_utils.py
@@ -88,7 +88,7 @@ def get_np_dtype(dtype):
     if dtype == "int32_t":
         return np.uint32
     if dtype == "int64_t":
-        return np.uint32
+        return np.uint64
     else:
         raise Exception(f"Unrecognized dtype '{dtype}'")
 
diff --git a/Tests/UnitTest/RefactoredTestGen/Lib/test.py b/Tests/UnitTest/RefactoredTestGen/Lib/test.py
index 466f8c36..ea8d1826 100644
--- a/Tests/UnitTest/RefactoredTestGen/Lib/test.py
+++ b/Tests/UnitTest/RefactoredTestGen/Lib/test.py
@@ -117,7 +117,7 @@ def generate(params, args, fpaths):
 
     include_in_config = lambda key: key not in [
         "suite_name", "name", "input_data_type", "op_type", "input_data_type", "weights_data_type", "bias_data_type",
-        "interpreter", "tflite_generator"
+        "interpreter", "tflite_generator", "json_template"
     ]
     config_params = {key: val for key, val in params.items() if include_in_config(key)}
     write_config(fpaths["config_data"], config_params, params["name"], fpaths["test_data"], header)
@@ -223,20 +223,32 @@ def write_config(config_fpath, params, prefix, test_data_fpath, header):
 
 
 def write_c_array(data, fname, dtype, prefix, tensor_name, test_data_fpath, header):
+
+    # Check that the data looks reasonable
+    values, counts = np.unique(data, return_counts=True)
+    if len(values) < data.size / 2 or max(counts) > data.size / 2:
+        print(f"WARNING: {fname} has repeating values, is this intended?")
+    if len(data) > 500:
+        print(f"WARNING: {fname} has more than 500 values, is this intended?")
+
     with fname.open("w+") as f:
         f.write(header)
         f.write("#pragma once\n")
         f.write("#include <stdint.h>\n\n")
-
         if not data is None:
+            data_shape = data.shape
+            format_width = len(str(data.max())) + 1
             data = data.flatten()
             f.write(f"const {dtype} {prefix}_{tensor_name}[{len(data)}] = \n" + "{")
 
             for i in range(len(data) - 1):
-                f.write(f"{data[i]: 5n}, ")
-                if i % 16 == 0:
+                if i % data_shape[-1] == 0:
                     f.write("\n")
-            f.write(str(data[len(data) - 1]) + "\n};")
+                f.write(f"{data[i]: {format_width}n}, ")
+
+            if len(data)-1 % data_shape[-1] == 0:
+                f.write("\n")
+            f.write(f"{data[len(data) - 1]: {format_width}n}" + "\n};")
 
         else:
             f.write(f"const {dtype} *{prefix}_{tensor_name} = NULL;\n")
@@ -268,11 +280,16 @@ def convert_json_to_tflite(json_template_fpath, json_output_fpath, tensors, para
             for line in template:
                 line_list = line.replace(",", "").split()
                 replaced = False
-                for key in params:
+                for key, val in params.items():
                     if key in line_list:
+                        if isinstance(val, bool):
+                            if val:
+                                val = "true"
+                            else:
+                                val = "false"
                         # To be able to handle cases like "variable_name" : variable_name
                         # make sure to only replace the last occurence per line
-                        new_line = str(params[key]).join(line.rsplit(key, 1))
+                        new_line = str(val).join(line.rsplit(key, 1))
                         output.write(new_line)
                         replaced = True
                         break
@@ -318,7 +335,7 @@ def quantize_scale(scale):
 
 def get_header(generator, interpreter):
     if generator == "keras":
-        header = f"// Generated by test.py using tensorflow version {tf.__version__} (Keras version {keras.__version__}).\n"
+        header = f"// Generated by {os.path.basename(__file__)} using tensorflow version {tf.__version__} (Keras version {keras.__version__}).\n"
     elif generator == "json":
         command = f"flatc  --version"
         command_list = command.split()
@@ -333,7 +350,7 @@ def get_header(generator, interpreter):
                 sys.exit(1)
         except Exception as e:
             raise RuntimeError(f"{e} from: {command = }. Did you install flatc?")
-        header = f"// Generated by test.py using {flatc_version}\n"
+        header = f"// Generated by {os.path.basename(__file__)} using {str(flatc_version)[2:-3]}\n"
     else:
         raise Exception
 
diff --git a/Tests/UnitTest/RefactoredTestGen/Lib/test_plan.py b/Tests/UnitTest/RefactoredTestGen/Lib/test_plan.py
index c9c60dae..fa0540be 100644
--- a/Tests/UnitTest/RefactoredTestGen/Lib/test_plan.py
+++ b/Tests/UnitTest/RefactoredTestGen/Lib/test_plan.py
@@ -16,18 +16,27 @@
 #
 import json
 import Lib.test_suite
-
+import sys
 
 def generate(args):
     """Generate a number of test suites defined by a json-file test plan"""
 
-    print(f"\nGenerating tests from {args.test_plan}")
     test_plan = args.test_plan.read_text()
     test_suite_params_list = json.loads(test_plan)
 
-    test_suites = []
+    # List available tests for convenience
+    if args.list:
+        for suite in test_suite_params_list:
+            print(f"{suite['suite_name']}")
+            for test in suite["tests"]:
+                print(f"- {test['name']}")
+
+        sys.exit()
+
+    print(f"\nGenerating tests from {args.test_plan}")
     for test_suite_params in test_suite_params_list:
         if (test_suite_params["suite_name"] in args.test_suites) or (args.test_suites == []):
-            print(f"{test_suite_params['suite_name']}")
-            test_suite = Lib.test_suite.generate(test_suite_params, args)
-            test_suites.append(test_suite)
+            test_names = [test["name"] for test in test_suite_params["tests"] if test["name"] in args.tests]
+            if (len(test_names) > 0) or (args.tests == []):
+                print(f"{test_suite_params['suite_name']}")
+                test_suite = Lib.test_suite.generate(test_suite_params, args)
diff --git a/Tests/UnitTest/RefactoredTestGen/generate_test_data.py b/Tests/UnitTest/RefactoredTestGen/generate_test_data.py
index a0591c51..6af83ee4 100755
--- a/Tests/UnitTest/RefactoredTestGen/generate_test_data.py
+++ b/Tests/UnitTest/RefactoredTestGen/generate_test_data.py
@@ -17,8 +17,9 @@
 # limitations under the License.
 #
 import os
-os.environ["TF_USE_LEGACY_KERAS"]="1"    # See https://github.com/tensorflow/tensorflow/releases/tag/v2.16.1
-os.environ['TF_CPP_MIN_LOG_LEVEL'] ="2"  # See https://github.com/tensorflow/tensorflow/issues/59779
+
+os.environ["TF_USE_LEGACY_KERAS"] = "1"  # See https://github.com/tensorflow/tensorflow/releases/tag/v2.16.1
+os.environ['TF_CPP_MIN_LOG_LEVEL'] = "2"  # See https://github.com/tensorflow/tensorflow/issues/59779
 import json
 import argparse
 import pathlib
@@ -55,6 +56,8 @@ def main():
                         default="../../../tflite_micro/tensorflow/lite/schema/schema.fbs",
                         help="Path to the schema-file needed for generating tflite-files with flatc")
     parser.add_argument("--verbose", action="store_true", help="Enable additional logging")
+    parser.add_argument("--list", action="store_true", help="Only list tests in test plan")
+
     args = parser.parse_args()
 
     Lib.test_plan.generate(args)
diff --git a/Tests/UnitTest/RefactoredTestGen/test_plan.json b/Tests/UnitTest/RefactoredTestGen/test_plan.json
index 3fe2f5fc..b577d23c 100644
--- a/Tests/UnitTest/RefactoredTestGen/test_plan.json
+++ b/Tests/UnitTest/RefactoredTestGen/test_plan.json
@@ -14,7 +14,7 @@
          "time_steps" : 10,
          "input_size" : 22,
          "hidden_size" : 11,
-         "json_template": "test_arm_lstm_s16.json"
+         "json_template": "lstm_s16_tm.json"
         },
         {"name" : "lstm_2_s16",
          "time_major" : false,
@@ -22,7 +22,7 @@
          "time_steps" : 9,
          "input_size" : 6,
          "hidden_size" : 7,
-         "json_template": "test_arm_lstm_s16.json"
+         "json_template": "lstm_s16.json"
         },
         {"name" : "lstm_one_time_step_s16",
          "time_major" : false,
@@ -30,7 +30,7 @@
          "time_steps" : 1,
          "input_size" : 22,
          "hidden_size" : 3,
-         "json_template": "test_arm_lstm_s16.json"
+         "json_template": "lstm_s16.json"
         }
     ]
 },
diff --git a/Tests/UnitTest/TestCases/TestData/lstm_1_s16/cell_gate_bias.h b/Tests/UnitTest/TestCases/TestData/lstm_1_s16/cell_gate_bias.h
new file mode 100644
index 00000000..8306e32b
--- /dev/null
+++ b/Tests/UnitTest/TestCases/TestData/lstm_1_s16/cell_gate_bias.h
@@ -0,0 +1,7 @@
+// Generated by RefactoredTestGen/test.py using flatc version 23.5.26
+// Interpreter from tflite_micro runtime version 0.dev20240224054047-gcfa4c91.
+#pragma once
+#include <stdint.h>
+
+const int64_t lstm_1_s16_cell_gate_bias[11] =
+    {20142, 23563, 15741, 25098, 29041, 25327, 12730, 19511, 21749, 13563, 31032};
\ No newline at end of file
diff --git a/Tests/UnitTest/TestCases/TestData/lstm_1_s16/cell_gate_hidden_weights.h b/Tests/UnitTest/TestCases/TestData/lstm_1_s16/cell_gate_hidden_weights.h
new file mode 100644
index 00000000..ba8777b4
--- /dev/null
+++ b/Tests/UnitTest/TestCases/TestData/lstm_1_s16/cell_gate_hidden_weights.h
@@ -0,0 +1,12 @@
+// Generated by RefactoredTestGen/test.py using flatc version 23.5.26
+// Interpreter from tflite_micro runtime version 0.dev20240224054047-gcfa4c91.
+#pragma once
+#include <stdint.h>
+
+const int8_t lstm_1_s16_cell_gate_hidden_weights[121] = {
+    99,   -84, 84,   -14,  -61,  94,  -60, 67,  -85, 56,   -108, 69, 53,   -34,  103, 75,   97,   -101, -84, -100, 89,
+    -105, 124, -83,  4,    42,   -9,  -18, 17,  -32, -37,  -117, 94, -105, 125,  -92, 73,   122,  -50,  31,  -66,  123,
+    2,    36,  118,  -19,  -98,  108, -67, -56, 77,  15,   -61,  -5, -25,  -19,  106, -121, -102, -81,  -54, 35,   80,
+    39,   -7,  -115, 107,  38,   30,  -28, 85,  -12, 111,  -124, 54, 0,    -9,   39,  -35,  102,  -45,  22,  83,   -39,
+    -94,  -96, -60,  30,   -101, 48,  108, -56, -78, -110, 13,   70, 49,   -44,  -75, -52,  78,   102,  -57, -26,  -97,
+    -95,  18,  -77,  -128, 50,   -76, 37,  -68, -63, 76,   107,  56, 115,  -108, -74, 34};
\ No newline at end of file
diff --git a/Tests/UnitTest/TestCases/TestData/lstm_1_s16/cell_gate_input_weights.h b/Tests/UnitTest/TestCases/TestData/lstm_1_s16/cell_gate_input_weights.h
new file mode 100644
index 00000000..83c21cc1
--- /dev/null
+++ b/Tests/UnitTest/TestCases/TestData/lstm_1_s16/cell_gate_input_weights.h
@@ -0,0 +1,19 @@
+// Generated by RefactoredTestGen/test.py using flatc version 23.5.26
+// Interpreter from tflite_micro runtime version 0.dev20240224054047-gcfa4c91.
+#pragma once
+#include <stdint.h>
+
+const int8_t lstm_1_s16_cell_gate_input_weights[242] = {
+    -66,  25,   6,    53,   -50, 29,  -127, -102, -15,  112,  -13,  60,   -87,  -92,  -108, 28,   81,   -17,  -16,
+    54,   -40,  -109, -105, -60, -26, -71,  -116, -76,  -104, 48,   108,  118,  74,   -124, 81,   -111, -16,  -117,
+    -91,  25,   38,   121,  -19, -1,  109,  39,   77,   -76,  99,   34,   2,    45,   26,   9,    -111, -62,  73,
+    -127, 2,    -8,   -115, 110, -47, 114,  -51,  -23,  111,  -84,  -112, 27,   -101, 13,   67,   95,   118,  -46,
+    37,   -51,  116,  -98,  87,  0,   109,  119,  -102, -122, 79,   110,  -47,  -114, -121, -77,  -45,  -94,  -1,
+    5,    -74,  -40,  -74,  41,  -67, 16,   -115, -20,  0,    27,   78,   29,   -116, -114, -119, -43,  -50,  -31,
+    -121, -52,  -42,  -91,  -24, -82, -24,  -23,  -106, 40,   62,   -36,  -102, 7,    -79,  -12,  56,   16,   79,
+    -29,  -39,  -74,  118,  6,   -46, -115, 100,  -38,  33,   70,   0,    -41,  111,  -28,  42,   -9,   -113, -22,
+    -15,  -96,  81,   -37,  88,  -77, 47,   111,  -39,  85,   79,   97,   62,   -74,  91,   -112, -62,  78,   -5,
+    52,   -64,  -97,  38,   -98, 84,  -88,  -59,  -2,   -107, -91,  12,   -56,  51,   32,   0,    78,   -14,  -14,
+    -7,   124,  -85,  -122, 52,  42,  -9,   10,   19,   20,   -8,   79,   104,  14,   -56,  1,    124,  81,   69,
+    -20,  -62,  -105, -67,  69,  -25, 88,   -15,  -108, 47,   -111, -26,  87,   25,   -98,  19,   -4,   68,   4,
+    47,   -115, -18,  -40,  87,  50,  -79,  -104, -122, -77,  -64,  125,  -93,  -75};
\ No newline at end of file
diff --git a/Tests/UnitTest/TestCases/TestData/lstm_1_s16/config_data.h b/Tests/UnitTest/TestCases/TestData/lstm_1_s16/config_data.h
new file mode 100644
index 00000000..a0b0925c
--- /dev/null
+++ b/Tests/UnitTest/TestCases/TestData/lstm_1_s16/config_data.h
@@ -0,0 +1,34 @@
+// Generated by RefactoredTestGen/test.py using flatc version 23.5.26
+// Interpreter from tflite_micro runtime version 0.dev20240224054047-gcfa4c91.
+#pragma once
+#define lstm_1_s16_time_major true
+#define lstm_1_s16_batch_size 1
+#define lstm_1_s16_time_steps 10
+#define lstm_1_s16_input_size 22
+#define lstm_1_s16_hidden_size 11
+#define lstm_1_s16_cell_scale_power -9
+#define lstm_1_s16_output_zero_point 0
+#define lstm_1_s16_input_zero_point 0
+#define lstm_1_s16_cell_clip 32767
+#define lstm_1_s16_forget_to_cell_multiplier 1073741824
+#define lstm_1_s16_forget_to_cell_shift -14
+#define lstm_1_s16_input_to_cell_multiplier 1107260824
+#define lstm_1_s16_input_to_cell_shift -20
+#define lstm_1_s16_output_multiplier 1213629630
+#define lstm_1_s16_output_shift -16
+#define lstm_1_s16_output_gate_hidden_multiplier 1898861718
+#define lstm_1_s16_output_gate_hidden_shift -11
+#define lstm_1_s16_cell_gate_hidden_multiplier 1891079498
+#define lstm_1_s16_cell_gate_hidden_shift -11
+#define lstm_1_s16_forget_gate_hidden_multiplier 1103129707
+#define lstm_1_s16_forget_gate_hidden_shift -11
+#define lstm_1_s16_input_gate_hidden_multiplier 1470839610
+#define lstm_1_s16_input_gate_hidden_shift -12
+#define lstm_1_s16_output_gate_input_multiplier 1106912731
+#define lstm_1_s16_output_gate_input_shift -8
+#define lstm_1_s16_cell_gate_input_multiplier 1407915316
+#define lstm_1_s16_cell_gate_input_shift -11
+#define lstm_1_s16_forget_gate_input_multiplier 1606965412
+#define lstm_1_s16_forget_gate_input_shift -9
+#define lstm_1_s16_input_gate_input_multiplier 1177308497
+#define lstm_1_s16_input_gate_input_shift -8
diff --git a/Tests/UnitTest/TestCases/TestData/lstm_1_s16/forget_gate_bias.h b/Tests/UnitTest/TestCases/TestData/lstm_1_s16/forget_gate_bias.h
new file mode 100644
index 00000000..bfa39ece
--- /dev/null
+++ b/Tests/UnitTest/TestCases/TestData/lstm_1_s16/forget_gate_bias.h
@@ -0,0 +1,6 @@
+// Generated by RefactoredTestGen/test.py using flatc version 23.5.26
+// Interpreter from tflite_micro runtime version 0.dev20240224054047-gcfa4c91.
+#pragma once
+#include <stdint.h>
+
+const int64_t lstm_1_s16_forget_gate_bias[11] = {8691, 24230, 5069, 6375, 13615, 17184, 3326, 27937, 5850, 1018, 15410};
\ No newline at end of file
diff --git a/Tests/UnitTest/TestCases/TestData/lstm_1_s16/forget_gate_hidden_weights.h b/Tests/UnitTest/TestCases/TestData/lstm_1_s16/forget_gate_hidden_weights.h
new file mode 100644
index 00000000..cc2f7476
--- /dev/null
+++ b/Tests/UnitTest/TestCases/TestData/lstm_1_s16/forget_gate_hidden_weights.h
@@ -0,0 +1,12 @@
+// Generated by RefactoredTestGen/test.py using flatc version 23.5.26
+// Interpreter from tflite_micro runtime version 0.dev20240224054047-gcfa4c91.
+#pragma once
+#include <stdint.h>
+
+const int8_t lstm_1_s16_forget_gate_hidden_weights[121] = {
+    59,   122, -110, 124, -13,  123, 42,  108, 71,   -65,  116,  -23, 62,  -80,  96,  111, -27,  42,   -34,  -13,  -39,
+    -119, 107, -44,  -71, 91,   21,  -28, 20,  96,   -76,  23,   74,  23,  -118, 95,  -72, -47,  87,   111,  -41,  68,
+    7,    113, 66,   9,   -36,  24,  126, -4,  2,    -36,  52,   50,  108, 35,   -22, 18,  -114, -65,  -16,  -112, -52,
+    35,   113, -77,  82,  -120, 92,  61,  -91, 104,  -28,  74,   -37, 55,  -64,  106, -38, 48,   -105, -127, -8,   -110,
+    -109, -42, -71,  121, -24,  -3,  30,  -40, -29,  -111, -63,  66,  119, 25,   -75, -52, 93,   -50,  15,   -107, -11,
+    -108, -43, 23,   50,  45,   7,   -90, 13,  -125, 98,   -118, -9,  125, -80,  -66, -14};
\ No newline at end of file
diff --git a/Tests/UnitTest/TestCases/TestData/lstm_1_s16/forget_gate_input_weights.h b/Tests/UnitTest/TestCases/TestData/lstm_1_s16/forget_gate_input_weights.h
new file mode 100644
index 00000000..2537dcfa
--- /dev/null
+++ b/Tests/UnitTest/TestCases/TestData/lstm_1_s16/forget_gate_input_weights.h
@@ -0,0 +1,19 @@
+// Generated by RefactoredTestGen/test.py using flatc version 23.5.26
+// Interpreter from tflite_micro runtime version 0.dev20240224054047-gcfa4c91.
+#pragma once
+#include <stdint.h>
+
+const int8_t lstm_1_s16_forget_gate_input_weights[242] = {
+    -56,  -78,  96,  -62, -38,  -48,  -92,  -47, 4,    121, -38,  -82,  -39,  55,   -53,  -15,  53,   -19,  89,
+    77,   -28,  73,  22,  112,  18,   71,   -27, -110, -16, 16,   -119, -33,  1,    -52,  -16,  92,   -21,  -7,
+    108,  -75,  -16, 44,  -55,  -70,  -106, -37, 40,   60,  86,   -120, 80,   -103, 25,   -121, 27,   -23,  57,
+    -39,  -101, 111, 2,   25,   77,   4,    -69, 13,   104, -64,  -4,   -3,   79,   -114, -68,  -57,  -74,  103,
+    -120, -87,  112, -59, 44,   27,   49,   -87, -37,  121, -83,  -74,  -109, 75,   28,   102,  110,  -11,  -113,
+    -30,  -65,  -86, 59,  -125, -95,  -76,  121, 31,   90,  123,  -76,  -15,  -100, 39,   21,   117,  40,   -106,
+    -72,  5,    83,  -11, -121, 123,  52,   123, -77,  12,  23,   22,   44,   98,   -50,  58,   -48,  90,   78,
+    125,  -32,  7,   -81, 18,   116,  38,   81,  -14,  23,  77,   -90,  109,  -3,   52,   -21,  -40,  -105, -121,
+    10,   -11,  38,  38,  21,   -71,  -97,  -41, 99,   -7,  31,   9,    -23,  -78,  58,   -31,  19,   -98,  -63,
+    -19,  -95,  -10, 23,  -35,  -116, -74,  -72, -91,  119, 62,   -111, -120, -114, -117, 100,  90,   -87,  -20,
+    -6,   -49,  66,  34,  54,   -73,  -76,  95,  -73,  40,  95,   39,   64,   44,   -109, 40,   15,   -40,  -94,
+    6,    -45,  54,  40,  45,   69,   -103, 88,  112,  85,  -63,  -127, 3,    -30,  -78,  -80,  -118, 100,  83,
+    -94,  23,   24,  -33, 23,   -78,  37,   47,  -3,   26,  -108, 81,   -86,  107};
\ No newline at end of file
diff --git a/Tests/UnitTest/TestCases/TestData/lstm_1_s16/input.h b/Tests/UnitTest/TestCases/TestData/lstm_1_s16/input.h
new file mode 100644
index 00000000..69c055f9
--- /dev/null
+++ b/Tests/UnitTest/TestCases/TestData/lstm_1_s16/input.h
@@ -0,0 +1,22 @@
+// Generated by RefactoredTestGen/test.py using flatc version 23.5.26
+// Interpreter from tflite_micro runtime version 0.dev20240224054047-gcfa4c91.
+#pragma once
+#include <stdint.h>
+
+const int16_t lstm_1_s16_input[220] = {
+    13736,  23944,  -5299,  7737,   -26699, -14813, -17886, -1647,  -23298, 24851,  5100,   -25001, 20574,  1552,
+    -14239, -24575, -21416, 15542,  -17167, -14164, 9242,   15063,  2831,   -5102,  18611,  -24404, -1195,  -5944,
+    26930,  -29354, -20621, 26317,  -2661,  21990,  13978,  -6300,  -10229, -22228, -17884, 16041,  4267,   -4187,
+    3334,   26553,  -32377, -9858,  -20242, -19193, -32259, -13251, -16675, 23520,  -24663, -6962,  916,    29657,
+    -31552, -32068, -27539, -29974, -30519, 3367,   10799,  10676,  29031,  -21278, -1862,  -30681, 18441,  -9753,
+    16575,  5763,   7587,   -11638, -4090,  22728,  5364,   5460,   -20007, 6908,   -15077, 19795,  18050,  -5621,
+    -9286,  -29639, 28802,  30444,  -26491, 5531,   10353,  8677,   -21402, -11352, 20251,  -11648, -20007, 11954,
+    -11831, 25472,  -628,   14150,  -8121,  -5260,  29895,  14203,  -30641, 15588,  -4199,  -32034, -30587, -12619,
+    5842,   17187,  22779,  16282,  1806,   18624,  25125,  22730,  -8889,  -7833,  9798,   25136,  -1294,  -10280,
+    -30367, 20369,  30464,  -14168, -29635, 18393,  3661,   -5063,  28190,  -17325, -12483, 24939,  -29775, 14131,
+    9790,   -7480,  21842,  32219,  18428,  -1242,  16135,  6860,   17424,  -27147, 18622,  -19581, -8972,  10224,
+    -1452,  -31682, 3691,   31065,  21477,  13232,  3679,   -3275,  26922,  13443,  26116,  20579,  -14769, 23870,
+    23100,  -11558, -25290, -29482, -5273,  -15625, 9577,   8195,   3928,   8730,   -21723, 25012,  15320,  -31171,
+    8455,   21432,  -13983, 16298,  1905,   -19173, 19859,  3255,   -26975, -12538, 20319,  -31892, 14187,  9356,
+    7952,   -5000,  6683,   8591,   12667,  26386,  22751,  -10073, -5509,  4861,   -19329, 17171,  15986,  -2173,
+    -2405,  31618,  -4030,  12743,  -32321, 26494,  15520,  -20860, 17165,  -1792};
\ No newline at end of file
diff --git a/Tests/UnitTest/TestCases/TestData/lstm_1_s16/input_gate_bias.h b/Tests/UnitTest/TestCases/TestData/lstm_1_s16/input_gate_bias.h
new file mode 100644
index 00000000..5573eb00
--- /dev/null
+++ b/Tests/UnitTest/TestCases/TestData/lstm_1_s16/input_gate_bias.h
@@ -0,0 +1,6 @@
+// Generated by RefactoredTestGen/test.py using flatc version 23.5.26
+// Interpreter from tflite_micro runtime version 0.dev20240224054047-gcfa4c91.
+#pragma once
+#include <stdint.h>
+
+const int64_t lstm_1_s16_input_gate_bias[11] = {13543, 1638, 25244, 3970, 6981, 29954, 4375, 16836, 30316, 6762, 4189};
\ No newline at end of file
diff --git a/Tests/UnitTest/TestCases/TestData/lstm_1_s16/input_gate_hidden_weights.h b/Tests/UnitTest/TestCases/TestData/lstm_1_s16/input_gate_hidden_weights.h
new file mode 100644
index 00000000..17652417
--- /dev/null
+++ b/Tests/UnitTest/TestCases/TestData/lstm_1_s16/input_gate_hidden_weights.h
@@ -0,0 +1,12 @@
+// Generated by RefactoredTestGen/test.py using flatc version 23.5.26
+// Interpreter from tflite_micro runtime version 0.dev20240224054047-gcfa4c91.
+#pragma once
+#include <stdint.h>
+
+const int8_t lstm_1_s16_input_gate_hidden_weights[121] = {
+    110,  -92,  49,  101, -32,  -85, -5,   -7,  -92,  90,  -52,  -116, 2,   -16,  -70, -43, -93,  9,    62,  4,   24,
+    104,  -36,  35,  -21, -50,  -92, 90,   -14, -49,  -60, 63,   -36,  -22, -121, -47, 65,  93,   -121, 107, -87, 36,
+    -122, -118, 21,  13,  -75,  -3,  -39,  -39, 24,   -82, 4,    38,   122, -59,  2,   89,  20,   68,   -65, 27,  95,
+    -123, -61,  84,  22,  -111, 106, -66,  -24, 51,   -75, -83,  53,   19,  -79,  6,   12,  -19,  0,    -4,  19,  122,
+    -101, -56,  -1,  46,  -12,  34,  -128, 75,  90,   -67, -108, -121, -13, 89,   -24, 42,  -108, 12,   -20, -75, 19,
+    -43,  -44,  -83, 97,  -41,  -58, -9,   -19, -107, 54,  21,   27,   59,  -49,  -13, 87};
\ No newline at end of file
diff --git a/Tests/UnitTest/TestCases/TestData/lstm_1_s16/input_gate_input_weights.h b/Tests/UnitTest/TestCases/TestData/lstm_1_s16/input_gate_input_weights.h
new file mode 100644
index 00000000..74548f4e
--- /dev/null
+++ b/Tests/UnitTest/TestCases/TestData/lstm_1_s16/input_gate_input_weights.h
@@ -0,0 +1,19 @@
+// Generated by RefactoredTestGen/test.py using flatc version 23.5.26
+// Interpreter from tflite_micro runtime version 0.dev20240224054047-gcfa4c91.
+#pragma once
+#include <stdint.h>
+
+const int8_t lstm_1_s16_input_gate_input_weights[242] = {
+    58,   -27, -78,  32,   -77,  -64,  -79,  118, 59,  -60, -38,  27,   48,   -27,  -79,  -4,   110,  -126, -20,
+    -119, 29,  76,   63,   -63,  26,   -108, -87, 29,  80,  -52,  87,   -3,   24,   36,   32,   53,   75,   -113,
+    -83,  107, 41,   108,  -54,  30,   115,  7,   -60, -38, -106, -50,  -114, -15,  -112, 75,   -21,  -55,  -3,
+    -28,  -73, 26,   126,  34,   116,  -126, 72,  -63, 106, -25,  -73,  65,   63,   -19,  -112, -45,  107,  65,
+    -55,  -48, 74,   66,   -117, 108,  110,  95,  -86, 29,  34,   -43,  121,  82,   43,   -63,  66,   77,   122,
+    10,   -24, -115, -112, -60,  118,  19,   -61, 85,  -97, -85,  5,    -41,  24,   93,   58,   58,   2,    99,
+    38,   6,   -88,  30,   114,  -20,  -104, 44,  98,  -68, 35,   -23,  -68,  11,   -104, -33,  54,   -10,  -106,
+    6,    118, 15,   64,   29,   -111, 109,  -5,  43,  37,  99,   -96,  86,   119,  54,   36,   -112, 126,  -120,
+    92,   79,  120,  -25,  38,   -44,  -70,  -77, 110, 65,  -110, 23,   -104, -81,  -104, 94,   -45,  -112, 40,
+    69,   -46, 22,   -32,  45,   -56,  93,   -92, -30, -61, 55,   47,   15,   117,  -113, 11,   22,   28,   -118,
+    0,    -42, -119, 29,   123,  -17,  -51,  -99, -91, 120, -1,   -29,  -99,  -117, 93,   91,   91,   -68,  -65,
+    -88,  87,  77,   -35,  27,   58,   71,   -33, -58, 69,  119,  94,   112,  72,   57,   78,   -42,  100,  84,
+    -112, 45,  25,   8,    30,   -19,  -83,  8,   79,  124, -88,  -119, 92,   81};
\ No newline at end of file
diff --git a/Tests/UnitTest/TestCases/TestData/lstm_1_s16/output.h b/Tests/UnitTest/TestCases/TestData/lstm_1_s16/output.h
new file mode 100644
index 00000000..0c867fb9
--- /dev/null
+++ b/Tests/UnitTest/TestCases/TestData/lstm_1_s16/output.h
@@ -0,0 +1,13 @@
+// Generated by RefactoredTestGen/test.py using flatc version 23.5.26
+// Interpreter from tflite_micro runtime version 0.dev20240224054047-gcfa4c91.
+#pragma once
+#include <stdint.h>
+
+const int16_t lstm_1_s16_output[110] = {
+    -318, 2734,  1144, 2729,  -1234, 1629,  -44,   1369,  -8,    -10,   15,    -19,   384,   2444,  1079,  0,
+    713,  -162,  252,  -19,   -542,  164,   109,   100,   -2191, 24,    0,     189,   -2293, -456,  9,     -460,
+    1051, 470,   120,  -4504, 259,   -2564, 146,   -2036, 971,   1754,  332,   -2445, 923,   -257,  -1112, -1547,
+    -69,  797,   -954, -2815, 35,    -2393, -4818, 672,   4690,  -181,  -146,  -536,  -84,   -917,  -190,  -3255,
+    0,    -1013, 1482, 328,   -3916, -83,   -1429, -2188, -215,  -7,    -2003, 825,   -32,   -1072, -341,  -260,
+    -8,   -758,  1097, 1027,  -1,    -1078, 11,    -46,   -134,  -1857, -194,  -561,  1380,  -1835, -30,   -65,
+    0,    -10,   236,  -171,  -1609, -2539, -1146, -348,  -81,   -275,  52,    -7,    2,     -1087};
\ No newline at end of file
diff --git a/Tests/UnitTest/TestCases/TestData/lstm_1_s16/output_gate_bias.h b/Tests/UnitTest/TestCases/TestData/lstm_1_s16/output_gate_bias.h
new file mode 100644
index 00000000..a4a36292
--- /dev/null
+++ b/Tests/UnitTest/TestCases/TestData/lstm_1_s16/output_gate_bias.h
@@ -0,0 +1,7 @@
+// Generated by RefactoredTestGen/test.py using flatc version 23.5.26
+// Interpreter from tflite_micro runtime version 0.dev20240224054047-gcfa4c91.
+#pragma once
+#include <stdint.h>
+
+const int64_t lstm_1_s16_output_gate_bias[11] =
+    {8300, 1579, 11812, 11224, 18423, 11748, 9932, 26717, 23831, 11737, 10113};
\ No newline at end of file
diff --git a/Tests/UnitTest/TestCases/TestData/lstm_1_s16/output_gate_hidden_weights.h b/Tests/UnitTest/TestCases/TestData/lstm_1_s16/output_gate_hidden_weights.h
new file mode 100644
index 00000000..41efc9b4
--- /dev/null
+++ b/Tests/UnitTest/TestCases/TestData/lstm_1_s16/output_gate_hidden_weights.h
@@ -0,0 +1,12 @@
+// Generated by RefactoredTestGen/test.py using flatc version 23.5.26
+// Interpreter from tflite_micro runtime version 0.dev20240224054047-gcfa4c91.
+#pragma once
+#include <stdint.h>
+
+const int8_t lstm_1_s16_output_gate_hidden_weights[121] = {
+    -45, -67,  -20,  112, -54,  13,  107,  74, -96,  -111, 120, -102, -12, -21, 123,  62,  98,   -19,  -12,  32,  -55,
+    72,  59,   -34,  40,  13,   -89, 27,   72, -119, 96,   119, 26,   -37, 64,  110,  45,  -99,  86,   -101, -7,  -12,
+    80,  80,   1,    13,  114,  -24, -96,  67, -17,  92,   8,   43,   28,  97,  85,   -69, -112, -82,  125,  -13, -84,
+    -16, 67,   79,   55,  29,   -64, -127, 14, -26,  -67,  -11, 21,   -44, -16, 27,   5,   95,   -120, -126, -33, 67,
+    25,  -112, 44,   16,  -80,  -8,  -119, 63, -63,  124,  80,  24,   8,   96,  -112, 82,  -3,   -43,  94,   -83, 120,
+    10,  8,    -128, 108, -126, -63, 82,   52, -68,  -51,  -98, 66,   -86, 114, -47,  46};
\ No newline at end of file
diff --git a/Tests/UnitTest/TestCases/TestData/lstm_1_s16/output_gate_input_weights.h b/Tests/UnitTest/TestCases/TestData/lstm_1_s16/output_gate_input_weights.h
new file mode 100644
index 00000000..0b7e3439
--- /dev/null
+++ b/Tests/UnitTest/TestCases/TestData/lstm_1_s16/output_gate_input_weights.h
@@ -0,0 +1,18 @@
+// Generated by RefactoredTestGen/test.py using flatc version 23.5.26
+// Interpreter from tflite_micro runtime version 0.dev20240224054047-gcfa4c91.
+#pragma once
+#include <stdint.h>
+
+const int8_t lstm_1_s16_output_gate_input_weights[242] = {
+    -33,  89,  114, -9,  -30, -112, -117, -4,   78,   -28,  94,   -22, 47,  73,  54,   28,  -59,  -96,  -107, -22, -51,
+    -1,   -21, 61,  38,  118, 112,  18,   -63,  111,  -95,  100,  78,  -78, -31, 21,   -23, 12,   126,  122,  86,  12,
+    12,   107, 57,  -75, 20,  -74,  -105, 53,   25,   -110, -35,  -62, -5,  87,  123,  -31, -92,  72,   -112, -26, 90,
+    -83,  101, -80, 44,  -1,  80,   22,   -30,  -122, -97,  46,   29,  52,  19,  117,  89,  -105, -16,  66,   -39, 32,
+    -108, 81,  -83, 29,  107, -119, -30,  77,   11,   53,   101,  -8,  99,  90,  85,   -26, 30,   -26,  -10,  -40, -45,
+    -118, -98, -71, 1,   11,  85,   -62,  -97,  117,  56,   29,   -62, -90, -81, -62,  -27, 23,   124,  -84,  -18, -88,
+    72,   -84, 1,   -34, -76, -27,  -57,  -46,  123,  -118, -58,  42,  108, -46, -91,  13,  87,   -127, -18,  -57, 68,
+    -73,  -49, -20, -24, 111, 19,   84,   -96,  -38,  66,   -45,  -14, -47, 78,  13,   -19, 17,   -65,  32,   90,  -45,
+    86,   -79, 28,  124, -96, -56,  -102, 35,   -67,  -101, 106,  100, 79,  112, 19,   66,  53,   -58,  -28,  -15, 50,
+    -62,  54,  5,   -52, -91, -122, -34,  -111, 69,   83,   83,   39,  -69, -98, 99,   -88, -122, 18,   67,   -48, 110,
+    -63,  -93, 5,   71,  117, -83,  -9,   75,   -43,  -101, -108, -54, 43,  2,   -101, -68, 26,   -24,  81,   -12, -111,
+    -40,  -54, 37,  63,  51,  -11,  4,    61,   41,   70,   -63};
\ No newline at end of file
diff --git a/Tests/UnitTest/TestCases/TestData/lstm_1_s16/test_data.h b/Tests/UnitTest/TestCases/TestData/lstm_1_s16/test_data.h
new file mode 100644
index 00000000..8355af79
--- /dev/null
+++ b/Tests/UnitTest/TestCases/TestData/lstm_1_s16/test_data.h
@@ -0,0 +1,15 @@
+#include "cell_gate_bias.h"
+#include "cell_gate_hidden_weights.h"
+#include "cell_gate_input_weights.h"
+#include "config_data.h"
+#include "forget_gate_bias.h"
+#include "forget_gate_hidden_weights.h"
+#include "forget_gate_input_weights.h"
+#include "input.h"
+#include "input_gate_bias.h"
+#include "input_gate_hidden_weights.h"
+#include "input_gate_input_weights.h"
+#include "output.h"
+#include "output_gate_bias.h"
+#include "output_gate_hidden_weights.h"
+#include "output_gate_input_weights.h"
diff --git a/Tests/UnitTest/TestCases/TestData/lstm_2_s16/cell_gate_bias.h b/Tests/UnitTest/TestCases/TestData/lstm_2_s16/cell_gate_bias.h
new file mode 100644
index 00000000..5a89575f
--- /dev/null
+++ b/Tests/UnitTest/TestCases/TestData/lstm_2_s16/cell_gate_bias.h
@@ -0,0 +1,6 @@
+// Generated by RefactoredTestGen/test.py using flatc version 23.5.26
+// Interpreter from tflite_micro runtime version 0.dev20240224054047-gcfa4c91.
+#pragma once
+#include <stdint.h>
+
+const int64_t lstm_2_s16_cell_gate_bias[7] = {19513, 13891, 21356, 10099, 17079, 31695, 8446};
\ No newline at end of file
diff --git a/Tests/UnitTest/TestCases/TestData/lstm_2_s16/cell_gate_hidden_weights.h b/Tests/UnitTest/TestCases/TestData/lstm_2_s16/cell_gate_hidden_weights.h
new file mode 100644
index 00000000..7320ed78
--- /dev/null
+++ b/Tests/UnitTest/TestCases/TestData/lstm_2_s16/cell_gate_hidden_weights.h
@@ -0,0 +1,9 @@
+// Generated by RefactoredTestGen/test.py using flatc version 23.5.26
+// Interpreter from tflite_micro runtime version 0.dev20240224054047-gcfa4c91.
+#pragma once
+#include <stdint.h>
+
+const int8_t lstm_2_s16_cell_gate_hidden_weights[49] = {
+    103, -81,  -66, 119, 103, 58, -7,  -56, -41, -99, 2,    12,   -23, 77,  -79, -4,  56,
+    113, -2,   -52, -77, 8,   -6, 68,  77,  125, 106, -66,  88,   27,  117, 12,  -44, 41,
+    -6,  -119, 52,  -82, -7,  -5, -57, 71,  7,   -62, -115, -108, -1,  119, 94};
\ No newline at end of file
diff --git a/Tests/UnitTest/TestCases/TestData/lstm_2_s16/cell_gate_input_weights.h b/Tests/UnitTest/TestCases/TestData/lstm_2_s16/cell_gate_input_weights.h
new file mode 100644
index 00000000..abffd05c
--- /dev/null
+++ b/Tests/UnitTest/TestCases/TestData/lstm_2_s16/cell_gate_input_weights.h
@@ -0,0 +1,8 @@
+// Generated by RefactoredTestGen/test.py using flatc version 23.5.26
+// Interpreter from tflite_micro runtime version 0.dev20240224054047-gcfa4c91.
+#pragma once
+#include <stdint.h>
+
+const int8_t lstm_2_s16_cell_gate_input_weights[42] = {
+    -62,  75, 47,   104, 59, 76,  48,  52,  13, 68,  -69, 93, 8,    8,   -95,  26,  -76, -49, 108, 81,  -92,
+    -104, 80, -118, -87, 25, 112, -56, -14, 61, -10, -26, 43, -118, -86, -108, -27, 26,  -40, 77,  -63, -125};
\ No newline at end of file
diff --git a/Tests/UnitTest/TestCases/TestData/lstm_2_s16/config_data.h b/Tests/UnitTest/TestCases/TestData/lstm_2_s16/config_data.h
new file mode 100644
index 00000000..3c54682d
--- /dev/null
+++ b/Tests/UnitTest/TestCases/TestData/lstm_2_s16/config_data.h
@@ -0,0 +1,34 @@
+// Generated by RefactoredTestGen/test.py using flatc version 23.5.26
+// Interpreter from tflite_micro runtime version 0.dev20240224054047-gcfa4c91.
+#pragma once
+#define lstm_2_s16_time_major false
+#define lstm_2_s16_batch_size 1
+#define lstm_2_s16_time_steps 9
+#define lstm_2_s16_input_size 6
+#define lstm_2_s16_hidden_size 7
+#define lstm_2_s16_cell_scale_power -9
+#define lstm_2_s16_output_zero_point 0
+#define lstm_2_s16_input_zero_point 0
+#define lstm_2_s16_cell_clip 32767
+#define lstm_2_s16_forget_to_cell_multiplier 1073741824
+#define lstm_2_s16_forget_to_cell_shift -14
+#define lstm_2_s16_input_to_cell_multiplier 1316479598
+#define lstm_2_s16_input_to_cell_shift -20
+#define lstm_2_s16_output_multiplier 1956298507
+#define lstm_2_s16_output_shift -17
+#define lstm_2_s16_output_gate_hidden_multiplier 1549742675
+#define lstm_2_s16_output_gate_hidden_shift -11
+#define lstm_2_s16_cell_gate_hidden_multiplier 1564226251
+#define lstm_2_s16_cell_gate_hidden_shift -12
+#define lstm_2_s16_forget_gate_hidden_multiplier 1670439145
+#define lstm_2_s16_forget_gate_hidden_shift -12
+#define lstm_2_s16_input_gate_hidden_multiplier 1164720936
+#define lstm_2_s16_input_gate_hidden_shift -10
+#define lstm_2_s16_output_gate_input_multiplier 1924775430
+#define lstm_2_s16_output_gate_input_shift -10
+#define lstm_2_s16_cell_gate_input_multiplier 1582330722
+#define lstm_2_s16_cell_gate_input_shift -11
+#define lstm_2_s16_forget_gate_input_multiplier 1281215547
+#define lstm_2_s16_forget_gate_input_shift -9
+#define lstm_2_s16_input_gate_input_multiplier 1578394576
+#define lstm_2_s16_input_gate_input_shift -9
diff --git a/Tests/UnitTest/TestCases/TestData/lstm_2_s16/forget_gate_bias.h b/Tests/UnitTest/TestCases/TestData/lstm_2_s16/forget_gate_bias.h
new file mode 100644
index 00000000..a673376e
--- /dev/null
+++ b/Tests/UnitTest/TestCases/TestData/lstm_2_s16/forget_gate_bias.h
@@ -0,0 +1,6 @@
+// Generated by RefactoredTestGen/test.py using flatc version 23.5.26
+// Interpreter from tflite_micro runtime version 0.dev20240224054047-gcfa4c91.
+#pragma once
+#include <stdint.h>
+
+const int64_t lstm_2_s16_forget_gate_bias[7] = {20964, 16982, 16930, 22508, 31250, 30498, 16337};
\ No newline at end of file
diff --git a/Tests/UnitTest/TestCases/TestData/lstm_2_s16/forget_gate_hidden_weights.h b/Tests/UnitTest/TestCases/TestData/lstm_2_s16/forget_gate_hidden_weights.h
new file mode 100644
index 00000000..7990b6cd
--- /dev/null
+++ b/Tests/UnitTest/TestCases/TestData/lstm_2_s16/forget_gate_hidden_weights.h
@@ -0,0 +1,9 @@
+// Generated by RefactoredTestGen/test.py using flatc version 23.5.26
+// Interpreter from tflite_micro runtime version 0.dev20240224054047-gcfa4c91.
+#pragma once
+#include <stdint.h>
+
+const int8_t lstm_2_s16_forget_gate_hidden_weights[49] = {
+    118, 14,  8,  -95, -3,  -65, 2,   -67,  41,   104, 18, 51,   21,  -4,   -86, 31,  56,
+    -60, -75, 74, 73,  -12, 115, 1,   -109, -119, 96,  15, -31,  -37, 25,   0,   -22, -24,
+    95,  30,  -6, 23,  -25, -59, 105, -84,  102,  -16, 19, -118, 42,  -124, -38};
\ No newline at end of file
diff --git a/Tests/UnitTest/TestCases/TestData/lstm_2_s16/forget_gate_input_weights.h b/Tests/UnitTest/TestCases/TestData/lstm_2_s16/forget_gate_input_weights.h
new file mode 100644
index 00000000..d639622f
--- /dev/null
+++ b/Tests/UnitTest/TestCases/TestData/lstm_2_s16/forget_gate_input_weights.h
@@ -0,0 +1,8 @@
+// Generated by RefactoredTestGen/test.py using flatc version 23.5.26
+// Interpreter from tflite_micro runtime version 0.dev20240224054047-gcfa4c91.
+#pragma once
+#include <stdint.h>
+
+const int8_t lstm_2_s16_forget_gate_input_weights[42] = {
+    -37, -21, 118, -39, 90,  -97,  -99, 73, 59, -12, 1,   -74, -39,  -125, -60, 101, 68,  -22, -65, 47, 55,
+    114, 86,  48,  57,  -72, -103, 103, -3, 78, 123, -51, 78,  -100, -100, -54, -25, -40, -21, 33,  60, -107};
\ No newline at end of file
diff --git a/Tests/UnitTest/TestCases/TestData/lstm_2_s16/input.h b/Tests/UnitTest/TestCases/TestData/lstm_2_s16/input.h
new file mode 100644
index 00000000..a3b0c229
--- /dev/null
+++ b/Tests/UnitTest/TestCases/TestData/lstm_2_s16/input.h
@@ -0,0 +1,10 @@
+// Generated by RefactoredTestGen/test.py using flatc version 23.5.26
+// Interpreter from tflite_micro runtime version 0.dev20240224054047-gcfa4c91.
+#pragma once
+#include <stdint.h>
+
+const int16_t lstm_2_s16_input[54] = {
+    23536,  27304,  24693, 30688, 19633, -25440, 14520,  -25776, -23552, -13305, -30589, -2019,  18827, 25138,
+    24159,  18605,  10528, 3507,  13149, 27508,  -23963, -28654, 12585,  -1653,  14551,  -23416, 6110,  -24065,
+    -19733, -24258, 13594, 28016, 13389, 201,    3040,   -9602,  7736,   -26340, -32711, 13470,  27619, -4043,
+    30212,  -20972, -1625, 30113, 937,   17488,  -13724, -26733, 25147,  11850,  -17325, -19942};
\ No newline at end of file
diff --git a/Tests/UnitTest/TestCases/TestData/lstm_2_s16/input_gate_bias.h b/Tests/UnitTest/TestCases/TestData/lstm_2_s16/input_gate_bias.h
new file mode 100644
index 00000000..c2b86290
--- /dev/null
+++ b/Tests/UnitTest/TestCases/TestData/lstm_2_s16/input_gate_bias.h
@@ -0,0 +1,6 @@
+// Generated by RefactoredTestGen/test.py using flatc version 23.5.26
+// Interpreter from tflite_micro runtime version 0.dev20240224054047-gcfa4c91.
+#pragma once
+#include <stdint.h>
+
+const int64_t lstm_2_s16_input_gate_bias[7] = {28793, 9909, 4344, 1117, 3627, 30848, 32766};
\ No newline at end of file
diff --git a/Tests/UnitTest/TestCases/TestData/lstm_2_s16/input_gate_hidden_weights.h b/Tests/UnitTest/TestCases/TestData/lstm_2_s16/input_gate_hidden_weights.h
new file mode 100644
index 00000000..4fc2d2ba
--- /dev/null
+++ b/Tests/UnitTest/TestCases/TestData/lstm_2_s16/input_gate_hidden_weights.h
@@ -0,0 +1,9 @@
+// Generated by RefactoredTestGen/test.py using flatc version 23.5.26
+// Interpreter from tflite_micro runtime version 0.dev20240224054047-gcfa4c91.
+#pragma once
+#include <stdint.h>
+
+const int8_t lstm_2_s16_input_gate_hidden_weights[49] = {
+    3,   53, -35, -6,  -43, -23, -98, 16,  -114, -96, 15,   40,  80,  -82,  109, -22, 13,
+    117, 90, 87,  -52, 58,  -46, 106, 34,  -90,  -63, -121, 106, 122, -101, 55,  7,   51,
+    -55, 7,  124, 53,  97,  -96, -9,  -61, -25,  123, -69,  -1,  -55, 69,   -104};
\ No newline at end of file
diff --git a/Tests/UnitTest/TestCases/TestData/lstm_2_s16/input_gate_input_weights.h b/Tests/UnitTest/TestCases/TestData/lstm_2_s16/input_gate_input_weights.h
new file mode 100644
index 00000000..37d25b83
--- /dev/null
+++ b/Tests/UnitTest/TestCases/TestData/lstm_2_s16/input_gate_input_weights.h
@@ -0,0 +1,8 @@
+// Generated by RefactoredTestGen/test.py using flatc version 23.5.26
+// Interpreter from tflite_micro runtime version 0.dev20240224054047-gcfa4c91.
+#pragma once
+#include <stdint.h>
+
+const int8_t lstm_2_s16_input_gate_input_weights[42] = {
+    83, 92,   125, 94,  -48, 9,  118, 97,  -55, -91, 102, -33, 107, 93, -3,  93, -122, 89,  117,  65,  89,
+    80, -120, -87, 101, 34,  70, -7,  103, -55, -96, 82,  121, 58,  17, -28, 78, -96,  107, -127, -40, 26};
\ No newline at end of file
diff --git a/Tests/UnitTest/TestCases/TestData/lstm_2_s16/output.h b/Tests/UnitTest/TestCases/TestData/lstm_2_s16/output.h
new file mode 100644
index 00000000..70f397c1
--- /dev/null
+++ b/Tests/UnitTest/TestCases/TestData/lstm_2_s16/output.h
@@ -0,0 +1,10 @@
+// Generated by RefactoredTestGen/test.py using flatc version 23.5.26
+// Interpreter from tflite_micro runtime version 0.dev20240224054047-gcfa4c91.
+#pragma once
+#include <stdint.h>
+
+const int16_t lstm_2_s16_output[63] = {1163, 635,   -271, 765,  -382, -1332, 230,   -197,  83,   1107, 452, -1470, -597,
+                                       433,  915,   1438, -536, 235,  -44,   -1806, 51,    367,  15,   92,  1041,  -668,
+                                       -646, -282,  -912, -703, 283,  1122,  -585,  71,    239,  19,   34,  -123,  1358,
+                                       10,   266,   221,  -49,  -169, 0,     1190,  -1863, -84,  304,  151, 313,   73,
+                                       -379, -2443, -498, -115, -164, 305,   28,    -1771, -415, 981,  674};
\ No newline at end of file
diff --git a/Tests/UnitTest/TestCases/TestData/lstm_2_s16/output_gate_bias.h b/Tests/UnitTest/TestCases/TestData/lstm_2_s16/output_gate_bias.h
new file mode 100644
index 00000000..0ec709fc
--- /dev/null
+++ b/Tests/UnitTest/TestCases/TestData/lstm_2_s16/output_gate_bias.h
@@ -0,0 +1,6 @@
+// Generated by RefactoredTestGen/test.py using flatc version 23.5.26
+// Interpreter from tflite_micro runtime version 0.dev20240224054047-gcfa4c91.
+#pragma once
+#include <stdint.h>
+
+const int64_t lstm_2_s16_output_gate_bias[7] = {5138, 5363, 14184, 3723, 23687, 28588, 26805};
\ No newline at end of file
diff --git a/Tests/UnitTest/TestCases/TestData/lstm_2_s16/output_gate_hidden_weights.h b/Tests/UnitTest/TestCases/TestData/lstm_2_s16/output_gate_hidden_weights.h
new file mode 100644
index 00000000..69b797d5
--- /dev/null
+++ b/Tests/UnitTest/TestCases/TestData/lstm_2_s16/output_gate_hidden_weights.h
@@ -0,0 +1,9 @@
+// Generated by RefactoredTestGen/test.py using flatc version 23.5.26
+// Interpreter from tflite_micro runtime version 0.dev20240224054047-gcfa4c91.
+#pragma once
+#include <stdint.h>
+
+const int8_t lstm_2_s16_output_gate_hidden_weights[49] = {
+    29,   -8,  -81,  -77, -33,  23,  74,  -78, 68, -77, 22, -74,  95, 112,  13, 97, -124,
+    -106, -3,  -122, 110, -124, -74, 47,  97,  11, 44,  8,  -100, 91, -103, 85, 68, 13,
+    28,   119, -94,  -74, -39,  -15, -45, 60,  23, 37,  52, 70,   83, 111,  -9};
\ No newline at end of file
diff --git a/Tests/UnitTest/TestCases/TestData/lstm_2_s16/output_gate_input_weights.h b/Tests/UnitTest/TestCases/TestData/lstm_2_s16/output_gate_input_weights.h
new file mode 100644
index 00000000..6263cf16
--- /dev/null
+++ b/Tests/UnitTest/TestCases/TestData/lstm_2_s16/output_gate_input_weights.h
@@ -0,0 +1,8 @@
+// Generated by RefactoredTestGen/test.py using flatc version 23.5.26
+// Interpreter from tflite_micro runtime version 0.dev20240224054047-gcfa4c91.
+#pragma once
+#include <stdint.h>
+
+const int8_t lstm_2_s16_output_gate_input_weights[42] = {
+    -83, -37, -5, -58, 37,  -85,  37,  121, 112, -12, -104, -55, 116, -43, 33,  -108, -15, 79,   8,  -34, -64,
+    -12, -63, 20, 90,  -89, -126, -49, -43, -16, 81,  65,   78,  -7,  -63, -92, -91,  8,   -108, -8, 37,  -67};
\ No newline at end of file
diff --git a/Tests/UnitTest/TestCases/TestData/lstm_2_s16/test_data.h b/Tests/UnitTest/TestCases/TestData/lstm_2_s16/test_data.h
new file mode 100644
index 00000000..8355af79
--- /dev/null
+++ b/Tests/UnitTest/TestCases/TestData/lstm_2_s16/test_data.h
@@ -0,0 +1,15 @@
+#include "cell_gate_bias.h"
+#include "cell_gate_hidden_weights.h"
+#include "cell_gate_input_weights.h"
+#include "config_data.h"
+#include "forget_gate_bias.h"
+#include "forget_gate_hidden_weights.h"
+#include "forget_gate_input_weights.h"
+#include "input.h"
+#include "input_gate_bias.h"
+#include "input_gate_hidden_weights.h"
+#include "input_gate_input_weights.h"
+#include "output.h"
+#include "output_gate_bias.h"
+#include "output_gate_hidden_weights.h"
+#include "output_gate_input_weights.h"
diff --git a/Tests/UnitTest/TestCases/TestData/lstm_one_time_step_s16/cell_gate_bias.h b/Tests/UnitTest/TestCases/TestData/lstm_one_time_step_s16/cell_gate_bias.h
new file mode 100644
index 00000000..9c2539a5
--- /dev/null
+++ b/Tests/UnitTest/TestCases/TestData/lstm_one_time_step_s16/cell_gate_bias.h
@@ -0,0 +1,6 @@
+// Generated by RefactoredTestGen/test.py using flatc version 23.5.26
+// Interpreter from tflite_micro runtime version 0.dev20240224054047-gcfa4c91.
+#pragma once
+#include <stdint.h>
+
+const int64_t lstm_one_time_step_s16_cell_gate_bias[3] = {20195, 27538, 32653};
\ No newline at end of file
diff --git a/Tests/UnitTest/TestCases/TestData/lstm_one_time_step_s16/cell_gate_hidden_weights.h b/Tests/UnitTest/TestCases/TestData/lstm_one_time_step_s16/cell_gate_hidden_weights.h
new file mode 100644
index 00000000..39f4ea18
--- /dev/null
+++ b/Tests/UnitTest/TestCases/TestData/lstm_one_time_step_s16/cell_gate_hidden_weights.h
@@ -0,0 +1,6 @@
+// Generated by RefactoredTestGen/test.py using flatc version 23.5.26
+// Interpreter from tflite_micro runtime version 0.dev20240224054047-gcfa4c91.
+#pragma once
+#include <stdint.h>
+
+const int8_t lstm_one_time_step_s16_cell_gate_hidden_weights[9] = {96, 122, 83, -16, 68, -124, -120, 18, -57};
\ No newline at end of file
diff --git a/Tests/UnitTest/TestCases/TestData/lstm_one_time_step_s16/cell_gate_input_weights.h b/Tests/UnitTest/TestCases/TestData/lstm_one_time_step_s16/cell_gate_input_weights.h
new file mode 100644
index 00000000..931d2608
--- /dev/null
+++ b/Tests/UnitTest/TestCases/TestData/lstm_one_time_step_s16/cell_gate_input_weights.h
@@ -0,0 +1,9 @@
+// Generated by RefactoredTestGen/test.py using flatc version 23.5.26
+// Interpreter from tflite_micro runtime version 0.dev20240224054047-gcfa4c91.
+#pragma once
+#include <stdint.h>
+
+const int8_t lstm_one_time_step_s16_cell_gate_input_weights[66] = {
+    87,  101, 93,  23,  -94, -70, -51, -59,  -76,  -21, -16, -37, 122, -91,  -112, -111, -46, -68, -38, 15,   -38, 74,
+    -97, -9,  -61, -7,  -4,  94,  105, -112, -123, -18, -29, 49,  42,  -13,  34,   83,   106, 60,  -24, 96,   48,  -86,
+    66,  97,  -90, -21, 101, -90, -6,  78,   -1,   41,  2,   8,   11,  -123, -79,  -12,  -36, -25, -37, -127, 73,  100};
\ No newline at end of file
diff --git a/Tests/UnitTest/TestCases/TestData/lstm_one_time_step_s16/config_data.h b/Tests/UnitTest/TestCases/TestData/lstm_one_time_step_s16/config_data.h
new file mode 100644
index 00000000..1ee11ca3
--- /dev/null
+++ b/Tests/UnitTest/TestCases/TestData/lstm_one_time_step_s16/config_data.h
@@ -0,0 +1,34 @@
+// Generated by RefactoredTestGen/test.py using flatc version 23.5.26
+// Interpreter from tflite_micro runtime version 0.dev20240224054047-gcfa4c91.
+#pragma once
+#define lstm_one_time_step_s16_time_major false
+#define lstm_one_time_step_s16_batch_size 3
+#define lstm_one_time_step_s16_time_steps 1
+#define lstm_one_time_step_s16_input_size 22
+#define lstm_one_time_step_s16_hidden_size 3
+#define lstm_one_time_step_s16_cell_scale_power -10
+#define lstm_one_time_step_s16_output_zero_point 0
+#define lstm_one_time_step_s16_input_zero_point 0
+#define lstm_one_time_step_s16_cell_clip 32767
+#define lstm_one_time_step_s16_forget_to_cell_multiplier 1073741824
+#define lstm_one_time_step_s16_forget_to_cell_shift -14
+#define lstm_one_time_step_s16_input_to_cell_multiplier 1899594203
+#define lstm_one_time_step_s16_input_to_cell_shift -20
+#define lstm_one_time_step_s16_output_multiplier 1713359477
+#define lstm_one_time_step_s16_output_shift -17
+#define lstm_one_time_step_s16_output_gate_hidden_multiplier 1328489832
+#define lstm_one_time_step_s16_output_gate_hidden_shift -11
+#define lstm_one_time_step_s16_cell_gate_hidden_multiplier 1181032973
+#define lstm_one_time_step_s16_cell_gate_hidden_shift -10
+#define lstm_one_time_step_s16_forget_gate_hidden_multiplier 2122276288
+#define lstm_one_time_step_s16_forget_gate_hidden_shift -12
+#define lstm_one_time_step_s16_input_gate_hidden_multiplier 1120396508
+#define lstm_one_time_step_s16_input_gate_hidden_shift -10
+#define lstm_one_time_step_s16_output_gate_input_multiplier 1673987981
+#define lstm_one_time_step_s16_output_gate_input_shift -10
+#define lstm_one_time_step_s16_cell_gate_input_multiplier 1551228863
+#define lstm_one_time_step_s16_cell_gate_input_shift -8
+#define lstm_one_time_step_s16_forget_gate_input_multiplier 1681427928
+#define lstm_one_time_step_s16_forget_gate_input_shift -11
+#define lstm_one_time_step_s16_input_gate_input_multiplier 1677707955
+#define lstm_one_time_step_s16_input_gate_input_shift -8
diff --git a/Tests/UnitTest/TestCases/TestData/lstm_one_time_step_s16/forget_gate_bias.h b/Tests/UnitTest/TestCases/TestData/lstm_one_time_step_s16/forget_gate_bias.h
new file mode 100644
index 00000000..ee744c92
--- /dev/null
+++ b/Tests/UnitTest/TestCases/TestData/lstm_one_time_step_s16/forget_gate_bias.h
@@ -0,0 +1,6 @@
+// Generated by RefactoredTestGen/test.py using flatc version 23.5.26
+// Interpreter from tflite_micro runtime version 0.dev20240224054047-gcfa4c91.
+#pragma once
+#include <stdint.h>
+
+const int64_t lstm_one_time_step_s16_forget_gate_bias[3] = {2956, 21127, 23202};
\ No newline at end of file
diff --git a/Tests/UnitTest/TestCases/TestData/lstm_one_time_step_s16/forget_gate_hidden_weights.h b/Tests/UnitTest/TestCases/TestData/lstm_one_time_step_s16/forget_gate_hidden_weights.h
new file mode 100644
index 00000000..1c2e881b
--- /dev/null
+++ b/Tests/UnitTest/TestCases/TestData/lstm_one_time_step_s16/forget_gate_hidden_weights.h
@@ -0,0 +1,6 @@
+// Generated by RefactoredTestGen/test.py using flatc version 23.5.26
+// Interpreter from tflite_micro runtime version 0.dev20240224054047-gcfa4c91.
+#pragma once
+#include <stdint.h>
+
+const int8_t lstm_one_time_step_s16_forget_gate_hidden_weights[9] = {43, 123, -49, -80, 18, -113, 97, 41, 74};
\ No newline at end of file
diff --git a/Tests/UnitTest/TestCases/TestData/lstm_one_time_step_s16/forget_gate_input_weights.h b/Tests/UnitTest/TestCases/TestData/lstm_one_time_step_s16/forget_gate_input_weights.h
new file mode 100644
index 00000000..883e0c9b
--- /dev/null
+++ b/Tests/UnitTest/TestCases/TestData/lstm_one_time_step_s16/forget_gate_input_weights.h
@@ -0,0 +1,9 @@
+// Generated by RefactoredTestGen/test.py using flatc version 23.5.26
+// Interpreter from tflite_micro runtime version 0.dev20240224054047-gcfa4c91.
+#pragma once
+#include <stdint.h>
+
+const int8_t lstm_one_time_step_s16_forget_gate_input_weights[66] = {
+    25, -38, 56,   -13, 108, -21, 92,   -97, 118,  105, 101, -2, -44,  123, -17, -23, 3,   92,  118, -115, -18, 78,
+    27, 103, -108, -41, -8,  37,  -115, 55,  59,   56,  15,  76, -119, 18,  -72, 124, 20,  -8,  -29, 75,   66,  28,
+    51, -44, 42,   -13, 101, 41,  -102, 30,  -118, -55, -19, 13, -106, 19,  19,  118, 106, -79, -15, -125, 29,  -109};
\ No newline at end of file
diff --git a/Tests/UnitTest/TestCases/TestData/lstm_one_time_step_s16/input.h b/Tests/UnitTest/TestCases/TestData/lstm_one_time_step_s16/input.h
new file mode 100644
index 00000000..bcedeea9
--- /dev/null
+++ b/Tests/UnitTest/TestCases/TestData/lstm_one_time_step_s16/input.h
@@ -0,0 +1,11 @@
+// Generated by RefactoredTestGen/test.py using flatc version 23.5.26
+// Interpreter from tflite_micro runtime version 0.dev20240224054047-gcfa4c91.
+#pragma once
+#include <stdint.h>
+
+const int16_t lstm_one_time_step_s16_input[66] = {
+    26737,  -3312,  27665,  19141,  4716,   -7008,  4549,   -21899, -25689, 4240,  -17667, 22726,  10924, 30458,
+    7515,   -17421, -25080, -26557, -16850, 6022,   -22044, -32139, -10312, 28342, -30325, 25737,  10769, 3648,
+    -11893, -26036, -15103, -9396,  -16565, -12245, -25711, -27976, 8469,   -3610, 7150,   -23798, -1771, -586,
+    -7059,  -25873, 24134,  -8545,  -10029, -8123,  17637,  -9169,  19492,  24515, 26091,  -15449, 16251, 3491,
+    -32627, 28448,  -6072,  26552,  32338,  258,    29453,  -29835, -17778, -15912};
\ No newline at end of file
diff --git a/Tests/UnitTest/TestCases/TestData/lstm_one_time_step_s16/input_gate_bias.h b/Tests/UnitTest/TestCases/TestData/lstm_one_time_step_s16/input_gate_bias.h
new file mode 100644
index 00000000..10828957
--- /dev/null
+++ b/Tests/UnitTest/TestCases/TestData/lstm_one_time_step_s16/input_gate_bias.h
@@ -0,0 +1,6 @@
+// Generated by RefactoredTestGen/test.py using flatc version 23.5.26
+// Interpreter from tflite_micro runtime version 0.dev20240224054047-gcfa4c91.
+#pragma once
+#include <stdint.h>
+
+const int64_t lstm_one_time_step_s16_input_gate_bias[3] = {20943, 5502, 20939};
\ No newline at end of file
diff --git a/Tests/UnitTest/TestCases/TestData/lstm_one_time_step_s16/input_gate_hidden_weights.h b/Tests/UnitTest/TestCases/TestData/lstm_one_time_step_s16/input_gate_hidden_weights.h
new file mode 100644
index 00000000..a7953bf8
--- /dev/null
+++ b/Tests/UnitTest/TestCases/TestData/lstm_one_time_step_s16/input_gate_hidden_weights.h
@@ -0,0 +1,6 @@
+// Generated by RefactoredTestGen/test.py using flatc version 23.5.26
+// Interpreter from tflite_micro runtime version 0.dev20240224054047-gcfa4c91.
+#pragma once
+#include <stdint.h>
+
+const int8_t lstm_one_time_step_s16_input_gate_hidden_weights[9] = {76, 36, 15, 85, 17, 55, 33, 25, -31};
\ No newline at end of file
diff --git a/Tests/UnitTest/TestCases/TestData/lstm_one_time_step_s16/input_gate_input_weights.h b/Tests/UnitTest/TestCases/TestData/lstm_one_time_step_s16/input_gate_input_weights.h
new file mode 100644
index 00000000..74270d4e
--- /dev/null
+++ b/Tests/UnitTest/TestCases/TestData/lstm_one_time_step_s16/input_gate_input_weights.h
@@ -0,0 +1,9 @@
+// Generated by RefactoredTestGen/test.py using flatc version 23.5.26
+// Interpreter from tflite_micro runtime version 0.dev20240224054047-gcfa4c91.
+#pragma once
+#include <stdint.h>
+
+const int8_t lstm_one_time_step_s16_input_gate_input_weights[66] = {
+    18,   -27, 78,   115,  101, -86, 106,  102, 23,  30,   13,  -90, -89, -67, 51,  11,  -97, -34, -113, 87,  88, 15,
+    55,   -55, -116, -11,  -72, 101, -107, 59,  -29, -119, 109, -4,  120, -22, 4,   57,  26,  -87, 79,   19,  81, 28,
+    -126, 96,  115,  -100, 27,  34,  14,   78,  84,  28,   -56, -67, 75,  33,  -93, -70, 19,  76,  103,  -47, 35, 69};
\ No newline at end of file
diff --git a/Tests/UnitTest/TestCases/TestData/lstm_one_time_step_s16/output.h b/Tests/UnitTest/TestCases/TestData/lstm_one_time_step_s16/output.h
new file mode 100644
index 00000000..d2858128
--- /dev/null
+++ b/Tests/UnitTest/TestCases/TestData/lstm_one_time_step_s16/output.h
@@ -0,0 +1,6 @@
+// Generated by RefactoredTestGen/test.py using flatc version 23.5.26
+// Interpreter from tflite_micro runtime version 0.dev20240224054047-gcfa4c91.
+#pragma once
+#include <stdint.h>
+
+const int16_t lstm_one_time_step_s16_output[9] = {2054, 0, 0, 1138, 129, 0, -61, -2801, 225};
\ No newline at end of file
diff --git a/Tests/UnitTest/TestCases/TestData/lstm_one_time_step_s16/output_gate_bias.h b/Tests/UnitTest/TestCases/TestData/lstm_one_time_step_s16/output_gate_bias.h
new file mode 100644
index 00000000..0fc61ce6
--- /dev/null
+++ b/Tests/UnitTest/TestCases/TestData/lstm_one_time_step_s16/output_gate_bias.h
@@ -0,0 +1,6 @@
+// Generated by RefactoredTestGen/test.py using flatc version 23.5.26
+// Interpreter from tflite_micro runtime version 0.dev20240224054047-gcfa4c91.
+#pragma once
+#include <stdint.h>
+
+const int64_t lstm_one_time_step_s16_output_gate_bias[3] = {32739, 16303, 29216};
\ No newline at end of file
diff --git a/Tests/UnitTest/TestCases/TestData/lstm_one_time_step_s16/output_gate_hidden_weights.h b/Tests/UnitTest/TestCases/TestData/lstm_one_time_step_s16/output_gate_hidden_weights.h
new file mode 100644
index 00000000..3f105f76
--- /dev/null
+++ b/Tests/UnitTest/TestCases/TestData/lstm_one_time_step_s16/output_gate_hidden_weights.h
@@ -0,0 +1,6 @@
+// Generated by RefactoredTestGen/test.py using flatc version 23.5.26
+// Interpreter from tflite_micro runtime version 0.dev20240224054047-gcfa4c91.
+#pragma once
+#include <stdint.h>
+
+const int8_t lstm_one_time_step_s16_output_gate_hidden_weights[9] = {63, 24, 19, 25, 71, -107, 75, -20, 80};
\ No newline at end of file
diff --git a/Tests/UnitTest/TestCases/TestData/lstm_one_time_step_s16/output_gate_input_weights.h b/Tests/UnitTest/TestCases/TestData/lstm_one_time_step_s16/output_gate_input_weights.h
new file mode 100644
index 00000000..246948eb
--- /dev/null
+++ b/Tests/UnitTest/TestCases/TestData/lstm_one_time_step_s16/output_gate_input_weights.h
@@ -0,0 +1,9 @@
+// Generated by RefactoredTestGen/test.py using flatc version 23.5.26
+// Interpreter from tflite_micro runtime version 0.dev20240224054047-gcfa4c91.
+#pragma once
+#include <stdint.h>
+
+const int8_t lstm_one_time_step_s16_output_gate_input_weights[66] = {
+    107, 18,  61,  4,    27,  96,  76,   -43, -48, -63,  -25, 18, -58, -69, 101, -75,  -36, 68,  61,  126, 122, 78,
+    90,  -88, -36, 115,  -6,  -28, 99,   96,  55,  -108, -34, 70, 102, 116, 98,  -103, -55, 76,  98,  -78, -53, 81,
+    -47, 66,  98,  -128, 103, -98, -124, -25, -37, -19,  -60, 4,  46,  28,  90,  88,   -90, -71, 124, 52,  58,  83};
\ No newline at end of file
diff --git a/Tests/UnitTest/TestCases/TestData/lstm_one_time_step_s16/test_data.h b/Tests/UnitTest/TestCases/TestData/lstm_one_time_step_s16/test_data.h
new file mode 100644
index 00000000..8355af79
--- /dev/null
+++ b/Tests/UnitTest/TestCases/TestData/lstm_one_time_step_s16/test_data.h
@@ -0,0 +1,15 @@
+#include "cell_gate_bias.h"
+#include "cell_gate_hidden_weights.h"
+#include "cell_gate_input_weights.h"
+#include "config_data.h"
+#include "forget_gate_bias.h"
+#include "forget_gate_hidden_weights.h"
+#include "forget_gate_input_weights.h"
+#include "input.h"
+#include "input_gate_bias.h"
+#include "input_gate_hidden_weights.h"
+#include "input_gate_input_weights.h"
+#include "output.h"
+#include "output_gate_bias.h"
+#include "output_gate_hidden_weights.h"
+#include "output_gate_input_weights.h"
diff --git a/Tests/UnitTest/TestCases/test_arm_lstm_unidirectional_s16/CMakeLists.txt b/Tests/UnitTest/TestCases/test_arm_lstm_unidirectional_s16/CMakeLists.txt
new file mode 100644
index 00000000..16e9eddd
--- /dev/null
+++ b/Tests/UnitTest/TestCases/test_arm_lstm_unidirectional_s16/CMakeLists.txt
@@ -0,0 +1,23 @@
+#
+# SPDX-FileCopyrightText: Copyright 2010-2022, 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+add_cmsis_nn_unit_test_executable(lstm_unidirectional_s16)
+
+target_sources(lstm_unidirectional_s16 PRIVATE
+    Unity/unity_test_arm_lstm_unidirectional_s16.c
+    Unity/TestRunner/unity_test_arm_lstm_unidirectional_s16_runner.c)
\ No newline at end of file
diff --git a/Tests/UnitTest/TestCases/test_arm_lstm_unidirectional_s16/Unity/unity_test_arm_lstm_unidirectional_s16.c b/Tests/UnitTest/TestCases/test_arm_lstm_unidirectional_s16/Unity/unity_test_arm_lstm_unidirectional_s16.c
new file mode 100644
index 00000000..b9ef170e
--- /dev/null
+++ b/Tests/UnitTest/TestCases/test_arm_lstm_unidirectional_s16/Unity/unity_test_arm_lstm_unidirectional_s16.c
@@ -0,0 +1,48 @@
+/*
+ * SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "../test_arm_lstm_unidirectional_s16.c"
+#include "unity.h"
+
+#ifdef USING_FVP_CORSTONE_300
+extern void uart_init(void);
+#endif
+
+/* This function is called from the autogenerated file.
+ * The name must be exactly like this
+ */
+void setUp(void)
+{ /* This is run before EACH TEST */
+#ifdef USING_FVP_CORSTONE_300
+    uart_init();
+#endif
+}
+
+/* This function is called from the autogenerated file.
+ * The name must be exactly like this
+ */
+void tearDown(void) {}
+void test_lstm_1_s16(void) { lstm_1_s16(); }
+void test_lstm_2_s16(void) { lstm_2_s16(); }
+void test_lstm_one_time_step_s16(void) { lstm_one_time_step_s16(); }
diff --git a/Tests/UnitTest/TestCases/test_arm_lstm_unidirectional_s16/test_arm_lstm_unidirectional_s16.c b/Tests/UnitTest/TestCases/test_arm_lstm_unidirectional_s16/test_arm_lstm_unidirectional_s16.c
new file mode 100644
index 00000000..83abb468
--- /dev/null
+++ b/Tests/UnitTest/TestCases/test_arm_lstm_unidirectional_s16/test_arm_lstm_unidirectional_s16.c
@@ -0,0 +1,475 @@
+/*
+ * SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../TestData/lstm_1_s16/test_data.h"
+#include "../TestData/lstm_2_s16/test_data.h"
+#include "../TestData/lstm_one_time_step_s16/test_data.h"
+#include "../Utils/validate.h"
+#include <arm_nnfunctions.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unity.h>
+
+// update the buffer size if adding a unit test with larger buffer.
+#define LARGEST_BUFFER_SIZE lstm_1_s16_hidden_size *lstm_1_s16_batch_size *lstm_1_s16_time_steps
+
+int16_t buffer1[LARGEST_BUFFER_SIZE];
+int16_t buffer2[LARGEST_BUFFER_SIZE];
+int16_t buffer3[LARGEST_BUFFER_SIZE];
+
+void lstm_1_s16(void)
+{
+    int16_t output[lstm_1_s16_batch_size * lstm_1_s16_time_steps * lstm_1_s16_hidden_size] = {0};
+    const arm_cmsis_nn_status expected = ARM_CMSIS_NN_SUCCESS;
+    const int16_t *output_ref = &lstm_1_s16_output[0];
+    const int32_t output_ref_size = lstm_1_s16_batch_size * lstm_1_s16_time_steps * lstm_1_s16_hidden_size;
+
+    int64_t input_data_kernel_sum[lstm_1_s16_hidden_size];
+    int64_t forget_data_kernel_sum[lstm_1_s16_hidden_size];
+    int64_t cell_data_kernel_sum[lstm_1_s16_hidden_size];
+    int64_t output_data_kernel_sum[lstm_1_s16_hidden_size];
+
+    int64_t input_hidden_kernel_sum[lstm_1_s16_hidden_size];
+    int64_t forget_hidden_kernel_sum[lstm_1_s16_hidden_size];
+    int64_t cell_hidden_kernel_sum[lstm_1_s16_hidden_size];
+    int64_t output_hidden_kernel_sum[lstm_1_s16_hidden_size];
+
+    arm_vector_sum_s8_s64(&input_data_kernel_sum[0],
+                          lstm_1_s16_input_size,
+                          lstm_1_s16_hidden_size,
+                          &lstm_1_s16_input_gate_input_weights[0],
+                          lstm_1_s16_input_zero_point,
+                          &lstm_1_s16_input_gate_bias[0]);
+    arm_vector_sum_s8_s64(&forget_data_kernel_sum[0],
+                          lstm_1_s16_input_size,
+                          lstm_1_s16_hidden_size,
+                          &lstm_1_s16_forget_gate_input_weights[0],
+                          lstm_1_s16_input_zero_point,
+                          &lstm_1_s16_forget_gate_bias[0]);
+    arm_vector_sum_s8_s64(&cell_data_kernel_sum[0],
+                          lstm_1_s16_input_size,
+                          lstm_1_s16_hidden_size,
+                          &lstm_1_s16_cell_gate_input_weights[0],
+                          lstm_1_s16_input_zero_point,
+                          &lstm_1_s16_cell_gate_bias[0]);
+    arm_vector_sum_s8_s64(&output_data_kernel_sum[0],
+                          lstm_1_s16_input_size,
+                          lstm_1_s16_hidden_size,
+                          &lstm_1_s16_output_gate_input_weights[0],
+                          lstm_1_s16_input_zero_point,
+                          &lstm_1_s16_output_gate_bias[0]);
+
+    arm_vector_sum_s8_s64(&input_hidden_kernel_sum[0],
+                          lstm_1_s16_hidden_size,
+                          lstm_1_s16_hidden_size,
+                          &lstm_1_s16_input_gate_hidden_weights[0],
+                          -lstm_1_s16_output_zero_point,
+                          NULL);
+    arm_vector_sum_s8_s64(&forget_hidden_kernel_sum[0],
+                          lstm_1_s16_hidden_size,
+                          lstm_1_s16_hidden_size,
+                          &lstm_1_s16_forget_gate_hidden_weights[0],
+                          -lstm_1_s16_output_zero_point,
+                          NULL);
+    arm_vector_sum_s8_s64(&cell_hidden_kernel_sum[0],
+                          lstm_1_s16_hidden_size,
+                          lstm_1_s16_hidden_size,
+                          &lstm_1_s16_cell_gate_hidden_weights[0],
+                          -lstm_1_s16_output_zero_point,
+                          NULL);
+    arm_vector_sum_s8_s64(&output_hidden_kernel_sum[0],
+                          lstm_1_s16_hidden_size,
+                          lstm_1_s16_hidden_size,
+                          &lstm_1_s16_output_gate_hidden_weights[0],
+                          -lstm_1_s16_output_zero_point,
+                          NULL);
+
+    // INPUT GATE
+    const cmsis_nn_lstm_gate gate_input = {lstm_1_s16_input_gate_input_multiplier,
+                                           lstm_1_s16_input_gate_input_shift,
+                                           &lstm_1_s16_input_gate_input_weights[0],
+                                           &input_data_kernel_sum[0],
+                                           lstm_1_s16_input_gate_hidden_multiplier,
+                                           lstm_1_s16_input_gate_hidden_shift,
+                                           &lstm_1_s16_input_gate_hidden_weights[0],
+                                           &input_hidden_kernel_sum[0],
+                                           &lstm_1_s16_input_gate_bias[0],
+                                           ARM_SIGMOID};
+
+    // FORGET GATE
+    const cmsis_nn_lstm_gate gate_forget = {lstm_1_s16_forget_gate_input_multiplier,
+                                            lstm_1_s16_forget_gate_input_shift,
+                                            &lstm_1_s16_forget_gate_input_weights[0],
+                                            &forget_data_kernel_sum[0],
+                                            lstm_1_s16_forget_gate_hidden_multiplier,
+                                            lstm_1_s16_forget_gate_hidden_shift,
+                                            &lstm_1_s16_forget_gate_hidden_weights[0],
+                                            &forget_hidden_kernel_sum[0],
+                                            &lstm_1_s16_forget_gate_bias[0],
+                                            ARM_SIGMOID};
+
+    // CELL GATE
+    const cmsis_nn_lstm_gate gate_cell = {lstm_1_s16_cell_gate_input_multiplier,
+                                          lstm_1_s16_cell_gate_input_shift,
+                                          &lstm_1_s16_cell_gate_input_weights[0],
+                                          &cell_data_kernel_sum[0],
+                                          lstm_1_s16_cell_gate_hidden_multiplier,
+                                          lstm_1_s16_cell_gate_hidden_shift,
+                                          &lstm_1_s16_cell_gate_hidden_weights[0],
+                                          &cell_hidden_kernel_sum[0],
+                                          &lstm_1_s16_cell_gate_bias[0],
+                                          ARM_TANH};
+
+    // OUTPUT GATE
+    const cmsis_nn_lstm_gate gate_output = {lstm_1_s16_output_gate_input_multiplier,
+                                            lstm_1_s16_output_gate_input_shift,
+                                            &lstm_1_s16_output_gate_input_weights[0],
+                                            &output_data_kernel_sum[0],
+                                            lstm_1_s16_output_gate_hidden_multiplier,
+                                            lstm_1_s16_output_gate_hidden_shift,
+                                            &lstm_1_s16_output_gate_hidden_weights[0],
+                                            &output_hidden_kernel_sum[0],
+                                            &lstm_1_s16_output_gate_bias[0],
+                                            ARM_SIGMOID};
+
+    // LSTM DATA
+    const cmsis_nn_lstm_params params = {lstm_1_s16_time_major,
+                                         lstm_1_s16_batch_size,
+                                         lstm_1_s16_time_steps,
+                                         lstm_1_s16_input_size,
+                                         lstm_1_s16_hidden_size,
+                                         lstm_1_s16_input_zero_point,
+                                         lstm_1_s16_forget_to_cell_multiplier,
+                                         lstm_1_s16_forget_to_cell_shift,
+                                         lstm_1_s16_input_to_cell_multiplier,
+                                         lstm_1_s16_input_to_cell_shift,
+                                         lstm_1_s16_cell_clip,
+                                         lstm_1_s16_cell_scale_power,
+                                         lstm_1_s16_output_multiplier,
+                                         lstm_1_s16_output_shift,
+                                         lstm_1_s16_output_zero_point,
+                                         gate_forget,
+                                         gate_input,
+                                         gate_cell,
+                                         gate_output};
+
+    cmsis_nn_lstm_context buffers;
+    buffers.temp1 = buffer1;
+    buffers.temp2 = buffer2;
+    buffers.cell_state = buffer3;
+
+    arm_cmsis_nn_status result = arm_lstm_unidirectional_s16(lstm_1_s16_input, output, &params, &buffers);
+
+    TEST_ASSERT_EQUAL(expected, result);
+    TEST_ASSERT_TRUE(validate_s16(output, output_ref, output_ref_size));
+}
+void lstm_2_s16(void)
+{
+    int16_t output[lstm_2_s16_batch_size * lstm_2_s16_time_steps * lstm_2_s16_hidden_size] = {0};
+    const arm_cmsis_nn_status expected = ARM_CMSIS_NN_SUCCESS;
+    const int16_t *output_ref = &lstm_2_s16_output[0];
+    const int32_t output_ref_size = lstm_2_s16_batch_size * lstm_2_s16_time_steps * lstm_2_s16_hidden_size;
+
+    int64_t input_data_kernel_sum[lstm_2_s16_hidden_size];
+    int64_t forget_data_kernel_sum[lstm_2_s16_hidden_size];
+    int64_t cell_data_kernel_sum[lstm_2_s16_hidden_size];
+    int64_t output_data_kernel_sum[lstm_2_s16_hidden_size];
+
+    int64_t input_hidden_kernel_sum[lstm_2_s16_hidden_size];
+    int64_t forget_hidden_kernel_sum[lstm_2_s16_hidden_size];
+    int64_t cell_hidden_kernel_sum[lstm_2_s16_hidden_size];
+    int64_t output_hidden_kernel_sum[lstm_2_s16_hidden_size];
+
+    arm_vector_sum_s8_s64(&input_data_kernel_sum[0],
+                          lstm_2_s16_input_size,
+                          lstm_2_s16_hidden_size,
+                          &lstm_2_s16_input_gate_input_weights[0],
+                          lstm_2_s16_input_zero_point,
+                          &lstm_2_s16_input_gate_bias[0]);
+    arm_vector_sum_s8_s64(&forget_data_kernel_sum[0],
+                          lstm_2_s16_input_size,
+                          lstm_2_s16_hidden_size,
+                          &lstm_2_s16_forget_gate_input_weights[0],
+                          lstm_2_s16_input_zero_point,
+                          &lstm_2_s16_forget_gate_bias[0]);
+    arm_vector_sum_s8_s64(&cell_data_kernel_sum[0],
+                          lstm_2_s16_input_size,
+                          lstm_2_s16_hidden_size,
+                          &lstm_2_s16_cell_gate_input_weights[0],
+                          lstm_2_s16_input_zero_point,
+                          &lstm_2_s16_cell_gate_bias[0]);
+    arm_vector_sum_s8_s64(&output_data_kernel_sum[0],
+                          lstm_2_s16_input_size,
+                          lstm_2_s16_hidden_size,
+                          &lstm_2_s16_output_gate_input_weights[0],
+                          lstm_2_s16_input_zero_point,
+                          &lstm_2_s16_output_gate_bias[0]);
+
+    arm_vector_sum_s8_s64(&input_hidden_kernel_sum[0],
+                          lstm_2_s16_hidden_size,
+                          lstm_2_s16_hidden_size,
+                          &lstm_2_s16_input_gate_hidden_weights[0],
+                          -lstm_2_s16_output_zero_point,
+                          NULL);
+    arm_vector_sum_s8_s64(&forget_hidden_kernel_sum[0],
+                          lstm_2_s16_hidden_size,
+                          lstm_2_s16_hidden_size,
+                          &lstm_2_s16_forget_gate_hidden_weights[0],
+                          -lstm_2_s16_output_zero_point,
+                          NULL);
+    arm_vector_sum_s8_s64(&cell_hidden_kernel_sum[0],
+                          lstm_2_s16_hidden_size,
+                          lstm_2_s16_hidden_size,
+                          &lstm_2_s16_cell_gate_hidden_weights[0],
+                          -lstm_2_s16_output_zero_point,
+                          NULL);
+    arm_vector_sum_s8_s64(&output_hidden_kernel_sum[0],
+                          lstm_2_s16_hidden_size,
+                          lstm_2_s16_hidden_size,
+                          &lstm_2_s16_output_gate_hidden_weights[0],
+                          -lstm_2_s16_output_zero_point,
+                          NULL);
+
+    // INPUT GATE
+    const cmsis_nn_lstm_gate gate_input = {lstm_2_s16_input_gate_input_multiplier,
+                                           lstm_2_s16_input_gate_input_shift,
+                                           &lstm_2_s16_input_gate_input_weights[0],
+                                           &input_data_kernel_sum[0],
+                                           lstm_2_s16_input_gate_hidden_multiplier,
+                                           lstm_2_s16_input_gate_hidden_shift,
+                                           &lstm_2_s16_input_gate_hidden_weights[0],
+                                           &input_hidden_kernel_sum[0],
+                                           &lstm_2_s16_input_gate_bias[0],
+                                           ARM_SIGMOID};
+
+    // FORGET GATE
+    const cmsis_nn_lstm_gate gate_forget = {lstm_2_s16_forget_gate_input_multiplier,
+                                            lstm_2_s16_forget_gate_input_shift,
+                                            &lstm_2_s16_forget_gate_input_weights[0],
+                                            &forget_data_kernel_sum[0],
+                                            lstm_2_s16_forget_gate_hidden_multiplier,
+                                            lstm_2_s16_forget_gate_hidden_shift,
+                                            &lstm_2_s16_forget_gate_hidden_weights[0],
+                                            &forget_hidden_kernel_sum[0],
+                                            &lstm_2_s16_forget_gate_bias[0],
+                                            ARM_SIGMOID};
+
+    // CELL GATE
+    const cmsis_nn_lstm_gate gate_cell = {lstm_2_s16_cell_gate_input_multiplier,
+                                          lstm_2_s16_cell_gate_input_shift,
+                                          &lstm_2_s16_cell_gate_input_weights[0],
+                                          &cell_data_kernel_sum[0],
+                                          lstm_2_s16_cell_gate_hidden_multiplier,
+                                          lstm_2_s16_cell_gate_hidden_shift,
+                                          &lstm_2_s16_cell_gate_hidden_weights[0],
+                                          &cell_hidden_kernel_sum[0],
+                                          &lstm_2_s16_cell_gate_bias[0],
+                                          ARM_TANH};
+
+    // OUTPUT GATE
+    const cmsis_nn_lstm_gate gate_output = {lstm_2_s16_output_gate_input_multiplier,
+                                            lstm_2_s16_output_gate_input_shift,
+                                            &lstm_2_s16_output_gate_input_weights[0],
+                                            &output_data_kernel_sum[0],
+                                            lstm_2_s16_output_gate_hidden_multiplier,
+                                            lstm_2_s16_output_gate_hidden_shift,
+                                            &lstm_2_s16_output_gate_hidden_weights[0],
+                                            &output_hidden_kernel_sum[0],
+                                            &lstm_2_s16_output_gate_bias[0],
+                                            ARM_SIGMOID};
+
+    // LSTM DATA
+    const cmsis_nn_lstm_params params = {lstm_2_s16_time_major,
+                                         lstm_2_s16_batch_size,
+                                         lstm_2_s16_time_steps,
+                                         lstm_2_s16_input_size,
+                                         lstm_2_s16_hidden_size,
+                                         lstm_2_s16_input_zero_point,
+                                         lstm_2_s16_forget_to_cell_multiplier,
+                                         lstm_2_s16_forget_to_cell_shift,
+                                         lstm_2_s16_input_to_cell_multiplier,
+                                         lstm_2_s16_input_to_cell_shift,
+                                         lstm_2_s16_cell_clip,
+                                         lstm_2_s16_cell_scale_power,
+                                         lstm_2_s16_output_multiplier,
+                                         lstm_2_s16_output_shift,
+                                         lstm_2_s16_output_zero_point,
+                                         gate_forget,
+                                         gate_input,
+                                         gate_cell,
+                                         gate_output};
+
+    cmsis_nn_lstm_context buffers;
+    buffers.temp1 = buffer1;
+    buffers.temp2 = buffer2;
+    buffers.cell_state = buffer3;
+
+    arm_cmsis_nn_status result = arm_lstm_unidirectional_s16(lstm_2_s16_input, output, &params, &buffers);
+
+    TEST_ASSERT_EQUAL(expected, result);
+    TEST_ASSERT_TRUE(validate_s16(output, output_ref, output_ref_size));
+}
+void lstm_one_time_step_s16(void)
+{
+    int16_t output[lstm_one_time_step_s16_batch_size * lstm_one_time_step_s16_time_steps *
+                   lstm_one_time_step_s16_hidden_size] = {0};
+    const arm_cmsis_nn_status expected = ARM_CMSIS_NN_SUCCESS;
+    const int16_t *output_ref = &lstm_one_time_step_s16_output[0];
+    const int32_t output_ref_size =
+        lstm_one_time_step_s16_batch_size * lstm_one_time_step_s16_time_steps * lstm_one_time_step_s16_hidden_size;
+
+    int64_t input_data_kernel_sum[lstm_one_time_step_s16_hidden_size];
+    int64_t forget_data_kernel_sum[lstm_one_time_step_s16_hidden_size];
+    int64_t cell_data_kernel_sum[lstm_one_time_step_s16_hidden_size];
+    int64_t output_data_kernel_sum[lstm_one_time_step_s16_hidden_size];
+
+    int64_t input_hidden_kernel_sum[lstm_one_time_step_s16_hidden_size];
+    int64_t forget_hidden_kernel_sum[lstm_one_time_step_s16_hidden_size];
+    int64_t cell_hidden_kernel_sum[lstm_one_time_step_s16_hidden_size];
+    int64_t output_hidden_kernel_sum[lstm_one_time_step_s16_hidden_size];
+
+    arm_vector_sum_s8_s64(&input_data_kernel_sum[0],
+                          lstm_one_time_step_s16_input_size,
+                          lstm_one_time_step_s16_hidden_size,
+                          &lstm_one_time_step_s16_input_gate_input_weights[0],
+                          lstm_one_time_step_s16_input_zero_point,
+                          &lstm_one_time_step_s16_input_gate_bias[0]);
+    arm_vector_sum_s8_s64(&forget_data_kernel_sum[0],
+                          lstm_one_time_step_s16_input_size,
+                          lstm_one_time_step_s16_hidden_size,
+                          &lstm_one_time_step_s16_forget_gate_input_weights[0],
+                          lstm_one_time_step_s16_input_zero_point,
+                          &lstm_one_time_step_s16_forget_gate_bias[0]);
+    arm_vector_sum_s8_s64(&cell_data_kernel_sum[0],
+                          lstm_one_time_step_s16_input_size,
+                          lstm_one_time_step_s16_hidden_size,
+                          &lstm_one_time_step_s16_cell_gate_input_weights[0],
+                          lstm_one_time_step_s16_input_zero_point,
+                          &lstm_one_time_step_s16_cell_gate_bias[0]);
+    arm_vector_sum_s8_s64(&output_data_kernel_sum[0],
+                          lstm_one_time_step_s16_input_size,
+                          lstm_one_time_step_s16_hidden_size,
+                          &lstm_one_time_step_s16_output_gate_input_weights[0],
+                          lstm_one_time_step_s16_input_zero_point,
+                          &lstm_one_time_step_s16_output_gate_bias[0]);
+
+    arm_vector_sum_s8_s64(&input_hidden_kernel_sum[0],
+                          lstm_one_time_step_s16_hidden_size,
+                          lstm_one_time_step_s16_hidden_size,
+                          &lstm_one_time_step_s16_input_gate_hidden_weights[0],
+                          -lstm_one_time_step_s16_output_zero_point,
+                          NULL);
+    arm_vector_sum_s8_s64(&forget_hidden_kernel_sum[0],
+                          lstm_one_time_step_s16_hidden_size,
+                          lstm_one_time_step_s16_hidden_size,
+                          &lstm_one_time_step_s16_forget_gate_hidden_weights[0],
+                          -lstm_one_time_step_s16_output_zero_point,
+                          NULL);
+    arm_vector_sum_s8_s64(&cell_hidden_kernel_sum[0],
+                          lstm_one_time_step_s16_hidden_size,
+                          lstm_one_time_step_s16_hidden_size,
+                          &lstm_one_time_step_s16_cell_gate_hidden_weights[0],
+                          -lstm_one_time_step_s16_output_zero_point,
+                          NULL);
+    arm_vector_sum_s8_s64(&output_hidden_kernel_sum[0],
+                          lstm_one_time_step_s16_hidden_size,
+                          lstm_one_time_step_s16_hidden_size,
+                          &lstm_one_time_step_s16_output_gate_hidden_weights[0],
+                          -lstm_one_time_step_s16_output_zero_point,
+                          NULL);
+
+    // INPUT GATE
+    const cmsis_nn_lstm_gate gate_input = {lstm_one_time_step_s16_input_gate_input_multiplier,
+                                           lstm_one_time_step_s16_input_gate_input_shift,
+                                           &lstm_one_time_step_s16_input_gate_input_weights[0],
+                                           &input_data_kernel_sum[0],
+                                           lstm_one_time_step_s16_input_gate_hidden_multiplier,
+                                           lstm_one_time_step_s16_input_gate_hidden_shift,
+                                           &lstm_one_time_step_s16_input_gate_hidden_weights[0],
+                                           &input_hidden_kernel_sum[0],
+                                           &lstm_one_time_step_s16_input_gate_bias[0],
+                                           ARM_SIGMOID};
+
+    // FORGET GATE
+    const cmsis_nn_lstm_gate gate_forget = {lstm_one_time_step_s16_forget_gate_input_multiplier,
+                                            lstm_one_time_step_s16_forget_gate_input_shift,
+                                            &lstm_one_time_step_s16_forget_gate_input_weights[0],
+                                            &forget_data_kernel_sum[0],
+                                            lstm_one_time_step_s16_forget_gate_hidden_multiplier,
+                                            lstm_one_time_step_s16_forget_gate_hidden_shift,
+                                            &lstm_one_time_step_s16_forget_gate_hidden_weights[0],
+                                            &forget_hidden_kernel_sum[0],
+                                            &lstm_one_time_step_s16_forget_gate_bias[0],
+                                            ARM_SIGMOID};
+
+    // CELL GATE
+    const cmsis_nn_lstm_gate gate_cell = {lstm_one_time_step_s16_cell_gate_input_multiplier,
+                                          lstm_one_time_step_s16_cell_gate_input_shift,
+                                          &lstm_one_time_step_s16_cell_gate_input_weights[0],
+                                          &cell_data_kernel_sum[0],
+                                          lstm_one_time_step_s16_cell_gate_hidden_multiplier,
+                                          lstm_one_time_step_s16_cell_gate_hidden_shift,
+                                          &lstm_one_time_step_s16_cell_gate_hidden_weights[0],
+                                          &cell_hidden_kernel_sum[0],
+                                          &lstm_one_time_step_s16_cell_gate_bias[0],
+                                          ARM_TANH};
+
+    // OUTPUT GATE
+    const cmsis_nn_lstm_gate gate_output = {lstm_one_time_step_s16_output_gate_input_multiplier,
+                                            lstm_one_time_step_s16_output_gate_input_shift,
+                                            &lstm_one_time_step_s16_output_gate_input_weights[0],
+                                            &output_data_kernel_sum[0],
+                                            lstm_one_time_step_s16_output_gate_hidden_multiplier,
+                                            lstm_one_time_step_s16_output_gate_hidden_shift,
+                                            &lstm_one_time_step_s16_output_gate_hidden_weights[0],
+                                            &output_hidden_kernel_sum[0],
+                                            &lstm_one_time_step_s16_output_gate_bias[0],
+                                            ARM_SIGMOID};
+
+    // LSTM DATA
+    const cmsis_nn_lstm_params params = {lstm_one_time_step_s16_time_major,
+                                         lstm_one_time_step_s16_batch_size,
+                                         lstm_one_time_step_s16_time_steps,
+                                         lstm_one_time_step_s16_input_size,
+                                         lstm_one_time_step_s16_hidden_size,
+                                         lstm_one_time_step_s16_input_zero_point,
+                                         lstm_one_time_step_s16_forget_to_cell_multiplier,
+                                         lstm_one_time_step_s16_forget_to_cell_shift,
+                                         lstm_one_time_step_s16_input_to_cell_multiplier,
+                                         lstm_one_time_step_s16_input_to_cell_shift,
+                                         lstm_one_time_step_s16_cell_clip,
+                                         lstm_one_time_step_s16_cell_scale_power,
+                                         lstm_one_time_step_s16_output_multiplier,
+                                         lstm_one_time_step_s16_output_shift,
+                                         lstm_one_time_step_s16_output_zero_point,
+                                         gate_forget,
+                                         gate_input,
+                                         gate_cell,
+                                         gate_output};
+
+    cmsis_nn_lstm_context buffers;
+    buffers.temp1 = buffer1;
+    buffers.temp2 = buffer2;
+    buffers.cell_state = buffer3;
+
+    arm_cmsis_nn_status result = arm_lstm_unidirectional_s16(lstm_one_time_step_s16_input, output, &params, &buffers);
+
+    TEST_ASSERT_EQUAL(expected, result);
+    TEST_ASSERT_TRUE(validate_s16(output, output_ref, output_ref_size));
+}
\ No newline at end of file