diff --git a/README.md b/README.md
index 1c3ba33..71f654e 100644
--- a/README.md
+++ b/README.md
@@ -101,114 +101,214 @@
Score (Accuracy) |
- CNN Large INT8 * |
+ CNN Large INT8 * |
INT8 |
TensorFlow Lite |
:heavy_check_mark: |
:heavy_check_mark: |
:heavy_check_mark: |
:heavy_check_mark: |
- 0.931 |
+ 0.923 |
- CNN Medium INT8 * |
+ CNN Medium INT8 * |
INT8 |
TensorFlow Lite |
:heavy_check_mark: |
:heavy_check_mark: |
:heavy_check_mark: |
:heavy_check_mark: |
- 0.911 |
+ 0.905 |
- CNN Small INT8 * |
+ CNN Small INT8 * |
INT8 |
TensorFlow Lite |
:heavy_check_mark: |
:heavy_check_mark: |
:heavy_check_mark: |
:heavy_check_mark: |
- 0.912 |
+ 0.902 |
- DNN Large INT8 * |
+ DNN Large INT8 * |
INT8 |
TensorFlow Lite |
:heavy_check_mark: |
:heavy_check_mark: |
:heavy_check_mark: |
:heavy_check_mark: |
- 0.863 |
+ 0.860 |
- DNN Medium INT8 * |
+ DNN Medium INT8 * |
INT8 |
TensorFlow Lite |
:heavy_check_mark: |
:heavy_check_mark: |
:heavy_check_mark: |
:heavy_check_mark: |
- 0.844 |
+ 0.839 |
- DNN Small INT8 * |
+ DNN Small INT8 * |
INT8 |
TensorFlow Lite |
:heavy_check_mark: |
:heavy_check_mark: |
:heavy_check_mark: |
:heavy_check_mark: |
- 0.825 |
+ 0.821 |
- DS-CNN Clustered FP32 * |
+ DS-CNN Large Clustered FP32 * |
FP32 |
TensorFlow Lite |
:heavy_check_mark: |
- :heavy_multiplication_x: |
+ :heavy_check_mark: |
:heavy_check_mark: |
:heavy_multiplication_x: |
- 0.950 |
+ 0.948 |
- DS-CNN Clustered INT8 * |
+ DS-CNN Large Clustered INT8 * |
INT8 |
TensorFlow Lite |
- :heavy_multiplication_x: |
:heavy_check_mark: |
:heavy_check_mark: |
:heavy_check_mark: |
- 0.940 |
+ :heavy_check_mark: |
+ 0.939 |
- DS-CNN Large INT8 * |
+ DS-CNN Large INT8 * |
INT8 |
TensorFlow Lite |
:heavy_check_mark: |
:heavy_check_mark: HERO |
:heavy_check_mark: |
:heavy_check_mark: |
- 0.946 |
+ 0.945 |
- DS-CNN Medium INT8 * |
+ DS-CNN Medium INT8 * |
INT8 |
TensorFlow Lite |
:heavy_check_mark: |
:heavy_check_mark: HERO |
:heavy_check_mark: |
:heavy_check_mark: |
- 0.941 |
+ 0.939 |
- DS-CNN Small INT8 * |
+ DS-CNN Small INT8 * |
INT8 |
TensorFlow Lite |
:heavy_check_mark: |
:heavy_check_mark: HERO |
:heavy_check_mark: |
:heavy_check_mark: |
- 0.935 |
+ 0.931 |
+
+
+ DS-CNN Small INT16 * |
+ INT16 |
+ TensorFlow Lite |
+ :heavy_check_mark: |
+ :heavy_check_mark: HERO |
+ :heavy_check_mark: |
+ :heavy_check_mark: |
+ 0.934 |
+
+
+ CNN Large FP32 * |
+ FP32 |
+ TensorFlow Lite |
+ :heavy_check_mark: |
+ :heavy_check_mark: |
+ :heavy_check_mark: |
+ :heavy_multiplication_x: |
+ 0.934 |
+
+
+ CNN Medium FP32 * |
+ FP32 |
+ TensorFlow Lite |
+ :heavy_check_mark: |
+ :heavy_check_mark: |
+ :heavy_check_mark: |
+ :heavy_multiplication_x: |
+ 0.918 |
+
+
+ CNN Small FP32 * |
+ FP32 |
+ TensorFlow Lite |
+ :heavy_check_mark: |
+ :heavy_check_mark: |
+ :heavy_check_mark: |
+ :heavy_multiplication_x: |
+ 0.922 |
+
+
+ DNN Large FP32 * |
+ FP32 |
+ TensorFlow Lite |
+ :heavy_check_mark: |
+ :heavy_check_mark: |
+ :heavy_check_mark: |
+ :heavy_multiplication_x: |
+ 0.867 |
+
+
+ DNN Medium FP32 * |
+ FP32 |
+ TensorFlow Lite |
+ :heavy_check_mark: |
+ :heavy_check_mark: |
+ :heavy_check_mark: |
+ :heavy_multiplication_x: |
+ 0.850 |
+
+
+ DNN Small FP32 * |
+ FP32 |
+ TensorFlow Lite |
+ :heavy_check_mark: |
+ :heavy_check_mark: |
+ :heavy_check_mark: |
+ :heavy_multiplication_x: |
+ 0.836 |
+
+
+ DS-CNN Large FP32 * |
+ FP32 |
+ TensorFlow Lite |
+ :heavy_check_mark: |
+ :heavy_check_mark: HERO |
+ :heavy_check_mark: |
+ :heavy_multiplication_x: |
+ 0.950 |
+
+
+ DS-CNN Medium FP32 * |
+ FP32 |
+ TensorFlow Lite |
+ :heavy_check_mark: |
+ :heavy_check_mark: HERO |
+ :heavy_check_mark: |
+ :heavy_multiplication_x: |
+ 0.943 |
+
+
+ DS-CNN Small FP32 * |
+ FP32 |
+ TensorFlow Lite |
+ :heavy_check_mark: |
+ :heavy_check_mark: HERO |
+ :heavy_check_mark: |
+ :heavy_multiplication_x: |
+ 0.939 |
MicroNet Large INT8 |
diff --git a/models/experimental/efficientnet_lite0_224/efficientnet_lite0_224.tflite b/models/experimental/efficientnet_lite0_224/efficientnet_lite0_224.tflite
index 9c9da85..a85250a 100644
Binary files a/models/experimental/efficientnet_lite0_224/efficientnet_lite0_224.tflite and b/models/experimental/efficientnet_lite0_224/efficientnet_lite0_224.tflite differ
diff --git a/models/experimental/har_cnn/har_int8.tflite b/models/experimental/har_cnn/har_int8.tflite
index 9d65d7e..a85b125 100644
Binary files a/models/experimental/har_cnn/har_int8.tflite and b/models/experimental/har_cnn/har_int8.tflite differ
diff --git a/models/experimental/ssd_mobilenet_v3_int8/ssd_mobilenet_v3_int8.tflite b/models/experimental/ssd_mobilenet_v3_int8/ssd_mobilenet_v3_int8.tflite
index f188cd2..65e2043 100644
Binary files a/models/experimental/ssd_mobilenet_v3_int8/ssd_mobilenet_v3_int8.tflite and b/models/experimental/ssd_mobilenet_v3_int8/ssd_mobilenet_v3_int8.tflite differ
diff --git a/models/experimental/yolov3_416_416_backbone_mltools_int8/yolov3_416_416_backbone_mltools_int8.tflite b/models/experimental/yolov3_416_416_backbone_mltools_int8/yolov3_416_416_backbone_mltools_int8.tflite
index 3270fe7..5a77ec3 100644
Binary files a/models/experimental/yolov3_416_416_backbone_mltools_int8/yolov3_416_416_backbone_mltools_int8.tflite and b/models/experimental/yolov3_416_416_backbone_mltools_int8/yolov3_416_416_backbone_mltools_int8.tflite differ
diff --git a/models/experimental/yolov3_tiny_int8_pruned_backbone_only/yolov3_tiny_int8_pruned_backbone_only.tflite b/models/experimental/yolov3_tiny_int8_pruned_backbone_only/yolov3_tiny_int8_pruned_backbone_only.tflite
index 5a45bf0..b879213 100644
Binary files a/models/experimental/yolov3_tiny_int8_pruned_backbone_only/yolov3_tiny_int8_pruned_backbone_only.tflite and b/models/experimental/yolov3_tiny_int8_pruned_backbone_only/yolov3_tiny_int8_pruned_backbone_only.tflite differ
diff --git a/models/keyword_spotting/cnn_large/model_package_tf/README.md b/models/keyword_spotting/cnn_large/model_package_tf/README.md
new file mode 100644
index 0000000..b0cbfe4
--- /dev/null
+++ b/models/keyword_spotting/cnn_large/model_package_tf/README.md
@@ -0,0 +1,115 @@
+# CNN Large model package
+
+This folder contains code that will allow you to recreate the CNN Large keyword spotting model from
+the [Hello Edge paper](https://arxiv.org/pdf/1711.07128.pdf).
+
+## License
+[Apache-2.0](https://spdx.org/licenses/Apache-2.0.html)
+
+## Model Package Overview
+| Model | CNN_Large |
+|:---------------: |:------------------------------------------:|
+| **Format**: | Keras, Saved Model, TensorFlow Lite int8, TensorFlow Lite fp32 |
+| **Feature**: | Keyword spotting for Arm Cortex-M CPUs |
+| **Architectural Delta w.r.t. Vanilla**: | None |
+| **Domain**: | Keyword spotting |
+| **Package Quality**: | Optimised |
+
+## Model Recreation
+
+In order to recreate the model you will first need to be using ```Python3.7``` and install the requirements in ```requirements.txt```.
+
+Once you have these requirements satisfied you can execute the recreation script contained within this folder, just run:
+
+```bash
+bash ./recreate_model.sh
+```
+
+Running this script will use the pre-trained checkpoint files supplied in the ```./model_archive/model_source/weights``` folder
+to generate the TFLite files and perform evaluation on the test sets. Both an fp32 version and a quantized version will be produced.
+The quantized version will use post-training quantization to fully quantize it.
+
+If you want to run training from scratch you can do this by supplying ```--train``` when running the script. For example:
+
+```bash
+bash ./recreate_model.sh --train
+```
+
+Training is then performed and should produce a model to the stated accuracy in this repository.
+Note that exporting to TFLite will still happen with the pre-trained checkpoint files so you will need to re-run the script
+and this time supply the path to the new checkpoint files you want to use, for example:
+
+```bash
+bash ./recreate_model.sh --ckpt
+```
+
+
+## Training
+
+To train a DNN with 3 fully-connected layers with 128 neurons in each layer, run:
+
+```
+python train.py --model_architecture dnn --model_size_info 128 128 128
+```
+The command line argument *--model_size_info* is used to pass the neural network layer
+dimensions such as number of layers, convolution filter size/stride as a list to models.py,
+which builds the TensorFlow graph based on the provided model architecture
+and layer dimensions. For more info on *model_size_info* for each network architecture see
+[models.py](models.py).
+
+The training commands with all the hyperparameters to reproduce the models shown in the
+[paper](https://arxiv.org/pdf/1711.07128.pdf) are given [here](recreate_model.sh).
+
+## Testing
+To run inference on the trained model from a checkpoint and get accuracy on validation and test sets, run:
+```
+python evaluation.py --model_architecture dnn --model_size_info 128 128 128 --checkpoint
+```
+The parameters used here should match those used in the Training step.
+
+## Optimization
+
+We introduce a new *optional* step to optimize the trained keyword spotting model for deployment.
+
+Here we use TensorFlow's [weight clustering API](https://www.tensorflow.org/model_optimization/guide/clustering) to reduce the compressed model size and optimize inference on supported hardware. 32 weight clusters and kmeans++ cluster intialization method are used as the clustering hyperparameters.
+
+To optimize your trained model (e.g. a DNN), a trained model checkpoint is needed to run clustering and fine-tuning on.
+You can use the pre-trained checkpoints provided, or train your own model and use the resulting checkpoint.
+
+To apply the optimization and fine-tuning, run the following command:
+```
+python optimisations.py --model_architecture dnn --model_size_info 128 128 128 --checkpoint
+```
+The parameters used here should match those used in the Training step, except for the number of training steps.
+The number of training steps is reduced since the optimization step only requires fine-tuning.
+
+This will generate a clustered model checkpoint that can be used in the quantization step to generate a quantized and clustered TFLite model.
+
+## Quantization and TFLite Conversion
+
+As part of the update we now use TensorFlow's
+[post training quantization](https://www.tensorflow.org/lite/performance/post_training_quantization) to
+make quantization of the trained models super simple.
+
+To quantize your trained model (e.g. a DNN) run:
+```
+python convert_to_tflite.py --model_architecture dnn --model_size_info 128 128 128 --checkpoint [--inference_type int8|int16]
+```
+The parameters used here should match those used in the Training step.
+
+The inference_type parameter is *optional* and to be used if a fully quantized model with inputs and outputs of type int8 or int16 is needed. It defaults to fp32.
+
+This step will produce a quantized TFLite file *dnn_quantized.tflite*.
+You can test the accuracy of this quantized model on the test set by running:
+```
+python evaluation.py --tflite_path dnn_quantized.tflite
+```
+The parameters used here should match those used in the Training step.
+
+`convert_to_tflite.py` uses post-training quantization to generate a quantized model by default. If you wish to convert to a floating point TFLite model, use the command below:
+
+```
+python convert_to_tflite.py --model_architecture dnn --model_size_info 128 128 128 --checkpoint --no-quantize
+```
+
+This will produce a floating point TFLite file *dnn.tflite*. You can test the accuracy of this floating point model using `evaluation.py` as above.
diff --git a/models/keyword_spotting/cnn_large/model_package_tf/cnn_l_inference_keras.py b/models/keyword_spotting/cnn_large/model_package_tf/cnn_l_inference_keras.py
new file mode 100644
index 0000000..db7694a
--- /dev/null
+++ b/models/keyword_spotting/cnn_large/model_package_tf/cnn_l_inference_keras.py
@@ -0,0 +1,76 @@
+# Copyright © 2023 Arm Ltd. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from data_processing.data_preprocessing import load_wav_file, calculate_mfcc
+
+import tensorflow as tf
+import argparse
+
+
+def load_labels(filename):
+ """Read in labels, one label per line."""
+ f = open(filename, "r")
+ return f.read().splitlines()
+
+
+def main():
+ window_size_samples = int(FLAGS.sample_rate * FLAGS.window_size_ms / 1000)
+ window_stride_samples = int(FLAGS.sample_rate * FLAGS.window_stride_ms / 1000)
+ decoded, sample = load_wav_file(FLAGS.wav, FLAGS.sample_rate)
+ x = calculate_mfcc(decoded, sample, window_size_samples, window_stride_samples, FLAGS.dct_coefficient_count)
+ x = tf.reshape(x, [1, -1])
+
+ model = tf.keras.models.load_model(FLAGS.keras_file_path)
+ predictions = model.predict(x)
+
+ # Sort to show labels in order of confidence
+ top_k = predictions[0].argsort()[-1:][::-1]
+ for node_id in top_k:
+ human_string = load_labels(FLAGS.labels)[int(node_id)]
+ score = predictions[0,node_id]
+ print(f'model predicted: {human_string} with score {score:.5f}')
+
+
+if __name__ == '__main__':
+ parser = argparse.ArgumentParser()
+ parser.add_argument(
+ '--wav', type=str, default='', help='Audio file to be identified.')
+ parser.add_argument(
+ '--labels', type=str, default='', help='Path to file containing labels.')
+ parser.add_argument(
+ '--sample_rate',
+ type=int,
+ default=16000,
+ help='Expected sample rate of the wavs', )
+ parser.add_argument(
+ '--window_size_ms',
+ type=float,
+ default=30.0,
+ help='How long each spectrogram timeslice is', )
+ parser.add_argument(
+ '--window_stride_ms',
+ type=float,
+ default=10.0,
+ help='How long each spectrogram timeslice is', )
+ parser.add_argument(
+ '--dct_coefficient_count',
+ type=int,
+ default=40,
+ help='How many bins to use for the MFCC fingerprint', )
+ parser.add_argument(
+ '--keras_file_path',
+ type=str,
+ default='',
+ help='Path to the .h5 Keras model file to use for testing.')
+ FLAGS, unparsed = parser.parse_known_args()
+ main()
diff --git a/models/keyword_spotting/cnn_large/model_package_tf/cnn_l_inference_tflite.py b/models/keyword_spotting/cnn_large/model_package_tf/cnn_l_inference_tflite.py
new file mode 100644
index 0000000..9f79d99
--- /dev/null
+++ b/models/keyword_spotting/cnn_large/model_package_tf/cnn_l_inference_tflite.py
@@ -0,0 +1,120 @@
+# Copyright © 2023 Arm Ltd. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from data_processing.data_preprocessing import load_wav_file, calculate_mfcc
+
+import tensorflow as tf
+import numpy as np
+import argparse
+
+
+def tflite_inference(input_data, tflite_path):
+ """Call forwards pass of TFLite file and returns the result.
+
+ Args:
+ input_data: Input data to use on forward pass.
+ tflite_path: Path to TFLite file to run.
+
+ Returns:
+ Output from inference.
+ """
+ supported_quant_dtypes = (np.int8, np.int16)
+ interpreter = tf.lite.Interpreter(model_path=tflite_path)
+ interpreter.allocate_tensors()
+
+ input_details = interpreter.get_input_details()
+ output_details = interpreter.get_output_details()
+
+ input_dtype = input_details[0]["dtype"]
+ output_dtype = output_details[0]["dtype"]
+
+ # Check if the input/output type is quantized,
+ # set scale and zero-point accordingly
+ if input_dtype in supported_quant_dtypes:
+ input_scale, input_zero_point = input_details[0]["quantization"]
+ else:
+ input_scale, input_zero_point = 1, 0
+
+ input_data = input_data / input_scale + input_zero_point
+ input_data = np.round(input_data) if input_dtype in supported_quant_dtypes else input_data
+
+ if output_dtype in supported_quant_dtypes:
+ output_scale, output_zero_point = output_details[0]["quantization"]
+ else:
+ output_scale, output_zero_point = 1, 0
+
+ interpreter.set_tensor(input_details[0]['index'], tf.cast(input_data, input_dtype))
+ interpreter.invoke()
+
+ output_data = interpreter.get_tensor(output_details[0]['index'])
+
+ output_data = output_scale * (output_data.astype(np.float32) - output_zero_point)
+
+ return output_data
+
+
+def load_labels(filename):
+ """Read in labels, one label per line."""
+ f = open(filename, "r")
+ return f.read().splitlines()
+
+
+def main():
+ window_size_samples = int(FLAGS.sample_rate * FLAGS.window_size_ms / 1000)
+ window_stride_samples = int(FLAGS.sample_rate * FLAGS.window_stride_ms / 1000)
+ decoded, sample = load_wav_file(FLAGS.wav, FLAGS.sample_rate)
+ x = calculate_mfcc(decoded, sample, window_size_samples, window_stride_samples, FLAGS.dct_coefficient_count)
+ x = tf.reshape(x, [1, -1])
+ predictions = tflite_inference(x, FLAGS.tflite_path)
+
+ # Sort to show labels in order of confidence
+ top_k = predictions[0].argsort()[-1:][::-1]
+ for node_id in top_k:
+ human_string = load_labels(FLAGS.labels)[int(node_id)]
+ score = predictions[0,node_id]
+ print(f'model predicted: {human_string} with score {score:.5f}')
+
+
+if __name__ == '__main__':
+ parser = argparse.ArgumentParser()
+ parser.add_argument(
+ '--wav', type=str, default='', help='Audio file to be identified.')
+ parser.add_argument(
+ '--labels', type=str, default='', help='Path to file containing labels.')
+ parser.add_argument(
+ '--sample_rate',
+ type=int,
+ default=16000,
+ help='Expected sample rate of the wavs', )
+ parser.add_argument(
+ '--window_size_ms',
+ type=float,
+ default=30.0,
+ help='How long each spectrogram timeslice is', )
+ parser.add_argument(
+ '--window_stride_ms',
+ type=float,
+ default=10.0,
+ help='How long each spectrogram timeslice is', )
+ parser.add_argument(
+ '--dct_coefficient_count',
+ type=int,
+ default=40,
+ help='How many bins to use for the MFCC fingerprint', )
+ parser.add_argument(
+ '--tflite_path',
+ type=str,
+ default='',
+ help='Path to TFLite file to use for testing.')
+ FLAGS, unparsed = parser.parse_known_args()
+ main()
diff --git a/models/keyword_spotting/cnn_large/model_package_tf/convert_to_tflite.py b/models/keyword_spotting/cnn_large/model_package_tf/convert_to_tflite.py
new file mode 100644
index 0000000..64ab8df
--- /dev/null
+++ b/models/keyword_spotting/cnn_large/model_package_tf/convert_to_tflite.py
@@ -0,0 +1,234 @@
+# Copyright © 2023 Arm Ltd. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Functions for converting and quantizing a trained keyword spotting
+ model and saving to TFLite."""
+
+import argparse
+
+import tensorflow as tf
+
+from data_processing import data_preprocessing
+from model_core_utils import models
+from evaluation import tflite_test
+
+NUM_REP_DATA_SAMPLES = 100 # How many samples to use for post training quantization.
+
+
+def convert(model_settings, audio_processor, checkpoint, quantize, inference_type, tflite_path):
+ """Load our trained floating point model and convert it.
+
+ TFLite conversion or post training quantization is performed and the
+ resulting model is saved as a TFLite file.
+ We use samples from the validation set to do post training quantization.
+
+ Args:
+ model_settings: Dictionary of common model settings.
+ audio_processor: Audio processor class object.
+ checkpoint: Path to training checkpoint to load.
+ quantize: Whether to quantize the model or convert to fp32 TFLite model.
+ inference_type: Input/output type of the quantized model.
+ tflite_path: Output TFLite file save path.
+ """
+ model = models.create_model(model_settings, FLAGS.model_architecture, FLAGS.model_size_info, False)
+ model.load_weights(checkpoint).expect_partial()
+
+ val_data = audio_processor.get_data(audio_processor.Modes.VALIDATION).batch(1)
+
+ def _rep_dataset():
+ """Generator function to produce representative dataset."""
+ i = 0
+ for mfcc, label in val_data:
+ if i > NUM_REP_DATA_SAMPLES:
+ break
+ i += 1
+ yield [mfcc]
+
+ if quantize:
+ # Quantize model and save to disk.
+ tflite_model = post_training_quantize(model, inference_type, _rep_dataset)
+ with open(tflite_path, 'wb') as f:
+ f.write(tflite_model)
+ print(f'Quantized model saved to {tflite_path}.')
+ else:
+ converter = tf.lite.TFLiteConverter.from_keras_model(model)
+ tflite_model = converter.convert()
+ with open(tflite_path, 'wb') as f:
+ f.write(tflite_model)
+ print(f'Converted model saved to {tflite_path}.')
+
+
+def post_training_quantize(keras_model, inference_type, rep_dataset):
+ """Perform post training quantization and returns the TFLite model ready for saving.
+
+ See https://www.tensorflow.org/lite/performance/post_training_quantization#full_integer_quantization for
+ more details.
+
+ Args:
+ keras_model: The trained tf Keras model used for post training quantization.
+ inference_type: Input/output type of the quantized model.
+ rep_dataset: Function to use as a representative dataset, must be callable.
+
+ Returns:
+ Quantized TFLite model ready for saving to disk.
+ """
+ converter = tf.lite.TFLiteConverter.from_keras_model(keras_model)
+ converter.optimizations = [tf.lite.Optimize.DEFAULT]
+
+ if inference_type == 'int8':
+ converter.inference_input_type = tf.int8
+ converter.inference_output_type = tf.int8
+ supported_ops = tf.lite.OpsSet.TFLITE_BUILTINS_INT8
+ if inference_type == 'int16':
+ converter.inference_input_type = tf.int16
+ converter.inference_output_type = tf.int16
+ supported_ops = tf.lite.OpsSet.EXPERIMENTAL_TFLITE_BUILTINS_ACTIVATIONS_INT16_WEIGHTS_INT8
+
+ # Int8 post training quantization needs representative dataset.
+ converter.representative_dataset = rep_dataset
+ converter.target_spec.supported_ops = [supported_ops]
+
+ tflite_model = converter.convert()
+
+ return tflite_model
+
+
+def main():
+ model_settings = models.prepare_model_settings(
+ len(data_preprocessing.prepare_words_list(FLAGS.wanted_words.split(','))),
+ FLAGS.sample_rate, FLAGS.clip_duration_ms, FLAGS.window_size_ms,
+ FLAGS.window_stride_ms, FLAGS.dct_coefficient_count)
+
+ audio_processor = data_preprocessing.AudioProcessor(data_url=FLAGS.data_url,
+ data_dir=FLAGS.data_dir,
+ silence_percentage=FLAGS.silence_percentage,
+ unknown_percentage=FLAGS.unknown_percentage,
+ wanted_words=FLAGS.wanted_words.split(','),
+ validation_percentage=FLAGS.validation_percentage,
+ testing_percentage=FLAGS.testing_percentage,
+ model_settings=model_settings)
+
+ if FLAGS.quantize:
+ tflite_path = f'{FLAGS.model_architecture}_quantized.tflite'
+ else:
+ tflite_path = f'{FLAGS.model_architecture}.tflite'
+
+ # Load floating point model from checkpoint and convert it.
+ convert(model_settings, audio_processor, FLAGS.checkpoint,
+ FLAGS.quantize, FLAGS.inference_type, tflite_path)
+
+ # Test the newly converted model on the test set.
+ tflite_test(model_settings, audio_processor, tflite_path)
+
+
+if __name__ == '__main__':
+ parser = argparse.ArgumentParser()
+ parser.add_argument(
+ '--data_url',
+ type=str,
+ default='http://download.tensorflow.org/data/speech_commands_v0.02.tar.gz',
+ help='Location of speech training data archive on the web.')
+ parser.add_argument(
+ '--data_dir',
+ type=str,
+ default='/tmp/speech_dataset/',
+ help="""\
+ Where to download the speech training data to.
+ """)
+ parser.add_argument(
+ '--silence_percentage',
+ type=float,
+ default=10.0,
+ help="""\
+ How much of the training data should be silence.
+ """)
+ parser.add_argument(
+ '--unknown_percentage',
+ type=float,
+ default=10.0,
+ help="""\
+ How much of the training data should be unknown words.
+ """)
+ parser.add_argument(
+ '--testing_percentage',
+ type=int,
+ default=10,
+ help='What percentage of wavs to use as a test set.')
+ parser.add_argument(
+ '--validation_percentage',
+ type=int,
+ default=10,
+ help='What percentage of wavs to use as a validation set.')
+ parser.add_argument(
+ '--sample_rate',
+ type=int,
+ default=16000,
+ help='Expected sample rate of the wavs',)
+ parser.add_argument(
+ '--clip_duration_ms',
+ type=int,
+ default=1000,
+ help='Expected duration in milliseconds of the wavs',)
+ parser.add_argument(
+ '--window_size_ms',
+ type=float,
+ default=30.0,
+ help='How long each spectrogram timeslice is',)
+ parser.add_argument(
+ '--window_stride_ms',
+ type=float,
+ default=10.0,
+ help='How long each spectrogram timeslice is',)
+ parser.add_argument(
+ '--dct_coefficient_count',
+ type=int,
+ default=40,
+ help='How many bins to use for the MFCC fingerprint',)
+ parser.add_argument(
+ '--wanted_words',
+ type=str,
+ default='yes,no,up,down,left,right,on,off,stop,go',
+ help='Words to use (others will be added to an unknown label)',)
+ parser.add_argument(
+ '--model_architecture',
+ type=str,
+ default='dnn',
+ help='What model architecture to use')
+ parser.add_argument(
+ '--model_size_info',
+ type=int,
+ nargs="+",
+ default=[128, 128, 128],
+ help='Model dimensions - different for various models')
+ parser.add_argument(
+ '--checkpoint',
+ type=str,
+ help='Checkpoint to load the weights from.')
+ parser.add_argument(
+ '--quantize',
+ dest='quantize',
+ action="store_true",
+ default=True,
+ help='Whether to quantize the model or convert to fp32 TFLite model. Defaults to True.')
+ parser.add_argument(
+ '--no-quantize',
+ dest='quantize',
+ action="store_false",
+ help='Whether to quantize the model or convert to fp32 TFLite model. Defaults to True.')
+ parser.add_argument(
+ '--inference_type',
+ type=str,
+ default='fp32',
+ help='If quantize is true, whether the model input and output is float32, int8 or int16')
+
+ FLAGS, _ = parser.parse_known_args()
+ main()
diff --git a/models/keyword_spotting/cnn_large/model_package_tf/data_processing/__init__.py b/models/keyword_spotting/cnn_large/model_package_tf/data_processing/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/models/keyword_spotting/cnn_large/model_package_tf/data_processing/data_preprocessing.py b/models/keyword_spotting/cnn_large/model_package_tf/data_processing/data_preprocessing.py
new file mode 100644
index 0000000..05cf5ba
--- /dev/null
+++ b/models/keyword_spotting/cnn_large/model_package_tf/data_processing/data_preprocessing.py
@@ -0,0 +1,462 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+#
+# Modifications Copyright 2023 Arm Inc. All Rights Reserved.
+# Modified to use TensorFlow 2.0 and data pipelines.
+#
+"""Functions for loading and preparing data for keyword spotting."""
+
+import os
+import re
+import sys
+import urllib
+from pathlib import Path
+import tarfile
+import hashlib
+import random
+import math
+from enum import Enum
+
+import numpy as np
+import tensorflow as tf
+from tensorflow.python.ops import gen_audio_ops as audio_ops
+
+MAX_NUM_WAVS_PER_CLASS = 2**27 - 1 # ~134M
+RANDOM_SEED = 59185
+BACKGROUND_NOISE_DIR_NAME = '_background_noise_'
+SILENCE_LABEL = '_silence_'
+SILENCE_INDEX = 0
+UNKNOWN_WORD_INDEX = 1
+UNKNOWN_WORD_LABEL = '_unknown_'
+
+
+def load_wav_file(wav_filename, desired_samples):
+ """Loads and then decodes a given 16bit PCM wav file.
+
+ Decoded audio is scaled to the range [-1, 1] and padded or cropped to the desired number of samples.
+
+ Args:
+ wav_filename: 16bit PCM wav file to load.
+ desired_samples: Number of samples wanted from the audio file.
+
+ Returns:
+ Tuple consisting of the decoded audio and sample rate.
+ """
+ wav_file = tf.io.read_file(wav_filename)
+ decoded_wav = audio_ops.decode_wav(wav_file, desired_channels=1, desired_samples=desired_samples)
+
+ return decoded_wav.audio, decoded_wav.sample_rate
+
+
+def calculate_mfcc(audio_signal, audio_sample_rate, window_size, window_stride, num_mfcc):
+ """Returns Mel Frequency Cepstral Coefficients (MFCC) for a given audio signal.
+
+ Args:
+ audio_signal: Raw audio signal in range [-1, 1]
+ audio_sample_rate: Audio signal sample rate
+ window_size: Window size in samples for calculating spectrogram
+ window_stride: Window stride in samples for calculating spectrogram
+ num_mfcc: The number of MFCC features wanted.
+
+ Returns:
+ Calculated mffc features.
+ """
+ spectrogram = audio_ops.audio_spectrogram(input=audio_signal, window_size=window_size, stride=window_stride,
+ magnitude_squared=True)
+
+ mfcc_features = audio_ops.mfcc(spectrogram, audio_sample_rate, dct_coefficient_count=num_mfcc)
+
+ return mfcc_features
+
+
+def which_set(filename, validation_percentage, testing_percentage):
+ """Determines which data partition the file should belong to.
+
+ We want to keep files in the same training, validation, or testing sets even
+ if new ones are added over time. This makes it less likely that testing
+ samples will accidentally be reused in training when long runs are restarted
+ for example. To keep this stability, a hash of the filename is taken and used
+ to determine which set it should belong to. This determination only depends on
+ the name and the set proportions, so it won't change as other files are added.
+ It's also useful to associate particular files as related (for example words
+ spoken by the same person), so anything after '_nohash_' in a filename is
+ ignored for set determination. This ensures that 'bobby_nohash_0.wav' and
+ 'bobby_nohash_1.wav' are always in the same set, for example.
+
+ Args:
+ filename: File path of the data sample.
+ validation_percentage: How much of the data set to use for validation.
+ testing_percentage: How much of the data set to use for testing.
+
+ Returns:
+ String, one of 'training', 'validation', or 'testing'.
+ """
+ base_name = os.path.basename(filename)
+ # We want to ignore anything after '_nohash_' in the file name when
+ # deciding which set to put a wav in, so the data set creator has a way of
+ # grouping wavs that are close variations of each other.
+ hash_name = re.sub(r'_nohash_.*$', '', base_name)
+ # This looks a bit magical, but we need to decide whether this file should
+ # go into the training, testing, or validation sets, and we want to keep
+ # existing files in the same set even if more files are subsequently
+ # added.
+ # To do that, we need a stable way of deciding based on just the file name
+ # itself, so we do a hash of that and then use that to generate a
+ # probability value that we use to assign it.
+ hash_name_hashed = hashlib.sha1(tf.compat.as_bytes(hash_name)).hexdigest()
+ percentage_hash = ((int(hash_name_hashed, 16) %
+ (MAX_NUM_WAVS_PER_CLASS + 1)) *
+ (100.0 / MAX_NUM_WAVS_PER_CLASS))
+ if percentage_hash < validation_percentage:
+ result = 'validation'
+ elif percentage_hash < (testing_percentage + validation_percentage):
+ result = 'testing'
+ else:
+ result = 'training'
+ return result
+
+
+def prepare_words_list(wanted_words):
+ """Prepends common tokens to the custom word list.
+
+ Args:
+ wanted_words: List of strings containing custom words to spot.
+
+ Returns:
+ List of words with silence and unknown tokens added.
+ """
+ return [SILENCE_LABEL, UNKNOWN_WORD_LABEL] + wanted_words
+
+
+class AudioProcessor:
+ """Handles loading, partitioning, and preparing audio training data."""
+
+ class Modes(Enum):
+ TRAINING = 1
+ VALIDATION = 2
+ TESTING = 3
+
+ def __init__(self, data_url, data_dir, silence_percentage, unknown_percentage,
+ wanted_words, validation_percentage, testing_percentage, model_settings):
+ self.data_dir = Path(data_dir)
+ self.model_settings = model_settings
+ self.words_list = prepare_words_list(wanted_words)
+
+ self._tf_datasets = {}
+ self.background_data = None
+ self._set_size = {'training': 0, 'validation': 0, 'testing': 0}
+
+ self._download_and_extract_data(data_url, data_dir)
+ self._prepare_datasets(silence_percentage, unknown_percentage, wanted_words,
+ validation_percentage, testing_percentage)
+ self._prepare_background_data()
+
+ def get_data(self, mode, background_frequency=0, background_volume_range=0, time_shift=0):
+ """Returns the train, validation or test set for KWS as a TF Dataset.
+
+ Args:
+ mode: The set to return, see AudioProcessor.Modes enumeration.
+ background_frequency: How many of the samples have background noise mixed in.
+ background_volume_range: How loud the background noise should be, between 0 and 1.
+ time_shift: Range to randomly shift the training audio by in time.
+
+ Returns:
+ TF dataset that will generate tuples containing an mfcc and corresponding label.
+
+ Raises:
+ ValueError: If mode is not recognised.
+ """
+ if mode == AudioProcessor.Modes.TRAINING:
+ dataset = self._tf_datasets['training']
+ elif mode == AudioProcessor.Modes.VALIDATION:
+ dataset = self._tf_datasets['validation']
+ elif mode == AudioProcessor.Modes.TESTING:
+ dataset = self._tf_datasets['testing']
+ else:
+ ValueError("Incorrect dataset type given")
+
+ use_background = (self.background_data is not None) and (mode == AudioProcessor.Modes.TRAINING)
+ dataset = dataset.map(lambda path, label: self._process_path(path, label, self.model_settings,
+ background_frequency, background_volume_range,
+ time_shift, use_background, self.background_data),
+ num_parallel_calls=tf.data.experimental.AUTOTUNE)
+
+ return dataset
+
+ def set_size(self, mode):
+ """Get the number of samples in the requested dataset partition.
+
+ Args:
+ mode: Which partition, see AudioProcessor.Modes enumeration.
+
+ Returns:
+ Number of samples in the partition.
+
+ Raises:
+ ValueError: If mode is not recognised.
+ """
+ if mode == AudioProcessor.Modes.TRAINING:
+ return self._set_size['training']
+ elif mode == AudioProcessor.Modes.VALIDATION:
+ return self._set_size['validation']
+ elif mode == AudioProcessor.Modes.TESTING:
+ return self._set_size['testing']
+ else:
+ ValueError('Incorrect dataset type given')
+
+ @staticmethod
+ def _process_path(path, label, model_settings, background_frequency, background_volume_range, time_shift_samples,
+ use_background, background_data):
+ """Load wav files and calculate mfcc features.
+
+ Random shifting of samples and adding in background noise is done within this function as well.
+ This function is meant to be mapped onto a TF Dataset by using a lambda function.
+
+ Args:
+ path: Path to the wav file to load.
+ label: Integer label for classifying the audio clip.
+ model_settings: Dictionary of settings for model being trained.
+ background_frequency: How many clips will have background noise, 0.0 to 1.0.
+ background_volume_range: How loud the background noise will be.
+ time_shift_samples: How much to randomly shift the clips by.
+ use_background: Add in background noise to audio clips or not.
+ background_data: Ragged tensor of loaded background noise samples.
+
+ Returns:
+ Tuple of calculated flattened mfcc and its class label.
+ """
+
+ desired_samples = model_settings['desired_samples']
+ audio, sample_rate = load_wav_file(path, desired_samples=desired_samples)
+
+ # Make our own silence audio data.
+ if label == SILENCE_INDEX:
+ audio = tf.multiply(audio, 0)
+
+ # Shift samples start position and pad any gaps with zeros.
+ if time_shift_samples > 0:
+ time_shift_amount = tf.random.uniform(shape=(), minval=-time_shift_samples, maxval=time_shift_samples,
+ dtype=tf.int32)
+ else:
+ time_shift_amount = 0
+ if time_shift_amount > 0:
+ time_shift_padding = [[time_shift_amount, 0], [0, 0]]
+ time_shift_offset = [0, 0]
+ else:
+ time_shift_padding = [[0, -time_shift_amount], [0, 0]]
+ time_shift_offset = [-time_shift_amount, 0]
+
+ padded_foreground = tf.pad(audio, time_shift_padding, mode='CONSTANT')
+ sliced_foreground = tf.slice(padded_foreground, time_shift_offset, [desired_samples, -1])
+
+ # Get a random section of background noise.
+ if use_background:
+ background_index = tf.random.uniform(shape=(), maxval=background_data.shape[0], dtype=tf.int32)
+ background_sample = background_data[background_index]
+ background_offset = tf.random.uniform(shape=(), maxval=len(background_sample)-desired_samples,
+ dtype=tf.int32)
+ background_clipped = background_sample[background_offset:(background_offset + desired_samples)]
+ background_reshaped = tf.reshape(background_clipped, [desired_samples, 1])
+ if tf.random.uniform(shape=(), maxval=1) < background_frequency:
+ background_volume = tf.random.uniform(shape=(), maxval=background_volume_range)
+ else:
+ background_volume = tf.constant(0, dtype='float32')
+ else:
+ background_reshaped = np.zeros([desired_samples, 1], dtype=np.float32)
+ background_volume = tf.constant(0, dtype='float32')
+
+ # Mix in background noise.
+ background_mul = tf.multiply(background_reshaped, background_volume)
+ background_add = tf.add(background_mul, sliced_foreground)
+ background_clamp = tf.clip_by_value(background_add, -1.0, 1.0)
+
+ mfcc = calculate_mfcc(background_clamp, sample_rate, model_settings['window_size_samples'],
+ model_settings['window_stride_samples'],
+ model_settings['dct_coefficient_count'])
+ mfcc = tf.reshape(mfcc, [-1])
+
+ return mfcc, label
+
+ def _download_and_extract_data(self, data_url, target_directory):
+ """Downloads and extracts file to target directory.
+
+ If the file does not already exist download it and then untar into the target directory.
+
+ Args:
+ data_url: Web link to the tarred data to download.
+ target_directory: Directory to download and extract to.
+ """
+ target_directory = Path(target_directory)
+ target_directory.mkdir(exist_ok=True)
+
+ filename = data_url.split('/')[-1]
+ filepath = target_directory / filename
+
+ if not filepath.exists():
+ def _report_hook(block_num, block_size, total_size):
+ """Function to track download progress in urllib"""
+ read_so_far = block_num * block_size
+ percent = (read_so_far / total_size) * 100.0
+
+ s = f"\rDownloading {filename} {percent:.1f}%"
+
+ sys.stdout.write(s)
+ sys.stdout.flush()
+
+ filepath, _ = urllib.request.urlretrieve(data_url, filepath, _report_hook)
+ print()
+
+ print(f'Untarring {filename}...')
+ tarfile.open(filepath, 'r:gz').extractall(target_directory)
+
+ def _prepare_datasets(self, silence_percentage, unknown_percentage, wanted_words,
+ validation_percentage, testing_percentage):
+ """Split the data into train, validation and testing sets.
+
+ Silence and unknown data is added, then sets are converted to TF Datasets.
+
+ Args:
+ silence_percentage: Percent of words should be silence.
+ unknown_percentage: Percent of words that should be unknown.
+ wanted_words: List of words wanted to classify.
+ validation_percentage: Percent to split off for validation.
+ testing_percentage: Percent to split off for testing.
+ """
+ # Make sure the shuffling and picking of unknowns is deterministic.
+ random.seed(RANDOM_SEED)
+ wanted_words_index = {}
+
+ for index, wanted_word in enumerate(wanted_words):
+ wanted_words_index[wanted_word] = index + 2
+
+ # Find all wav files in subfolders.
+ search_path = self.data_dir / '*' / '*.wav'
+ data_index, unknown_index, all_words = self._find_and_sort_wavs(search_path, validation_percentage,
+ testing_percentage, wanted_words_index)
+
+ for index, wanted_word in enumerate(wanted_words):
+ if wanted_word not in all_words:
+ raise Exception(f'Tried to find {wanted_word} in labels but only found: {", ".join(all_words.keys())}')
+
+ word_to_index = {}
+ for word in all_words:
+ if word in wanted_words_index:
+ word_to_index[word] = wanted_words_index[word]
+ else:
+ word_to_index[word] = UNKNOWN_WORD_INDEX
+ word_to_index[SILENCE_LABEL] = SILENCE_INDEX
+
+ # We need an arbitrary file to load as the input for the silence samples.
+ # It's multiplied by zero later, so the content doesn't matter.
+ silence_wav_path = data_index['training'][0]['file']
+ for set_index in ['validation', 'testing', 'training']:
+ set_size = len(data_index[set_index]) # Size before adding silence and unknown samples.
+ silence_size = int(math.ceil(set_size * silence_percentage / 100))
+ for _ in range(silence_size):
+ data_index[set_index].append({
+ 'label': SILENCE_LABEL,
+ 'file': silence_wav_path
+ })
+ # Pick some unknowns to add to each partition of the data set.
+ random.shuffle(unknown_index[set_index])
+ unknown_size = int(math.ceil(set_size * unknown_percentage / 100))
+ data_index[set_index].extend(unknown_index[set_index][:unknown_size])
+
+ self._set_size[set_index] = len(data_index[set_index]) # Size after adding silence and unknown samples.
+
+ # Make sure the ordering is random.
+ random.shuffle(data_index[set_index])
+
+ # Transform into TF Datasets ready for easier processing later.
+ labels, paths = list(zip(*[d.values() for d in data_index[set_index]]))
+ labels = [word_to_index[label] for label in labels]
+ self._tf_datasets[set_index] = tf.data.Dataset.from_tensor_slices((list(paths), labels))
+
+ def _find_and_sort_wavs(self, search_pattern, validation_percentage, testing_percentage, wanted_words_index):
+ """Find and sort wav files into known and unknown word sets.
+
+ Known words are files containing words in the list of wanted words.
+ Any other clip goes to the unknown label set. Labels come from the folder names.
+ All clips are also assigned to train, test and validation sets.
+
+ Args:
+ search_pattern: Path pattern used by glob to find wav files.
+ validation_percentage: Percent to split off for validation.
+ testing_percentage: Percent to split off for testing.
+ wanted_words_index: Dict mapping wanted words to their label index.
+
+ Returns:
+ 3-tuple of known words, unknown words and mapping of all word labels.
+ """
+ data_index = {'validation': [], 'testing': [], 'training': []}
+ unknown_index = {'validation': [], 'testing': [], 'training': []}
+ all_words = {}
+
+ for wav_path in sorted(tf.io.gfile.glob(str(search_pattern))):
+ word = Path(wav_path).parent.name.lower()
+
+ # Treat the '_background_noise_' folder as a special case, since we expect
+ # it to contain long audio samples we mix in to improve training.
+ if word == BACKGROUND_NOISE_DIR_NAME:
+ continue
+
+ all_words[word] = True
+ set_index = which_set(wav_path, validation_percentage, testing_percentage)
+ # If it's a known class, store its detail, otherwise add it to the list
+ # we'll use to train the unknown label.
+ if word in wanted_words_index:
+ data_index[set_index].append({'label': word, 'file': wav_path})
+ else:
+ unknown_index[set_index].append({'label': word, 'file': wav_path})
+ if not all_words:
+ raise Exception('No .wavs found at ' + str(search_pattern))
+
+ return data_index, unknown_index, all_words
+
+ def _prepare_background_data(self):
+ """Searches a folder for background noise audio, and loads it into memory.
+
+ It's expected that the background audio samples will be in a subdirectory
+ named '_background_noise_' inside the 'data_dir' folder, as .wavs that match
+ the sample rate of the training data, but can be much longer in duration.
+
+ If the '_background_noise_' folder doesn't exist at all, this isn't an
+ error, it's just taken to mean that no background noise augmentation should
+ be used. If the folder does exist, but it's empty, that's treated as an
+ error.
+
+ Returns:
+ Ragged tensor of raw PCM-encoded audio samples of background noise.
+ None if '_background_noise_' folder doesnt exist.
+
+ Raises:
+ Exception: If files aren't found in the folder.
+ """
+ background_data = []
+ background_dir = Path(self.data_dir / BACKGROUND_NOISE_DIR_NAME)
+ if not background_dir.exists():
+ self.background_data = None
+ return
+
+ search_path = Path(background_dir / '*.wav')
+ for wav_path in tf.io.gfile.glob(str(search_path)):
+ wav_data, _ = load_wav_file(wav_path, desired_samples=-1)
+ background_data.append(tf.reshape(wav_data, [-1]))
+
+ if not background_data:
+ raise Exception('No background wav files were found in ' + str(search_path))
+
+ # Ragged tensor as we cant use lists in tf dataset map functions.
+ self.background_data = tf.ragged.stack(background_data)
diff --git a/models/keyword_spotting/cnn_large/model_package_tf/evaluation.py b/models/keyword_spotting/cnn_large/model_package_tf/evaluation.py
new file mode 100644
index 0000000..1bec940
--- /dev/null
+++ b/models/keyword_spotting/cnn_large/model_package_tf/evaluation.py
@@ -0,0 +1,250 @@
+# Copyright © 2023 Arm Ltd. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Functions for testing trained keyword spotting models from checkpoint files and TFLite files."""
+
+import argparse
+
+import numpy as np
+import tensorflow as tf
+
+from data_processing import data_preprocessing
+from model_core_utils import models
+from cnn_l_inference_tflite import tflite_inference
+
+
+def tflite_test(model_settings, audio_processor, tflite_path):
+ """Calculate accuracy and confusion matrices on the validation and test sets.
+
+ A TFLite model is used for doing testing.
+
+ Args:
+ model_settings: Dictionary of common model settings.
+ audio_processor: Audio processor class object.
+ tflite_path: Path to TFLite file to use for inference.
+ """
+ # Evaluate on validation set.
+ print("Running TFLite evaluation on validation set...")
+ val_data = audio_processor.get_data(audio_processor.Modes.VALIDATION).batch(1)
+ expected_indices = np.concatenate([y for x, y in val_data])
+ predicted_indices = []
+
+ for mfcc, label in val_data:
+ prediction = tflite_inference(mfcc, tflite_path)
+ predicted_indices.append(np.squeeze(tf.argmax(prediction, axis=1)))
+
+ val_accuracy = calculate_accuracy(predicted_indices, expected_indices)
+ confusion_matrix = tf.math.confusion_matrix(expected_indices, predicted_indices,
+ num_classes=model_settings['label_count'])
+
+ print(confusion_matrix.numpy())
+ print(f'Validation accuracy = {val_accuracy * 100:.2f}%'
+ f'(N={audio_processor.set_size(audio_processor.Modes.VALIDATION)})')
+
+ # Evaluate on testing set.
+ print("Running TFLite evaluation on test set...")
+ test_data = audio_processor.get_data(audio_processor.Modes.TESTING).batch(1)
+ expected_indices = np.concatenate([y for x, y in test_data])
+ predicted_indices = []
+
+ for mfcc, label in test_data:
+ prediction = tflite_inference(mfcc, tflite_path)
+ predicted_indices.append(np.squeeze(tf.argmax(prediction, axis=1)))
+
+ test_accuracy = calculate_accuracy(predicted_indices, expected_indices)
+ confusion_matrix = tf.math.confusion_matrix(expected_indices, predicted_indices,
+ num_classes=model_settings['label_count'])
+
+ print(confusion_matrix.numpy())
+ print(f'Test accuracy = {test_accuracy * 100:.2f}%'
+ f'(N={audio_processor.set_size(audio_processor.Modes.TESTING)})')
+
+
+def keras_test(model_settings, audio_processor, model):
+ """Calculate accuracy and confusion matrices on the validation and test sets.
+
+ A loaded keras model is used for doing testing.
+
+ Args:
+ model_settings: Dictionary of common model settings.
+ audio_processor: Audio processor class object.
+ model: Loaded keras model.
+ """
+ # Evaluate on validation set.
+ print("Running TF evaluation on validation set...")
+ val_data = audio_processor.get_data(audio_processor.Modes.VALIDATION).batch(FLAGS.batch_size)
+ expected_indices = np.concatenate([y for x, y in val_data])
+
+ predictions = model.predict(val_data)
+ predicted_indices = tf.argmax(predictions, axis=1)
+
+ val_accuracy = calculate_accuracy(predicted_indices, expected_indices)
+ confusion_matrix = tf.math.confusion_matrix(expected_indices, predicted_indices,
+ num_classes=model_settings['label_count'])
+ print(confusion_matrix.numpy())
+ print(f'Validation accuracy = {val_accuracy * 100:.2f}%'
+ f'(N={audio_processor.set_size(audio_processor.Modes.VALIDATION)})')
+
+ # Evaluate on testing set.
+ print("Running TF evaluation on test set...")
+ test_data = audio_processor.get_data(audio_processor.Modes.TESTING).batch(FLAGS.batch_size)
+ expected_indices = np.concatenate([y for x, y in test_data])
+
+ predictions = model.predict(test_data)
+ predicted_indices = tf.argmax(predictions, axis=1)
+
+ test_accuracy = calculate_accuracy(predicted_indices, expected_indices)
+ confusion_matrix = tf.math.confusion_matrix(expected_indices, predicted_indices,
+ num_classes=model_settings['label_count'])
+ print(confusion_matrix.numpy())
+ print(f'Test accuracy = {test_accuracy * 100:.2f}%'
+ f'(N={audio_processor.set_size(audio_processor.Modes.TESTING)})')
+
+
+def calculate_accuracy(predicted_indices, expected_indices):
+ """Calculates and returns accuracy.
+
+ Args:
+ predicted_indices: List of predicted integer indices.
+ expected_indices: List of expected integer indices.
+
+ Returns:
+ Accuracy value between 0 and 1.
+ """
+ correct_prediction = tf.equal(predicted_indices, expected_indices)
+ accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
+ return accuracy
+
+
+def evaluate():
+ """Calculate accuracy and confusion matrices on validation and test sets.
+
+ Model is created and weights loaded from supplied command line arguments.
+ """
+ model_settings = models.prepare_model_settings(
+ len(data_preprocessing.prepare_words_list(FLAGS.wanted_words.split(','))),
+ FLAGS.sample_rate, FLAGS.clip_duration_ms, FLAGS.window_size_ms,
+ FLAGS.window_stride_ms, FLAGS.dct_coefficient_count)
+
+ audio_processor = data_preprocessing.AudioProcessor(data_url=FLAGS.data_url,
+ data_dir=FLAGS.data_dir,
+ silence_percentage=FLAGS.silence_percentage,
+ unknown_percentage=FLAGS.unknown_percentage,
+ wanted_words=FLAGS.wanted_words.split(','),
+ validation_percentage=FLAGS.validation_percentage,
+ testing_percentage=FLAGS.testing_percentage,
+ model_settings=model_settings)
+
+ if FLAGS.tflite_path:
+ tflite_test(model_settings, audio_processor, FLAGS.tflite_path)
+
+ if FLAGS.checkpoint:
+ model = models.create_model(model_settings, FLAGS.model_architecture, FLAGS.model_size_info, False)
+ model.load_weights(FLAGS.checkpoint).expect_partial()
+ keras_test(model_settings, audio_processor, model)
+
+
+if __name__ == '__main__':
+ parser = argparse.ArgumentParser()
+ parser.add_argument(
+ '--data_url',
+ type=str,
+ default='http://download.tensorflow.org/data/speech_commands_v0.02.tar.gz',
+ help='Location of speech training data archive on the web.')
+ parser.add_argument(
+ '--data_dir',
+ type=str,
+ default='/tmp/speech_dataset/',
+ help="""\
+ Where to download the speech training data to.
+ """)
+ parser.add_argument(
+ '--silence_percentage',
+ type=float,
+ default=10.0,
+ help="""\
+ How much of the training data should be silence.
+ """)
+ parser.add_argument(
+ '--unknown_percentage',
+ type=float,
+ default=10.0,
+ help="""\
+ How much of the training data should be unknown words.
+ """)
+ parser.add_argument(
+ '--testing_percentage',
+ type=int,
+ default=10,
+ help='What percentage of wavs to use as a test set.')
+ parser.add_argument(
+ '--validation_percentage',
+ type=int,
+ default=10,
+ help='What percentage of wavs to use as a validation set.')
+ parser.add_argument(
+ '--sample_rate',
+ type=int,
+ default=16000,
+ help='Expected sample rate of the wavs',)
+ parser.add_argument(
+ '--clip_duration_ms',
+ type=int,
+ default=1000,
+ help='Expected duration in milliseconds of the wavs',)
+ parser.add_argument(
+ '--window_size_ms',
+ type=float,
+ default=30.0,
+ help='How long each spectrogram timeslice is',)
+ parser.add_argument(
+ '--window_stride_ms',
+ type=float,
+ default=10.0,
+ help='How long each spectrogram timeslice is',)
+ parser.add_argument(
+ '--dct_coefficient_count',
+ type=int,
+ default=40,
+ help='How many bins to use for the MFCC fingerprint',)
+ parser.add_argument(
+ '--batch_size',
+ type=int,
+ default=100,
+ help='How many items to train with at once',)
+ parser.add_argument(
+ '--wanted_words',
+ type=str,
+ default='yes,no,up,down,left,right,on,off,stop,go',
+ help='Words to use (others will be added to an unknown label)',)
+ parser.add_argument(
+ '--checkpoint',
+ type=str,
+ help='Checkpoint to load the weights from')
+ parser.add_argument(
+ '--tflite_path',
+ type=str,
+ help='Path to TFLite file to use for evaluation')
+ parser.add_argument(
+ '--model_architecture',
+ type=str,
+ default='dnn',
+ help='What model architecture to use')
+ parser.add_argument(
+ '--model_size_info',
+ type=int,
+ nargs="+",
+ default=[128, 128, 128],
+ help='Model dimensions - different for various models')
+
+ FLAGS, _ = parser.parse_known_args()
+ evaluate()
diff --git a/models/keyword_spotting/cnn_large/model_package_tf/how_to_guidance.ipynb b/models/keyword_spotting/cnn_large/model_package_tf/how_to_guidance.ipynb
new file mode 100644
index 0000000..d818b93
--- /dev/null
+++ b/models/keyword_spotting/cnn_large/model_package_tf/how_to_guidance.ipynb
@@ -0,0 +1,428 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Copyright (C) 2023 Arm Limited or its affiliates. All rights reserved.\n",
+ "#\n",
+ "# SPDX-License-Identifier: Apache-2.0\n",
+ "#\n",
+ "# Licensed under the Apache License, Version 2.0 (the License); you may\n",
+ "# not use this file except in compliance with the License.\n",
+ "# You may obtain a copy of the License at\n",
+ "#\n",
+ "# www.apache.org/licenses/LICENSE-2.0\n",
+ "#\n",
+ "# Unless required by applicable law or agreed to in writing, software\n",
+ "# distributed under the License is distributed on an AS IS BASIS, WITHOUT\n",
+ "# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
+ "# See the License for the specific language governing permissions and\n",
+ "# limitations under the License."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# CNN_Large - Optimised\n",
+ "\n",
+ "Here we reproduce the models with our established codebase and ModelPackage approach for your convenience.\n",
+ "\n",
+ "## Model-Package Overview:\n",
+ "\n",
+ "| Model \t| CNN_Large \t|\n",
+ "|:---------------:\t|:---------------------------------------------------------------:\t|\n",
+ "| **Format**: \t| Keras, Saved Model, TensorFlow Lite int8, TensorFlow Lite fp32 |\n",
+ "| **Feature**: \t| Keyword spotting for Arm Cortex-M CPUs |\n",
+ "| **Architectural Delta w.r.t. Vanilla**: | None |\n",
+ "| **Domain**: \t| Keyword spotting |\n",
+ "| **Package Quality**: \t| Optimised |"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Table of contents \n",
+ "\n",
+ "This how-to guidance presents the key steps to reproduce everything in this package. The contents are organised as below. We provided the internal navigation links for users to easy-jump among different sections. \n",
+ "\n",
+ " \n",
+ "* [1.0 Model recreation](#model_recreation)\n",
+ "\n",
+ "* [2.0 Training](#training)\n",
+ "\n",
+ "* [3.0 Testing](#testing)\n",
+ "\n",
+ "* [4.0 Optimization](#optimization)\n",
+ "\n",
+ "* [5.0 Quantization and TFLite conversion](#tflite_conversion)\n",
+ "\n",
+ "* [6.0 Inference the TFLite model files](#tflite_inference)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 1.0 Model Recreation\n",
+ "\n",
+ "In order to recreate the model you will first need to be using ```Python3.7``` and install the requirements in ```requirements.txt```.\n",
+ "\n",
+ "Once you have these requirements satisfied you can execute the recreation script contained within this folder, just run:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "2023-01-31 12:11:37.988637: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0\n",
+ "Untarring speech_commands_v0.02.tar.gz...\n",
+ "2023-01-31 12:12:28.656297: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcuda.so.1\n",
+ "2023-01-31 12:12:28.695168: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: \n",
+ "pciBusID: 0000:03:00.0 name: NVIDIA TITAN Xp computeCapability: 6.1\n",
+ "coreClock: 1.582GHz coreCount: 30 deviceMemorySize: 11.91GiB deviceMemoryBandwidth: 510.07GiB/s\n",
+ "2023-01-31 12:12:28.695203: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0\n",
+ "2023-01-31 12:12:28.715771: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcublas.so.11\n",
+ "2023-01-31 12:12:28.715835: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcublasLt.so.11\n",
+ "2023-01-31 12:12:28.718556: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcufft.so.10\n",
+ "2023-01-31 12:12:28.718828: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcurand.so.10\n",
+ "2023-01-31 12:12:28.719402: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcusolver.so.11\n",
+ "2023-01-31 12:12:28.720115: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcusparse.so.11\n",
+ "2023-01-31 12:12:28.720266: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudnn.so.8\n",
+ "2023-01-31 12:12:28.720628: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0\n",
+ "2023-01-31 12:12:28.720911: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA\n",
+ "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
+ "2023-01-31 12:12:28.721608: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: \n",
+ "pciBusID: 0000:03:00.0 name: NVIDIA TITAN Xp computeCapability: 6.1\n",
+ "coreClock: 1.582GHz coreCount: 30 deviceMemorySize: 11.91GiB deviceMemoryBandwidth: 510.07GiB/s\n",
+ "2023-01-31 12:12:28.721996: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0\n",
+ "2023-01-31 12:12:28.722060: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0\n",
+ "2023-01-31 12:12:29.189512: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1258] Device interconnect StreamExecutor with strength 1 edge matrix:\n",
+ "2023-01-31 12:12:29.189552: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1264] 0 \n",
+ "2023-01-31 12:12:29.189560: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1277] 0: N \n",
+ "2023-01-31 12:12:29.190094: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1418] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 11007 MB memory) -> physical GPU (device: 0, name: NVIDIA TITAN Xp, pci bus id: 0000:03:00.0, compute capability: 6.1)\n",
+ "WARNING:tensorflow:Compiled the loaded model, but the compiled metrics have yet to be built. `model.compile_metrics` will be empty until you train or evaluate the model.\n",
+ "2023-01-31 12:12:30.746072: W tensorflow/python/util/util.cc:348] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.\n",
+ "2023-01-31 12:12:31.596489: I tensorflow/core/grappler/devices.cc:69] Number of eligible GPUs (core count >= 8, compute capability >= 0.0): 1\n",
+ "2023-01-31 12:12:31.596713: I tensorflow/core/grappler/clusters/single_machine.cc:357] Starting new session\n",
+ "2023-01-31 12:12:31.597272: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: \n",
+ "pciBusID: 0000:03:00.0 name: NVIDIA TITAN Xp computeCapability: 6.1\n",
+ "coreClock: 1.582GHz coreCount: 30 deviceMemorySize: 11.91GiB deviceMemoryBandwidth: 510.07GiB/s\n",
+ "2023-01-31 12:12:31.597524: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0\n",
+ "2023-01-31 12:12:31.597556: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1258] Device interconnect StreamExecutor with strength 1 edge matrix:\n",
+ "2023-01-31 12:12:31.597566: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1264] 0 \n",
+ "2023-01-31 12:12:31.597575: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1277] 0: N \n",
+ "2023-01-31 12:12:31.597851: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1418] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 11007 MB memory) -> physical GPU (device: 0, name: NVIDIA TITAN Xp, pci bus id: 0000:03:00.0, compute capability: 6.1)\n",
+ "2023-01-31 12:12:31.615526: I tensorflow/core/platform/profile_utils/cpu_utils.cc:114] CPU Frequency: 3492140000 Hz\n",
+ "2023-01-31 12:12:31.619233: I tensorflow/core/grappler/optimizers/meta_optimizer.cc:1144] Optimization results for grappler item: graph_to_optimize\n",
+ " function_optimizer: function_optimizer did nothing. time = 0.019ms.\n",
+ " function_optimizer: function_optimizer did nothing. time = 0.003ms.\n",
+ "\n",
+ "2023-01-31 12:12:31.702242: W tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc:345] Ignored output_format.\n",
+ "2023-01-31 12:12:31.702286: W tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc:348] Ignored drop_control_dependency.\n",
+ "2023-01-31 12:12:31.707954: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:210] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.\n",
+ "2023-01-31 12:12:31.710595: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: \n",
+ "pciBusID: 0000:03:00.0 name: NVIDIA TITAN Xp computeCapability: 6.1\n",
+ "coreClock: 1.582GHz coreCount: 30 deviceMemorySize: 11.91GiB deviceMemoryBandwidth: 510.07GiB/s\n",
+ "2023-01-31 12:12:31.710946: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0\n",
+ "2023-01-31 12:12:31.710984: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1258] Device interconnect StreamExecutor with strength 1 edge matrix:\n",
+ "2023-01-31 12:12:31.710993: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1264] 0 \n",
+ "2023-01-31 12:12:31.711005: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1277] 0: N \n",
+ "2023-01-31 12:12:31.711361: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1418] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 11007 MB memory) -> physical GPU (device: 0, name: NVIDIA TITAN Xp, pci bus id: 0000:03:00.0, compute capability: 6.1)\n",
+ "Converted model saved to cnn.tflite.\n",
+ "Running TFLite evaluation on validation set...\n",
+ "2023-01-31 12:12:31.770147: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:176] None of the MLIR Optimization Passes are enabled (registered 2)\n",
+ "[[371 0 0 0 0 0 0 0 0 0 0 0]\n",
+ " [ 0 301 3 6 3 16 7 5 10 3 4 13]\n",
+ " [ 0 1 383 1 1 1 5 2 0 0 0 3]\n",
+ " [ 0 8 3 362 1 13 3 0 1 1 2 12]\n",
+ " [ 0 2 1 0 322 0 2 0 5 10 5 3]\n",
+ " [ 0 2 0 8 0 360 0 0 0 1 1 5]\n",
+ " [ 0 1 8 4 0 1 336 1 1 0 0 0]\n",
+ " [ 0 6 0 0 1 0 1 353 0 1 1 0]\n",
+ " [ 1 3 0 1 4 1 0 0 342 7 1 3]\n",
+ " [ 0 3 0 1 19 1 2 0 4 338 4 1]\n",
+ " [ 1 1 2 0 7 1 1 0 2 1 334 0]\n",
+ " [ 0 5 0 9 1 7 0 1 1 3 1 344]]\n",
+ "Validation accuracy = 93.27%(N=4445)\n",
+ "Running TFLite evaluation on test set...\n",
+ "[[408 0 0 0 0 0 0 0 0 0 0 0]\n",
+ " [ 0 350 1 5 5 8 9 5 8 0 6 11]\n",
+ " [ 0 9 401 0 0 1 3 0 0 1 0 4]\n",
+ " [ 0 2 1 375 0 8 5 0 0 0 0 14]\n",
+ " [ 0 8 0 2 388 2 0 0 5 13 4 3]\n",
+ " [ 0 4 1 8 1 378 1 0 2 0 1 10]\n",
+ " [ 0 5 7 1 2 0 396 0 0 0 1 0]\n",
+ " [ 0 11 0 0 0 1 5 377 0 0 1 1]\n",
+ " [ 0 5 0 0 4 4 0 0 363 14 2 4]\n",
+ " [ 0 4 0 2 12 0 1 0 6 374 1 2]\n",
+ " [ 0 0 0 0 5 5 0 0 0 1 400 0]\n",
+ " [ 0 4 2 13 3 13 3 1 0 3 1 359]]\n",
+ "Test accuracy = 93.44%(N=4890)\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "2023-01-31 12:13:11.688023: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0\n",
+ "Untarring speech_commands_v0.02.tar.gz...\n",
+ "2023-01-31 12:14:02.193138: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcuda.so.1\n",
+ "2023-01-31 12:14:02.228847: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: \n",
+ "pciBusID: 0000:03:00.0 name: NVIDIA TITAN Xp computeCapability: 6.1\n",
+ "coreClock: 1.582GHz coreCount: 30 deviceMemorySize: 11.91GiB deviceMemoryBandwidth: 510.07GiB/s\n",
+ "2023-01-31 12:14:02.228887: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0\n",
+ "2023-01-31 12:14:02.249127: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcublas.so.11\n",
+ "2023-01-31 12:14:02.249193: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcublasLt.so.11\n",
+ "2023-01-31 12:14:02.251962: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcufft.so.10\n",
+ "2023-01-31 12:14:02.252223: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcurand.so.10\n",
+ "2023-01-31 12:14:02.252782: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcusolver.so.11\n",
+ "2023-01-31 12:14:02.253506: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcusparse.so.11\n",
+ "2023-01-31 12:14:02.253657: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudnn.so.8\n",
+ "2023-01-31 12:14:02.254137: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0\n",
+ "2023-01-31 12:14:02.254437: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA\n",
+ "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
+ "2023-01-31 12:14:02.255267: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: \n",
+ "pciBusID: 0000:03:00.0 name: NVIDIA TITAN Xp computeCapability: 6.1\n",
+ "coreClock: 1.582GHz coreCount: 30 deviceMemorySize: 11.91GiB deviceMemoryBandwidth: 510.07GiB/s\n",
+ "2023-01-31 12:14:02.255838: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0\n",
+ "2023-01-31 12:14:02.255907: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0\n",
+ "2023-01-31 12:14:02.712898: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1258] Device interconnect StreamExecutor with strength 1 edge matrix:\n",
+ "2023-01-31 12:14:02.712937: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1264] 0 \n",
+ "2023-01-31 12:14:02.712946: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1277] 0: N \n",
+ "2023-01-31 12:14:02.713547: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1418] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 11007 MB memory) -> physical GPU (device: 0, name: NVIDIA TITAN Xp, pci bus id: 0000:03:00.0, compute capability: 6.1)\n",
+ "WARNING:tensorflow:Compiled the loaded model, but the compiled metrics have yet to be built. `model.compile_metrics` will be empty until you train or evaluate the model.\n",
+ "2023-01-31 12:14:04.312064: W tensorflow/python/util/util.cc:348] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.\n",
+ "2023-01-31 12:14:05.110529: I tensorflow/core/grappler/devices.cc:69] Number of eligible GPUs (core count >= 8, compute capability >= 0.0): 1\n",
+ "2023-01-31 12:14:05.110622: I tensorflow/core/grappler/clusters/single_machine.cc:357] Starting new session\n",
+ "2023-01-31 12:14:05.111243: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: \n",
+ "pciBusID: 0000:03:00.0 name: NVIDIA TITAN Xp computeCapability: 6.1\n",
+ "coreClock: 1.582GHz coreCount: 30 deviceMemorySize: 11.91GiB deviceMemoryBandwidth: 510.07GiB/s\n",
+ "2023-01-31 12:14:05.111519: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0\n",
+ "2023-01-31 12:14:05.111551: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1258] Device interconnect StreamExecutor with strength 1 edge matrix:\n",
+ "2023-01-31 12:14:05.111562: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1264] 0 \n",
+ "2023-01-31 12:14:05.111570: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1277] 0: N \n",
+ "2023-01-31 12:14:05.111865: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1418] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 11007 MB memory) -> physical GPU (device: 0, name: NVIDIA TITAN Xp, pci bus id: 0000:03:00.0, compute capability: 6.1)\n",
+ "2023-01-31 12:14:05.131485: I tensorflow/core/platform/profile_utils/cpu_utils.cc:114] CPU Frequency: 3492140000 Hz\n",
+ "2023-01-31 12:14:05.133498: I tensorflow/core/grappler/optimizers/meta_optimizer.cc:1144] Optimization results for grappler item: graph_to_optimize\n",
+ " function_optimizer: function_optimizer did nothing. time = 0.009ms.\n",
+ " function_optimizer: function_optimizer did nothing. time = 0.002ms.\n",
+ "\n",
+ "2023-01-31 12:14:05.210179: W tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc:345] Ignored output_format.\n",
+ "2023-01-31 12:14:05.210218: W tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc:348] Ignored drop_control_dependency.\n",
+ "2023-01-31 12:14:05.215177: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:210] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.\n",
+ "2023-01-31 12:14:05.217453: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: \n",
+ "pciBusID: 0000:03:00.0 name: NVIDIA TITAN Xp computeCapability: 6.1\n",
+ "coreClock: 1.582GHz coreCount: 30 deviceMemorySize: 11.91GiB deviceMemoryBandwidth: 510.07GiB/s\n",
+ "2023-01-31 12:14:05.217717: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0\n",
+ "2023-01-31 12:14:05.217748: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1258] Device interconnect StreamExecutor with strength 1 edge matrix:\n",
+ "2023-01-31 12:14:05.217758: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1264] 0 \n",
+ "2023-01-31 12:14:05.217766: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1277] 0: N \n",
+ "2023-01-31 12:14:05.218054: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1418] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 11007 MB memory) -> physical GPU (device: 0, name: NVIDIA TITAN Xp, pci bus id: 0000:03:00.0, compute capability: 6.1)\n",
+ "2023-01-31 12:14:05.257830: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:176] None of the MLIR Optimization Passes are enabled (registered 2)\n",
+ "fully_quantize: 0, inference_type: 6, input_inference_type: 9, output_inference_type: 9\n",
+ "Quantized model saved to cnn_quantized.tflite.\n",
+ "Running TFLite evaluation on validation set...\n",
+ "[[371 0 0 0 0 0 0 0 0 0 0 0]\n",
+ " [ 0 304 3 4 3 14 7 6 9 4 4 13]\n",
+ " [ 0 2 382 2 1 0 4 2 0 1 0 3]\n",
+ " [ 0 7 3 356 5 11 3 0 1 1 3 16]\n",
+ " [ 0 2 1 0 318 1 2 0 5 10 8 3]\n",
+ " [ 0 2 0 8 1 354 1 0 0 0 4 7]\n",
+ " [ 0 2 6 3 3 1 333 2 1 0 0 1]\n",
+ " [ 0 7 0 0 1 0 3 349 0 2 1 0]\n",
+ " [ 1 4 0 2 4 1 0 0 341 6 1 3]\n",
+ " [ 0 3 1 1 24 1 4 0 6 328 3 2]\n",
+ " [ 1 3 2 0 10 3 0 0 0 1 330 0]\n",
+ " [ 0 5 0 8 2 8 0 1 1 3 2 342]]\n",
+ "Validation accuracy = 92.42%(N=4445)\n",
+ "Running TFLite evaluation on test set...\n",
+ "[[408 0 0 0 0 0 0 0 0 0 0 0]\n",
+ " [ 0 351 2 4 4 7 9 6 9 0 6 10]\n",
+ " [ 0 12 392 0 1 1 9 0 0 0 2 2]\n",
+ " [ 0 5 1 366 2 8 6 1 0 0 1 15]\n",
+ " [ 0 8 1 2 379 3 2 2 7 10 9 2]\n",
+ " [ 0 7 1 10 1 370 1 1 1 0 4 10]\n",
+ " [ 0 8 7 2 4 0 387 2 0 0 2 0]\n",
+ " [ 0 10 0 0 1 0 8 372 0 1 2 2]\n",
+ " [ 1 12 0 0 6 4 0 1 356 11 0 5]\n",
+ " [ 0 5 0 2 15 0 0 1 6 368 2 3]\n",
+ " [ 0 0 0 2 4 4 0 0 0 0 399 2]\n",
+ " [ 0 5 0 12 4 15 4 1 1 1 4 355]]\n",
+ "Test accuracy = 92.09%(N=4890)\n"
+ ]
+ }
+ ],
+ "source": [
+ "!bash ./recreate_model.sh"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Running this script will use the pre-trained checkpoint files supplied in the ```./model_archive/model_source/weights``` folder to generate the TFLite files and perform evaluation on the test set. Both an fp32 version and a quantized version will be produced. The quantized version will use post-training quantization to fully quantize it.\n",
+ "\n",
+ "If you want to run training from scratch you can do this by supplying ```--train``` when running the script. For example:\n",
+ "\n",
+ "```bash\n",
+ "bash ./recreate_model.sh --train\n",
+ "```\n",
+ "\n",
+ "Training is then performed and should produce a model to the stated accuracy in this repository. Note that exporting to TFLite will still happen with the baseline pre-trained checkpoint files, so you will need to re-run the script and this time supply the path to the new checkpoint files you want to use, for example:\n",
+ "\n",
+ "```bash\n",
+ "bash ./recreate_model.sh --ckpt \n",
+ "```"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 2.0 Training\n",
+ "\n",
+ "The training scripts can be used to recreate any of the models from the [Hello Edge paper](https://arxiv.org/pdf/1711.07128.pdf) provided the right hyperparameters are used. The training commands with all the hyperparameters to reproduce the model in this repository are given [here](recreate_model.sh). The model in this part of the repository represents just one variation of the models from the paper, other varieties are covered in other parts of the repository.\n",
+ "\n",
+ "\n",
+ "As a general example of how to train a DNN with 3 fully-connected layers with 128 neurons in each layer, run:\n",
+ "```\n",
+ "python train.py --model_architecture dnn --model_size_info 128 128 128\n",
+ "```\n",
+ "\n",
+ "The command line argument *--model_size_info* is used to pass the neural network layer\n",
+ "dimensions such as number of layers, convolution filter size/stride as a list to models.py,\n",
+ "which builds the TensorFlow graph based on the provided model architecture\n",
+ "and layer dimensions. For more info on *model_size_info* for each network architecture see\n",
+ "[models.py](model_core_utils/models.py).\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 3.0 Testing\n",
+ "To run inference on the trained model from a checkpoint and get accuracy on validation and test sets, run:\n",
+ "```\n",
+ "python evaluation.py --model_architecture dnn --model_size_info 128 128 128 --checkpoint \n",
+ "```\n",
+ "**The model and feature extraction parameters passed to this script should match those used in the Training step.**"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 4.0 Optimization\n",
+ "\n",
+ "We introduce an *optional* step to optimize the trained keyword spotting model for deployment.\n",
+ "\n",
+ "Here we use TensorFlow's [weight clustering API](https://www.tensorflow.org/model_optimization/guide/clustering) to reduce the compressed model size and optimize inference on supported hardware. 32 weight clusters and kmeans++ cluster intialization method are used as the clustering hyperparameters.\n",
+ "\n",
+ "To optimize your trained model (e.g. a DNN), a trained model checkpoint is needed to run clustering and fine-tuning on.\n",
+ "You can use the pre-trained checkpoints provided, or train your own model and use the resulting checkpoint.\n",
+ "\n",
+ "To apply the optimization and fine-tuning, run the following command:\n",
+ "```\n",
+ "python optimisations.py --model_architecture dnn --model_size_info 128 128 128 --checkpoint \n",
+ "```\n",
+ "**The model and feature extraction parameters used here should match those used in the Training step, except for the number of training steps.\n",
+ "The number of training steps is reduced since the optimization step only requires fine-tuning.**\n",
+ "\n",
+ "This will generate a clustered model checkpoint that can be used in the quantization step to generate a quantized and clustered TFLite model."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 5.0 Quantization and TFLite Conversion\n",
+ "\n",
+ "You can now use TensorFlow's\n",
+ "[post training quantization](https://www.tensorflow.org/lite/performance/post_training_quantization) to\n",
+ "make quantization of the trained models super simple.\n",
+ "\n",
+ "To quantize your trained model (e.g. a DNN) run:\n",
+ "```\n",
+ "python convert_to_tflite.py --model_architecture dnn --model_size_info 128 128 128 --checkpoint [--inference_type int8|int16]\n",
+ "```\n",
+ "**The model and feature extraction parameters used here should match those used in the Training step.**\n",
+ "\n",
+ "The ```inference_type``` parameter is *optional* and to be used if a fully quantized model with inputs and outputs of type int8 or int16 is needed. It defaults to fp32.\n",
+ "\n",
+ "In this example, this step will produce a quantized TFLite file *dnn_quantized.tflite*."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "You can test the accuracy of this quantized model on the test set by running:\n",
+ "```\n",
+ "python evaluation.py --tflite_path dnn_quantized.tflite\n",
+ "```\n",
+ "**The model and feature extraction parameters used here should match those used in the Training step.**\n",
+ "\n",
+ "`convert_to_tflite.py` uses post-training quantization to generate a quantized model by default. If you wish to convert to a floating point TFLite model, use the command below:\n",
+ "\n",
+ "```\n",
+ "python convert_to_tflite.py --model_architecture dnn --model_size_info 128 128 128 --checkpoint --no-quantize\n",
+ "```\n",
+ "\n",
+ "This will produce a floating point TFLite file *dnn.tflite*. You can test the accuracy of this floating point model using `evaluation.py` as above.\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 6.0 Single inference of the TFLite model files \n",
+ "\n",
+ "You can conduct TFLite inference for .fp32 and .int8 model files by using the following command: \n",
+ "\n",
+ "```python cnn_l_inference_tflite.py --labels validation_utils/labels.txt --wav --tflite_path ```\n",
+ "\n",
+ "**The feature extraction parameters used here should match those used in the Training step.**\n",
+ "\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.7.4"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/models/keyword_spotting/cnn_large/model_package_tf/model_archive/TFLite/tflite_fp32/README.md b/models/keyword_spotting/cnn_large/model_package_tf/model_archive/TFLite/tflite_fp32/README.md
new file mode 100644
index 0000000..fdb2fcc
--- /dev/null
+++ b/models/keyword_spotting/cnn_large/model_package_tf/model_archive/TFLite/tflite_fp32/README.md
@@ -0,0 +1,62 @@
+# keyword_spotting/cnn_large/model_package_tf/model_archive/TFLite/tflite_fp32
+
+## Description
+This is a floating point fp32 version of the CNN Large model developed by Arm, from the Hello Edge paper.
+
+## License
+[Apache-2.0](https://spdx.org/licenses/Apache-2.0.html)
+
+## Network Information
+| Network Information | Value |
+|---------------------|-------|
+| Framework | TensorFlow Lite |
+| Datatype | fp32 |
+| SHA-1 Hash | e77e0f185dd6b7b9adcb9d867279a6c0a0ecbf02 |
+| Size (Bytes) | 1908316 |
+| Provenance | https://arxiv.org/abs/1711.07128 |
+| Training | Trained by Arm |
+| Paper | https://arxiv.org/abs/1711.07128 |
+
+## DataSet
+| Dataset Information | Value |
+|--------|-------|
+| Name | Google Speech Commands test set |
+
+## Accuracy
+
+| Metric | Value |
+|--------|-------|
+| accuracy | 93.44% |
+
+## HW Support
+| HW Support | Value |
+|--------------|-------|
+| Cortex-A |:heavy_check_mark: |
+| Cortex-M |:heavy_check_mark: |
+| Mali GPU |:heavy_check_mark: |
+| Ethos U |:heavy_multiplication_x: |
+
+### Key
+* :heavy_check_mark: - Will run on this platform.
+* :heavy_multiplication_x: - Will not run on this platform.
+
+## Network Quality
+| Network Quality | Value |
+|-------------------------|-------|
+| Recreate | :heavy_check_mark: |
+| Quality level | Deployable |
+| Vanilla | :heavy_check_mark: |
+| Clustered | :heavy_multiplication_x: |
+| Pruned | :heavy_multiplication_x: |
+| Quantization - default | :heavy_multiplication_x: |
+| Quantization - full | :heavy_multiplication_x: |
+
+## Network Inputs
+| Input Node Name | Shape | Type | Example Path | Example Type | Example Shape | Example Use Case |
+|-----------------|-------|-------|--------------|-------|-------|-----------------|
+| input | (1, 490) | fp32 | models/keyword_spotting/cnn_large/model_package_tf/model_archive/TFLite/tflite_fp32/testing_input/input | fp32 | [1, 490] | The input is a processed MFCCs |
+
+## Network Outputs
+| Output Node Name | Shape | Type | Example Path | Example Type | Example Shape | Example Use Case |
+|-----------------|-------|-------|--------------|-------|-------|-----------------|
+| Identity | (1, 12) | fp32 | models/keyword_spotting/cnn_large/model_package_tf/model_archive/TFLite/tflite_fp32/testing_output/Identity | fp32 | [1, 12] | The probability on 12 keywords |
\ No newline at end of file
diff --git a/models/keyword_spotting/cnn_large/model_package_tf/model_archive/TFLite/tflite_fp32/cnn_l.tflite b/models/keyword_spotting/cnn_large/model_package_tf/model_archive/TFLite/tflite_fp32/cnn_l.tflite
new file mode 100644
index 0000000..cab79f2
--- /dev/null
+++ b/models/keyword_spotting/cnn_large/model_package_tf/model_archive/TFLite/tflite_fp32/cnn_l.tflite
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f1a82f9c75ab57bafccbe9a154454d228c9610bd66cb186a69bab4fcc9958558
+size 1908316
diff --git a/models/keyword_spotting/cnn_large/model_package_tf/model_archive/TFLite/tflite_fp32/definition.yaml b/models/keyword_spotting/cnn_large/model_package_tf/model_archive/TFLite/tflite_fp32/definition.yaml
new file mode 100644
index 0000000..9404113
--- /dev/null
+++ b/models/keyword_spotting/cnn_large/model_package_tf/model_archive/TFLite/tflite_fp32/definition.yaml
@@ -0,0 +1,64 @@
+benchmark:
+ benchmark_metrics:
+ accuracy: 93.44%
+ benchmark_name: Google Speech Commands test set
+description: This is a floating point fp32 version of the CNN Large model developed
+ by Arm, from the Hello Edge paper.
+license:
+- Apache-2.0
+network:
+ datatype: fp32
+ file_size_bytes: 1908316
+ filename: cnn_l.tflite
+ framework: TensorFlow Lite
+ hash:
+ algorithm: sha1
+ value: e77e0f185dd6b7b9adcb9d867279a6c0a0ecbf02
+ provenance: https://arxiv.org/abs/1711.07128
+ training: Trained by Arm
+network_parameters:
+ input_nodes:
+ - description: The input is a processed MFCCs of shape (1, 490)
+ example_input:
+ path: models/keyword_spotting/cnn_large/model_package_tf/model_archive/TFLite/tflite_fp32/testing_input/input
+ shape:
+ - 1
+ - 490
+ type: fp32
+ use_case: Random input for model regression.
+ input_datatype: fp32
+ name: input
+ shape:
+ - 1
+ - 490
+ output_nodes:
+ - description: The probability on 12 keywords.
+ example_output:
+ path: models/keyword_spotting/cnn_large/model_package_tf/model_archive/TFLite/tflite_fp32/testing_output/Identity
+ shape:
+ - 1
+ - 12
+ type: fp32
+ use_case: output for model regression.
+ name: Identity
+ output_datatype: fp32
+ shape:
+ - 1
+ - 12
+network_quality:
+ clustered: false
+ is_vanilla: true
+ pruned: false
+ quality_level: Deployable
+ quality_level_hero_hw: cortex_m
+ quantization_default: false
+ quantization_full: false
+ recreate: true
+operators:
+ TensorFlow Lite:
+ - CONV_2D
+ - FULLY_CONNECTED
+ - RELU
+ - RESHAPE
+ - SOFTMAX
+paper: https://arxiv.org/abs/1711.07128
\ No newline at end of file
diff --git a/models/keyword_spotting/cnn_large/model_package_tf/model_archive/TFLite/tflite_fp32/testing_input/input/0.npy b/models/keyword_spotting/cnn_large/model_package_tf/model_archive/TFLite/tflite_fp32/testing_input/input/0.npy
new file mode 100644
index 0000000..4b93b40
--- /dev/null
+++ b/models/keyword_spotting/cnn_large/model_package_tf/model_archive/TFLite/tflite_fp32/testing_input/input/0.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c0d3177ad9e25a08e300d6dab37303348cc99cda9137a0ed98bfe4ecabb4cbe2
+size 2088
diff --git a/models/keyword_spotting/cnn_large/model_package_tf/model_archive/TFLite/tflite_fp32/testing_output/Identity/0.npy b/models/keyword_spotting/cnn_large/model_package_tf/model_archive/TFLite/tflite_fp32/testing_output/Identity/0.npy
new file mode 100644
index 0000000..cca051a
--- /dev/null
+++ b/models/keyword_spotting/cnn_large/model_package_tf/model_archive/TFLite/tflite_fp32/testing_output/Identity/0.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:84727ee69c9018fcd7295ca5646c29a982b948ce3abd7c4a9c44c7203c699b24
+size 176
diff --git a/models/keyword_spotting/cnn_large/model_package_tf/model_archive/TFLite/tflite_int8/README.md b/models/keyword_spotting/cnn_large/model_package_tf/model_archive/TFLite/tflite_int8/README.md
new file mode 100644
index 0000000..8befb51
--- /dev/null
+++ b/models/keyword_spotting/cnn_large/model_package_tf/model_archive/TFLite/tflite_int8/README.md
@@ -0,0 +1,62 @@
+# keyword_spotting/cnn_large/model_package_tf/model_archive/TFLite/tflite_int8
+
+## Description
+This is a fully quantized int8 version of the CNN Large model developed by Arm, from the Hello Edge paper.
+
+## License
+[Apache-2.0](https://spdx.org/licenses/Apache-2.0.html)
+
+## Network Information
+| Network Information | Value |
+|---------------------|-------|
+| Framework | TensorFlow Lite |
+| Datatype | int8 |
+| SHA-1 Hash | a61ab748ae8f52f78ab568342db67a792c6ecf34 |
+| Size (Bytes) | 484600 |
+| Provenance | https://arxiv.org/abs/1711.07128 |
+| Training | Trained by Arm |
+| Paper | https://arxiv.org/abs/1711.07128 |
+
+## DataSet
+| Dataset Information | Value |
+|--------|-------|
+| Name | Google Speech Commands test set |
+
+## Accuracy
+
+| Metric | Value |
+|--------|-------|
+| Accuracy | 92.27% |
+
+## HW Support
+| HW Support | Value |
+|--------------|-------|
+| Cortex-A |:heavy_check_mark: |
+| Cortex-M |:heavy_check_mark: |
+| Mali GPU |:heavy_check_mark: |
+| Ethos U |:heavy_check_mark: |
+
+### Key
+* :heavy_check_mark: - Will run on this platform.
+* :heavy_multiplication_x: - Will not run on this platform.
+
+## Network Quality
+| Network Quality | Value |
+|-------------------------|-------|
+| Recreate | :heavy_check_mark: |
+| Quality level | Deployable |
+| Vanilla | :heavy_check_mark: |
+| Clustered | :heavy_multiplication_x: |
+| Pruned | :heavy_multiplication_x: |
+| Quantization - default | :heavy_multiplication_x: |
+| Quantization - full | :heavy_check_mark: |
+
+## Network Inputs
+| Input Node Name | Shape | Type | Example Path | Example Type | Example Shape | Example Use Case |
+|-----------------|-------|-------|--------------|-------|-------|-----------------|
+| input | (1, 490) | int8 | models/keyword_spotting/cnn_large/model_package_tf/model_archive/TFLite/tflite_int8/testing_input/input | int8 | [1, 490] | The input is a processed MFCCs |
+
+## Network Outputs
+| Output Node Name | Shape | Type | Example Path | Example Type | Example Shape | Example Use Case |
+|-----------------|-------|-------|--------------|-------|-------|-----------------|
+| Identity | (1, 12) | int8 | models/keyword_spotting/cnn_large/model_package_tf/model_archive/TFLite/tflite_int8/testing_output/Identity | int8 | [1, 12] | The probability on 12 keywords |
\ No newline at end of file
diff --git a/models/keyword_spotting/cnn_large/tflite_int8/cnn_l_quantized.tflite b/models/keyword_spotting/cnn_large/model_package_tf/model_archive/TFLite/tflite_int8/cnn_l_quantized.tflite
similarity index 100%
rename from models/keyword_spotting/cnn_large/tflite_int8/cnn_l_quantized.tflite
rename to models/keyword_spotting/cnn_large/model_package_tf/model_archive/TFLite/tflite_int8/cnn_l_quantized.tflite
diff --git a/models/keyword_spotting/cnn_large/model_package_tf/model_archive/TFLite/tflite_int8/definition.yaml b/models/keyword_spotting/cnn_large/model_package_tf/model_archive/TFLite/tflite_int8/definition.yaml
new file mode 100644
index 0000000..32429b1
--- /dev/null
+++ b/models/keyword_spotting/cnn_large/model_package_tf/model_archive/TFLite/tflite_int8/definition.yaml
@@ -0,0 +1,64 @@
+benchmark:
+ benchmark_metrics:
+ accuracy: 92.27%
+ benchmark_name: Google Speech Commands test set
+description: This is a fully quantized int8 version of the CNN Large model developed
+ by Arm, from the Hello Edge paper.
+license:
+- Apache-2.0
+network:
+ datatype: int8
+ file_size_bytes: 484600
+ filename: cnn_l_quantized.tflite
+ framework: TensorFlow Lite
+ hash:
+ algorithm: sha1
+ value: a61ab748ae8f52f78ab568342db67a792c6ecf34
+ provenance: https://arxiv.org/abs/1711.07128
+ training: Trained by Arm
+network_parameters:
+ input_nodes:
+ - description: The input is a processed MFCCs of shape (1, 490)
+ example_input:
+ path: models/keyword_spotting/cnn_large/model_package_tf/model_archive/TFLite/tflite_int8/testing_input/input
+ shape:
+ - 1
+ - 490
+ type: int8
+ use_case: Random input for model regression.
+ input_datatype: int8
+ name: input
+ shape:
+ - 1
+ - 490
+ output_nodes:
+ - description: The probability on 12 keywords.
+ example_output:
+ path: models/keyword_spotting/cnn_large/model_package_tf/model_archive/TFLite/tflite_int8/testing_output/Identity
+ shape:
+ - 1
+ - 12
+ type: int8
+ use_case: output for model regression.
+ name: Identity
+ output_datatype: int8
+ shape:
+ - 1
+ - 12
+network_quality:
+ clustered: false
+ is_vanilla: true
+ pruned: false
+ quality_level: Deployable
+ quality_level_hero_hw: cortex_m
+ quantization_default: false
+ quantization_full: true
+ recreate: true
+operators:
+ TensorFlow Lite:
+ - CONV_2D
+ - FULLY_CONNECTED
+ - RELU
+ - RESHAPE
+ - SOFTMAX
+paper: https://arxiv.org/abs/1711.07128
\ No newline at end of file
diff --git a/models/keyword_spotting/cnn_large/tflite_int8/testing_input/input/0.npy b/models/keyword_spotting/cnn_large/model_package_tf/model_archive/TFLite/tflite_int8/testing_input/input/0.npy
similarity index 100%
rename from models/keyword_spotting/cnn_large/tflite_int8/testing_input/input/0.npy
rename to models/keyword_spotting/cnn_large/model_package_tf/model_archive/TFLite/tflite_int8/testing_input/input/0.npy
diff --git a/models/keyword_spotting/cnn_large/tflite_int8/testing_output/Identity/0.npy b/models/keyword_spotting/cnn_large/model_package_tf/model_archive/TFLite/tflite_int8/testing_output/Identity/0.npy
similarity index 100%
rename from models/keyword_spotting/cnn_large/tflite_int8/testing_output/Identity/0.npy
rename to models/keyword_spotting/cnn_large/model_package_tf/model_archive/TFLite/tflite_int8/testing_output/Identity/0.npy
diff --git a/models/keyword_spotting/cnn_large/model_package_tf/model_archive/model_source/saved_model/cnn_large/keras_metadata.pb b/models/keyword_spotting/cnn_large/model_package_tf/model_archive/model_source/saved_model/cnn_large/keras_metadata.pb
new file mode 100644
index 0000000..95bf328
--- /dev/null
+++ b/models/keyword_spotting/cnn_large/model_package_tf/model_archive/model_source/saved_model/cnn_large/keras_metadata.pb
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4200839672e3d67af379cc06349ee6af8ab3b53c966562595b31473afc252c6d
+size 28876
diff --git a/models/keyword_spotting/cnn_large/model_package_tf/model_archive/model_source/saved_model/cnn_large/saved_model.pb b/models/keyword_spotting/cnn_large/model_package_tf/model_archive/model_source/saved_model/cnn_large/saved_model.pb
new file mode 100644
index 0000000..ff4b1b6
--- /dev/null
+++ b/models/keyword_spotting/cnn_large/model_package_tf/model_archive/model_source/saved_model/cnn_large/saved_model.pb
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4d0494f8fe5b99a8b92217809d33d287f855e9281465548650037906c57912a2
+size 302218
diff --git a/models/keyword_spotting/cnn_large/model_package_tf/model_archive/model_source/saved_model/cnn_large/variables/variables.data-00000-of-00001 b/models/keyword_spotting/cnn_large/model_package_tf/model_archive/model_source/saved_model/cnn_large/variables/variables.data-00000-of-00001
new file mode 100644
index 0000000..d05f350
--- /dev/null
+++ b/models/keyword_spotting/cnn_large/model_package_tf/model_archive/model_source/saved_model/cnn_large/variables/variables.data-00000-of-00001
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4d8519182ae8e5d3dbf4762e2db5c1ac27472e95e9ef4aa0772aec6991020ffd
+size 1917320
diff --git a/models/keyword_spotting/cnn_large/model_package_tf/model_archive/model_source/saved_model/cnn_large/variables/variables.index b/models/keyword_spotting/cnn_large/model_package_tf/model_archive/model_source/saved_model/cnn_large/variables/variables.index
new file mode 100644
index 0000000..f6645fe
--- /dev/null
+++ b/models/keyword_spotting/cnn_large/model_package_tf/model_archive/model_source/saved_model/cnn_large/variables/variables.index
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:398bc377f651632cfde25ca4c1e372d04fe199868080ec162f482db3a7d8399e
+size 1478
diff --git a/models/keyword_spotting/cnn_large/tflite_int8/ckpt/checkpoint b/models/keyword_spotting/cnn_large/model_package_tf/model_archive/model_source/weights/checkpoint
similarity index 100%
rename from models/keyword_spotting/cnn_large/tflite_int8/ckpt/checkpoint
rename to models/keyword_spotting/cnn_large/model_package_tf/model_archive/model_source/weights/checkpoint
diff --git a/models/keyword_spotting/cnn_large/tflite_int8/ckpt/cnn_0.94_ckpt.data-00000-of-00001 b/models/keyword_spotting/cnn_large/model_package_tf/model_archive/model_source/weights/cnn_0.94_ckpt.data-00000-of-00001
similarity index 100%
rename from models/keyword_spotting/cnn_large/tflite_int8/ckpt/cnn_0.94_ckpt.data-00000-of-00001
rename to models/keyword_spotting/cnn_large/model_package_tf/model_archive/model_source/weights/cnn_0.94_ckpt.data-00000-of-00001
diff --git a/models/keyword_spotting/cnn_large/tflite_int8/ckpt/cnn_0.94_ckpt.index b/models/keyword_spotting/cnn_large/model_package_tf/model_archive/model_source/weights/cnn_0.94_ckpt.index
similarity index 100%
rename from models/keyword_spotting/cnn_large/tflite_int8/ckpt/cnn_0.94_ckpt.index
rename to models/keyword_spotting/cnn_large/model_package_tf/model_archive/model_source/weights/cnn_0.94_ckpt.index
diff --git a/models/keyword_spotting/cnn_large/model_package_tf/model_core_utils/__init__.py b/models/keyword_spotting/cnn_large/model_package_tf/model_core_utils/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/models/keyword_spotting/cnn_large/model_package_tf/model_core_utils/models.py b/models/keyword_spotting/cnn_large/model_package_tf/model_core_utils/models.py
new file mode 100644
index 0000000..1978136
--- /dev/null
+++ b/models/keyword_spotting/cnn_large/model_package_tf/model_core_utils/models.py
@@ -0,0 +1,327 @@
+# Copyright © 2023 Arm Ltd. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Model definitions for simple keyword spotting."""
+
+import math
+
+import tensorflow as tf
+
+
+def prepare_model_settings(label_count, sample_rate, clip_duration_ms,
+ window_size_ms, window_stride_ms,
+ dct_coefficient_count):
+ """Calculates common settings needed for all models.
+
+ Args:
+ label_count: How many classes are to be recognized.
+ sample_rate: Number of audio samples per second.
+ clip_duration_ms: Length of each audio clip to be analyzed.
+ window_size_ms: Duration of frequency analysis window.
+ window_stride_ms: How far to move in time between frequency windows.
+ dct_coefficient_count: Number of frequency bins to use for analysis.
+
+ Returns:
+ Dictionary containing common settings.
+ """
+ desired_samples = int(sample_rate * clip_duration_ms / 1000)
+ window_size_samples = int(sample_rate * window_size_ms / 1000)
+ window_stride_samples = int(sample_rate * window_stride_ms / 1000)
+ length_minus_window = (desired_samples - window_size_samples)
+ if length_minus_window < 0:
+ spectrogram_length = 0
+ else:
+ spectrogram_length = 1 + int(length_minus_window / window_stride_samples)
+ fingerprint_size = dct_coefficient_count * spectrogram_length
+
+ return {
+ 'desired_samples': desired_samples,
+ 'window_size_samples': window_size_samples,
+ 'window_stride_samples': window_stride_samples,
+ 'spectrogram_length': spectrogram_length,
+ 'dct_coefficient_count': dct_coefficient_count,
+ 'fingerprint_size': fingerprint_size,
+ 'label_count': label_count,
+ 'sample_rate': sample_rate,
+ }
+
+
+def create_model(model_settings, model_architecture, model_size_info, is_training):
+ """Builds a tf.keras model of the requested architecture compatible with the settings.
+
+ Args:
+ model_settings: Dictionary of information about the model.
+ model_architecture: String specifying which kind of model to create.
+ model_size_info: Array with specific information for the chosen architecture
+ (e.g convolutional parameters, number of layers).
+
+ Returns:
+ A tf.keras Model with the requested architecture.
+
+ Raises:
+ Exception: If the architecture type isn't recognized.
+ """
+
+ if model_architecture == 'dnn':
+ return create_dnn_model(model_settings, model_size_info)
+
+ elif model_architecture == 'cnn':
+ return create_cnn_model(model_settings, model_size_info)
+
+ elif model_architecture == 'ds_cnn':
+ return create_ds_cnn_model(model_settings, model_size_info)
+ elif model_architecture == 'single_fc':
+ return create_single_fc_model(model_settings)
+ elif model_architecture == 'basic_lstm':
+ return create_basic_lstm_model(model_settings, model_size_info, is_training)
+ else:
+ raise Exception(f'model_architecture argument {model_architecture} not recognized'
+ f', should be one of, "dnn", "cnn", "ds_cnn" ')
+
+
+def create_single_fc_model(model_settings):
+ """Builds a model with a single fully-connected layer.
+
+ For details see https://arxiv.org/abs/1711.07128.
+
+ Args:
+ model_settings: Dict of different settings for model training.
+
+ Returns:
+ tf.keras Model of the 'SINGLE_FC' architecture.
+ """
+ inputs = tf.keras.Input(shape=(model_settings['fingerprint_size'],), name='input')
+ # Fully connected layer
+ output = tf.keras.layers.Dense(units=model_settings['label_count'], activation='softmax')(inputs)
+
+ return tf.keras.Model(inputs, output)
+
+
+def create_basic_lstm_model(model_settings, model_size_info, is_training):
+ """Builds a model with a basic lstm layer.
+
+ For details see https://arxiv.org/abs/1711.07128.
+
+ Args:
+ model_settings: Dict of different settings for model training.
+ model_size_info: Length of the array defines the number of hidden-layers and
+ each element in the array represent the number of neurons in that layer.
+ is_training: Determining whether the use of the model is for training or for something else.
+
+ Returns:
+ tf.keras Model of the 'Basic_LSTM' architecture.
+ """
+ inputs = tf.keras.Input(shape=(model_settings['fingerprint_size'], ), name='input')
+
+ input_frequency_size = model_settings['dct_coefficient_count']
+ input_time_size = model_settings['spectrogram_length']
+ x = tf.reshape(inputs, shape=(-1, input_time_size, input_frequency_size))
+
+ # LSTM layer, and unrolling depending on whether you are training or not
+ if is_training:
+ x = tf.keras.layers.LSTM(units=model_size_info[0], time_major=False, unroll=False)(x)
+ else:
+ x = tf.keras.layers.LSTM(units=model_size_info[0], time_major=False, unroll=True)(x)
+
+ # Outputs a fully connected layer
+ output = tf.keras.layers.Dense(units=model_settings['label_count'], activation='softmax')(x)
+
+ return tf.keras.Model(inputs, output)
+
+
+def create_dnn_model(model_settings, model_size_info):
+ """Builds a model with multiple hidden fully-connected layers.
+
+ For details see https://arxiv.org/abs/1711.07128.
+
+ Args:
+ model_settings: Dict of different settings for model training.
+ model_size_info: Length of the array defines the number of hidden-layers and
+ each element in the array represent the number of neurons in that layer.
+
+ Returns:
+ tf.keras Model of the 'DNN' architecture.
+ """
+
+ inputs = tf.keras.Input(shape=(model_settings['fingerprint_size'], ), name='input')
+
+ # First fully connected layer.
+ x = tf.keras.layers.Dense(units=model_size_info[0], activation='relu')(inputs)
+
+ # Hidden layers with ReLU activations.
+ for i in range(1, len(model_size_info)):
+ x = tf.keras.layers.Dense(units=model_size_info[i], activation='relu')(x)
+
+ # Output fully connected layer.
+ output = tf.keras.layers.Dense(units=model_settings['label_count'], activation='softmax')(x)
+
+ return tf.keras.Model(inputs, output)
+
+
+def create_cnn_model(model_settings, model_size_info):
+ """Builds a model with 2 convolution layers followed by a linear layer and a hidden fully-connected layer.
+
+ For details see https://arxiv.org/abs/1711.07128.
+
+ Args:
+ model_settings: Dict of different settings for model training.
+ model_size_info: Defines the first and second convolution parameters in
+ {number of conv features, conv filter height, width, stride in y,x dir.},
+ followed by linear layer size and fully-connected layer size.
+
+ Returns:
+ tf.keras Model of the 'CNN' architecture.
+ """
+
+ input_frequency_size = model_settings['dct_coefficient_count']
+ input_time_size = model_settings['spectrogram_length']
+
+ first_filter_count = model_size_info[0]
+ first_filter_height = model_size_info[1] # Time axis.
+ first_filter_width = model_size_info[2] # Frequency axis.
+ first_filter_stride_y = model_size_info[3] # Time axis.
+ first_filter_stride_x = model_size_info[4] # Frequency_axis.
+
+ second_filter_count = model_size_info[5]
+ second_filter_height = model_size_info[6] # Time axis.
+ second_filter_width = model_size_info[7] # Frequency axis.
+ second_filter_stride_y = model_size_info[8] # Time axis.
+ second_filter_stride_x = model_size_info[9] # Frequency axis.
+
+ linear_layer_size = model_size_info[10]
+ fc_size = model_size_info[11]
+
+ inputs = tf.keras.Input(shape=(model_settings['fingerprint_size']), name='input')
+
+ # Reshape the flattened input.
+ x = tf.reshape(inputs, shape=(-1, input_time_size, input_frequency_size, 1))
+
+ # First convolution.
+ x = tf.keras.layers.Conv2D(filters=first_filter_count,
+ kernel_size=(first_filter_height, first_filter_width),
+ strides=(first_filter_stride_y, first_filter_stride_x),
+ padding='VALID')(x)
+ x = tf.keras.layers.BatchNormalization()(x)
+ x = tf.keras.layers.ReLU()(x)
+ x = tf.keras.layers.Dropout(rate=0)(x)
+
+ # Second convolution.
+ x = tf.keras.layers.Conv2D(filters=second_filter_count,
+ kernel_size=(second_filter_height, second_filter_width),
+ strides=(second_filter_stride_y, second_filter_stride_x),
+ padding='VALID')(x)
+ x = tf.keras.layers.BatchNormalization()(x)
+ x = tf.keras.layers.ReLU()(x)
+ x = tf.keras.layers.Dropout(rate=0)(x)
+
+ # Flatten for fully connected layers.
+ x = tf.keras.layers.Flatten()(x)
+
+ # Fully connected layer with no activation.
+ x = tf.keras.layers.Dense(units=linear_layer_size)(x)
+
+ # Fully connected layer with ReLU activation.
+ x = tf.keras.layers.Dense(units=fc_size)(x)
+ x = tf.keras.layers.BatchNormalization()(x)
+ x = tf.keras.layers.ReLU()(x)
+ x = tf.keras.layers.Dropout(rate=0)(x)
+
+ # Output fully connected.
+ output = tf.keras.layers.Dense(units=model_settings['label_count'], activation='softmax')(x)
+
+ return tf.keras.Model(inputs, output)
+
+
+def create_ds_cnn_model(model_settings, model_size_info):
+ """Builds a model with convolutional & depthwise separable convolutional layers.
+
+ For more details see https://arxiv.org/abs/1711.07128.
+
+ Args:
+ model_settings: Dict of different settings for model training.
+ model_size_info: Defines number of layers, followed by the DS-Conv layer
+ parameters in the order {number of conv features, conv filter height,
+ width and stride in y,x dir.} for each of the layers.
+
+ Returns:
+ tf.keras Model of the 'DS-CNN' architecture.
+ """
+
+ label_count = model_settings['label_count']
+ input_frequency_size = model_settings['dct_coefficient_count']
+ input_time_size = model_settings['spectrogram_length']
+
+ t_dim = input_time_size
+ f_dim = input_frequency_size
+
+ # Extract model dimensions from model_size_info.
+ num_layers = model_size_info[0]
+ conv_feat = [None]*num_layers
+ conv_kt = [None]*num_layers
+ conv_kf = [None]*num_layers
+ conv_st = [None]*num_layers
+ conv_sf = [None]*num_layers
+
+ i = 1
+ for layer_no in range(0, num_layers):
+ conv_feat[layer_no] = model_size_info[i]
+ i += 1
+ conv_kt[layer_no] = model_size_info[i]
+ i += 1
+ conv_kf[layer_no] = model_size_info[i]
+ i += 1
+ conv_st[layer_no] = model_size_info[i]
+ i += 1
+ conv_sf[layer_no] = model_size_info[i]
+ i += 1
+
+ inputs = tf.keras.Input(shape=(model_settings['fingerprint_size']), name='input')
+
+ # Reshape the flattened input.
+ x = tf.reshape(inputs, shape=(-1, input_time_size, input_frequency_size, 1))
+
+ # Depthwise separable convolutions.
+ for layer_no in range(0, num_layers):
+ if layer_no == 0:
+ # First convolution.
+ x = tf.keras.layers.Conv2D(filters=conv_feat[0],
+ kernel_size=(conv_kt[0], conv_kf[0]),
+ strides=(conv_st[0], conv_sf[0]),
+ padding='SAME')(x)
+ x = tf.keras.layers.BatchNormalization()(x)
+ x = tf.keras.layers.ReLU()(x)
+ else:
+ # Depthwise convolution.
+ x = tf.keras.layers.DepthwiseConv2D(kernel_size=(conv_kt[layer_no], conv_kf[layer_no]),
+ strides=(conv_sf[layer_no], conv_st[layer_no]),
+ padding='SAME')(x)
+ x = tf.keras.layers.BatchNormalization()(x)
+ x = tf.keras.layers.ReLU()(x)
+
+ # Pointwise convolution.
+ x = tf.keras.layers.Conv2D(filters=conv_feat[layer_no], kernel_size=(1, 1))(x)
+ x = tf.keras.layers.BatchNormalization()(x)
+ x = tf.keras.layers.ReLU()(x)
+
+ t_dim = math.ceil(t_dim/float(conv_st[layer_no]))
+ f_dim = math.ceil(f_dim/float(conv_sf[layer_no]))
+
+ # Global average pool.
+ x = tf.keras.layers.AveragePooling2D(pool_size=(t_dim, f_dim), strides=1)(x)
+
+ # Squeeze before passing to output fully connected layer.
+ x = tf.reshape(x, shape=(-1, conv_feat[layer_no]))
+
+ # Output connected layer.
+ output = tf.keras.layers.Dense(units=label_count, activation='softmax')(x)
+
+ return tf.keras.Model(inputs, output)
diff --git a/models/keyword_spotting/cnn_large/model_package_tf/optimisations.py b/models/keyword_spotting/cnn_large/model_package_tf/optimisations.py
new file mode 100644
index 0000000..16b6f4c
--- /dev/null
+++ b/models/keyword_spotting/cnn_large/model_package_tf/optimisations.py
@@ -0,0 +1,259 @@
+# Copyright © 2023 Arm Ltd. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Functions for optimizing simple keyword spotting models using clustering API."""
+
+import argparse
+from pathlib import Path
+
+import tensorflow as tf
+import numpy as np
+import tensorflow_model_optimization as tfmot
+
+from data_processing import data_preprocessing
+from model_core_utils import models
+
+
+def print_model_weight_clusters(model):
+
+ for layer in model.layers:
+ if isinstance(layer, tf.keras.layers.Wrapper):
+ weights = layer.trainable_weights
+ else:
+ weights = layer.weights
+ for weight in weights:
+ if "kernel" in weight.name:
+ unique_count = len(np.unique(weight))
+ print(
+ f"{layer.name}/{weight.name}: {unique_count} clusters "
+ )
+
+
+def optimize():
+ model_settings = models.prepare_model_settings(
+ len(data_preprocessing.prepare_words_list(FLAGS.wanted_words.split(','))),
+ FLAGS.sample_rate, FLAGS.clip_duration_ms, FLAGS.window_size_ms,
+ FLAGS.window_stride_ms, FLAGS.dct_coefficient_count)
+
+ # Create the model to optimize from checkpoint.
+ model = models.create_model(model_settings, FLAGS.model_architecture, FLAGS.model_size_info)
+ model.load_weights(FLAGS.checkpoint).expect_partial()
+
+ audio_processor = data_preprocessing.AudioProcessor(data_url=FLAGS.data_url,
+ data_dir=FLAGS.data_dir,
+ silence_percentage=FLAGS.silence_percentage,
+ unknown_percentage=FLAGS.unknown_percentage,
+ wanted_words=FLAGS.wanted_words.split(','),
+ validation_percentage=FLAGS.validation_percentage,
+ testing_percentage=FLAGS.testing_percentage,
+ model_settings=model_settings)
+
+ # We decay learning rate in a constant piecewise way to help learning.
+ training_steps_list = list(map(int, FLAGS.how_many_training_steps.split(',')))
+ learning_rates_list = list(map(float, FLAGS.learning_rate.split(',')))
+ lr_boundary_list = training_steps_list[:-1] # Only need the values at which to change lr.
+ lr_schedule = tf.keras.optimizers.schedules.PiecewiseConstantDecay(boundaries=lr_boundary_list,
+ values=learning_rates_list)
+
+ cluster_weights = tfmot.clustering.keras.cluster_weights
+ CentroidInitialization = tfmot.clustering.keras.CentroidInitialization
+
+ clustering_params = {
+ 'number_of_clusters': 32,
+ 'cluster_centroids_init': CentroidInitialization.KMEANS_PLUS_PLUS}
+
+ clustered_model = cluster_weights(model, **clustering_params)
+
+ # Specify the optimizer configurations.
+ optimizer = tf.keras.optimizers.Adam(learning_rate=lr_schedule)
+ clustered_model.compile(optimizer=optimizer,
+ loss=tf.keras.losses.SparseCategoricalCrossentropy(),
+ metrics=['accuracy'])
+
+ train_data = audio_processor.get_data(audio_processor.Modes.TRAINING,
+ FLAGS.background_frequency, FLAGS.background_volume,
+ int((FLAGS.time_shift_ms * FLAGS.sample_rate) / 1000))
+ train_data = train_data.repeat().batch(FLAGS.batch_size).prefetch(tf.data.AUTOTUNE)
+ val_data = audio_processor.get_data(audio_processor.Modes.VALIDATION)
+ val_data = val_data.batch(FLAGS.batch_size).prefetch(tf.data.AUTOTUNE)
+
+ # We train for a max number of iterations so need to calculate how many 'epochs' this will be.
+ training_steps_max = np.sum(training_steps_list)
+ training_epoch_max = int(np.ceil(training_steps_max / FLAGS.eval_step_interval))
+
+ # Train the model with clustering applied.
+ clustered_model.fit(x=train_data,
+ steps_per_epoch=FLAGS.eval_step_interval,
+ epochs=training_epoch_max,
+ validation_data=val_data)
+
+ stripped_clustered_model = tfmot.clustering.keras.strip_clustering(clustered_model)
+
+ print_model_weight_clusters(stripped_clustered_model)
+
+ # Save the clustered model weights
+ train_dir = Path(FLAGS.train_dir) / "optimized"
+ train_dir.mkdir(parents=True, exist_ok=True)
+
+ stripped_clustered_model.save_weights((train_dir /
+ (FLAGS.model_architecture +
+ "_clustered_ckpt")))
+
+ # Test the model.
+ test_data = audio_processor.get_data(audio_processor.Modes.TESTING)
+ test_data = test_data.batch(FLAGS.batch_size)
+
+ stripped_clustered_model.compile(optimizer=optimizer,
+ loss=tf.keras.losses.SparseCategoricalCrossentropy(),
+ metrics=['accuracy'])
+
+ test_loss, test_acc = stripped_clustered_model.evaluate(x=test_data)
+ print(f'Final test accuracy: {test_acc*100:.2f}%')
+
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser()
+ parser.add_argument(
+ '--data_url',
+ type=str,
+ default='http://download.tensorflow.org/data/speech_commands_v0.02.tar.gz',
+ help='Location of speech training data archive on the web.')
+ parser.add_argument(
+ '--data_dir',
+ type=str,
+ default='/tmp/speech_dataset/',
+ help="""\
+ Where to download the speech training data to.
+ """)
+ parser.add_argument(
+ '--background_volume',
+ type=float,
+ default=0.1,
+ help="""\
+ How loud the background noise should be, between 0 and 1.
+ """)
+ parser.add_argument(
+ '--background_frequency',
+ type=float,
+ default=0.8,
+ help="""\
+ How many of the training samples have background noise mixed in.
+ """)
+ parser.add_argument(
+ '--silence_percentage',
+ type=float,
+ default=10.0,
+ help="""\
+ How much of the training data should be silence.
+ """)
+ parser.add_argument(
+ '--unknown_percentage',
+ type=float,
+ default=10.0,
+ help="""\
+ How much of the training data should be unknown words.
+ """)
+ parser.add_argument(
+ '--time_shift_ms',
+ type=float,
+ default=100.0,
+ help="""\
+ Range to randomly shift the training audio by in time.
+ """)
+ parser.add_argument(
+ '--testing_percentage',
+ type=int,
+ default=10,
+ help='What percentage of wavs to use as a test set.')
+ parser.add_argument(
+ '--validation_percentage',
+ type=int,
+ default=10,
+ help='What percentage of wavs to use as a validation set.')
+ parser.add_argument(
+ '--sample_rate',
+ type=int,
+ default=16000,
+ help='Expected sample rate of the wavs',)
+ parser.add_argument(
+ '--clip_duration_ms',
+ type=int,
+ default=1000,
+ help='Expected duration in milliseconds of the wavs',)
+ parser.add_argument(
+ '--window_size_ms',
+ type=float,
+ default=30.0,
+ help='How long each spectrogram timeslice is',)
+ parser.add_argument(
+ '--window_stride_ms',
+ type=float,
+ default=10.0,
+ help='How long each spectrogram timeslice is',)
+ parser.add_argument(
+ '--dct_coefficient_count',
+ type=int,
+ default=40,
+ help='How many bins to use for the MFCC fingerprint',)
+ parser.add_argument(
+ '--how_many_training_steps',
+ type=str,
+ default='3750,750',
+ help='How many training loops to run',)
+ parser.add_argument(
+ '--eval_step_interval',
+ type=int,
+ default=400,
+ help='How often to evaluate the training results.')
+ parser.add_argument(
+ '--learning_rate',
+ type=str,
+ default='0.001,0.0001',
+ help='How large a learning rate to use when training.')
+ parser.add_argument(
+ '--batch_size',
+ type=int,
+ default=100,
+ help='How many items to train with at once',)
+ parser.add_argument(
+ '--wanted_words',
+ type=str,
+ default='yes,no,up,down,left,right,on,off,stop,go',
+ help='Words to use (others will be added to an unknown label)',)
+ parser.add_argument(
+ '--train_dir',
+ type=str,
+ default='/tmp/speech_commands_train',
+ help='Directory to write event logs and checkpoint.')
+ parser.add_argument(
+ '--save_step_interval',
+ type=int,
+ default=100,
+ help='Save model checkpoint every save_steps.')
+ parser.add_argument(
+ '--checkpoint',
+ type=str,
+ help='Checkpoint to load the weights from before fine-tuning.')
+ parser.add_argument(
+ '--model_architecture',
+ type=str,
+ default='dnn',
+ help='What model architecture to use')
+ parser.add_argument(
+ '--model_size_info',
+ type=int,
+ nargs="+",
+ default=[128, 128, 128],
+ help='Model dimensions - different for various models')
+
+ FLAGS, _ = parser.parse_known_args()
+ optimize()
diff --git a/models/keyword_spotting/cnn_large/model_package_tf/recreate_model.sh b/models/keyword_spotting/cnn_large/model_package_tf/recreate_model.sh
new file mode 100644
index 0000000..1ea0506
--- /dev/null
+++ b/models/keyword_spotting/cnn_large/model_package_tf/recreate_model.sh
@@ -0,0 +1,52 @@
+#!/usr/bin/env bash
+# Copyright (C) 2023 Arm Limited or its affiliates. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+ckpt_path=model_archive/model_source/weights/cnn_0.94_ckpt
+train=false
+
+# Parse command line args
+while (( $# >= 1 )); do
+ case $1 in
+ --ckpt)
+ if [ "$2" ]; then
+ ckpt_path=$2
+ shift
+ else
+ printf 'ERROR: "--ckpt" requires a path to be supplied.\n'
+ exit 1
+ fi
+ ;;
+ --train)
+ train=true
+ break;;
+ *) shift;
+ esac;
+done
+
+
+# CNN Large training
+if [ "$train" = true ]
+then
+python train.py --model_architecture cnn --model_size_info 60 10 4 1 1 76 10 4 2 1 58 128 --dct_coefficient_count 10 --window_size_ms 40 --window_stride_ms 20 --learning_rate 0.0005,0.0001,0.00002 --how_many_training_steps 10000,10000,10000 --summaries_dir work/CNN/CNN_L/retrain_logs --train_dir work/CNN/CNN_L/training
+fi
+
+# Conversion to TFLite fp32
+python convert_to_tflite.py --model_architecture cnn --model_size_info 60 10 4 1 1 76 10 4 2 1 58 128 --dct_coefficient_count 10 --window_size_ms 40 --window_stride_ms 20 --checkpoint $ckpt_path --no-quantize
+
+# Conversion to TFLite int8
+python convert_to_tflite.py --model_architecture cnn --model_size_info 60 10 4 1 1 76 10 4 2 1 58 128 --dct_coefficient_count 10 --window_size_ms 40 --window_stride_ms 20 --checkpoint $ckpt_path --inference_type int8
+
diff --git a/models/keyword_spotting/cnn_large/model_package_tf/requirements.txt b/models/keyword_spotting/cnn_large/model_package_tf/requirements.txt
new file mode 100644
index 0000000..3448cff
--- /dev/null
+++ b/models/keyword_spotting/cnn_large/model_package_tf/requirements.txt
@@ -0,0 +1,3 @@
+numpy == 1.19.5
+tensorflow == 2.5.0
+tensorflow-model-optimization == 0.6.0
\ No newline at end of file
diff --git a/models/keyword_spotting/cnn_large/model_package_tf/train.py b/models/keyword_spotting/cnn_large/model_package_tf/train.py
new file mode 100644
index 0000000..8c488b3
--- /dev/null
+++ b/models/keyword_spotting/cnn_large/model_package_tf/train.py
@@ -0,0 +1,227 @@
+# Copyright © 2023 Arm Ltd. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Functions for training simple keyword spotting models."""
+
+import argparse
+from pathlib import Path
+
+import tensorflow as tf
+import numpy as np
+
+from data_processing import data_preprocessing
+from model_core_utils import models
+
+
+def train():
+ model_settings = models.prepare_model_settings(
+ len(data_preprocessing.prepare_words_list(FLAGS.wanted_words.split(','))),
+ FLAGS.sample_rate, FLAGS.clip_duration_ms, FLAGS.window_size_ms,
+ FLAGS.window_stride_ms, FLAGS.dct_coefficient_count)
+
+ # Create the model.
+ model = models.create_model(model_settings, FLAGS.model_architecture, FLAGS.model_size_info, True)
+
+ audio_processor = data_preprocessing.AudioProcessor(data_url=FLAGS.data_url,
+ data_dir=FLAGS.data_dir,
+ silence_percentage=FLAGS.silence_percentage,
+ unknown_percentage=FLAGS.unknown_percentage,
+ wanted_words=FLAGS.wanted_words.split(','),
+ validation_percentage=FLAGS.validation_percentage,
+ testing_percentage=FLAGS.testing_percentage,
+ model_settings=model_settings)
+
+ # We decay learning rate in a constant piecewise way to help learning.
+ training_steps_list = list(map(int, FLAGS.how_many_training_steps.split(',')))
+ learning_rates_list = list(map(float, FLAGS.learning_rate.split(',')))
+ lr_boundary_list = training_steps_list[:-1] # Only need the values at which to change lr.
+ lr_schedule = tf.keras.optimizers.schedules.PiecewiseConstantDecay(boundaries=lr_boundary_list,
+ values=learning_rates_list)
+
+ # Specify the optimizer configurations.
+ optimizer = tf.keras.optimizers.Adam(learning_rate=lr_schedule)
+ model.compile(optimizer=optimizer,
+ loss=tf.keras.losses.SparseCategoricalCrossentropy(),
+ metrics=['accuracy'])
+
+ train_data = audio_processor.get_data(audio_processor.Modes.TRAINING,
+ FLAGS.background_frequency, FLAGS.background_volume,
+ int((FLAGS.time_shift_ms * FLAGS.sample_rate) / 1000))
+ train_data = train_data.repeat().batch(FLAGS.batch_size).prefetch(tf.data.AUTOTUNE)
+ val_data = audio_processor.get_data(audio_processor.Modes.VALIDATION)
+ val_data = val_data.batch(FLAGS.batch_size).prefetch(tf.data.AUTOTUNE)
+
+ # We train for a max number of iterations so need to calculate how many 'epochs' this will be.
+ training_steps_max = np.sum(training_steps_list)
+ training_epoch_max = int(np.ceil(training_steps_max / FLAGS.eval_step_interval))
+
+ # Callbacks.
+ train_dir = Path(FLAGS.train_dir) / "best"
+ train_dir.mkdir(parents=True, exist_ok=True)
+ model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
+ filepath=(train_dir / (FLAGS.model_architecture + "_{val_accuracy:.3f}_ckpt")),
+ save_weights_only=True,
+ monitor='val_accuracy',
+ mode='max',
+ save_best_only=True)
+ tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=FLAGS.summaries_dir)
+
+ # Train the model.
+ model.fit(x=train_data,
+ steps_per_epoch=FLAGS.eval_step_interval,
+ epochs=training_epoch_max,
+ validation_data=val_data,
+ callbacks=[model_checkpoint_callback, tensorboard_callback])
+
+ # Test and save the model.
+ test_data = audio_processor.get_data(audio_processor.Modes.TESTING)
+ test_data = test_data.batch(FLAGS.batch_size)
+
+ test_loss, test_acc = model.evaluate(x=test_data)
+ print(f'Final test accuracy: {test_acc*100:.2f}%')
+ model.save(f'saved_model/{FLAGS.model_architecture}')
+ model.save(f'keras/{FLAGS.model_architecture}.h5')
+
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser()
+ parser.add_argument(
+ '--data_url',
+ type=str,
+ default='http://download.tensorflow.org/data/speech_commands_v0.02.tar.gz',
+ help='Location of speech training data archive on the web.')
+ parser.add_argument(
+ '--data_dir',
+ type=str,
+ default='/tmp/speech_dataset/',
+ help="""\
+ Where to download the speech training data to.
+ """)
+ parser.add_argument(
+ '--background_volume',
+ type=float,
+ default=0.1,
+ help="""\
+ How loud the background noise should be, between 0 and 1.
+ """)
+ parser.add_argument(
+ '--background_frequency',
+ type=float,
+ default=0.8,
+ help="""\
+ How many of the training samples have background noise mixed in.
+ """)
+ parser.add_argument(
+ '--silence_percentage',
+ type=float,
+ default=10.0,
+ help="""\
+ How much of the training data should be silence.
+ """)
+ parser.add_argument(
+ '--unknown_percentage',
+ type=float,
+ default=10.0,
+ help="""\
+ How much of the training data should be unknown words.
+ """)
+ parser.add_argument(
+ '--time_shift_ms',
+ type=float,
+ default=100.0,
+ help="""\
+ Range to randomly shift the training audio by in time.
+ """)
+ parser.add_argument(
+ '--testing_percentage',
+ type=int,
+ default=10,
+ help='What percentage of wavs to use as a test set.')
+ parser.add_argument(
+ '--validation_percentage',
+ type=int,
+ default=10,
+ help='What percentage of wavs to use as a validation set.')
+ parser.add_argument(
+ '--sample_rate',
+ type=int,
+ default=16000,
+ help='Expected sample rate of the wavs',)
+ parser.add_argument(
+ '--clip_duration_ms',
+ type=int,
+ default=1000,
+ help='Expected duration in milliseconds of the wavs',)
+ parser.add_argument(
+ '--window_size_ms',
+ type=float,
+ default=30.0,
+ help='How long each spectrogram timeslice is',)
+ parser.add_argument(
+ '--window_stride_ms',
+ type=float,
+ default=10.0,
+ help='How long each spectrogram timeslice is',)
+ parser.add_argument(
+ '--dct_coefficient_count',
+ type=int,
+ default=40,
+ help='How many bins to use for the MFCC fingerprint',)
+ parser.add_argument(
+ '--how_many_training_steps',
+ type=str,
+ default='15000,3000',
+ help='How many training loops to run',)
+ parser.add_argument(
+ '--eval_step_interval',
+ type=int,
+ default=400,
+ help='How often to evaluate the training results.')
+ parser.add_argument(
+ '--learning_rate',
+ type=str,
+ default='0.001,0.0001',
+ help='How large a learning rate to use when training.')
+ parser.add_argument(
+ '--batch_size',
+ type=int,
+ default=100,
+ help='How many items to train with at once',)
+ parser.add_argument(
+ '--summaries_dir',
+ type=str,
+ default='/tmp/retrain_logs',
+ help='Where to save summary logs for TensorBoard.')
+ parser.add_argument(
+ '--wanted_words',
+ type=str,
+ default='yes,no,up,down,left,right,on,off,stop,go',
+ help='Words to use (others will be added to an unknown label)',)
+ parser.add_argument(
+ '--train_dir',
+ type=str,
+ default='/tmp/speech_commands_train',
+ help='Directory to write event logs and checkpoint.')
+ parser.add_argument(
+ '--model_architecture',
+ type=str,
+ default='dnn',
+ help='What model architecture to use')
+ parser.add_argument(
+ '--model_size_info',
+ type=int,
+ nargs="+",
+ default=[128, 128, 128],
+ help='Model dimensions - different for various models')
+
+ FLAGS, _ = parser.parse_known_args()
+ train()
diff --git a/models/keyword_spotting/cnn_large/model_package_tf/validation_utils/labels.txt b/models/keyword_spotting/cnn_large/model_package_tf/validation_utils/labels.txt
new file mode 100644
index 0000000..ba41645
--- /dev/null
+++ b/models/keyword_spotting/cnn_large/model_package_tf/validation_utils/labels.txt
@@ -0,0 +1,12 @@
+_silence_
+_unknown_
+yes
+no
+up
+down
+left
+right
+on
+off
+stop
+go
\ No newline at end of file
diff --git a/models/keyword_spotting/cnn_large/tflite_int8/README.md b/models/keyword_spotting/cnn_large/tflite_int8/README.md
deleted file mode 100644
index 479133f..0000000
--- a/models/keyword_spotting/cnn_large/tflite_int8/README.md
+++ /dev/null
@@ -1,58 +0,0 @@
-# CNN Large INT8
-
-## Description
-This is a fully quantized version (asymmetrical int8) of the CNN Large model developed by Arm, with training checkpoints, from the Hello Edge paper. Code to recreate this model can be found here: https://github.com/ARM-software/ML-examples/tree/master/tflu-kws-cortex-m
-
-## License
-[Apache-2.0](https://spdx.org/licenses/Apache-2.0.html)
-
-## Related Materials
-### Class Labels
-The class labels associated with this model can be downloaded by running the script `get_class_labels.sh`.
-
-### Model Recreation Code
-Code to recreate this model can be found here: https://github.com/ARM-software/ML-examples/tree/master/tflu-kws-cortex-m.
-
-## Network Information
-| Network Information | Value |
-|---------------------|------------------|
-| Framework | TensorFlow Lite |
-| SHA-1 Hash | a61ab748ae8f52f78ab568342db67a792c6ecf34 |
-| Size (Bytes) | 484600 |
-| Provenance | https://github.com/ARM-software/ML-examples/tree/master/tflu-kws-cortex-m |
-| Paper | https://arxiv.org/abs/1711.07128 |
-
-## Accuracy
-Dataset: Google Speech Commands Test Set
-
-| Metric | Value |
-|--------|-------|
-| Accuracy | 0.931 |
-
-## Performance
-| Platform | Optimized |
-|----------|:---------:|
-| Cortex-A |:heavy_check_mark: |
-| Cortex-M |:heavy_check_mark: |
-| Mali GPU |:heavy_check_mark: |
-| Ethos U |:heavy_check_mark: |
-
-### Key
-* :heavy_check_mark: - Will run on this platform.
-* :heavy_multiplication_x: - Will not run on this platform.
-
-
-## Optimizations
-| Optimization | Value |
-|-----------------|---------|
-| Quantization | INT8 |
-
-## Network Inputs
-| Input Node Name | Shape | Description |
-|-----------------|---------|-------------|
-| input | (1, 490) | The input is a processed MFCCs of shape (1, 490) |
-
-## Network Outputs
-| Output Node Name | Shape | Description |
-|------------------|---------|-------------|
-| Identity | (1, 12) | The probability on 12 keywords. |
diff --git a/models/keyword_spotting/cnn_large/tflite_int8/definition.yaml b/models/keyword_spotting/cnn_large/tflite_int8/definition.yaml
deleted file mode 100644
index 63dcf0d..0000000
--- a/models/keyword_spotting/cnn_large/tflite_int8/definition.yaml
+++ /dev/null
@@ -1,43 +0,0 @@
-benchmark:
- Google Speech Commands test set:
- Accuracy: 93.09%
-description: 'This is a fully quantized version (asymmetrical int8) of the CNN Large
- model developed by Arm, with training checkpoints, from the Hello Edge paper. Code
- to recreate this model can be found here: https://github.com/ARM-software/ML-examples/tree/master/tflu-kws-cortex-m'
-license:
-- Apache-2.0
-network:
- file_size_bytes: 484600
- filename: cnn_l_quantized.tflite
- framework: TensorFlow Lite
- hash:
- algorithm: sha1
- value: a61ab748ae8f52f78ab568342db67a792c6ecf34
- provenance: https://github.com/ARM-software/ML-examples/tree/master/tflu-kws-cortex-m
- quality_level: null
-network_parameters:
- input_nodes:
- - description: The input is a processed MFCCs of shape (1, 490)
- example_input:
- path: models/keyword_spotting/cnn_large/tflite_int8/testing_input/input
- name: input
- shape:
- - 1
- - 490
- output_nodes:
- - description: The probability on 12 keywords.
- name: Identity
- shape:
- - 1
- - 12
- test_output_path: models/keyword_spotting/cnn_large/tflite_int8/testing_output/Identity
-operators:
- TensorFlow Lite:
- - CONV_2D
- - DEQUANTIZE
- - FULLY_CONNECTED
- - QUANTIZE
- - RELU
- - RESHAPE
- - SOFTMAX
-paper: https://arxiv.org/abs/1711.07128
diff --git a/models/keyword_spotting/cnn_large/tflite_int8/get_class_labels.sh b/models/keyword_spotting/cnn_large/tflite_int8/get_class_labels.sh
deleted file mode 100755
index e59caf5..0000000
--- a/models/keyword_spotting/cnn_large/tflite_int8/get_class_labels.sh
+++ /dev/null
@@ -1,20 +0,0 @@
-# Copyright (C) 2021 Arm Limited or its affiliates. All rights reserved.
-#
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-#!/usr/bin/env bash
-
-wget https://raw.githubusercontent.com/ARM-software/ML-KWS-for-MCU/e9cf319e9aa2ff71d433e111477dd95329fb94cb/Pretrained_models/labels.txt
-mv labels.txt labelmappings.txt
\ No newline at end of file
diff --git a/models/keyword_spotting/cnn_medium/model_package_tf/README.md b/models/keyword_spotting/cnn_medium/model_package_tf/README.md
new file mode 100644
index 0000000..bb7380f
--- /dev/null
+++ b/models/keyword_spotting/cnn_medium/model_package_tf/README.md
@@ -0,0 +1,115 @@
+# CNN Medium model package
+
+This folder contains code that will allow you to recreate the CNN Medium keyword spotting model from
+the [Hello Edge paper](https://arxiv.org/pdf/1711.07128.pdf).
+
+## License
+[Apache-2.0](https://spdx.org/licenses/Apache-2.0.html)
+
+## Model Package Overview
+| Model | CNN_Medium |
+|:---------------: |:--------------------------------------------------------------:|
+| **Format**: | Keras, Saved Model, TensorFlow Lite int8, TensorFlow Lite fp32 |
+| **Feature**: | Keyword spotting for Arm Cortex-M CPUs |
+| **Architectural Delta w.r.t. Vanilla**: | None |
+| **Domain**: | Keyword spotting |
+| **Package Quality**: | Optimised |
+
+## Model Recreation
+
+In order to recreate the model you will first need to be using ```Python3.7``` and install the requirements in ```requirements.txt```.
+
+Once you have these requirements satisfied you can execute the recreation script contained within this folder, just run:
+
+```bash
+bash ./recreate_model.sh
+```
+
+Running this script will use the pre-trained checkpoint files supplied in the ```./model_archive/model_source/weights``` folder
+to generate the TFLite files and perform evaluation on the test sets. Both an fp32 version and a quantized version will be produced.
+The quantized version will use post-training quantization to fully quantize it.
+
+If you want to run training from scratch you can do this by supplying ```--train``` when running the script. For example:
+
+```bash
+bash ./recreate_model.sh --train
+```
+
+Training is then performed and should produce a model to the stated accuracy in this repository.
+Note that exporting to TFLite will still happen with the pre-trained checkpoint files so you will need to re-run the script
+and this time supply the path to the new checkpoint files you want to use, for example:
+
+```bash
+bash ./recreate_model.sh --ckpt
+```
+
+
+## Training
+
+To train a DNN with 3 fully-connected layers with 128 neurons in each layer, run:
+
+```
+python train.py --model_architecture dnn --model_size_info 128 128 128
+```
+The command line argument *--model_size_info* is used to pass the neural network layer
+dimensions such as number of layers, convolution filter size/stride as a list to models.py,
+which builds the TensorFlow graph based on the provided model architecture
+and layer dimensions. For more info on *model_size_info* for each network architecture see
+[models.py](models.py).
+
+The training commands with all the hyperparameters to reproduce the models shown in the
+[paper](https://arxiv.org/pdf/1711.07128.pdf) are given [here](recreate_model.sh).
+
+## Testing
+To run inference on the trained model from a checkpoint and get accuracy on validation and test sets, run:
+```
+python evaluation.py --model_architecture dnn --model_size_info 128 128 128 --checkpoint
+```
+The parameters used here should match those used in the Training step.
+
+## Optimization
+
+We introduce a new *optional* step to optimize the trained keyword spotting model for deployment.
+
+Here we use TensorFlow's [weight clustering API](https://www.tensorflow.org/model_optimization/guide/clustering) to reduce the compressed model size and optimize inference on supported hardware. 32 weight clusters and kmeans++ cluster intialization method are used as the clustering hyperparameters.
+
+To optimize your trained model (e.g. a DNN), a trained model checkpoint is needed to run clustering and fine-tuning on.
+You can use the pre-trained checkpoints provided, or train your own model and use the resulting checkpoint.
+
+To apply the optimization and fine-tuning, run the following command:
+```
+python optimisations.py --model_architecture dnn --model_size_info 128 128 128 --checkpoint
+```
+The parameters used here should match those used in the Training step, except for the number of training steps.
+The number of training steps is reduced since the optimization step only requires fine-tuning.
+
+This will generate a clustered model checkpoint that can be used in the quantization step to generate a quantized and clustered TFLite model.
+
+## Quantization and TFLite Conversion
+
+As part of the update we now use TensorFlow's
+[post training quantization](https://www.tensorflow.org/lite/performance/post_training_quantization) to
+make quantization of the trained models super simple.
+
+To quantize your trained model (e.g. a DNN) run:
+```
+python convert_to_tflite.py --model_architecture dnn --model_size_info 128 128 128 --checkpoint [--inference_type int8|int16]
+```
+The parameters used here should match those used in the Training step.
+
+The inference_type parameter is *optional* and to be used if a fully quantized model with inputs and outputs of type int8 or int16 is needed. It defaults to fp32.
+
+This step will produce a quantized TFLite file *dnn_quantized.tflite*.
+You can test the accuracy of this quantized model on the test set by running:
+```
+python evaluation.py --tflite_path dnn_quantized.tflite
+```
+The parameters used here should match those used in the Training step.
+
+`convert_to_tflite.py` uses post-training quantization to generate a quantized model by default. If you wish to convert to a floating point TFLite model, use the command below:
+
+```
+python convert_to_tflite.py --model_architecture dnn --model_size_info 128 128 128 --checkpoint --no-quantize
+```
+
+This will produce a floating point TFLite file *dnn.tflite*. You can test the accuracy of this floating point model using `evaluation.py` as above.
diff --git a/models/keyword_spotting/cnn_medium/model_package_tf/cnn_m_inference_keras.py b/models/keyword_spotting/cnn_medium/model_package_tf/cnn_m_inference_keras.py
new file mode 100644
index 0000000..db7694a
--- /dev/null
+++ b/models/keyword_spotting/cnn_medium/model_package_tf/cnn_m_inference_keras.py
@@ -0,0 +1,76 @@
+# Copyright © 2023 Arm Ltd. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from data_processing.data_preprocessing import load_wav_file, calculate_mfcc
+
+import tensorflow as tf
+import argparse
+
+
+def load_labels(filename):
+ """Read in labels, one label per line."""
+ f = open(filename, "r")
+ return f.read().splitlines()
+
+
+def main():
+ window_size_samples = int(FLAGS.sample_rate * FLAGS.window_size_ms / 1000)
+ window_stride_samples = int(FLAGS.sample_rate * FLAGS.window_stride_ms / 1000)
+ decoded, sample = load_wav_file(FLAGS.wav, FLAGS.sample_rate)
+ x = calculate_mfcc(decoded, sample, window_size_samples, window_stride_samples, FLAGS.dct_coefficient_count)
+ x = tf.reshape(x, [1, -1])
+
+ model = tf.keras.models.load_model(FLAGS.keras_file_path)
+ predictions = model.predict(x)
+
+ # Sort to show labels in order of confidence
+ top_k = predictions[0].argsort()[-1:][::-1]
+ for node_id in top_k:
+ human_string = load_labels(FLAGS.labels)[int(node_id)]
+ score = predictions[0,node_id]
+ print(f'model predicted: {human_string} with score {score:.5f}')
+
+
+if __name__ == '__main__':
+ parser = argparse.ArgumentParser()
+ parser.add_argument(
+ '--wav', type=str, default='', help='Audio file to be identified.')
+ parser.add_argument(
+ '--labels', type=str, default='', help='Path to file containing labels.')
+ parser.add_argument(
+ '--sample_rate',
+ type=int,
+ default=16000,
+ help='Expected sample rate of the wavs', )
+ parser.add_argument(
+ '--window_size_ms',
+ type=float,
+ default=30.0,
+ help='How long each spectrogram timeslice is', )
+ parser.add_argument(
+ '--window_stride_ms',
+ type=float,
+ default=10.0,
+ help='How long each spectrogram timeslice is', )
+ parser.add_argument(
+ '--dct_coefficient_count',
+ type=int,
+ default=40,
+ help='How many bins to use for the MFCC fingerprint', )
+ parser.add_argument(
+ '--keras_file_path',
+ type=str,
+ default='',
+ help='Path to the .h5 Keras model file to use for testing.')
+ FLAGS, unparsed = parser.parse_known_args()
+ main()
diff --git a/models/keyword_spotting/cnn_medium/model_package_tf/cnn_m_inference_tflite.py b/models/keyword_spotting/cnn_medium/model_package_tf/cnn_m_inference_tflite.py
new file mode 100644
index 0000000..9f79d99
--- /dev/null
+++ b/models/keyword_spotting/cnn_medium/model_package_tf/cnn_m_inference_tflite.py
@@ -0,0 +1,120 @@
+# Copyright © 2023 Arm Ltd. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from data_processing.data_preprocessing import load_wav_file, calculate_mfcc
+
+import tensorflow as tf
+import numpy as np
+import argparse
+
+
+def tflite_inference(input_data, tflite_path):
+ """Call forwards pass of TFLite file and returns the result.
+
+ Args:
+ input_data: Input data to use on forward pass.
+ tflite_path: Path to TFLite file to run.
+
+ Returns:
+ Output from inference.
+ """
+ supported_quant_dtypes = (np.int8, np.int16)
+ interpreter = tf.lite.Interpreter(model_path=tflite_path)
+ interpreter.allocate_tensors()
+
+ input_details = interpreter.get_input_details()
+ output_details = interpreter.get_output_details()
+
+ input_dtype = input_details[0]["dtype"]
+ output_dtype = output_details[0]["dtype"]
+
+ # Check if the input/output type is quantized,
+ # set scale and zero-point accordingly
+ if input_dtype in supported_quant_dtypes:
+ input_scale, input_zero_point = input_details[0]["quantization"]
+ else:
+ input_scale, input_zero_point = 1, 0
+
+ input_data = input_data / input_scale + input_zero_point
+ input_data = np.round(input_data) if input_dtype in supported_quant_dtypes else input_data
+
+ if output_dtype in supported_quant_dtypes:
+ output_scale, output_zero_point = output_details[0]["quantization"]
+ else:
+ output_scale, output_zero_point = 1, 0
+
+ interpreter.set_tensor(input_details[0]['index'], tf.cast(input_data, input_dtype))
+ interpreter.invoke()
+
+ output_data = interpreter.get_tensor(output_details[0]['index'])
+
+ output_data = output_scale * (output_data.astype(np.float32) - output_zero_point)
+
+ return output_data
+
+
+def load_labels(filename):
+ """Read in labels, one label per line."""
+ f = open(filename, "r")
+ return f.read().splitlines()
+
+
+def main():
+ window_size_samples = int(FLAGS.sample_rate * FLAGS.window_size_ms / 1000)
+ window_stride_samples = int(FLAGS.sample_rate * FLAGS.window_stride_ms / 1000)
+ decoded, sample = load_wav_file(FLAGS.wav, FLAGS.sample_rate)
+ x = calculate_mfcc(decoded, sample, window_size_samples, window_stride_samples, FLAGS.dct_coefficient_count)
+ x = tf.reshape(x, [1, -1])
+ predictions = tflite_inference(x, FLAGS.tflite_path)
+
+ # Sort to show labels in order of confidence
+ top_k = predictions[0].argsort()[-1:][::-1]
+ for node_id in top_k:
+ human_string = load_labels(FLAGS.labels)[int(node_id)]
+ score = predictions[0,node_id]
+ print(f'model predicted: {human_string} with score {score:.5f}')
+
+
+if __name__ == '__main__':
+ parser = argparse.ArgumentParser()
+ parser.add_argument(
+ '--wav', type=str, default='', help='Audio file to be identified.')
+ parser.add_argument(
+ '--labels', type=str, default='', help='Path to file containing labels.')
+ parser.add_argument(
+ '--sample_rate',
+ type=int,
+ default=16000,
+ help='Expected sample rate of the wavs', )
+ parser.add_argument(
+ '--window_size_ms',
+ type=float,
+ default=30.0,
+ help='How long each spectrogram timeslice is', )
+ parser.add_argument(
+ '--window_stride_ms',
+ type=float,
+ default=10.0,
+ help='How long each spectrogram timeslice is', )
+ parser.add_argument(
+ '--dct_coefficient_count',
+ type=int,
+ default=40,
+ help='How many bins to use for the MFCC fingerprint', )
+ parser.add_argument(
+ '--tflite_path',
+ type=str,
+ default='',
+ help='Path to TFLite file to use for testing.')
+ FLAGS, unparsed = parser.parse_known_args()
+ main()
diff --git a/models/keyword_spotting/cnn_medium/model_package_tf/convert_to_tflite.py b/models/keyword_spotting/cnn_medium/model_package_tf/convert_to_tflite.py
new file mode 100644
index 0000000..64ab8df
--- /dev/null
+++ b/models/keyword_spotting/cnn_medium/model_package_tf/convert_to_tflite.py
@@ -0,0 +1,234 @@
+# Copyright © 2023 Arm Ltd. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Functions for converting and quantizing a trained keyword spotting
+ model and saving to TFLite."""
+
+import argparse
+
+import tensorflow as tf
+
+from data_processing import data_preprocessing
+from model_core_utils import models
+from evaluation import tflite_test
+
+NUM_REP_DATA_SAMPLES = 100 # How many samples to use for post training quantization.
+
+
+def convert(model_settings, audio_processor, checkpoint, quantize, inference_type, tflite_path):
+ """Load our trained floating point model and convert it.
+
+ TFLite conversion or post training quantization is performed and the
+ resulting model is saved as a TFLite file.
+ We use samples from the validation set to do post training quantization.
+
+ Args:
+ model_settings: Dictionary of common model settings.
+ audio_processor: Audio processor class object.
+ checkpoint: Path to training checkpoint to load.
+ quantize: Whether to quantize the model or convert to fp32 TFLite model.
+ inference_type: Input/output type of the quantized model.
+ tflite_path: Output TFLite file save path.
+ """
+ model = models.create_model(model_settings, FLAGS.model_architecture, FLAGS.model_size_info, False)
+ model.load_weights(checkpoint).expect_partial()
+
+ val_data = audio_processor.get_data(audio_processor.Modes.VALIDATION).batch(1)
+
+ def _rep_dataset():
+ """Generator function to produce representative dataset."""
+ i = 0
+ for mfcc, label in val_data:
+ if i > NUM_REP_DATA_SAMPLES:
+ break
+ i += 1
+ yield [mfcc]
+
+ if quantize:
+ # Quantize model and save to disk.
+ tflite_model = post_training_quantize(model, inference_type, _rep_dataset)
+ with open(tflite_path, 'wb') as f:
+ f.write(tflite_model)
+ print(f'Quantized model saved to {tflite_path}.')
+ else:
+ converter = tf.lite.TFLiteConverter.from_keras_model(model)
+ tflite_model = converter.convert()
+ with open(tflite_path, 'wb') as f:
+ f.write(tflite_model)
+ print(f'Converted model saved to {tflite_path}.')
+
+
+def post_training_quantize(keras_model, inference_type, rep_dataset):
+ """Perform post training quantization and returns the TFLite model ready for saving.
+
+ See https://www.tensorflow.org/lite/performance/post_training_quantization#full_integer_quantization for
+ more details.
+
+ Args:
+ keras_model: The trained tf Keras model used for post training quantization.
+ inference_type: Input/output type of the quantized model.
+ rep_dataset: Function to use as a representative dataset, must be callable.
+
+ Returns:
+ Quantized TFLite model ready for saving to disk.
+ """
+ converter = tf.lite.TFLiteConverter.from_keras_model(keras_model)
+ converter.optimizations = [tf.lite.Optimize.DEFAULT]
+
+ if inference_type == 'int8':
+ converter.inference_input_type = tf.int8
+ converter.inference_output_type = tf.int8
+ supported_ops = tf.lite.OpsSet.TFLITE_BUILTINS_INT8
+ if inference_type == 'int16':
+ converter.inference_input_type = tf.int16
+ converter.inference_output_type = tf.int16
+ supported_ops = tf.lite.OpsSet.EXPERIMENTAL_TFLITE_BUILTINS_ACTIVATIONS_INT16_WEIGHTS_INT8
+
+ # Int8 post training quantization needs representative dataset.
+ converter.representative_dataset = rep_dataset
+ converter.target_spec.supported_ops = [supported_ops]
+
+ tflite_model = converter.convert()
+
+ return tflite_model
+
+
+def main():
+ model_settings = models.prepare_model_settings(
+ len(data_preprocessing.prepare_words_list(FLAGS.wanted_words.split(','))),
+ FLAGS.sample_rate, FLAGS.clip_duration_ms, FLAGS.window_size_ms,
+ FLAGS.window_stride_ms, FLAGS.dct_coefficient_count)
+
+ audio_processor = data_preprocessing.AudioProcessor(data_url=FLAGS.data_url,
+ data_dir=FLAGS.data_dir,
+ silence_percentage=FLAGS.silence_percentage,
+ unknown_percentage=FLAGS.unknown_percentage,
+ wanted_words=FLAGS.wanted_words.split(','),
+ validation_percentage=FLAGS.validation_percentage,
+ testing_percentage=FLAGS.testing_percentage,
+ model_settings=model_settings)
+
+ if FLAGS.quantize:
+ tflite_path = f'{FLAGS.model_architecture}_quantized.tflite'
+ else:
+ tflite_path = f'{FLAGS.model_architecture}.tflite'
+
+ # Load floating point model from checkpoint and convert it.
+ convert(model_settings, audio_processor, FLAGS.checkpoint,
+ FLAGS.quantize, FLAGS.inference_type, tflite_path)
+
+ # Test the newly converted model on the test set.
+ tflite_test(model_settings, audio_processor, tflite_path)
+
+
+if __name__ == '__main__':
+ parser = argparse.ArgumentParser()
+ parser.add_argument(
+ '--data_url',
+ type=str,
+ default='http://download.tensorflow.org/data/speech_commands_v0.02.tar.gz',
+ help='Location of speech training data archive on the web.')
+ parser.add_argument(
+ '--data_dir',
+ type=str,
+ default='/tmp/speech_dataset/',
+ help="""\
+ Where to download the speech training data to.
+ """)
+ parser.add_argument(
+ '--silence_percentage',
+ type=float,
+ default=10.0,
+ help="""\
+ How much of the training data should be silence.
+ """)
+ parser.add_argument(
+ '--unknown_percentage',
+ type=float,
+ default=10.0,
+ help="""\
+ How much of the training data should be unknown words.
+ """)
+ parser.add_argument(
+ '--testing_percentage',
+ type=int,
+ default=10,
+ help='What percentage of wavs to use as a test set.')
+ parser.add_argument(
+ '--validation_percentage',
+ type=int,
+ default=10,
+ help='What percentage of wavs to use as a validation set.')
+ parser.add_argument(
+ '--sample_rate',
+ type=int,
+ default=16000,
+ help='Expected sample rate of the wavs',)
+ parser.add_argument(
+ '--clip_duration_ms',
+ type=int,
+ default=1000,
+ help='Expected duration in milliseconds of the wavs',)
+ parser.add_argument(
+ '--window_size_ms',
+ type=float,
+ default=30.0,
+ help='How long each spectrogram timeslice is',)
+ parser.add_argument(
+ '--window_stride_ms',
+ type=float,
+ default=10.0,
+ help='How long each spectrogram timeslice is',)
+ parser.add_argument(
+ '--dct_coefficient_count',
+ type=int,
+ default=40,
+ help='How many bins to use for the MFCC fingerprint',)
+ parser.add_argument(
+ '--wanted_words',
+ type=str,
+ default='yes,no,up,down,left,right,on,off,stop,go',
+ help='Words to use (others will be added to an unknown label)',)
+ parser.add_argument(
+ '--model_architecture',
+ type=str,
+ default='dnn',
+ help='What model architecture to use')
+ parser.add_argument(
+ '--model_size_info',
+ type=int,
+ nargs="+",
+ default=[128, 128, 128],
+ help='Model dimensions - different for various models')
+ parser.add_argument(
+ '--checkpoint',
+ type=str,
+ help='Checkpoint to load the weights from.')
+ parser.add_argument(
+ '--quantize',
+ dest='quantize',
+ action="store_true",
+ default=True,
+ help='Whether to quantize the model or convert to fp32 TFLite model. Defaults to True.')
+ parser.add_argument(
+ '--no-quantize',
+ dest='quantize',
+ action="store_false",
+ help='Whether to quantize the model or convert to fp32 TFLite model. Defaults to True.')
+ parser.add_argument(
+ '--inference_type',
+ type=str,
+ default='fp32',
+ help='If quantize is true, whether the model input and output is float32, int8 or int16')
+
+ FLAGS, _ = parser.parse_known_args()
+ main()
diff --git a/models/keyword_spotting/cnn_medium/model_package_tf/data_processing/__init__.py b/models/keyword_spotting/cnn_medium/model_package_tf/data_processing/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/models/keyword_spotting/cnn_medium/model_package_tf/data_processing/data_preprocessing.py b/models/keyword_spotting/cnn_medium/model_package_tf/data_processing/data_preprocessing.py
new file mode 100644
index 0000000..05cf5ba
--- /dev/null
+++ b/models/keyword_spotting/cnn_medium/model_package_tf/data_processing/data_preprocessing.py
@@ -0,0 +1,462 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+#
+# Modifications Copyright 2023 Arm Inc. All Rights Reserved.
+# Modified to use TensorFlow 2.0 and data pipelines.
+#
+"""Functions for loading and preparing data for keyword spotting."""
+
+import os
+import re
+import sys
+import urllib
+from pathlib import Path
+import tarfile
+import hashlib
+import random
+import math
+from enum import Enum
+
+import numpy as np
+import tensorflow as tf
+from tensorflow.python.ops import gen_audio_ops as audio_ops
+
+MAX_NUM_WAVS_PER_CLASS = 2**27 - 1 # ~134M
+RANDOM_SEED = 59185
+BACKGROUND_NOISE_DIR_NAME = '_background_noise_'
+SILENCE_LABEL = '_silence_'
+SILENCE_INDEX = 0
+UNKNOWN_WORD_INDEX = 1
+UNKNOWN_WORD_LABEL = '_unknown_'
+
+
+def load_wav_file(wav_filename, desired_samples):
+ """Loads and then decodes a given 16bit PCM wav file.
+
+ Decoded audio is scaled to the range [-1, 1] and padded or cropped to the desired number of samples.
+
+ Args:
+ wav_filename: 16bit PCM wav file to load.
+ desired_samples: Number of samples wanted from the audio file.
+
+ Returns:
+ Tuple consisting of the decoded audio and sample rate.
+ """
+ wav_file = tf.io.read_file(wav_filename)
+ decoded_wav = audio_ops.decode_wav(wav_file, desired_channels=1, desired_samples=desired_samples)
+
+ return decoded_wav.audio, decoded_wav.sample_rate
+
+
+def calculate_mfcc(audio_signal, audio_sample_rate, window_size, window_stride, num_mfcc):
+ """Returns Mel Frequency Cepstral Coefficients (MFCC) for a given audio signal.
+
+ Args:
+ audio_signal: Raw audio signal in range [-1, 1]
+ audio_sample_rate: Audio signal sample rate
+ window_size: Window size in samples for calculating spectrogram
+ window_stride: Window stride in samples for calculating spectrogram
+ num_mfcc: The number of MFCC features wanted.
+
+ Returns:
+ Calculated mffc features.
+ """
+ spectrogram = audio_ops.audio_spectrogram(input=audio_signal, window_size=window_size, stride=window_stride,
+ magnitude_squared=True)
+
+ mfcc_features = audio_ops.mfcc(spectrogram, audio_sample_rate, dct_coefficient_count=num_mfcc)
+
+ return mfcc_features
+
+
+def which_set(filename, validation_percentage, testing_percentage):
+ """Determines which data partition the file should belong to.
+
+ We want to keep files in the same training, validation, or testing sets even
+ if new ones are added over time. This makes it less likely that testing
+ samples will accidentally be reused in training when long runs are restarted
+ for example. To keep this stability, a hash of the filename is taken and used
+ to determine which set it should belong to. This determination only depends on
+ the name and the set proportions, so it won't change as other files are added.
+ It's also useful to associate particular files as related (for example words
+ spoken by the same person), so anything after '_nohash_' in a filename is
+ ignored for set determination. This ensures that 'bobby_nohash_0.wav' and
+ 'bobby_nohash_1.wav' are always in the same set, for example.
+
+ Args:
+ filename: File path of the data sample.
+ validation_percentage: How much of the data set to use for validation.
+ testing_percentage: How much of the data set to use for testing.
+
+ Returns:
+ String, one of 'training', 'validation', or 'testing'.
+ """
+ base_name = os.path.basename(filename)
+ # We want to ignore anything after '_nohash_' in the file name when
+ # deciding which set to put a wav in, so the data set creator has a way of
+ # grouping wavs that are close variations of each other.
+ hash_name = re.sub(r'_nohash_.*$', '', base_name)
+ # This looks a bit magical, but we need to decide whether this file should
+ # go into the training, testing, or validation sets, and we want to keep
+ # existing files in the same set even if more files are subsequently
+ # added.
+ # To do that, we need a stable way of deciding based on just the file name
+ # itself, so we do a hash of that and then use that to generate a
+ # probability value that we use to assign it.
+ hash_name_hashed = hashlib.sha1(tf.compat.as_bytes(hash_name)).hexdigest()
+ percentage_hash = ((int(hash_name_hashed, 16) %
+ (MAX_NUM_WAVS_PER_CLASS + 1)) *
+ (100.0 / MAX_NUM_WAVS_PER_CLASS))
+ if percentage_hash < validation_percentage:
+ result = 'validation'
+ elif percentage_hash < (testing_percentage + validation_percentage):
+ result = 'testing'
+ else:
+ result = 'training'
+ return result
+
+
+def prepare_words_list(wanted_words):
+ """Prepends common tokens to the custom word list.
+
+ Args:
+ wanted_words: List of strings containing custom words to spot.
+
+ Returns:
+ List of words with silence and unknown tokens added.
+ """
+ return [SILENCE_LABEL, UNKNOWN_WORD_LABEL] + wanted_words
+
+
+class AudioProcessor:
+ """Handles loading, partitioning, and preparing audio training data."""
+
+ class Modes(Enum):
+ TRAINING = 1
+ VALIDATION = 2
+ TESTING = 3
+
+ def __init__(self, data_url, data_dir, silence_percentage, unknown_percentage,
+ wanted_words, validation_percentage, testing_percentage, model_settings):
+ self.data_dir = Path(data_dir)
+ self.model_settings = model_settings
+ self.words_list = prepare_words_list(wanted_words)
+
+ self._tf_datasets = {}
+ self.background_data = None
+ self._set_size = {'training': 0, 'validation': 0, 'testing': 0}
+
+ self._download_and_extract_data(data_url, data_dir)
+ self._prepare_datasets(silence_percentage, unknown_percentage, wanted_words,
+ validation_percentage, testing_percentage)
+ self._prepare_background_data()
+
+ def get_data(self, mode, background_frequency=0, background_volume_range=0, time_shift=0):
+ """Returns the train, validation or test set for KWS as a TF Dataset.
+
+ Args:
+ mode: The set to return, see AudioProcessor.Modes enumeration.
+ background_frequency: How many of the samples have background noise mixed in.
+ background_volume_range: How loud the background noise should be, between 0 and 1.
+ time_shift: Range to randomly shift the training audio by in time.
+
+ Returns:
+ TF dataset that will generate tuples containing an mfcc and corresponding label.
+
+ Raises:
+ ValueError: If mode is not recognised.
+ """
+ if mode == AudioProcessor.Modes.TRAINING:
+ dataset = self._tf_datasets['training']
+ elif mode == AudioProcessor.Modes.VALIDATION:
+ dataset = self._tf_datasets['validation']
+ elif mode == AudioProcessor.Modes.TESTING:
+ dataset = self._tf_datasets['testing']
+ else:
+ ValueError("Incorrect dataset type given")
+
+ use_background = (self.background_data is not None) and (mode == AudioProcessor.Modes.TRAINING)
+ dataset = dataset.map(lambda path, label: self._process_path(path, label, self.model_settings,
+ background_frequency, background_volume_range,
+ time_shift, use_background, self.background_data),
+ num_parallel_calls=tf.data.experimental.AUTOTUNE)
+
+ return dataset
+
+ def set_size(self, mode):
+ """Get the number of samples in the requested dataset partition.
+
+ Args:
+ mode: Which partition, see AudioProcessor.Modes enumeration.
+
+ Returns:
+ Number of samples in the partition.
+
+ Raises:
+ ValueError: If mode is not recognised.
+ """
+ if mode == AudioProcessor.Modes.TRAINING:
+ return self._set_size['training']
+ elif mode == AudioProcessor.Modes.VALIDATION:
+ return self._set_size['validation']
+ elif mode == AudioProcessor.Modes.TESTING:
+ return self._set_size['testing']
+ else:
+ ValueError('Incorrect dataset type given')
+
+ @staticmethod
+ def _process_path(path, label, model_settings, background_frequency, background_volume_range, time_shift_samples,
+ use_background, background_data):
+ """Load wav files and calculate mfcc features.
+
+ Random shifting of samples and adding in background noise is done within this function as well.
+ This function is meant to be mapped onto a TF Dataset by using a lambda function.
+
+ Args:
+ path: Path to the wav file to load.
+ label: Integer label for classifying the audio clip.
+ model_settings: Dictionary of settings for model being trained.
+ background_frequency: How many clips will have background noise, 0.0 to 1.0.
+ background_volume_range: How loud the background noise will be.
+ time_shift_samples: How much to randomly shift the clips by.
+ use_background: Add in background noise to audio clips or not.
+ background_data: Ragged tensor of loaded background noise samples.
+
+ Returns:
+ Tuple of calculated flattened mfcc and its class label.
+ """
+
+ desired_samples = model_settings['desired_samples']
+ audio, sample_rate = load_wav_file(path, desired_samples=desired_samples)
+
+ # Make our own silence audio data.
+ if label == SILENCE_INDEX:
+ audio = tf.multiply(audio, 0)
+
+ # Shift samples start position and pad any gaps with zeros.
+ if time_shift_samples > 0:
+ time_shift_amount = tf.random.uniform(shape=(), minval=-time_shift_samples, maxval=time_shift_samples,
+ dtype=tf.int32)
+ else:
+ time_shift_amount = 0
+ if time_shift_amount > 0:
+ time_shift_padding = [[time_shift_amount, 0], [0, 0]]
+ time_shift_offset = [0, 0]
+ else:
+ time_shift_padding = [[0, -time_shift_amount], [0, 0]]
+ time_shift_offset = [-time_shift_amount, 0]
+
+ padded_foreground = tf.pad(audio, time_shift_padding, mode='CONSTANT')
+ sliced_foreground = tf.slice(padded_foreground, time_shift_offset, [desired_samples, -1])
+
+ # Get a random section of background noise.
+ if use_background:
+ background_index = tf.random.uniform(shape=(), maxval=background_data.shape[0], dtype=tf.int32)
+ background_sample = background_data[background_index]
+ background_offset = tf.random.uniform(shape=(), maxval=len(background_sample)-desired_samples,
+ dtype=tf.int32)
+ background_clipped = background_sample[background_offset:(background_offset + desired_samples)]
+ background_reshaped = tf.reshape(background_clipped, [desired_samples, 1])
+ if tf.random.uniform(shape=(), maxval=1) < background_frequency:
+ background_volume = tf.random.uniform(shape=(), maxval=background_volume_range)
+ else:
+ background_volume = tf.constant(0, dtype='float32')
+ else:
+ background_reshaped = np.zeros([desired_samples, 1], dtype=np.float32)
+ background_volume = tf.constant(0, dtype='float32')
+
+ # Mix in background noise.
+ background_mul = tf.multiply(background_reshaped, background_volume)
+ background_add = tf.add(background_mul, sliced_foreground)
+ background_clamp = tf.clip_by_value(background_add, -1.0, 1.0)
+
+ mfcc = calculate_mfcc(background_clamp, sample_rate, model_settings['window_size_samples'],
+ model_settings['window_stride_samples'],
+ model_settings['dct_coefficient_count'])
+ mfcc = tf.reshape(mfcc, [-1])
+
+ return mfcc, label
+
+ def _download_and_extract_data(self, data_url, target_directory):
+ """Downloads and extracts file to target directory.
+
+ If the file does not already exist download it and then untar into the target directory.
+
+ Args:
+ data_url: Web link to the tarred data to download.
+ target_directory: Directory to download and extract to.
+ """
+ target_directory = Path(target_directory)
+ target_directory.mkdir(exist_ok=True)
+
+ filename = data_url.split('/')[-1]
+ filepath = target_directory / filename
+
+ if not filepath.exists():
+ def _report_hook(block_num, block_size, total_size):
+ """Function to track download progress in urllib"""
+ read_so_far = block_num * block_size
+ percent = (read_so_far / total_size) * 100.0
+
+ s = f"\rDownloading {filename} {percent:.1f}%"
+
+ sys.stdout.write(s)
+ sys.stdout.flush()
+
+ filepath, _ = urllib.request.urlretrieve(data_url, filepath, _report_hook)
+ print()
+
+ print(f'Untarring {filename}...')
+ tarfile.open(filepath, 'r:gz').extractall(target_directory)
+
+ def _prepare_datasets(self, silence_percentage, unknown_percentage, wanted_words,
+ validation_percentage, testing_percentage):
+ """Split the data into train, validation and testing sets.
+
+ Silence and unknown data is added, then sets are converted to TF Datasets.
+
+ Args:
+ silence_percentage: Percent of words should be silence.
+ unknown_percentage: Percent of words that should be unknown.
+ wanted_words: List of words wanted to classify.
+ validation_percentage: Percent to split off for validation.
+ testing_percentage: Percent to split off for testing.
+ """
+ # Make sure the shuffling and picking of unknowns is deterministic.
+ random.seed(RANDOM_SEED)
+ wanted_words_index = {}
+
+ for index, wanted_word in enumerate(wanted_words):
+ wanted_words_index[wanted_word] = index + 2
+
+ # Find all wav files in subfolders.
+ search_path = self.data_dir / '*' / '*.wav'
+ data_index, unknown_index, all_words = self._find_and_sort_wavs(search_path, validation_percentage,
+ testing_percentage, wanted_words_index)
+
+ for index, wanted_word in enumerate(wanted_words):
+ if wanted_word not in all_words:
+ raise Exception(f'Tried to find {wanted_word} in labels but only found: {", ".join(all_words.keys())}')
+
+ word_to_index = {}
+ for word in all_words:
+ if word in wanted_words_index:
+ word_to_index[word] = wanted_words_index[word]
+ else:
+ word_to_index[word] = UNKNOWN_WORD_INDEX
+ word_to_index[SILENCE_LABEL] = SILENCE_INDEX
+
+ # We need an arbitrary file to load as the input for the silence samples.
+ # It's multiplied by zero later, so the content doesn't matter.
+ silence_wav_path = data_index['training'][0]['file']
+ for set_index in ['validation', 'testing', 'training']:
+ set_size = len(data_index[set_index]) # Size before adding silence and unknown samples.
+ silence_size = int(math.ceil(set_size * silence_percentage / 100))
+ for _ in range(silence_size):
+ data_index[set_index].append({
+ 'label': SILENCE_LABEL,
+ 'file': silence_wav_path
+ })
+ # Pick some unknowns to add to each partition of the data set.
+ random.shuffle(unknown_index[set_index])
+ unknown_size = int(math.ceil(set_size * unknown_percentage / 100))
+ data_index[set_index].extend(unknown_index[set_index][:unknown_size])
+
+ self._set_size[set_index] = len(data_index[set_index]) # Size after adding silence and unknown samples.
+
+ # Make sure the ordering is random.
+ random.shuffle(data_index[set_index])
+
+ # Transform into TF Datasets ready for easier processing later.
+ labels, paths = list(zip(*[d.values() for d in data_index[set_index]]))
+ labels = [word_to_index[label] for label in labels]
+ self._tf_datasets[set_index] = tf.data.Dataset.from_tensor_slices((list(paths), labels))
+
+ def _find_and_sort_wavs(self, search_pattern, validation_percentage, testing_percentage, wanted_words_index):
+ """Find and sort wav files into known and unknown word sets.
+
+ Known words are files containing words in the list of wanted words.
+ Any other clip goes to the unknown label set. Labels come from the folder names.
+ All clips are also assigned to train, test and validation sets.
+
+ Args:
+ search_pattern: Path pattern used by glob to find wav files.
+ validation_percentage: Percent to split off for validation.
+ testing_percentage: Percent to split off for testing.
+ wanted_words_index: Dict mapping wanted words to their label index.
+
+ Returns:
+ 3-tuple of known words, unknown words and mapping of all word labels.
+ """
+ data_index = {'validation': [], 'testing': [], 'training': []}
+ unknown_index = {'validation': [], 'testing': [], 'training': []}
+ all_words = {}
+
+ for wav_path in sorted(tf.io.gfile.glob(str(search_pattern))):
+ word = Path(wav_path).parent.name.lower()
+
+ # Treat the '_background_noise_' folder as a special case, since we expect
+ # it to contain long audio samples we mix in to improve training.
+ if word == BACKGROUND_NOISE_DIR_NAME:
+ continue
+
+ all_words[word] = True
+ set_index = which_set(wav_path, validation_percentage, testing_percentage)
+ # If it's a known class, store its detail, otherwise add it to the list
+ # we'll use to train the unknown label.
+ if word in wanted_words_index:
+ data_index[set_index].append({'label': word, 'file': wav_path})
+ else:
+ unknown_index[set_index].append({'label': word, 'file': wav_path})
+ if not all_words:
+ raise Exception('No .wavs found at ' + str(search_pattern))
+
+ return data_index, unknown_index, all_words
+
+ def _prepare_background_data(self):
+ """Searches a folder for background noise audio, and loads it into memory.
+
+ It's expected that the background audio samples will be in a subdirectory
+ named '_background_noise_' inside the 'data_dir' folder, as .wavs that match
+ the sample rate of the training data, but can be much longer in duration.
+
+ If the '_background_noise_' folder doesn't exist at all, this isn't an
+ error, it's just taken to mean that no background noise augmentation should
+ be used. If the folder does exist, but it's empty, that's treated as an
+ error.
+
+ Returns:
+ Ragged tensor of raw PCM-encoded audio samples of background noise.
+ None if '_background_noise_' folder doesnt exist.
+
+ Raises:
+ Exception: If files aren't found in the folder.
+ """
+ background_data = []
+ background_dir = Path(self.data_dir / BACKGROUND_NOISE_DIR_NAME)
+ if not background_dir.exists():
+ self.background_data = None
+ return
+
+ search_path = Path(background_dir / '*.wav')
+ for wav_path in tf.io.gfile.glob(str(search_path)):
+ wav_data, _ = load_wav_file(wav_path, desired_samples=-1)
+ background_data.append(tf.reshape(wav_data, [-1]))
+
+ if not background_data:
+ raise Exception('No background wav files were found in ' + str(search_path))
+
+ # Ragged tensor as we cant use lists in tf dataset map functions.
+ self.background_data = tf.ragged.stack(background_data)
diff --git a/models/keyword_spotting/cnn_medium/model_package_tf/evaluation.py b/models/keyword_spotting/cnn_medium/model_package_tf/evaluation.py
new file mode 100644
index 0000000..e5dcf30
--- /dev/null
+++ b/models/keyword_spotting/cnn_medium/model_package_tf/evaluation.py
@@ -0,0 +1,250 @@
+# Copyright © 2023 Arm Ltd. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Functions for testing trained keyword spotting models from checkpoint files and TFLite files."""
+
+import argparse
+
+import numpy as np
+import tensorflow as tf
+
+from data_processing import data_preprocessing
+from model_core_utils import models
+from cnn_m_inference_tflite import tflite_inference
+
+
+def tflite_test(model_settings, audio_processor, tflite_path):
+ """Calculate accuracy and confusion matrices on the validation and test sets.
+
+ A TFLite model is used for doing testing.
+
+ Args:
+ model_settings: Dictionary of common model settings.
+ audio_processor: Audio processor class object.
+ tflite_path: Path to TFLite file to use for inference.
+ """
+ # Evaluate on validation set.
+ print("Running TFLite evaluation on validation set...")
+ val_data = audio_processor.get_data(audio_processor.Modes.VALIDATION).batch(1)
+ expected_indices = np.concatenate([y for x, y in val_data])
+ predicted_indices = []
+
+ for mfcc, label in val_data:
+ prediction = tflite_inference(mfcc, tflite_path)
+ predicted_indices.append(np.squeeze(tf.argmax(prediction, axis=1)))
+
+ val_accuracy = calculate_accuracy(predicted_indices, expected_indices)
+ confusion_matrix = tf.math.confusion_matrix(expected_indices, predicted_indices,
+ num_classes=model_settings['label_count'])
+
+ print(confusion_matrix.numpy())
+ print(f'Validation accuracy = {val_accuracy * 100:.2f}%'
+ f'(N={audio_processor.set_size(audio_processor.Modes.VALIDATION)})')
+
+ # Evaluate on testing set.
+ print("Running TFLite evaluation on test set...")
+ test_data = audio_processor.get_data(audio_processor.Modes.TESTING).batch(1)
+ expected_indices = np.concatenate([y for x, y in test_data])
+ predicted_indices = []
+
+ for mfcc, label in test_data:
+ prediction = tflite_inference(mfcc, tflite_path)
+ predicted_indices.append(np.squeeze(tf.argmax(prediction, axis=1)))
+
+ test_accuracy = calculate_accuracy(predicted_indices, expected_indices)
+ confusion_matrix = tf.math.confusion_matrix(expected_indices, predicted_indices,
+ num_classes=model_settings['label_count'])
+
+ print(confusion_matrix.numpy())
+ print(f'Test accuracy = {test_accuracy * 100:.2f}%'
+ f'(N={audio_processor.set_size(audio_processor.Modes.TESTING)})')
+
+
+def keras_test(model_settings, audio_processor, model):
+ """Calculate accuracy and confusion matrices on the validation and test sets.
+
+ A loaded keras model is used for doing testing.
+
+ Args:
+ model_settings: Dictionary of common model settings.
+ audio_processor: Audio processor class object.
+ model: Loaded keras model.
+ """
+ # Evaluate on validation set.
+ print("Running TF evaluation on validation set...")
+ val_data = audio_processor.get_data(audio_processor.Modes.VALIDATION).batch(FLAGS.batch_size)
+ expected_indices = np.concatenate([y for x, y in val_data])
+
+ predictions = model.predict(val_data)
+ predicted_indices = tf.argmax(predictions, axis=1)
+
+ val_accuracy = calculate_accuracy(predicted_indices, expected_indices)
+ confusion_matrix = tf.math.confusion_matrix(expected_indices, predicted_indices,
+ num_classes=model_settings['label_count'])
+ print(confusion_matrix.numpy())
+ print(f'Validation accuracy = {val_accuracy * 100:.2f}%'
+ f'(N={audio_processor.set_size(audio_processor.Modes.VALIDATION)})')
+
+ # Evaluate on testing set.
+ print("Running TF evaluation on test set...")
+ test_data = audio_processor.get_data(audio_processor.Modes.TESTING).batch(FLAGS.batch_size)
+ expected_indices = np.concatenate([y for x, y in test_data])
+
+ predictions = model.predict(test_data)
+ predicted_indices = tf.argmax(predictions, axis=1)
+
+ test_accuracy = calculate_accuracy(predicted_indices, expected_indices)
+ confusion_matrix = tf.math.confusion_matrix(expected_indices, predicted_indices,
+ num_classes=model_settings['label_count'])
+ print(confusion_matrix.numpy())
+ print(f'Test accuracy = {test_accuracy * 100:.2f}%'
+ f'(N={audio_processor.set_size(audio_processor.Modes.TESTING)})')
+
+
+def calculate_accuracy(predicted_indices, expected_indices):
+ """Calculates and returns accuracy.
+
+ Args:
+ predicted_indices: List of predicted integer indices.
+ expected_indices: List of expected integer indices.
+
+ Returns:
+ Accuracy value between 0 and 1.
+ """
+ correct_prediction = tf.equal(predicted_indices, expected_indices)
+ accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
+ return accuracy
+
+
+def evaluate():
+ """Calculate accuracy and confusion matrices on validation and test sets.
+
+ Model is created and weights loaded from supplied command line arguments.
+ """
+ model_settings = models.prepare_model_settings(
+ len(data_preprocessing.prepare_words_list(FLAGS.wanted_words.split(','))),
+ FLAGS.sample_rate, FLAGS.clip_duration_ms, FLAGS.window_size_ms,
+ FLAGS.window_stride_ms, FLAGS.dct_coefficient_count)
+
+ audio_processor = data_preprocessing.AudioProcessor(data_url=FLAGS.data_url,
+ data_dir=FLAGS.data_dir,
+ silence_percentage=FLAGS.silence_percentage,
+ unknown_percentage=FLAGS.unknown_percentage,
+ wanted_words=FLAGS.wanted_words.split(','),
+ validation_percentage=FLAGS.validation_percentage,
+ testing_percentage=FLAGS.testing_percentage,
+ model_settings=model_settings)
+
+ if FLAGS.tflite_path:
+ tflite_test(model_settings, audio_processor, FLAGS.tflite_path)
+
+ if FLAGS.checkpoint:
+ model = models.create_model(model_settings, FLAGS.model_architecture, FLAGS.model_size_info, False)
+ model.load_weights(FLAGS.checkpoint).expect_partial()
+ keras_test(model_settings, audio_processor, model)
+
+
+if __name__ == '__main__':
+ parser = argparse.ArgumentParser()
+ parser.add_argument(
+ '--data_url',
+ type=str,
+ default='http://download.tensorflow.org/data/speech_commands_v0.02.tar.gz',
+ help='Location of speech training data archive on the web.')
+ parser.add_argument(
+ '--data_dir',
+ type=str,
+ default='/tmp/speech_dataset/',
+ help="""\
+ Where to download the speech training data to.
+ """)
+ parser.add_argument(
+ '--silence_percentage',
+ type=float,
+ default=10.0,
+ help="""\
+ How much of the training data should be silence.
+ """)
+ parser.add_argument(
+ '--unknown_percentage',
+ type=float,
+ default=10.0,
+ help="""\
+ How much of the training data should be unknown words.
+ """)
+ parser.add_argument(
+ '--testing_percentage',
+ type=int,
+ default=10,
+ help='What percentage of wavs to use as a test set.')
+ parser.add_argument(
+ '--validation_percentage',
+ type=int,
+ default=10,
+ help='What percentage of wavs to use as a validation set.')
+ parser.add_argument(
+ '--sample_rate',
+ type=int,
+ default=16000,
+ help='Expected sample rate of the wavs',)
+ parser.add_argument(
+ '--clip_duration_ms',
+ type=int,
+ default=1000,
+ help='Expected duration in milliseconds of the wavs',)
+ parser.add_argument(
+ '--window_size_ms',
+ type=float,
+ default=30.0,
+ help='How long each spectrogram timeslice is',)
+ parser.add_argument(
+ '--window_stride_ms',
+ type=float,
+ default=10.0,
+ help='How long each spectrogram timeslice is',)
+ parser.add_argument(
+ '--dct_coefficient_count',
+ type=int,
+ default=40,
+ help='How many bins to use for the MFCC fingerprint',)
+ parser.add_argument(
+ '--batch_size',
+ type=int,
+ default=100,
+ help='How many items to train with at once',)
+ parser.add_argument(
+ '--wanted_words',
+ type=str,
+ default='yes,no,up,down,left,right,on,off,stop,go',
+ help='Words to use (others will be added to an unknown label)',)
+ parser.add_argument(
+ '--checkpoint',
+ type=str,
+ help='Checkpoint to load the weights from')
+ parser.add_argument(
+ '--tflite_path',
+ type=str,
+ help='Path to TFLite file to use for evaluation')
+ parser.add_argument(
+ '--model_architecture',
+ type=str,
+ default='dnn',
+ help='What model architecture to use')
+ parser.add_argument(
+ '--model_size_info',
+ type=int,
+ nargs="+",
+ default=[128, 128, 128],
+ help='Model dimensions - different for various models')
+
+ FLAGS, _ = parser.parse_known_args()
+ evaluate()
diff --git a/models/keyword_spotting/cnn_medium/model_package_tf/how_to_guidance.ipynb b/models/keyword_spotting/cnn_medium/model_package_tf/how_to_guidance.ipynb
new file mode 100644
index 0000000..34a8579
--- /dev/null
+++ b/models/keyword_spotting/cnn_medium/model_package_tf/how_to_guidance.ipynb
@@ -0,0 +1,428 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Copyright (C) 2023 Arm Limited or its affiliates. All rights reserved.\n",
+ "#\n",
+ "# SPDX-License-Identifier: Apache-2.0\n",
+ "#\n",
+ "# Licensed under the Apache License, Version 2.0 (the License); you may\n",
+ "# not use this file except in compliance with the License.\n",
+ "# You may obtain a copy of the License at\n",
+ "#\n",
+ "# www.apache.org/licenses/LICENSE-2.0\n",
+ "#\n",
+ "# Unless required by applicable law or agreed to in writing, software\n",
+ "# distributed under the License is distributed on an AS IS BASIS, WITHOUT\n",
+ "# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
+ "# See the License for the specific language governing permissions and\n",
+ "# limitations under the License."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# CNN_Medium - Optimised\n",
+ "\n",
+ "Here we reproduce the models with our established codebase and ModelPackage approach for your convenience.\n",
+ "\n",
+ "## Model-Package Overview:\n",
+ "\n",
+ "| Model \t| CNN_Medium \t|\n",
+ "|:---------------:\t|:---------------------------------------------------------------:\t|\n",
+ "| **Format**: \t| Keras, Saved Model, TensorFlow Lite int8, TensorFlow Lite fp32 |\n",
+ "| **Feature**: \t| Keyword spotting for Arm Cortex-M CPUs |\n",
+ "| **Architectural Delta w.r.t. Vanilla**: | None |\n",
+ "| **Domain**: \t| Keyword spotting |\n",
+ "| **Package Quality**: \t| Optimised |"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Table of contents \n",
+ "\n",
+ "This how-to guidance presents the key steps to reproduce everything in this package. The contents are organised as below. We provided the internal navigation links for users to easy-jump among different sections. \n",
+ "\n",
+ " \n",
+ "* [1.0 Model recreation](#model_recreation)\n",
+ "\n",
+ "* [2.0 Training](#training)\n",
+ "\n",
+ "* [3.0 Testing](#testing)\n",
+ "\n",
+ "* [4.0 Optimization](#optimization)\n",
+ "\n",
+ "* [5.0 Quantization and TFLite conversion](#tflite_conversion)\n",
+ "\n",
+ "* [6.0 Inference the TFLite model files](#tflite_inference)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 1.0 Model Recreation\n",
+ "\n",
+ "In order to recreate the model you will first need to be using ```Python3.7``` and install the requirements in ```requirements.txt```.\n",
+ "\n",
+ "Once you have these requirements satisfied you can execute the recreation script contained within this folder, just run:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "2023-01-31 12:28:00.950084: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0\n",
+ "Untarring speech_commands_v0.02.tar.gz...\n",
+ "2023-01-31 12:28:52.604010: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcuda.so.1\n",
+ "2023-01-31 12:28:52.642244: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: \n",
+ "pciBusID: 0000:03:00.0 name: NVIDIA TITAN Xp computeCapability: 6.1\n",
+ "coreClock: 1.582GHz coreCount: 30 deviceMemorySize: 11.91GiB deviceMemoryBandwidth: 510.07GiB/s\n",
+ "2023-01-31 12:28:52.642282: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0\n",
+ "2023-01-31 12:28:52.661881: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcublas.so.11\n",
+ "2023-01-31 12:28:52.661959: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcublasLt.so.11\n",
+ "2023-01-31 12:28:52.664744: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcufft.so.10\n",
+ "2023-01-31 12:28:52.665058: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcurand.so.10\n",
+ "2023-01-31 12:28:52.665625: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcusolver.so.11\n",
+ "2023-01-31 12:28:52.666342: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcusparse.so.11\n",
+ "2023-01-31 12:28:52.666491: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudnn.so.8\n",
+ "2023-01-31 12:28:52.666964: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0\n",
+ "2023-01-31 12:28:52.667239: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA\n",
+ "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
+ "2023-01-31 12:28:52.668032: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: \n",
+ "pciBusID: 0000:03:00.0 name: NVIDIA TITAN Xp computeCapability: 6.1\n",
+ "coreClock: 1.582GHz coreCount: 30 deviceMemorySize: 11.91GiB deviceMemoryBandwidth: 510.07GiB/s\n",
+ "2023-01-31 12:28:52.668409: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0\n",
+ "2023-01-31 12:28:52.668474: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0\n",
+ "2023-01-31 12:28:53.120304: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1258] Device interconnect StreamExecutor with strength 1 edge matrix:\n",
+ "2023-01-31 12:28:53.120344: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1264] 0 \n",
+ "2023-01-31 12:28:53.120355: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1277] 0: N \n",
+ "2023-01-31 12:28:53.120872: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1418] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 10987 MB memory) -> physical GPU (device: 0, name: NVIDIA TITAN Xp, pci bus id: 0000:03:00.0, compute capability: 6.1)\n",
+ "WARNING:tensorflow:Compiled the loaded model, but the compiled metrics have yet to be built. `model.compile_metrics` will be empty until you train or evaluate the model.\n",
+ "2023-01-31 12:28:54.678368: W tensorflow/python/util/util.cc:348] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.\n",
+ "2023-01-31 12:28:55.540021: I tensorflow/core/grappler/devices.cc:69] Number of eligible GPUs (core count >= 8, compute capability >= 0.0): 1\n",
+ "2023-01-31 12:28:55.540187: I tensorflow/core/grappler/clusters/single_machine.cc:357] Starting new session\n",
+ "2023-01-31 12:28:55.540624: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: \n",
+ "pciBusID: 0000:03:00.0 name: NVIDIA TITAN Xp computeCapability: 6.1\n",
+ "coreClock: 1.582GHz coreCount: 30 deviceMemorySize: 11.91GiB deviceMemoryBandwidth: 510.07GiB/s\n",
+ "2023-01-31 12:28:55.540870: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0\n",
+ "2023-01-31 12:28:55.540900: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1258] Device interconnect StreamExecutor with strength 1 edge matrix:\n",
+ "2023-01-31 12:28:55.540909: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1264] 0 \n",
+ "2023-01-31 12:28:55.540916: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1277] 0: N \n",
+ "2023-01-31 12:28:55.541191: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1418] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 10987 MB memory) -> physical GPU (device: 0, name: NVIDIA TITAN Xp, pci bus id: 0000:03:00.0, compute capability: 6.1)\n",
+ "2023-01-31 12:28:55.559442: I tensorflow/core/platform/profile_utils/cpu_utils.cc:114] CPU Frequency: 3492140000 Hz\n",
+ "2023-01-31 12:28:55.561433: I tensorflow/core/grappler/optimizers/meta_optimizer.cc:1144] Optimization results for grappler item: graph_to_optimize\n",
+ " function_optimizer: function_optimizer did nothing. time = 0.011ms.\n",
+ " function_optimizer: function_optimizer did nothing. time = 0.001ms.\n",
+ "\n",
+ "2023-01-31 12:28:55.642998: W tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc:345] Ignored output_format.\n",
+ "2023-01-31 12:28:55.643041: W tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc:348] Ignored drop_control_dependency.\n",
+ "2023-01-31 12:28:55.647105: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:210] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.\n",
+ "2023-01-31 12:28:55.649478: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: \n",
+ "pciBusID: 0000:03:00.0 name: NVIDIA TITAN Xp computeCapability: 6.1\n",
+ "coreClock: 1.582GHz coreCount: 30 deviceMemorySize: 11.91GiB deviceMemoryBandwidth: 510.07GiB/s\n",
+ "2023-01-31 12:28:55.649793: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0\n",
+ "2023-01-31 12:28:55.649827: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1258] Device interconnect StreamExecutor with strength 1 edge matrix:\n",
+ "2023-01-31 12:28:55.649839: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1264] 0 \n",
+ "2023-01-31 12:28:55.649846: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1277] 0: N \n",
+ "2023-01-31 12:28:55.650184: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1418] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 10987 MB memory) -> physical GPU (device: 0, name: NVIDIA TITAN Xp, pci bus id: 0000:03:00.0, compute capability: 6.1)\n",
+ "Converted model saved to cnn.tflite.\n",
+ "Running TFLite evaluation on validation set...\n",
+ "2023-01-31 12:28:55.708536: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:176] None of the MLIR Optimization Passes are enabled (registered 2)\n",
+ "[[371 0 0 0 0 0 0 0 0 0 0 0]\n",
+ " [ 0 307 3 8 0 15 9 4 10 5 1 9]\n",
+ " [ 0 2 384 0 0 2 6 1 0 0 0 2]\n",
+ " [ 1 5 3 368 1 12 3 0 1 0 2 10]\n",
+ " [ 0 1 1 2 324 0 3 0 0 12 6 1]\n",
+ " [ 0 3 0 12 0 352 2 1 0 1 1 5]\n",
+ " [ 0 5 8 1 1 0 334 2 0 1 0 0]\n",
+ " [ 0 3 0 1 1 1 1 352 1 2 0 1]\n",
+ " [ 1 7 0 0 5 0 0 0 337 9 1 3]\n",
+ " [ 0 7 1 0 16 0 1 0 2 342 3 1]\n",
+ " [ 1 2 1 0 9 2 1 0 1 2 330 1]\n",
+ " [ 0 5 0 11 1 6 1 0 2 3 3 340]]\n",
+ "Validation accuracy = 93.16%(N=4445)\n",
+ "Running TFLite evaluation on test set...\n",
+ "[[408 0 0 0 0 0 0 0 0 0 0 0]\n",
+ " [ 0 341 3 6 5 6 8 6 12 3 3 15]\n",
+ " [ 0 6 395 2 0 2 12 0 0 0 0 2]\n",
+ " [ 0 8 2 363 0 11 9 0 0 0 0 12]\n",
+ " [ 0 8 0 1 386 1 1 0 3 15 8 2]\n",
+ " [ 0 6 3 12 1 371 3 0 3 0 1 6]\n",
+ " [ 0 4 6 1 1 1 394 3 0 0 2 0]\n",
+ " [ 0 13 0 0 1 0 6 372 0 1 1 2]\n",
+ " [ 1 9 0 0 4 7 1 0 356 17 1 0]\n",
+ " [ 0 5 0 1 14 0 3 1 5 364 1 8]\n",
+ " [ 0 0 0 0 9 3 1 0 0 1 392 5]\n",
+ " [ 0 8 1 24 3 6 2 0 0 4 5 349]]\n",
+ "Test accuracy = 91.84%(N=4890)\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "2023-01-31 12:29:24.873900: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0\n",
+ "Untarring speech_commands_v0.02.tar.gz...\n",
+ "2023-01-31 12:30:17.291981: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcuda.so.1\n",
+ "2023-01-31 12:30:17.332661: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: \n",
+ "pciBusID: 0000:03:00.0 name: NVIDIA TITAN Xp computeCapability: 6.1\n",
+ "coreClock: 1.582GHz coreCount: 30 deviceMemorySize: 11.91GiB deviceMemoryBandwidth: 510.07GiB/s\n",
+ "2023-01-31 12:30:17.332698: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0\n",
+ "2023-01-31 12:30:17.352880: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcublas.so.11\n",
+ "2023-01-31 12:30:17.352950: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcublasLt.so.11\n",
+ "2023-01-31 12:30:17.355747: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcufft.so.10\n",
+ "2023-01-31 12:30:17.356015: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcurand.so.10\n",
+ "2023-01-31 12:30:17.356577: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcusolver.so.11\n",
+ "2023-01-31 12:30:17.357311: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcusparse.so.11\n",
+ "2023-01-31 12:30:17.357465: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudnn.so.8\n",
+ "2023-01-31 12:30:17.357965: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0\n",
+ "2023-01-31 12:30:17.358267: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA\n",
+ "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
+ "2023-01-31 12:30:17.358989: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: \n",
+ "pciBusID: 0000:03:00.0 name: NVIDIA TITAN Xp computeCapability: 6.1\n",
+ "coreClock: 1.582GHz coreCount: 30 deviceMemorySize: 11.91GiB deviceMemoryBandwidth: 510.07GiB/s\n",
+ "2023-01-31 12:30:17.359555: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0\n",
+ "2023-01-31 12:30:17.359642: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0\n",
+ "2023-01-31 12:30:17.803416: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1258] Device interconnect StreamExecutor with strength 1 edge matrix:\n",
+ "2023-01-31 12:30:17.803457: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1264] 0 \n",
+ "2023-01-31 12:30:17.803465: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1277] 0: N \n",
+ "2023-01-31 12:30:17.803976: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1418] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 10960 MB memory) -> physical GPU (device: 0, name: NVIDIA TITAN Xp, pci bus id: 0000:03:00.0, compute capability: 6.1)\n",
+ "WARNING:tensorflow:Compiled the loaded model, but the compiled metrics have yet to be built. `model.compile_metrics` will be empty until you train or evaluate the model.\n",
+ "2023-01-31 12:30:19.386735: W tensorflow/python/util/util.cc:348] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.\n",
+ "2023-01-31 12:30:20.196203: I tensorflow/core/grappler/devices.cc:69] Number of eligible GPUs (core count >= 8, compute capability >= 0.0): 1\n",
+ "2023-01-31 12:30:20.196287: I tensorflow/core/grappler/clusters/single_machine.cc:357] Starting new session\n",
+ "2023-01-31 12:30:20.196874: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: \n",
+ "pciBusID: 0000:03:00.0 name: NVIDIA TITAN Xp computeCapability: 6.1\n",
+ "coreClock: 1.582GHz coreCount: 30 deviceMemorySize: 11.91GiB deviceMemoryBandwidth: 510.07GiB/s\n",
+ "2023-01-31 12:30:20.197122: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0\n",
+ "2023-01-31 12:30:20.197152: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1258] Device interconnect StreamExecutor with strength 1 edge matrix:\n",
+ "2023-01-31 12:30:20.197161: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1264] 0 \n",
+ "2023-01-31 12:30:20.197168: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1277] 0: N \n",
+ "2023-01-31 12:30:20.197458: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1418] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 10960 MB memory) -> physical GPU (device: 0, name: NVIDIA TITAN Xp, pci bus id: 0000:03:00.0, compute capability: 6.1)\n",
+ "2023-01-31 12:30:20.215456: I tensorflow/core/platform/profile_utils/cpu_utils.cc:114] CPU Frequency: 3492140000 Hz\n",
+ "2023-01-31 12:30:20.218487: I tensorflow/core/grappler/optimizers/meta_optimizer.cc:1144] Optimization results for grappler item: graph_to_optimize\n",
+ " function_optimizer: function_optimizer did nothing. time = 0.015ms.\n",
+ " function_optimizer: function_optimizer did nothing. time = 0.003ms.\n",
+ "\n",
+ "2023-01-31 12:30:20.293490: W tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc:345] Ignored output_format.\n",
+ "2023-01-31 12:30:20.293531: W tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc:348] Ignored drop_control_dependency.\n",
+ "2023-01-31 12:30:20.297417: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:210] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.\n",
+ "2023-01-31 12:30:20.299779: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: \n",
+ "pciBusID: 0000:03:00.0 name: NVIDIA TITAN Xp computeCapability: 6.1\n",
+ "coreClock: 1.582GHz coreCount: 30 deviceMemorySize: 11.91GiB deviceMemoryBandwidth: 510.07GiB/s\n",
+ "2023-01-31 12:30:20.300054: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0\n",
+ "2023-01-31 12:30:20.300091: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1258] Device interconnect StreamExecutor with strength 1 edge matrix:\n",
+ "2023-01-31 12:30:20.300104: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1264] 0 \n",
+ "2023-01-31 12:30:20.300114: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1277] 0: N \n",
+ "2023-01-31 12:30:20.300414: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1418] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 10960 MB memory) -> physical GPU (device: 0, name: NVIDIA TITAN Xp, pci bus id: 0000:03:00.0, compute capability: 6.1)\n",
+ "2023-01-31 12:30:20.327055: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:176] None of the MLIR Optimization Passes are enabled (registered 2)\n",
+ "fully_quantize: 0, inference_type: 6, input_inference_type: 9, output_inference_type: 9\n",
+ "Quantized model saved to cnn_quantized.tflite.\n",
+ "Running TFLite evaluation on validation set...\n",
+ "[[371 0 0 0 0 0 0 0 0 0 0 0]\n",
+ " [ 0 305 3 9 0 14 12 4 7 5 1 11]\n",
+ " [ 1 3 380 0 0 1 7 3 0 0 1 1]\n",
+ " [ 1 14 3 349 1 10 7 0 1 0 4 16]\n",
+ " [ 0 4 1 1 310 2 3 0 0 15 13 1]\n",
+ " [ 0 6 0 12 0 341 2 1 1 1 8 5]\n",
+ " [ 0 5 9 1 3 0 327 3 0 0 4 0]\n",
+ " [ 0 7 0 0 3 0 3 346 0 2 0 2]\n",
+ " [ 1 12 0 1 2 0 0 0 333 9 1 4]\n",
+ " [ 0 7 1 0 20 0 2 0 1 331 8 3]\n",
+ " [ 1 2 1 0 11 2 2 0 1 2 326 2]\n",
+ " [ 0 5 0 12 2 7 1 0 4 2 5 334]]\n",
+ "Validation accuracy = 91.18%(N=4445)\n",
+ "Running TFLite evaluation on test set...\n",
+ "[[408 0 0 0 0 0 0 0 0 0 0 0]\n",
+ " [ 1 335 5 6 6 9 9 5 12 3 2 15]\n",
+ " [ 0 9 390 3 0 3 10 0 0 1 0 3]\n",
+ " [ 0 8 2 358 3 10 8 1 0 0 2 13]\n",
+ " [ 0 10 0 1 380 2 5 1 2 14 9 1]\n",
+ " [ 0 7 3 12 2 361 3 0 3 0 4 11]\n",
+ " [ 0 7 7 1 1 0 391 5 0 0 0 0]\n",
+ " [ 0 14 0 0 2 1 6 367 0 2 1 3]\n",
+ " [ 2 13 0 0 6 7 2 0 349 16 1 0]\n",
+ " [ 0 6 0 1 13 0 5 1 4 360 3 9]\n",
+ " [ 0 1 0 1 8 9 1 0 0 1 382 8]\n",
+ " [ 0 10 0 29 3 8 3 2 0 5 6 336]]\n",
+ "Test accuracy = 90.33%(N=4890)\n"
+ ]
+ }
+ ],
+ "source": [
+ "!bash ./recreate_model.sh"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Running this script will use the pre-trained checkpoint files supplied in the ```./model_archive/model_source/weights``` folder to generate the TFLite files and perform evaluation on the test set. Both an fp32 version and a quantized version will be produced. The quantized version will use post-training quantization to fully quantize it.\n",
+ "\n",
+ "If you want to run training from scratch you can do this by supplying ```--train``` when running the script. For example:\n",
+ "\n",
+ "```bash\n",
+ "bash ./recreate_model.sh --train\n",
+ "```\n",
+ "\n",
+ "Training is then performed and should produce a model to the stated accuracy in this repository. Note that exporting to TFLite will still happen with the baseline pre-trained checkpoint files, so you will need to re-run the script and this time supply the path to the new checkpoint files you want to use, for example:\n",
+ "\n",
+ "```bash\n",
+ "bash ./recreate_model.sh --ckpt \n",
+ "```"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 2.0 Training\n",
+ "\n",
+ "The training scripts can be used to recreate any of the models from the [Hello Edge paper](https://arxiv.org/pdf/1711.07128.pdf) provided the right hyperparameters are used. The training commands with all the hyperparameters to reproduce the model in this repository are given [here](recreate_model.sh). The model in this part of the repository represents just one variation of the models from the paper, other varieties are covered in other parts of the repository.\n",
+ "\n",
+ "\n",
+ "As a general example of how to train a DNN with 3 fully-connected layers with 128 neurons in each layer, run:\n",
+ "```\n",
+ "python train.py --model_architecture dnn --model_size_info 128 128 128\n",
+ "```\n",
+ "\n",
+ "The command line argument *--model_size_info* is used to pass the neural network layer\n",
+ "dimensions such as number of layers, convolution filter size/stride as a list to models.py,\n",
+ "which builds the TensorFlow graph based on the provided model architecture\n",
+ "and layer dimensions. For more info on *model_size_info* for each network architecture see\n",
+ "[models.py](model_core_utils/models.py).\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 3.0 Testing\n",
+ "To run inference on the trained model from a checkpoint and get accuracy on validation and test sets, run:\n",
+ "```\n",
+ "python evaluation.py --model_architecture dnn --model_size_info 128 128 128 --checkpoint \n",
+ "```\n",
+ "**The model and feature extraction parameters passed to this script should match those used in the Training step.**"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 4.0 Optimization\n",
+ "\n",
+ "We introduce an *optional* step to optimize the trained keyword spotting model for deployment.\n",
+ "\n",
+ "Here we use TensorFlow's [weight clustering API](https://www.tensorflow.org/model_optimization/guide/clustering) to reduce the compressed model size and optimize inference on supported hardware. 32 weight clusters and kmeans++ cluster intialization method are used as the clustering hyperparameters.\n",
+ "\n",
+ "To optimize your trained model (e.g. a DNN), a trained model checkpoint is needed to run clustering and fine-tuning on.\n",
+ "You can use the pre-trained checkpoints provided, or train your own model and use the resulting checkpoint.\n",
+ "\n",
+ "To apply the optimization and fine-tuning, run the following command:\n",
+ "```\n",
+ "python optimisations.py --model_architecture dnn --model_size_info 128 128 128 --checkpoint \n",
+ "```\n",
+ "**The model and feature extraction parameters used here should match those used in the Training step, except for the number of training steps.\n",
+ "The number of training steps is reduced since the optimization step only requires fine-tuning.**\n",
+ "\n",
+ "This will generate a clustered model checkpoint that can be used in the quantization step to generate a quantized and clustered TFLite model."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 5.0 Quantization and TFLite Conversion\n",
+ "\n",
+ "You can now use TensorFlow's\n",
+ "[post training quantization](https://www.tensorflow.org/lite/performance/post_training_quantization) to\n",
+ "make quantization of the trained models super simple.\n",
+ "\n",
+ "To quantize your trained model (e.g. a DNN) run:\n",
+ "```\n",
+ "python convert_to_tflite.py --model_architecture dnn --model_size_info 128 128 128 --checkpoint [--inference_type int8|int16]\n",
+ "```\n",
+ "**The model and feature extraction parameters used here should match those used in the Training step.**\n",
+ "\n",
+ "The ```inference_type``` parameter is *optional* and to be used if a fully quantized model with inputs and outputs of type int8 or int16 is needed. It defaults to fp32.\n",
+ "\n",
+ "In this example, this step will produce a quantized TFLite file *dnn_quantized.tflite*."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "You can test the accuracy of this quantized model on the test set by running:\n",
+ "```\n",
+ "python evaluation.py --tflite_path dnn_quantized.tflite\n",
+ "```\n",
+ "**The model and feature extraction parameters used here should match those used in the Training step.**\n",
+ "\n",
+ "`convert_to_tflite.py` uses post-training quantization to generate a quantized model by default. If you wish to convert to a floating point TFLite model, use the command below:\n",
+ "\n",
+ "```\n",
+ "python convert_to_tflite.py --model_architecture dnn --model_size_info 128 128 128 --checkpoint --no-quantize\n",
+ "```\n",
+ "\n",
+ "This will produce a floating point TFLite file *dnn.tflite*. You can test the accuracy of this floating point model using `evaluation.py` as above.\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 6.0 Single inference of the TFLite model files \n",
+ "\n",
+ "You can conduct TFLite inference for .fp32 and .int8 model files by using the following command: \n",
+ "\n",
+ "```python cnn_m_inference_tflite.py --labels validation_utils/labels.txt --wav --tflite_path ```\n",
+ "\n",
+ "**The feature extraction parameters used here should match those used in the Training step.**\n",
+ "\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.7.4"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/models/keyword_spotting/cnn_medium/model_package_tf/model_archive/TFLite/tflite_fp32/README.md b/models/keyword_spotting/cnn_medium/model_package_tf/model_archive/TFLite/tflite_fp32/README.md
new file mode 100644
index 0000000..37debc0
--- /dev/null
+++ b/models/keyword_spotting/cnn_medium/model_package_tf/model_archive/TFLite/tflite_fp32/README.md
@@ -0,0 +1,62 @@
+# keyword_spotting/cnn_medium/model_package_tf/model_archive/TFLite/tflite_fp32
+
+## Description
+This is a floating point fp32 version of the CNN Medium model developed by Arm, from the Hello Edge paper.
+
+## License
+[Apache-2.0](https://spdx.org/licenses/Apache-2.0.html)
+
+## Network Information
+| Network Information | Value |
+|---------------------|-------|
+| Framework | TensorFlow Lite |
+| Datatype | fp32 |
+| SHA-1 Hash | 0057378e784ccb8fa28abaa972a86988fbecea19 |
+| Size (Bytes) | 717268 |
+| Provenance | https://arxiv.org/abs/1711.07128 |
+| Training | Trained by Arm |
+| Paper | https://arxiv.org/abs/1711.07128 |
+
+## DataSet
+| Dataset Information | Value |
+|--------|-------|
+| Name | Google Speech Commands test set |
+
+## Accuracy
+
+| Metric | Value |
+|--------|-------|
+| accuracy | 91.84% |
+
+## HW Support
+| HW Support | Value |
+|--------------|-------|
+| Cortex-A |:heavy_check_mark: |
+| Cortex-M |:heavy_check_mark: |
+| Mali GPU |:heavy_check_mark: |
+| Ethos U |:heavy_multiplication_x: |
+
+### Key
+* :heavy_check_mark: - Will run on this platform.
+* :heavy_multiplication_x: - Will not run on this platform.
+
+## Network Quality
+| Network Quality | Value |
+|-------------------------|-------|
+| Recreate | :heavy_check_mark: |
+| Quality level | Deployable |
+| Vanilla | :heavy_check_mark: |
+| Clustered | :heavy_multiplication_x: |
+| Pruned | :heavy_multiplication_x: |
+| Quantization - default | :heavy_multiplication_x: |
+| Quantization - full | :heavy_multiplication_x: |
+
+## Network Inputs
+| Input Node Name | Shape | Type | Example Path | Example Type | Example Shape | Example Use Case |
+|-----------------|-------|-------|--------------|-------|-------|-----------------|
+| input | (1, 490) | fp32 | models/keyword_spotting/cnn_medium/model_package_tf/model_archive/TFLite/tflite_fp32/testing_input/input | fp32 | [1, 490] | The input is a processed MFCCs. |
+
+## Network Outputs
+| Output Node Name | Shape | Type | Example Path | Example Type | Example Shape | Example Use Case |
+|-----------------|-------|-------|--------------|-------|-------|-----------------|
+| Identity | (1, 12) | fp32 | models/keyword_spotting/cnn_medium/model_package_tf/model_archive/TFLite/tflite_fp32/testing_output/Identity | fp32 | [1, 12] | The probability on 12 keywords |
\ No newline at end of file
diff --git a/models/keyword_spotting/cnn_medium/model_package_tf/model_archive/TFLite/tflite_fp32/cnn_m.tflite b/models/keyword_spotting/cnn_medium/model_package_tf/model_archive/TFLite/tflite_fp32/cnn_m.tflite
new file mode 100644
index 0000000..f928da7
--- /dev/null
+++ b/models/keyword_spotting/cnn_medium/model_package_tf/model_archive/TFLite/tflite_fp32/cnn_m.tflite
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d18705eebbb20d0ffa569266c97c839082f9a6cd37115c834661081832edc22c
+size 717268
diff --git a/models/keyword_spotting/cnn_medium/model_package_tf/model_archive/TFLite/tflite_fp32/definition.yaml b/models/keyword_spotting/cnn_medium/model_package_tf/model_archive/TFLite/tflite_fp32/definition.yaml
new file mode 100644
index 0000000..8bea635
--- /dev/null
+++ b/models/keyword_spotting/cnn_medium/model_package_tf/model_archive/TFLite/tflite_fp32/definition.yaml
@@ -0,0 +1,64 @@
+benchmark:
+ benchmark_metrics:
+ accuracy: 91.84%
+ benchmark_name: Google Speech Commands test set
+description: This is a floating point fp32 version of the CNN Medium model developed
+ by Arm, from the Hello Edge paper.
+license:
+- Apache-2.0
+network:
+ datatype: fp32
+ file_size_bytes: 717268
+ filename: cnn_m.tflite
+ framework: TensorFlow Lite
+ hash:
+ algorithm: sha1
+ value: 0057378e784ccb8fa28abaa972a86988fbecea19
+ provenance: https://arxiv.org/abs/1711.07128
+ training: Trained by Arm
+network_parameters:
+ input_nodes:
+ - description: The input is a processed MFCCs of shape (1, 490)
+ example_input:
+ path: models/keyword_spotting/cnn_medium/model_package_tf/model_archive/TFLite/tflite_fp32/testing_input/input
+ shape:
+ - 1
+ - 490
+ type: fp32
+ use_case: Random input for model regression.
+ input_datatype: fp32
+ name: input
+ shape:
+ - 1
+ - 490
+ output_nodes:
+ - description: The probability on 12 keywords.
+ example_output:
+ path: models/keyword_spotting/cnn_medium/model_package_tf/model_archive/TFLite/tflite_fp32/testing_output/Identity
+ shape:
+ - 1
+ - 12
+ type: fp32
+ use_case: output for model regression.
+ name: Identity
+ output_datatype: fp32
+ shape:
+ - 1
+ - 12
+network_quality:
+ clustered: false
+ is_vanilla: true
+ pruned: false
+ quality_level: Deployable
+ quality_level_hero_hw: cortex_m
+ quantization_default: false
+ quantization_full: false
+ recreate: true
+operators:
+ TensorFlow Lite:
+ - CONV_2D
+ - FULLY_CONNECTED
+ - RELU
+ - RESHAPE
+ - SOFTMAX
+paper: https://arxiv.org/abs/1711.07128
\ No newline at end of file
diff --git a/models/keyword_spotting/cnn_medium/model_package_tf/model_archive/TFLite/tflite_fp32/testing_input/input/0.npy b/models/keyword_spotting/cnn_medium/model_package_tf/model_archive/TFLite/tflite_fp32/testing_input/input/0.npy
new file mode 100644
index 0000000..1752993
--- /dev/null
+++ b/models/keyword_spotting/cnn_medium/model_package_tf/model_archive/TFLite/tflite_fp32/testing_input/input/0.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a2a935408c16cb85e8d23f9d604ea41231df1f8005c067e0a692146e7b881481
+size 2088
diff --git a/models/keyword_spotting/cnn_medium/model_package_tf/model_archive/TFLite/tflite_fp32/testing_output/Identity/0.npy b/models/keyword_spotting/cnn_medium/model_package_tf/model_archive/TFLite/tflite_fp32/testing_output/Identity/0.npy
new file mode 100644
index 0000000..c590a95
--- /dev/null
+++ b/models/keyword_spotting/cnn_medium/model_package_tf/model_archive/TFLite/tflite_fp32/testing_output/Identity/0.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:62786f0bb0878883ab48d4a76086aff8cea161ac537ea41615901378926052a8
+size 176
diff --git a/models/keyword_spotting/cnn_medium/model_package_tf/model_archive/TFLite/tflite_int8/README.md b/models/keyword_spotting/cnn_medium/model_package_tf/model_archive/TFLite/tflite_int8/README.md
new file mode 100644
index 0000000..6318de4
--- /dev/null
+++ b/models/keyword_spotting/cnn_medium/model_package_tf/model_archive/TFLite/tflite_int8/README.md
@@ -0,0 +1,62 @@
+# keyword_spotting/cnn_medium/model_package_tf/model_archive/TFLite/tflite_int8
+
+## Description
+This is a fully quantized int8 version of the CNN Medium model developed by Arm, from the Hello Edge paper.
+
+## License
+[Apache-2.0](https://spdx.org/licenses/Apache-2.0.html)
+
+## Network Information
+| Network Information | Value |
+|---------------------|-------|
+| Framework | TensorFlow Lite |
+| Datatype | int8 |
+| SHA-1 Hash | 6bc68074d960bbb0c695e19fd96fd7903131ef60 |
+| Size (Bytes) | 186064 |
+| Provenance | https://arxiv.org/abs/1711.07128 |
+| Training | Trained by Arm |
+| Paper | https://arxiv.org/abs/1711.07128 |
+
+## DataSet
+| Dataset Information | Value |
+|--------|-------|
+| Name | Google Speech Commands test set |
+
+## Accuracy
+
+| Metric | Value |
+|--------|-------|
+| Accuracy | 90.47% |
+
+## HW Support
+| HW Support | Value |
+|--------------|-------|
+| Cortex-A |:heavy_check_mark: |
+| Cortex-M |:heavy_check_mark: |
+| Mali GPU |:heavy_check_mark: |
+| Ethos U |:heavy_check_mark: |
+
+### Key
+* :heavy_check_mark: - Will run on this platform.
+* :heavy_multiplication_x: - Will not run on this platform.
+
+## Network Quality
+| Network Quality | Value |
+|-------------------------|-------|
+| Recreate | :heavy_check_mark: |
+| Quality level | Deployable |
+| Vanilla | :heavy_check_mark: |
+| Clustered | :heavy_multiplication_x: |
+| Pruned | :heavy_multiplication_x: |
+| Quantization - default | :heavy_multiplication_x: |
+| Quantization - full | :heavy_check_mark: |
+
+## Network Inputs
+| Input Node Name | Shape | Type | Example Path | Example Type | Example Shape | Example Use Case |
+|-----------------|-------|-------|--------------|-------|-------|-----------------|
+| input | (1, 490) | int8 | models/keyword_spotting/cnn_medium/model_package_tf/model_archive/TFLite/tflite_int8/testing_input/input | fp32 | [1, 490] | The input is a processed MFCCs |
+
+## Network Outputs
+| Output Node Name | Shape | Type | Example Path | Example Type | Example Shape | Example Use Case |
+|-----------------|-------|-------|--------------|-------|-------|-----------------|
+| Identity | (1, 12) | int8 | models/keyword_spotting/cnn_medium/model_package_tf/model_archive/TFLite/tflite_int8/testing_output/Identity | fp32 | [1, 12] | The probability on 12 keywords |
\ No newline at end of file
diff --git a/models/keyword_spotting/cnn_medium/tflite_int8/cnn_m_quantized.tflite b/models/keyword_spotting/cnn_medium/model_package_tf/model_archive/TFLite/tflite_int8/cnn_m_quantized.tflite
similarity index 100%
rename from models/keyword_spotting/cnn_medium/tflite_int8/cnn_m_quantized.tflite
rename to models/keyword_spotting/cnn_medium/model_package_tf/model_archive/TFLite/tflite_int8/cnn_m_quantized.tflite
diff --git a/models/keyword_spotting/cnn_medium/model_package_tf/model_archive/TFLite/tflite_int8/definition.yaml b/models/keyword_spotting/cnn_medium/model_package_tf/model_archive/TFLite/tflite_int8/definition.yaml
new file mode 100644
index 0000000..10f79a7
--- /dev/null
+++ b/models/keyword_spotting/cnn_medium/model_package_tf/model_archive/TFLite/tflite_int8/definition.yaml
@@ -0,0 +1,64 @@
+benchmark:
+ benchmark_metrics:
+ Accuracy: 90.47%
+ benchmark_name: Google Speech Commands test set
+description: This is a fully quantized int8 version of the CNN Medium model developed
+ by Arm, from the Hello Edge paper.
+license:
+- Apache-2.0
+network:
+ datatype: int8
+ file_size_bytes: 186064
+ filename: cnn_m_quantized.tflite
+ framework: TensorFlow Lite
+ hash:
+ algorithm: sha1
+ value: 6bc68074d960bbb0c695e19fd96fd7903131ef60
+ provenance: https://arxiv.org/abs/1711.07128
+ training: Trained by Arm
+network_parameters:
+ input_nodes:
+ - description: The input is a processed MFCCs of shape (1, 490)
+ example_input:
+ path: models/keyword_spotting/cnn_medium/model_package_tf/model_archive/TFLite/tflite_int8/testing_input/input
+ shape:
+ - 1
+ - 490
+ type: int8
+ use_case: Random input for model regression.
+ input_datatype: int8
+ name: input
+ shape:
+ - 1
+ - 490
+ output_nodes:
+ - description: The probability on 12 keywords.
+ example_output:
+ path: models/keyword_spotting/cnn_medium/model_package_tf/model_archive/TFLite/tflite_int8/testing_output/Identity
+ shape:
+ - 1
+ - 12
+ type: int8
+ use_case: output for model regression.
+ name: Identity
+ output_datatype: int8
+ shape:
+ - 1
+ - 12
+network_quality:
+ clustered: false
+ is_vanilla: true
+ pruned: false
+ quality_level: Deployable
+ quality_level_hero_hw: cortex_m
+ quantization_default: false
+ quantization_full: true
+ recreate: true
+operators:
+ TensorFlow Lite:
+ - CONV_2D
+ - FULLY_CONNECTED
+ - RELU
+ - RESHAPE
+ - SOFTMAX
+paper: https://arxiv.org/abs/1711.07128
\ No newline at end of file
diff --git a/models/keyword_spotting/cnn_medium/tflite_int8/testing_input/input/0.npy b/models/keyword_spotting/cnn_medium/model_package_tf/model_archive/TFLite/tflite_int8/testing_input/input/0.npy
similarity index 100%
rename from models/keyword_spotting/cnn_medium/tflite_int8/testing_input/input/0.npy
rename to models/keyword_spotting/cnn_medium/model_package_tf/model_archive/TFLite/tflite_int8/testing_input/input/0.npy
diff --git a/models/keyword_spotting/cnn_medium/tflite_int8/testing_output/Identity/0.npy b/models/keyword_spotting/cnn_medium/model_package_tf/model_archive/TFLite/tflite_int8/testing_output/Identity/0.npy
similarity index 100%
rename from models/keyword_spotting/cnn_medium/tflite_int8/testing_output/Identity/0.npy
rename to models/keyword_spotting/cnn_medium/model_package_tf/model_archive/TFLite/tflite_int8/testing_output/Identity/0.npy
diff --git a/models/keyword_spotting/cnn_medium/model_package_tf/model_archive/model_source/saved_model/cnn_medium/keras_metadata.pb b/models/keyword_spotting/cnn_medium/model_package_tf/model_archive/model_source/saved_model/cnn_medium/keras_metadata.pb
new file mode 100644
index 0000000..30ebf5e
--- /dev/null
+++ b/models/keyword_spotting/cnn_medium/model_package_tf/model_archive/model_source/saved_model/cnn_medium/keras_metadata.pb
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ae36d2d043a0d2b71e7f5fd8eef87f627324344451706fbfa6dcdcd9fd95bd6f
+size 28876
diff --git a/models/keyword_spotting/cnn_medium/model_package_tf/model_archive/model_source/saved_model/cnn_medium/saved_model.pb b/models/keyword_spotting/cnn_medium/model_package_tf/model_archive/model_source/saved_model/cnn_medium/saved_model.pb
new file mode 100644
index 0000000..5d6fdbc
--- /dev/null
+++ b/models/keyword_spotting/cnn_medium/model_package_tf/model_archive/model_source/saved_model/cnn_medium/saved_model.pb
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0228b4fa8fed68d9bfbaa60e6f7157f91c6b4e142d0278b4141006749fc1ccd8
+size 302218
diff --git a/models/keyword_spotting/cnn_medium/model_package_tf/model_archive/model_source/saved_model/cnn_medium/variables/variables.data-00000-of-00001 b/models/keyword_spotting/cnn_medium/model_package_tf/model_archive/model_source/saved_model/cnn_medium/variables/variables.data-00000-of-00001
new file mode 100644
index 0000000..6a79c8b
--- /dev/null
+++ b/models/keyword_spotting/cnn_medium/model_package_tf/model_archive/model_source/saved_model/cnn_medium/variables/variables.data-00000-of-00001
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d54b7d5df343e2d5285d1d64a9bfb743ace65a402e87e9d963e69b0417a59e5d
+size 725888
diff --git a/models/keyword_spotting/cnn_medium/model_package_tf/model_archive/model_source/saved_model/cnn_medium/variables/variables.index b/models/keyword_spotting/cnn_medium/model_package_tf/model_archive/model_source/saved_model/cnn_medium/variables/variables.index
new file mode 100644
index 0000000..99cba5f
--- /dev/null
+++ b/models/keyword_spotting/cnn_medium/model_package_tf/model_archive/model_source/saved_model/cnn_medium/variables/variables.index
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d12a6c029bb2ff6a692e3376a01e160f78461add8d82d1d6c53e7e65c0d5f278
+size 1476
diff --git a/models/keyword_spotting/cnn_medium/tflite_int8/ckpt/checkpoint b/models/keyword_spotting/cnn_medium/model_package_tf/model_archive/model_source/weights/checkpoint
similarity index 100%
rename from models/keyword_spotting/cnn_medium/tflite_int8/ckpt/checkpoint
rename to models/keyword_spotting/cnn_medium/model_package_tf/model_archive/model_source/weights/checkpoint
diff --git a/models/keyword_spotting/cnn_medium/tflite_int8/ckpt/cnn_0.93_ckpt.data-00000-of-00001 b/models/keyword_spotting/cnn_medium/model_package_tf/model_archive/model_source/weights/cnn_0.93_ckpt.data-00000-of-00001
similarity index 100%
rename from models/keyword_spotting/cnn_medium/tflite_int8/ckpt/cnn_0.93_ckpt.data-00000-of-00001
rename to models/keyword_spotting/cnn_medium/model_package_tf/model_archive/model_source/weights/cnn_0.93_ckpt.data-00000-of-00001
diff --git a/models/keyword_spotting/cnn_medium/tflite_int8/ckpt/cnn_0.93_ckpt.index b/models/keyword_spotting/cnn_medium/model_package_tf/model_archive/model_source/weights/cnn_0.93_ckpt.index
similarity index 100%
rename from models/keyword_spotting/cnn_medium/tflite_int8/ckpt/cnn_0.93_ckpt.index
rename to models/keyword_spotting/cnn_medium/model_package_tf/model_archive/model_source/weights/cnn_0.93_ckpt.index
diff --git a/models/keyword_spotting/cnn_medium/model_package_tf/model_core_utils/__init__.py b/models/keyword_spotting/cnn_medium/model_package_tf/model_core_utils/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/models/keyword_spotting/cnn_medium/model_package_tf/model_core_utils/models.py b/models/keyword_spotting/cnn_medium/model_package_tf/model_core_utils/models.py
new file mode 100644
index 0000000..1978136
--- /dev/null
+++ b/models/keyword_spotting/cnn_medium/model_package_tf/model_core_utils/models.py
@@ -0,0 +1,327 @@
+# Copyright © 2023 Arm Ltd. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Model definitions for simple keyword spotting."""
+
+import math
+
+import tensorflow as tf
+
+
+def prepare_model_settings(label_count, sample_rate, clip_duration_ms,
+ window_size_ms, window_stride_ms,
+ dct_coefficient_count):
+ """Calculates common settings needed for all models.
+
+ Args:
+ label_count: How many classes are to be recognized.
+ sample_rate: Number of audio samples per second.
+ clip_duration_ms: Length of each audio clip to be analyzed.
+ window_size_ms: Duration of frequency analysis window.
+ window_stride_ms: How far to move in time between frequency windows.
+ dct_coefficient_count: Number of frequency bins to use for analysis.
+
+ Returns:
+ Dictionary containing common settings.
+ """
+ desired_samples = int(sample_rate * clip_duration_ms / 1000)
+ window_size_samples = int(sample_rate * window_size_ms / 1000)
+ window_stride_samples = int(sample_rate * window_stride_ms / 1000)
+ length_minus_window = (desired_samples - window_size_samples)
+ if length_minus_window < 0:
+ spectrogram_length = 0
+ else:
+ spectrogram_length = 1 + int(length_minus_window / window_stride_samples)
+ fingerprint_size = dct_coefficient_count * spectrogram_length
+
+ return {
+ 'desired_samples': desired_samples,
+ 'window_size_samples': window_size_samples,
+ 'window_stride_samples': window_stride_samples,
+ 'spectrogram_length': spectrogram_length,
+ 'dct_coefficient_count': dct_coefficient_count,
+ 'fingerprint_size': fingerprint_size,
+ 'label_count': label_count,
+ 'sample_rate': sample_rate,
+ }
+
+
+def create_model(model_settings, model_architecture, model_size_info, is_training):
+ """Builds a tf.keras model of the requested architecture compatible with the settings.
+
+ Args:
+ model_settings: Dictionary of information about the model.
+ model_architecture: String specifying which kind of model to create.
+ model_size_info: Array with specific information for the chosen architecture
+ (e.g convolutional parameters, number of layers).
+
+ Returns:
+ A tf.keras Model with the requested architecture.
+
+ Raises:
+ Exception: If the architecture type isn't recognized.
+ """
+
+ if model_architecture == 'dnn':
+ return create_dnn_model(model_settings, model_size_info)
+
+ elif model_architecture == 'cnn':
+ return create_cnn_model(model_settings, model_size_info)
+
+ elif model_architecture == 'ds_cnn':
+ return create_ds_cnn_model(model_settings, model_size_info)
+ elif model_architecture == 'single_fc':
+ return create_single_fc_model(model_settings)
+ elif model_architecture == 'basic_lstm':
+ return create_basic_lstm_model(model_settings, model_size_info, is_training)
+ else:
+ raise Exception(f'model_architecture argument {model_architecture} not recognized'
+ f', should be one of, "dnn", "cnn", "ds_cnn" ')
+
+
+def create_single_fc_model(model_settings):
+ """Builds a model with a single fully-connected layer.
+
+ For details see https://arxiv.org/abs/1711.07128.
+
+ Args:
+ model_settings: Dict of different settings for model training.
+
+ Returns:
+ tf.keras Model of the 'SINGLE_FC' architecture.
+ """
+ inputs = tf.keras.Input(shape=(model_settings['fingerprint_size'],), name='input')
+ # Fully connected layer
+ output = tf.keras.layers.Dense(units=model_settings['label_count'], activation='softmax')(inputs)
+
+ return tf.keras.Model(inputs, output)
+
+
+def create_basic_lstm_model(model_settings, model_size_info, is_training):
+ """Builds a model with a basic lstm layer.
+
+ For details see https://arxiv.org/abs/1711.07128.
+
+ Args:
+ model_settings: Dict of different settings for model training.
+ model_size_info: Length of the array defines the number of hidden-layers and
+ each element in the array represent the number of neurons in that layer.
+ is_training: Determining whether the use of the model is for training or for something else.
+
+ Returns:
+ tf.keras Model of the 'Basic_LSTM' architecture.
+ """
+ inputs = tf.keras.Input(shape=(model_settings['fingerprint_size'], ), name='input')
+
+ input_frequency_size = model_settings['dct_coefficient_count']
+ input_time_size = model_settings['spectrogram_length']
+ x = tf.reshape(inputs, shape=(-1, input_time_size, input_frequency_size))
+
+ # LSTM layer, and unrolling depending on whether you are training or not
+ if is_training:
+ x = tf.keras.layers.LSTM(units=model_size_info[0], time_major=False, unroll=False)(x)
+ else:
+ x = tf.keras.layers.LSTM(units=model_size_info[0], time_major=False, unroll=True)(x)
+
+ # Outputs a fully connected layer
+ output = tf.keras.layers.Dense(units=model_settings['label_count'], activation='softmax')(x)
+
+ return tf.keras.Model(inputs, output)
+
+
+def create_dnn_model(model_settings, model_size_info):
+ """Builds a model with multiple hidden fully-connected layers.
+
+ For details see https://arxiv.org/abs/1711.07128.
+
+ Args:
+ model_settings: Dict of different settings for model training.
+ model_size_info: Length of the array defines the number of hidden-layers and
+ each element in the array represent the number of neurons in that layer.
+
+ Returns:
+ tf.keras Model of the 'DNN' architecture.
+ """
+
+ inputs = tf.keras.Input(shape=(model_settings['fingerprint_size'], ), name='input')
+
+ # First fully connected layer.
+ x = tf.keras.layers.Dense(units=model_size_info[0], activation='relu')(inputs)
+
+ # Hidden layers with ReLU activations.
+ for i in range(1, len(model_size_info)):
+ x = tf.keras.layers.Dense(units=model_size_info[i], activation='relu')(x)
+
+ # Output fully connected layer.
+ output = tf.keras.layers.Dense(units=model_settings['label_count'], activation='softmax')(x)
+
+ return tf.keras.Model(inputs, output)
+
+
+def create_cnn_model(model_settings, model_size_info):
+ """Builds a model with 2 convolution layers followed by a linear layer and a hidden fully-connected layer.
+
+ For details see https://arxiv.org/abs/1711.07128.
+
+ Args:
+ model_settings: Dict of different settings for model training.
+ model_size_info: Defines the first and second convolution parameters in
+ {number of conv features, conv filter height, width, stride in y,x dir.},
+ followed by linear layer size and fully-connected layer size.
+
+ Returns:
+ tf.keras Model of the 'CNN' architecture.
+ """
+
+ input_frequency_size = model_settings['dct_coefficient_count']
+ input_time_size = model_settings['spectrogram_length']
+
+ first_filter_count = model_size_info[0]
+ first_filter_height = model_size_info[1] # Time axis.
+ first_filter_width = model_size_info[2] # Frequency axis.
+ first_filter_stride_y = model_size_info[3] # Time axis.
+ first_filter_stride_x = model_size_info[4] # Frequency_axis.
+
+ second_filter_count = model_size_info[5]
+ second_filter_height = model_size_info[6] # Time axis.
+ second_filter_width = model_size_info[7] # Frequency axis.
+ second_filter_stride_y = model_size_info[8] # Time axis.
+ second_filter_stride_x = model_size_info[9] # Frequency axis.
+
+ linear_layer_size = model_size_info[10]
+ fc_size = model_size_info[11]
+
+ inputs = tf.keras.Input(shape=(model_settings['fingerprint_size']), name='input')
+
+ # Reshape the flattened input.
+ x = tf.reshape(inputs, shape=(-1, input_time_size, input_frequency_size, 1))
+
+ # First convolution.
+ x = tf.keras.layers.Conv2D(filters=first_filter_count,
+ kernel_size=(first_filter_height, first_filter_width),
+ strides=(first_filter_stride_y, first_filter_stride_x),
+ padding='VALID')(x)
+ x = tf.keras.layers.BatchNormalization()(x)
+ x = tf.keras.layers.ReLU()(x)
+ x = tf.keras.layers.Dropout(rate=0)(x)
+
+ # Second convolution.
+ x = tf.keras.layers.Conv2D(filters=second_filter_count,
+ kernel_size=(second_filter_height, second_filter_width),
+ strides=(second_filter_stride_y, second_filter_stride_x),
+ padding='VALID')(x)
+ x = tf.keras.layers.BatchNormalization()(x)
+ x = tf.keras.layers.ReLU()(x)
+ x = tf.keras.layers.Dropout(rate=0)(x)
+
+ # Flatten for fully connected layers.
+ x = tf.keras.layers.Flatten()(x)
+
+ # Fully connected layer with no activation.
+ x = tf.keras.layers.Dense(units=linear_layer_size)(x)
+
+ # Fully connected layer with ReLU activation.
+ x = tf.keras.layers.Dense(units=fc_size)(x)
+ x = tf.keras.layers.BatchNormalization()(x)
+ x = tf.keras.layers.ReLU()(x)
+ x = tf.keras.layers.Dropout(rate=0)(x)
+
+ # Output fully connected.
+ output = tf.keras.layers.Dense(units=model_settings['label_count'], activation='softmax')(x)
+
+ return tf.keras.Model(inputs, output)
+
+
+def create_ds_cnn_model(model_settings, model_size_info):
+ """Builds a model with convolutional & depthwise separable convolutional layers.
+
+ For more details see https://arxiv.org/abs/1711.07128.
+
+ Args:
+ model_settings: Dict of different settings for model training.
+ model_size_info: Defines number of layers, followed by the DS-Conv layer
+ parameters in the order {number of conv features, conv filter height,
+ width and stride in y,x dir.} for each of the layers.
+
+ Returns:
+ tf.keras Model of the 'DS-CNN' architecture.
+ """
+
+ label_count = model_settings['label_count']
+ input_frequency_size = model_settings['dct_coefficient_count']
+ input_time_size = model_settings['spectrogram_length']
+
+ t_dim = input_time_size
+ f_dim = input_frequency_size
+
+ # Extract model dimensions from model_size_info.
+ num_layers = model_size_info[0]
+ conv_feat = [None]*num_layers
+ conv_kt = [None]*num_layers
+ conv_kf = [None]*num_layers
+ conv_st = [None]*num_layers
+ conv_sf = [None]*num_layers
+
+ i = 1
+ for layer_no in range(0, num_layers):
+ conv_feat[layer_no] = model_size_info[i]
+ i += 1
+ conv_kt[layer_no] = model_size_info[i]
+ i += 1
+ conv_kf[layer_no] = model_size_info[i]
+ i += 1
+ conv_st[layer_no] = model_size_info[i]
+ i += 1
+ conv_sf[layer_no] = model_size_info[i]
+ i += 1
+
+ inputs = tf.keras.Input(shape=(model_settings['fingerprint_size']), name='input')
+
+ # Reshape the flattened input.
+ x = tf.reshape(inputs, shape=(-1, input_time_size, input_frequency_size, 1))
+
+ # Depthwise separable convolutions.
+ for layer_no in range(0, num_layers):
+ if layer_no == 0:
+ # First convolution.
+ x = tf.keras.layers.Conv2D(filters=conv_feat[0],
+ kernel_size=(conv_kt[0], conv_kf[0]),
+ strides=(conv_st[0], conv_sf[0]),
+ padding='SAME')(x)
+ x = tf.keras.layers.BatchNormalization()(x)
+ x = tf.keras.layers.ReLU()(x)
+ else:
+ # Depthwise convolution.
+ x = tf.keras.layers.DepthwiseConv2D(kernel_size=(conv_kt[layer_no], conv_kf[layer_no]),
+ strides=(conv_sf[layer_no], conv_st[layer_no]),
+ padding='SAME')(x)
+ x = tf.keras.layers.BatchNormalization()(x)
+ x = tf.keras.layers.ReLU()(x)
+
+ # Pointwise convolution.
+ x = tf.keras.layers.Conv2D(filters=conv_feat[layer_no], kernel_size=(1, 1))(x)
+ x = tf.keras.layers.BatchNormalization()(x)
+ x = tf.keras.layers.ReLU()(x)
+
+ t_dim = math.ceil(t_dim/float(conv_st[layer_no]))
+ f_dim = math.ceil(f_dim/float(conv_sf[layer_no]))
+
+ # Global average pool.
+ x = tf.keras.layers.AveragePooling2D(pool_size=(t_dim, f_dim), strides=1)(x)
+
+ # Squeeze before passing to output fully connected layer.
+ x = tf.reshape(x, shape=(-1, conv_feat[layer_no]))
+
+ # Output connected layer.
+ output = tf.keras.layers.Dense(units=label_count, activation='softmax')(x)
+
+ return tf.keras.Model(inputs, output)
diff --git a/models/keyword_spotting/cnn_medium/model_package_tf/optimisations.py b/models/keyword_spotting/cnn_medium/model_package_tf/optimisations.py
new file mode 100644
index 0000000..16b6f4c
--- /dev/null
+++ b/models/keyword_spotting/cnn_medium/model_package_tf/optimisations.py
@@ -0,0 +1,259 @@
+# Copyright © 2023 Arm Ltd. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Functions for optimizing simple keyword spotting models using clustering API."""
+
+import argparse
+from pathlib import Path
+
+import tensorflow as tf
+import numpy as np
+import tensorflow_model_optimization as tfmot
+
+from data_processing import data_preprocessing
+from model_core_utils import models
+
+
+def print_model_weight_clusters(model):
+
+ for layer in model.layers:
+ if isinstance(layer, tf.keras.layers.Wrapper):
+ weights = layer.trainable_weights
+ else:
+ weights = layer.weights
+ for weight in weights:
+ if "kernel" in weight.name:
+ unique_count = len(np.unique(weight))
+ print(
+ f"{layer.name}/{weight.name}: {unique_count} clusters "
+ )
+
+
+def optimize():
+ model_settings = models.prepare_model_settings(
+ len(data_preprocessing.prepare_words_list(FLAGS.wanted_words.split(','))),
+ FLAGS.sample_rate, FLAGS.clip_duration_ms, FLAGS.window_size_ms,
+ FLAGS.window_stride_ms, FLAGS.dct_coefficient_count)
+
+ # Create the model to optimize from checkpoint.
+ model = models.create_model(model_settings, FLAGS.model_architecture, FLAGS.model_size_info)
+ model.load_weights(FLAGS.checkpoint).expect_partial()
+
+ audio_processor = data_preprocessing.AudioProcessor(data_url=FLAGS.data_url,
+ data_dir=FLAGS.data_dir,
+ silence_percentage=FLAGS.silence_percentage,
+ unknown_percentage=FLAGS.unknown_percentage,
+ wanted_words=FLAGS.wanted_words.split(','),
+ validation_percentage=FLAGS.validation_percentage,
+ testing_percentage=FLAGS.testing_percentage,
+ model_settings=model_settings)
+
+ # We decay learning rate in a constant piecewise way to help learning.
+ training_steps_list = list(map(int, FLAGS.how_many_training_steps.split(',')))
+ learning_rates_list = list(map(float, FLAGS.learning_rate.split(',')))
+ lr_boundary_list = training_steps_list[:-1] # Only need the values at which to change lr.
+ lr_schedule = tf.keras.optimizers.schedules.PiecewiseConstantDecay(boundaries=lr_boundary_list,
+ values=learning_rates_list)
+
+ cluster_weights = tfmot.clustering.keras.cluster_weights
+ CentroidInitialization = tfmot.clustering.keras.CentroidInitialization
+
+ clustering_params = {
+ 'number_of_clusters': 32,
+ 'cluster_centroids_init': CentroidInitialization.KMEANS_PLUS_PLUS}
+
+ clustered_model = cluster_weights(model, **clustering_params)
+
+ # Specify the optimizer configurations.
+ optimizer = tf.keras.optimizers.Adam(learning_rate=lr_schedule)
+ clustered_model.compile(optimizer=optimizer,
+ loss=tf.keras.losses.SparseCategoricalCrossentropy(),
+ metrics=['accuracy'])
+
+ train_data = audio_processor.get_data(audio_processor.Modes.TRAINING,
+ FLAGS.background_frequency, FLAGS.background_volume,
+ int((FLAGS.time_shift_ms * FLAGS.sample_rate) / 1000))
+ train_data = train_data.repeat().batch(FLAGS.batch_size).prefetch(tf.data.AUTOTUNE)
+ val_data = audio_processor.get_data(audio_processor.Modes.VALIDATION)
+ val_data = val_data.batch(FLAGS.batch_size).prefetch(tf.data.AUTOTUNE)
+
+ # We train for a max number of iterations so need to calculate how many 'epochs' this will be.
+ training_steps_max = np.sum(training_steps_list)
+ training_epoch_max = int(np.ceil(training_steps_max / FLAGS.eval_step_interval))
+
+ # Train the model with clustering applied.
+ clustered_model.fit(x=train_data,
+ steps_per_epoch=FLAGS.eval_step_interval,
+ epochs=training_epoch_max,
+ validation_data=val_data)
+
+ stripped_clustered_model = tfmot.clustering.keras.strip_clustering(clustered_model)
+
+ print_model_weight_clusters(stripped_clustered_model)
+
+ # Save the clustered model weights
+ train_dir = Path(FLAGS.train_dir) / "optimized"
+ train_dir.mkdir(parents=True, exist_ok=True)
+
+ stripped_clustered_model.save_weights((train_dir /
+ (FLAGS.model_architecture +
+ "_clustered_ckpt")))
+
+ # Test the model.
+ test_data = audio_processor.get_data(audio_processor.Modes.TESTING)
+ test_data = test_data.batch(FLAGS.batch_size)
+
+ stripped_clustered_model.compile(optimizer=optimizer,
+ loss=tf.keras.losses.SparseCategoricalCrossentropy(),
+ metrics=['accuracy'])
+
+ test_loss, test_acc = stripped_clustered_model.evaluate(x=test_data)
+ print(f'Final test accuracy: {test_acc*100:.2f}%')
+
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser()
+ parser.add_argument(
+ '--data_url',
+ type=str,
+ default='http://download.tensorflow.org/data/speech_commands_v0.02.tar.gz',
+ help='Location of speech training data archive on the web.')
+ parser.add_argument(
+ '--data_dir',
+ type=str,
+ default='/tmp/speech_dataset/',
+ help="""\
+ Where to download the speech training data to.
+ """)
+ parser.add_argument(
+ '--background_volume',
+ type=float,
+ default=0.1,
+ help="""\
+ How loud the background noise should be, between 0 and 1.
+ """)
+ parser.add_argument(
+ '--background_frequency',
+ type=float,
+ default=0.8,
+ help="""\
+ How many of the training samples have background noise mixed in.
+ """)
+ parser.add_argument(
+ '--silence_percentage',
+ type=float,
+ default=10.0,
+ help="""\
+ How much of the training data should be silence.
+ """)
+ parser.add_argument(
+ '--unknown_percentage',
+ type=float,
+ default=10.0,
+ help="""\
+ How much of the training data should be unknown words.
+ """)
+ parser.add_argument(
+ '--time_shift_ms',
+ type=float,
+ default=100.0,
+ help="""\
+ Range to randomly shift the training audio by in time.
+ """)
+ parser.add_argument(
+ '--testing_percentage',
+ type=int,
+ default=10,
+ help='What percentage of wavs to use as a test set.')
+ parser.add_argument(
+ '--validation_percentage',
+ type=int,
+ default=10,
+ help='What percentage of wavs to use as a validation set.')
+ parser.add_argument(
+ '--sample_rate',
+ type=int,
+ default=16000,
+ help='Expected sample rate of the wavs',)
+ parser.add_argument(
+ '--clip_duration_ms',
+ type=int,
+ default=1000,
+ help='Expected duration in milliseconds of the wavs',)
+ parser.add_argument(
+ '--window_size_ms',
+ type=float,
+ default=30.0,
+ help='How long each spectrogram timeslice is',)
+ parser.add_argument(
+ '--window_stride_ms',
+ type=float,
+ default=10.0,
+ help='How long each spectrogram timeslice is',)
+ parser.add_argument(
+ '--dct_coefficient_count',
+ type=int,
+ default=40,
+ help='How many bins to use for the MFCC fingerprint',)
+ parser.add_argument(
+ '--how_many_training_steps',
+ type=str,
+ default='3750,750',
+ help='How many training loops to run',)
+ parser.add_argument(
+ '--eval_step_interval',
+ type=int,
+ default=400,
+ help='How often to evaluate the training results.')
+ parser.add_argument(
+ '--learning_rate',
+ type=str,
+ default='0.001,0.0001',
+ help='How large a learning rate to use when training.')
+ parser.add_argument(
+ '--batch_size',
+ type=int,
+ default=100,
+ help='How many items to train with at once',)
+ parser.add_argument(
+ '--wanted_words',
+ type=str,
+ default='yes,no,up,down,left,right,on,off,stop,go',
+ help='Words to use (others will be added to an unknown label)',)
+ parser.add_argument(
+ '--train_dir',
+ type=str,
+ default='/tmp/speech_commands_train',
+ help='Directory to write event logs and checkpoint.')
+ parser.add_argument(
+ '--save_step_interval',
+ type=int,
+ default=100,
+ help='Save model checkpoint every save_steps.')
+ parser.add_argument(
+ '--checkpoint',
+ type=str,
+ help='Checkpoint to load the weights from before fine-tuning.')
+ parser.add_argument(
+ '--model_architecture',
+ type=str,
+ default='dnn',
+ help='What model architecture to use')
+ parser.add_argument(
+ '--model_size_info',
+ type=int,
+ nargs="+",
+ default=[128, 128, 128],
+ help='Model dimensions - different for various models')
+
+ FLAGS, _ = parser.parse_known_args()
+ optimize()
diff --git a/models/keyword_spotting/cnn_medium/model_package_tf/recreate_model.sh b/models/keyword_spotting/cnn_medium/model_package_tf/recreate_model.sh
new file mode 100644
index 0000000..a295f58
--- /dev/null
+++ b/models/keyword_spotting/cnn_medium/model_package_tf/recreate_model.sh
@@ -0,0 +1,52 @@
+#!/usr/bin/env bash
+# Copyright (C) 2023 Arm Limited or its affiliates. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+ckpt_path=model_archive/model_source/weights/cnn_0.93_ckpt
+train=false
+
+# Parse command line args
+while (( $# >= 1 )); do
+ case $1 in
+ --ckpt)
+ if [ "$2" ]; then
+ ckpt_path=$2
+ shift
+ else
+ printf 'ERROR: "--ckpt" requires a path to be supplied.\n'
+ exit 1
+ fi
+ ;;
+ --train)
+ train=true
+ break;;
+ *) shift;
+ esac;
+done
+
+
+# CNN Medium training
+if [ "$train" = true ]
+then
+python train.py --model_architecture cnn --model_size_info 64 10 4 1 1 48 10 4 2 1 16 128 --dct_coefficient_count 10 --window_size_ms 40 --window_stride_ms 20 --learning_rate 0.0005,0.0001,0.00002 --how_many_training_steps 10000,10000,10000 --summaries_dir work/CNN/CNN_M/retrain_logs --train_dir work/CNN/CNN_M/training
+fi
+
+# Conversion to TFLite fp32
+python convert_to_tflite.py --model_architecture cnn --model_size_info 64 10 4 1 1 48 10 4 2 1 16 128 --dct_coefficient_count 10 --window_size_ms 40 --window_stride_ms 20 --checkpoint $ckpt_path --no-quantize
+
+# Conversion to TFLite int8
+python convert_to_tflite.py --model_architecture cnn --model_size_info 64 10 4 1 1 48 10 4 2 1 16 128 --dct_coefficient_count 10 --window_size_ms 40 --window_stride_ms 20 --checkpoint $ckpt_path --inference_type int8
+
diff --git a/models/keyword_spotting/cnn_medium/model_package_tf/requirements.txt b/models/keyword_spotting/cnn_medium/model_package_tf/requirements.txt
new file mode 100644
index 0000000..3448cff
--- /dev/null
+++ b/models/keyword_spotting/cnn_medium/model_package_tf/requirements.txt
@@ -0,0 +1,3 @@
+numpy == 1.19.5
+tensorflow == 2.5.0
+tensorflow-model-optimization == 0.6.0
\ No newline at end of file
diff --git a/models/keyword_spotting/cnn_medium/model_package_tf/train.py b/models/keyword_spotting/cnn_medium/model_package_tf/train.py
new file mode 100644
index 0000000..8c488b3
--- /dev/null
+++ b/models/keyword_spotting/cnn_medium/model_package_tf/train.py
@@ -0,0 +1,227 @@
+# Copyright © 2023 Arm Ltd. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Functions for training simple keyword spotting models."""
+
+import argparse
+from pathlib import Path
+
+import tensorflow as tf
+import numpy as np
+
+from data_processing import data_preprocessing
+from model_core_utils import models
+
+
+def train():
+ model_settings = models.prepare_model_settings(
+ len(data_preprocessing.prepare_words_list(FLAGS.wanted_words.split(','))),
+ FLAGS.sample_rate, FLAGS.clip_duration_ms, FLAGS.window_size_ms,
+ FLAGS.window_stride_ms, FLAGS.dct_coefficient_count)
+
+ # Create the model.
+ model = models.create_model(model_settings, FLAGS.model_architecture, FLAGS.model_size_info, True)
+
+ audio_processor = data_preprocessing.AudioProcessor(data_url=FLAGS.data_url,
+ data_dir=FLAGS.data_dir,
+ silence_percentage=FLAGS.silence_percentage,
+ unknown_percentage=FLAGS.unknown_percentage,
+ wanted_words=FLAGS.wanted_words.split(','),
+ validation_percentage=FLAGS.validation_percentage,
+ testing_percentage=FLAGS.testing_percentage,
+ model_settings=model_settings)
+
+ # We decay learning rate in a constant piecewise way to help learning.
+ training_steps_list = list(map(int, FLAGS.how_many_training_steps.split(',')))
+ learning_rates_list = list(map(float, FLAGS.learning_rate.split(',')))
+ lr_boundary_list = training_steps_list[:-1] # Only need the values at which to change lr.
+ lr_schedule = tf.keras.optimizers.schedules.PiecewiseConstantDecay(boundaries=lr_boundary_list,
+ values=learning_rates_list)
+
+ # Specify the optimizer configurations.
+ optimizer = tf.keras.optimizers.Adam(learning_rate=lr_schedule)
+ model.compile(optimizer=optimizer,
+ loss=tf.keras.losses.SparseCategoricalCrossentropy(),
+ metrics=['accuracy'])
+
+ train_data = audio_processor.get_data(audio_processor.Modes.TRAINING,
+ FLAGS.background_frequency, FLAGS.background_volume,
+ int((FLAGS.time_shift_ms * FLAGS.sample_rate) / 1000))
+ train_data = train_data.repeat().batch(FLAGS.batch_size).prefetch(tf.data.AUTOTUNE)
+ val_data = audio_processor.get_data(audio_processor.Modes.VALIDATION)
+ val_data = val_data.batch(FLAGS.batch_size).prefetch(tf.data.AUTOTUNE)
+
+ # We train for a max number of iterations so need to calculate how many 'epochs' this will be.
+ training_steps_max = np.sum(training_steps_list)
+ training_epoch_max = int(np.ceil(training_steps_max / FLAGS.eval_step_interval))
+
+ # Callbacks.
+ train_dir = Path(FLAGS.train_dir) / "best"
+ train_dir.mkdir(parents=True, exist_ok=True)
+ model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
+ filepath=(train_dir / (FLAGS.model_architecture + "_{val_accuracy:.3f}_ckpt")),
+ save_weights_only=True,
+ monitor='val_accuracy',
+ mode='max',
+ save_best_only=True)
+ tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=FLAGS.summaries_dir)
+
+ # Train the model.
+ model.fit(x=train_data,
+ steps_per_epoch=FLAGS.eval_step_interval,
+ epochs=training_epoch_max,
+ validation_data=val_data,
+ callbacks=[model_checkpoint_callback, tensorboard_callback])
+
+ # Test and save the model.
+ test_data = audio_processor.get_data(audio_processor.Modes.TESTING)
+ test_data = test_data.batch(FLAGS.batch_size)
+
+ test_loss, test_acc = model.evaluate(x=test_data)
+ print(f'Final test accuracy: {test_acc*100:.2f}%')
+ model.save(f'saved_model/{FLAGS.model_architecture}')
+ model.save(f'keras/{FLAGS.model_architecture}.h5')
+
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser()
+ parser.add_argument(
+ '--data_url',
+ type=str,
+ default='http://download.tensorflow.org/data/speech_commands_v0.02.tar.gz',
+ help='Location of speech training data archive on the web.')
+ parser.add_argument(
+ '--data_dir',
+ type=str,
+ default='/tmp/speech_dataset/',
+ help="""\
+ Where to download the speech training data to.
+ """)
+ parser.add_argument(
+ '--background_volume',
+ type=float,
+ default=0.1,
+ help="""\
+ How loud the background noise should be, between 0 and 1.
+ """)
+ parser.add_argument(
+ '--background_frequency',
+ type=float,
+ default=0.8,
+ help="""\
+ How many of the training samples have background noise mixed in.
+ """)
+ parser.add_argument(
+ '--silence_percentage',
+ type=float,
+ default=10.0,
+ help="""\
+ How much of the training data should be silence.
+ """)
+ parser.add_argument(
+ '--unknown_percentage',
+ type=float,
+ default=10.0,
+ help="""\
+ How much of the training data should be unknown words.
+ """)
+ parser.add_argument(
+ '--time_shift_ms',
+ type=float,
+ default=100.0,
+ help="""\
+ Range to randomly shift the training audio by in time.
+ """)
+ parser.add_argument(
+ '--testing_percentage',
+ type=int,
+ default=10,
+ help='What percentage of wavs to use as a test set.')
+ parser.add_argument(
+ '--validation_percentage',
+ type=int,
+ default=10,
+ help='What percentage of wavs to use as a validation set.')
+ parser.add_argument(
+ '--sample_rate',
+ type=int,
+ default=16000,
+ help='Expected sample rate of the wavs',)
+ parser.add_argument(
+ '--clip_duration_ms',
+ type=int,
+ default=1000,
+ help='Expected duration in milliseconds of the wavs',)
+ parser.add_argument(
+ '--window_size_ms',
+ type=float,
+ default=30.0,
+ help='How long each spectrogram timeslice is',)
+ parser.add_argument(
+ '--window_stride_ms',
+ type=float,
+ default=10.0,
+ help='How long each spectrogram timeslice is',)
+ parser.add_argument(
+ '--dct_coefficient_count',
+ type=int,
+ default=40,
+ help='How many bins to use for the MFCC fingerprint',)
+ parser.add_argument(
+ '--how_many_training_steps',
+ type=str,
+ default='15000,3000',
+ help='How many training loops to run',)
+ parser.add_argument(
+ '--eval_step_interval',
+ type=int,
+ default=400,
+ help='How often to evaluate the training results.')
+ parser.add_argument(
+ '--learning_rate',
+ type=str,
+ default='0.001,0.0001',
+ help='How large a learning rate to use when training.')
+ parser.add_argument(
+ '--batch_size',
+ type=int,
+ default=100,
+ help='How many items to train with at once',)
+ parser.add_argument(
+ '--summaries_dir',
+ type=str,
+ default='/tmp/retrain_logs',
+ help='Where to save summary logs for TensorBoard.')
+ parser.add_argument(
+ '--wanted_words',
+ type=str,
+ default='yes,no,up,down,left,right,on,off,stop,go',
+ help='Words to use (others will be added to an unknown label)',)
+ parser.add_argument(
+ '--train_dir',
+ type=str,
+ default='/tmp/speech_commands_train',
+ help='Directory to write event logs and checkpoint.')
+ parser.add_argument(
+ '--model_architecture',
+ type=str,
+ default='dnn',
+ help='What model architecture to use')
+ parser.add_argument(
+ '--model_size_info',
+ type=int,
+ nargs="+",
+ default=[128, 128, 128],
+ help='Model dimensions - different for various models')
+
+ FLAGS, _ = parser.parse_known_args()
+ train()
diff --git a/models/keyword_spotting/cnn_medium/model_package_tf/validation_utils/labels.txt b/models/keyword_spotting/cnn_medium/model_package_tf/validation_utils/labels.txt
new file mode 100644
index 0000000..ba41645
--- /dev/null
+++ b/models/keyword_spotting/cnn_medium/model_package_tf/validation_utils/labels.txt
@@ -0,0 +1,12 @@
+_silence_
+_unknown_
+yes
+no
+up
+down
+left
+right
+on
+off
+stop
+go
\ No newline at end of file
diff --git a/models/keyword_spotting/cnn_medium/tflite_int8/README.md b/models/keyword_spotting/cnn_medium/tflite_int8/README.md
deleted file mode 100644
index 5576d61..0000000
--- a/models/keyword_spotting/cnn_medium/tflite_int8/README.md
+++ /dev/null
@@ -1,59 +0,0 @@
-# CNN Medium INT8
-
-## Description
-This is a fully quantized version (asymmetrical int8) of the CNN Medium model developed by Arm, with training checkpoints, from the Hello Edge paper. Code to recreate this model can be found here: https://github.com/ARM-software/ML-examples/tree/master/tflu-kws-cortex-m
-
-## License
-[Apache-2.0](https://spdx.org/licenses/Apache-2.0.html)
-
-## Related Materials
-### Class Labels
-The class labels associated with this model can be downloaded by running the script `get_class_labels.sh`.
-
-### Model Recreation Code
-Code to recreate this model can be found here: https://github.com/ARM-software/ML-examples/tree/master/tflu-kws-cortex-m.
-
-## Network Information
-| Network Information | Value |
-|---------------------|------------------|
-| Framework | TensorFlow Lite |
-| SHA-1 Hash | 6bc68074d960bbb0c695e19fd96fd7903131ef60 |
-| Size (Bytes) | 186064 |
-| Provenance | https://github.com/ARM-software/ML-examples/tree/master/tflu-kws-cortex-m |
-| Paper | https://arxiv.org/abs/1711.07128 |
-
-## Accuracy
-Dataset: Google Speech Commands Test Set
-
-| Metric | Value |
-|--------|-------|
-| Accuracy | 0.911 |
-
-## Performance
-| Platform | Optimized |
-|----------|:---------:|
-| Cortex-A |:heavy_check_mark: |
-| Cortex-M |:heavy_check_mark: |
-| Mali GPU |:heavy_check_mark: |
-| Ethos U |:heavy_check_mark: |
-
-### Key
-* :heavy_check_mark: - Will run on this platform.
-* :heavy_multiplication_x: - Will not run on this platform.
-
-
-
-## Optimizations
-| Optimization | Value |
-|-----------------|---------|
-| Quantization | INT8 |
-
-## Network Inputs
-| Input Node Name | Shape | Description |
-|-----------------|---------|-------------|
-| input | (1, 490) | The input is a processed MFCCs of shape (1, 490) |
-
-## Network Outputs
-| Output Node Name | Shape | Description |
-|------------------|---------|-------------|
-| Identity | (1, 12) | The probability on 12 keywords. |
diff --git a/models/keyword_spotting/cnn_medium/tflite_int8/definition.yaml b/models/keyword_spotting/cnn_medium/tflite_int8/definition.yaml
deleted file mode 100644
index a7851bb..0000000
--- a/models/keyword_spotting/cnn_medium/tflite_int8/definition.yaml
+++ /dev/null
@@ -1,43 +0,0 @@
-benchmark:
- Google Speech Commands test set:
- Accuracy: 91.08%
-description: 'This is a fully quantized version (asymmetrical int8) of the CNN Medium
- model developed by Arm, with training checkpoints, from the Hello Edge paper. Code
- to recreate this model can be found here: https://github.com/ARM-software/ML-examples/tree/master/tflu-kws-cortex-m'
-license:
-- Apache-2.0
-network:
- file_size_bytes: 186064
- filename: cnn_m_quantized.tflite
- framework: TensorFlow Lite
- hash:
- algorithm: sha1
- value: 6bc68074d960bbb0c695e19fd96fd7903131ef60
- provenance: https://github.com/ARM-software/ML-examples/tree/master/tflu-kws-cortex-m
- quality_level: null
-network_parameters:
- input_nodes:
- - description: The input is a processed MFCCs of shape (1, 490)
- example_input:
- path: models/keyword_spotting/cnn_medium/tflite_int8/testing_input/input
- name: input
- shape:
- - 1
- - 490
- output_nodes:
- - description: The probability on 12 keywords.
- name: Identity
- shape:
- - 1
- - 12
- test_output_path: models/keyword_spotting/cnn_medium/tflite_int8/testing_output/Identity
-operators:
- TensorFlow Lite:
- - CONV_2D
- - DEQUANTIZE
- - FULLY_CONNECTED
- - QUANTIZE
- - RELU
- - RESHAPE
- - SOFTMAX
-paper: https://arxiv.org/abs/1711.07128
diff --git a/models/keyword_spotting/cnn_medium/tflite_int8/get_class_labels.sh b/models/keyword_spotting/cnn_medium/tflite_int8/get_class_labels.sh
deleted file mode 100755
index e59caf5..0000000
--- a/models/keyword_spotting/cnn_medium/tflite_int8/get_class_labels.sh
+++ /dev/null
@@ -1,20 +0,0 @@
-# Copyright (C) 2021 Arm Limited or its affiliates. All rights reserved.
-#
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-#!/usr/bin/env bash
-
-wget https://raw.githubusercontent.com/ARM-software/ML-KWS-for-MCU/e9cf319e9aa2ff71d433e111477dd95329fb94cb/Pretrained_models/labels.txt
-mv labels.txt labelmappings.txt
\ No newline at end of file
diff --git a/models/keyword_spotting/cnn_small/model_package_tf/README.md b/models/keyword_spotting/cnn_small/model_package_tf/README.md
new file mode 100644
index 0000000..b74f3ba
--- /dev/null
+++ b/models/keyword_spotting/cnn_small/model_package_tf/README.md
@@ -0,0 +1,115 @@
+# CNN Small model package
+
+This folder contains code that will allow you to recreate the CNN Small keyword spotting model from
+the [Hello Edge paper](https://arxiv.org/pdf/1711.07128.pdf).
+
+## License
+[Apache-2.0](https://spdx.org/licenses/Apache-2.0.html)
+
+## Model Package Overview
+| Model | CNN_Small |
+|:---------------: |:------------------------------------------:|
+| **Format**: | Keras, Saved Model, TensorFlow Lite int8, TensorFlow Lite fp32 |
+| **Feature**: | Keyword spotting for Arm Cortex-M CPUs |
+| **Architectural Delta w.r.t. Vanilla**: | None |
+| **Domain**: | Keyword spotting |
+| **Package Quality**: | Optimised |
+
+## Model Recreation
+
+In order to recreate the model you will first need to be using ```Python3.7``` and install the requirements in ```requirements.txt```.
+
+Once you have these requirements satisfied you can execute the recreation script contained within this folder, just run:
+
+```bash
+bash ./recreate_model.sh
+```
+
+Running this script will use the pre-trained checkpoint files supplied in the ```./model_archive/model_source/weights``` folder
+to generate the TFLite files and perform evaluation on the test sets. Both an fp32 version and a quantized version will be produced.
+The quantized version will use post-training quantization to fully quantize it.
+
+If you want to run training from scratch you can do this by supplying ```--train``` when running the script. For example:
+
+```bash
+bash ./recreate_model.sh --train
+```
+
+Training is then performed and should produce a model to the stated accuracy in this repository.
+Note that exporting to TFLite will still happen with the pre-trained checkpoint files so you will need to re-run the script
+and this time supply the path to the new checkpoint files you want to use, for example:
+
+```bash
+bash ./recreate_model.sh --ckpt
+```
+
+
+## Training
+
+To train a DNN with 3 fully-connected layers with 128 neurons in each layer, run:
+
+```
+python train.py --model_architecture dnn --model_size_info 128 128 128
+```
+The command line argument *--model_size_info* is used to pass the neural network layer
+dimensions such as number of layers, convolution filter size/stride as a list to models.py,
+which builds the TensorFlow graph based on the provided model architecture
+and layer dimensions. For more info on *model_size_info* for each network architecture see
+[models.py](models.py).
+
+The training commands with all the hyperparameters to reproduce the models shown in the
+[paper](https://arxiv.org/pdf/1711.07128.pdf) are given [here](recreate_model.sh).
+
+## Testing
+To run inference on the trained model from a checkpoint and get accuracy on validation and test sets, run:
+```
+python evaluation.py --model_architecture dnn --model_size_info 128 128 128 --checkpoint
+```
+The parameters used here should match those used in the Training step.
+
+## Optimization
+
+We introduce a new *optional* step to optimize the trained keyword spotting model for deployment.
+
+Here we use TensorFlow's [weight clustering API](https://www.tensorflow.org/model_optimization/guide/clustering) to reduce the compressed model size and optimize inference on supported hardware. 32 weight clusters and kmeans++ cluster intialization method are used as the clustering hyperparameters.
+
+To optimize your trained model (e.g. a DNN), a trained model checkpoint is needed to run clustering and fine-tuning on.
+You can use the pre-trained checkpoints provided, or train your own model and use the resulting checkpoint.
+
+To apply the optimization and fine-tuning, run the following command:
+```
+python optimisations.py --model_architecture dnn --model_size_info 128 128 128 --checkpoint
+```
+The parameters used here should match those used in the Training step, except for the number of training steps.
+The number of training steps is reduced since the optimization step only requires fine-tuning.
+
+This will generate a clustered model checkpoint that can be used in the quantization step to generate a quantized and clustered TFLite model.
+
+## Quantization and TFLite Conversion
+
+As part of the update we now use TensorFlow's
+[post training quantization](https://www.tensorflow.org/lite/performance/post_training_quantization) to
+make quantization of the trained models super simple.
+
+To quantize your trained model (e.g. a DNN) run:
+```
+python convert_to_tflite.py --model_architecture dnn --model_size_info 128 128 128 --checkpoint [--inference_type int8|int16]
+```
+The parameters used here should match those used in the Training step.
+
+The inference_type parameter is *optional* and to be used if a fully quantized model with inputs and outputs of type int8 or int16 is needed. It defaults to fp32.
+
+This step will produce a quantized TFLite file *dnn_quantized.tflite*.
+You can test the accuracy of this quantized model on the test set by running:
+```
+python evaluation.py --tflite_path dnn_quantized.tflite
+```
+The parameters used here should match those used in the Training step.
+
+`convert_to_tflite.py` uses post-training quantization to generate a quantized model by default. If you wish to convert to a floating point TFLite model, use the command below:
+
+```
+python convert_to_tflite.py --model_architecture dnn --model_size_info 128 128 128 --checkpoint --no-quantize
+```
+
+This will produce a floating point TFLite file *dnn.tflite*. You can test the accuracy of this floating point model using `evaluation.py` as above.
diff --git a/models/keyword_spotting/cnn_small/model_package_tf/cnn_s_inference_keras.py b/models/keyword_spotting/cnn_small/model_package_tf/cnn_s_inference_keras.py
new file mode 100644
index 0000000..db7694a
--- /dev/null
+++ b/models/keyword_spotting/cnn_small/model_package_tf/cnn_s_inference_keras.py
@@ -0,0 +1,76 @@
+# Copyright © 2023 Arm Ltd. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from data_processing.data_preprocessing import load_wav_file, calculate_mfcc
+
+import tensorflow as tf
+import argparse
+
+
+def load_labels(filename):
+ """Read in labels, one label per line."""
+ f = open(filename, "r")
+ return f.read().splitlines()
+
+
+def main():
+ window_size_samples = int(FLAGS.sample_rate * FLAGS.window_size_ms / 1000)
+ window_stride_samples = int(FLAGS.sample_rate * FLAGS.window_stride_ms / 1000)
+ decoded, sample = load_wav_file(FLAGS.wav, FLAGS.sample_rate)
+ x = calculate_mfcc(decoded, sample, window_size_samples, window_stride_samples, FLAGS.dct_coefficient_count)
+ x = tf.reshape(x, [1, -1])
+
+ model = tf.keras.models.load_model(FLAGS.keras_file_path)
+ predictions = model.predict(x)
+
+ # Sort to show labels in order of confidence
+ top_k = predictions[0].argsort()[-1:][::-1]
+ for node_id in top_k:
+ human_string = load_labels(FLAGS.labels)[int(node_id)]
+ score = predictions[0,node_id]
+ print(f'model predicted: {human_string} with score {score:.5f}')
+
+
+if __name__ == '__main__':
+ parser = argparse.ArgumentParser()
+ parser.add_argument(
+ '--wav', type=str, default='', help='Audio file to be identified.')
+ parser.add_argument(
+ '--labels', type=str, default='', help='Path to file containing labels.')
+ parser.add_argument(
+ '--sample_rate',
+ type=int,
+ default=16000,
+ help='Expected sample rate of the wavs', )
+ parser.add_argument(
+ '--window_size_ms',
+ type=float,
+ default=30.0,
+ help='How long each spectrogram timeslice is', )
+ parser.add_argument(
+ '--window_stride_ms',
+ type=float,
+ default=10.0,
+ help='How long each spectrogram timeslice is', )
+ parser.add_argument(
+ '--dct_coefficient_count',
+ type=int,
+ default=40,
+ help='How many bins to use for the MFCC fingerprint', )
+ parser.add_argument(
+ '--keras_file_path',
+ type=str,
+ default='',
+ help='Path to the .h5 Keras model file to use for testing.')
+ FLAGS, unparsed = parser.parse_known_args()
+ main()
diff --git a/models/keyword_spotting/cnn_small/model_package_tf/cnn_s_inference_tflite.py b/models/keyword_spotting/cnn_small/model_package_tf/cnn_s_inference_tflite.py
new file mode 100644
index 0000000..9f79d99
--- /dev/null
+++ b/models/keyword_spotting/cnn_small/model_package_tf/cnn_s_inference_tflite.py
@@ -0,0 +1,120 @@
+# Copyright © 2023 Arm Ltd. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from data_processing.data_preprocessing import load_wav_file, calculate_mfcc
+
+import tensorflow as tf
+import numpy as np
+import argparse
+
+
+def tflite_inference(input_data, tflite_path):
+ """Call forwards pass of TFLite file and returns the result.
+
+ Args:
+ input_data: Input data to use on forward pass.
+ tflite_path: Path to TFLite file to run.
+
+ Returns:
+ Output from inference.
+ """
+ supported_quant_dtypes = (np.int8, np.int16)
+ interpreter = tf.lite.Interpreter(model_path=tflite_path)
+ interpreter.allocate_tensors()
+
+ input_details = interpreter.get_input_details()
+ output_details = interpreter.get_output_details()
+
+ input_dtype = input_details[0]["dtype"]
+ output_dtype = output_details[0]["dtype"]
+
+ # Check if the input/output type is quantized,
+ # set scale and zero-point accordingly
+ if input_dtype in supported_quant_dtypes:
+ input_scale, input_zero_point = input_details[0]["quantization"]
+ else:
+ input_scale, input_zero_point = 1, 0
+
+ input_data = input_data / input_scale + input_zero_point
+ input_data = np.round(input_data) if input_dtype in supported_quant_dtypes else input_data
+
+ if output_dtype in supported_quant_dtypes:
+ output_scale, output_zero_point = output_details[0]["quantization"]
+ else:
+ output_scale, output_zero_point = 1, 0
+
+ interpreter.set_tensor(input_details[0]['index'], tf.cast(input_data, input_dtype))
+ interpreter.invoke()
+
+ output_data = interpreter.get_tensor(output_details[0]['index'])
+
+ output_data = output_scale * (output_data.astype(np.float32) - output_zero_point)
+
+ return output_data
+
+
+def load_labels(filename):
+ """Read in labels, one label per line."""
+ f = open(filename, "r")
+ return f.read().splitlines()
+
+
+def main():
+ window_size_samples = int(FLAGS.sample_rate * FLAGS.window_size_ms / 1000)
+ window_stride_samples = int(FLAGS.sample_rate * FLAGS.window_stride_ms / 1000)
+ decoded, sample = load_wav_file(FLAGS.wav, FLAGS.sample_rate)
+ x = calculate_mfcc(decoded, sample, window_size_samples, window_stride_samples, FLAGS.dct_coefficient_count)
+ x = tf.reshape(x, [1, -1])
+ predictions = tflite_inference(x, FLAGS.tflite_path)
+
+ # Sort to show labels in order of confidence
+ top_k = predictions[0].argsort()[-1:][::-1]
+ for node_id in top_k:
+ human_string = load_labels(FLAGS.labels)[int(node_id)]
+ score = predictions[0,node_id]
+ print(f'model predicted: {human_string} with score {score:.5f}')
+
+
+if __name__ == '__main__':
+ parser = argparse.ArgumentParser()
+ parser.add_argument(
+ '--wav', type=str, default='', help='Audio file to be identified.')
+ parser.add_argument(
+ '--labels', type=str, default='', help='Path to file containing labels.')
+ parser.add_argument(
+ '--sample_rate',
+ type=int,
+ default=16000,
+ help='Expected sample rate of the wavs', )
+ parser.add_argument(
+ '--window_size_ms',
+ type=float,
+ default=30.0,
+ help='How long each spectrogram timeslice is', )
+ parser.add_argument(
+ '--window_stride_ms',
+ type=float,
+ default=10.0,
+ help='How long each spectrogram timeslice is', )
+ parser.add_argument(
+ '--dct_coefficient_count',
+ type=int,
+ default=40,
+ help='How many bins to use for the MFCC fingerprint', )
+ parser.add_argument(
+ '--tflite_path',
+ type=str,
+ default='',
+ help='Path to TFLite file to use for testing.')
+ FLAGS, unparsed = parser.parse_known_args()
+ main()
diff --git a/models/keyword_spotting/cnn_small/model_package_tf/convert_to_tflite.py b/models/keyword_spotting/cnn_small/model_package_tf/convert_to_tflite.py
new file mode 100644
index 0000000..64ab8df
--- /dev/null
+++ b/models/keyword_spotting/cnn_small/model_package_tf/convert_to_tflite.py
@@ -0,0 +1,234 @@
+# Copyright © 2023 Arm Ltd. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Functions for converting and quantizing a trained keyword spotting
+ model and saving to TFLite."""
+
+import argparse
+
+import tensorflow as tf
+
+from data_processing import data_preprocessing
+from model_core_utils import models
+from evaluation import tflite_test
+
+NUM_REP_DATA_SAMPLES = 100 # How many samples to use for post training quantization.
+
+
+def convert(model_settings, audio_processor, checkpoint, quantize, inference_type, tflite_path):
+ """Load our trained floating point model and convert it.
+
+ TFLite conversion or post training quantization is performed and the
+ resulting model is saved as a TFLite file.
+ We use samples from the validation set to do post training quantization.
+
+ Args:
+ model_settings: Dictionary of common model settings.
+ audio_processor: Audio processor class object.
+ checkpoint: Path to training checkpoint to load.
+ quantize: Whether to quantize the model or convert to fp32 TFLite model.
+ inference_type: Input/output type of the quantized model.
+ tflite_path: Output TFLite file save path.
+ """
+ model = models.create_model(model_settings, FLAGS.model_architecture, FLAGS.model_size_info, False)
+ model.load_weights(checkpoint).expect_partial()
+
+ val_data = audio_processor.get_data(audio_processor.Modes.VALIDATION).batch(1)
+
+ def _rep_dataset():
+ """Generator function to produce representative dataset."""
+ i = 0
+ for mfcc, label in val_data:
+ if i > NUM_REP_DATA_SAMPLES:
+ break
+ i += 1
+ yield [mfcc]
+
+ if quantize:
+ # Quantize model and save to disk.
+ tflite_model = post_training_quantize(model, inference_type, _rep_dataset)
+ with open(tflite_path, 'wb') as f:
+ f.write(tflite_model)
+ print(f'Quantized model saved to {tflite_path}.')
+ else:
+ converter = tf.lite.TFLiteConverter.from_keras_model(model)
+ tflite_model = converter.convert()
+ with open(tflite_path, 'wb') as f:
+ f.write(tflite_model)
+ print(f'Converted model saved to {tflite_path}.')
+
+
+def post_training_quantize(keras_model, inference_type, rep_dataset):
+ """Perform post training quantization and returns the TFLite model ready for saving.
+
+ See https://www.tensorflow.org/lite/performance/post_training_quantization#full_integer_quantization for
+ more details.
+
+ Args:
+ keras_model: The trained tf Keras model used for post training quantization.
+ inference_type: Input/output type of the quantized model.
+ rep_dataset: Function to use as a representative dataset, must be callable.
+
+ Returns:
+ Quantized TFLite model ready for saving to disk.
+ """
+ converter = tf.lite.TFLiteConverter.from_keras_model(keras_model)
+ converter.optimizations = [tf.lite.Optimize.DEFAULT]
+
+ if inference_type == 'int8':
+ converter.inference_input_type = tf.int8
+ converter.inference_output_type = tf.int8
+ supported_ops = tf.lite.OpsSet.TFLITE_BUILTINS_INT8
+ if inference_type == 'int16':
+ converter.inference_input_type = tf.int16
+ converter.inference_output_type = tf.int16
+ supported_ops = tf.lite.OpsSet.EXPERIMENTAL_TFLITE_BUILTINS_ACTIVATIONS_INT16_WEIGHTS_INT8
+
+ # Int8 post training quantization needs representative dataset.
+ converter.representative_dataset = rep_dataset
+ converter.target_spec.supported_ops = [supported_ops]
+
+ tflite_model = converter.convert()
+
+ return tflite_model
+
+
+def main():
+ model_settings = models.prepare_model_settings(
+ len(data_preprocessing.prepare_words_list(FLAGS.wanted_words.split(','))),
+ FLAGS.sample_rate, FLAGS.clip_duration_ms, FLAGS.window_size_ms,
+ FLAGS.window_stride_ms, FLAGS.dct_coefficient_count)
+
+ audio_processor = data_preprocessing.AudioProcessor(data_url=FLAGS.data_url,
+ data_dir=FLAGS.data_dir,
+ silence_percentage=FLAGS.silence_percentage,
+ unknown_percentage=FLAGS.unknown_percentage,
+ wanted_words=FLAGS.wanted_words.split(','),
+ validation_percentage=FLAGS.validation_percentage,
+ testing_percentage=FLAGS.testing_percentage,
+ model_settings=model_settings)
+
+ if FLAGS.quantize:
+ tflite_path = f'{FLAGS.model_architecture}_quantized.tflite'
+ else:
+ tflite_path = f'{FLAGS.model_architecture}.tflite'
+
+ # Load floating point model from checkpoint and convert it.
+ convert(model_settings, audio_processor, FLAGS.checkpoint,
+ FLAGS.quantize, FLAGS.inference_type, tflite_path)
+
+ # Test the newly converted model on the test set.
+ tflite_test(model_settings, audio_processor, tflite_path)
+
+
+if __name__ == '__main__':
+ parser = argparse.ArgumentParser()
+ parser.add_argument(
+ '--data_url',
+ type=str,
+ default='http://download.tensorflow.org/data/speech_commands_v0.02.tar.gz',
+ help='Location of speech training data archive on the web.')
+ parser.add_argument(
+ '--data_dir',
+ type=str,
+ default='/tmp/speech_dataset/',
+ help="""\
+ Where to download the speech training data to.
+ """)
+ parser.add_argument(
+ '--silence_percentage',
+ type=float,
+ default=10.0,
+ help="""\
+ How much of the training data should be silence.
+ """)
+ parser.add_argument(
+ '--unknown_percentage',
+ type=float,
+ default=10.0,
+ help="""\
+ How much of the training data should be unknown words.
+ """)
+ parser.add_argument(
+ '--testing_percentage',
+ type=int,
+ default=10,
+ help='What percentage of wavs to use as a test set.')
+ parser.add_argument(
+ '--validation_percentage',
+ type=int,
+ default=10,
+ help='What percentage of wavs to use as a validation set.')
+ parser.add_argument(
+ '--sample_rate',
+ type=int,
+ default=16000,
+ help='Expected sample rate of the wavs',)
+ parser.add_argument(
+ '--clip_duration_ms',
+ type=int,
+ default=1000,
+ help='Expected duration in milliseconds of the wavs',)
+ parser.add_argument(
+ '--window_size_ms',
+ type=float,
+ default=30.0,
+ help='How long each spectrogram timeslice is',)
+ parser.add_argument(
+ '--window_stride_ms',
+ type=float,
+ default=10.0,
+ help='How long each spectrogram timeslice is',)
+ parser.add_argument(
+ '--dct_coefficient_count',
+ type=int,
+ default=40,
+ help='How many bins to use for the MFCC fingerprint',)
+ parser.add_argument(
+ '--wanted_words',
+ type=str,
+ default='yes,no,up,down,left,right,on,off,stop,go',
+ help='Words to use (others will be added to an unknown label)',)
+ parser.add_argument(
+ '--model_architecture',
+ type=str,
+ default='dnn',
+ help='What model architecture to use')
+ parser.add_argument(
+ '--model_size_info',
+ type=int,
+ nargs="+",
+ default=[128, 128, 128],
+ help='Model dimensions - different for various models')
+ parser.add_argument(
+ '--checkpoint',
+ type=str,
+ help='Checkpoint to load the weights from.')
+ parser.add_argument(
+ '--quantize',
+ dest='quantize',
+ action="store_true",
+ default=True,
+ help='Whether to quantize the model or convert to fp32 TFLite model. Defaults to True.')
+ parser.add_argument(
+ '--no-quantize',
+ dest='quantize',
+ action="store_false",
+ help='Whether to quantize the model or convert to fp32 TFLite model. Defaults to True.')
+ parser.add_argument(
+ '--inference_type',
+ type=str,
+ default='fp32',
+ help='If quantize is true, whether the model input and output is float32, int8 or int16')
+
+ FLAGS, _ = parser.parse_known_args()
+ main()
diff --git a/models/keyword_spotting/cnn_small/model_package_tf/data_processing/__init__.py b/models/keyword_spotting/cnn_small/model_package_tf/data_processing/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/models/keyword_spotting/cnn_small/model_package_tf/data_processing/data_preprocessing.py b/models/keyword_spotting/cnn_small/model_package_tf/data_processing/data_preprocessing.py
new file mode 100644
index 0000000..05cf5ba
--- /dev/null
+++ b/models/keyword_spotting/cnn_small/model_package_tf/data_processing/data_preprocessing.py
@@ -0,0 +1,462 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+#
+# Modifications Copyright 2023 Arm Inc. All Rights Reserved.
+# Modified to use TensorFlow 2.0 and data pipelines.
+#
+"""Functions for loading and preparing data for keyword spotting."""
+
+import os
+import re
+import sys
+import urllib
+from pathlib import Path
+import tarfile
+import hashlib
+import random
+import math
+from enum import Enum
+
+import numpy as np
+import tensorflow as tf
+from tensorflow.python.ops import gen_audio_ops as audio_ops
+
+MAX_NUM_WAVS_PER_CLASS = 2**27 - 1 # ~134M
+RANDOM_SEED = 59185
+BACKGROUND_NOISE_DIR_NAME = '_background_noise_'
+SILENCE_LABEL = '_silence_'
+SILENCE_INDEX = 0
+UNKNOWN_WORD_INDEX = 1
+UNKNOWN_WORD_LABEL = '_unknown_'
+
+
+def load_wav_file(wav_filename, desired_samples):
+ """Loads and then decodes a given 16bit PCM wav file.
+
+ Decoded audio is scaled to the range [-1, 1] and padded or cropped to the desired number of samples.
+
+ Args:
+ wav_filename: 16bit PCM wav file to load.
+ desired_samples: Number of samples wanted from the audio file.
+
+ Returns:
+ Tuple consisting of the decoded audio and sample rate.
+ """
+ wav_file = tf.io.read_file(wav_filename)
+ decoded_wav = audio_ops.decode_wav(wav_file, desired_channels=1, desired_samples=desired_samples)
+
+ return decoded_wav.audio, decoded_wav.sample_rate
+
+
+def calculate_mfcc(audio_signal, audio_sample_rate, window_size, window_stride, num_mfcc):
+ """Returns Mel Frequency Cepstral Coefficients (MFCC) for a given audio signal.
+
+ Args:
+ audio_signal: Raw audio signal in range [-1, 1]
+ audio_sample_rate: Audio signal sample rate
+ window_size: Window size in samples for calculating spectrogram
+ window_stride: Window stride in samples for calculating spectrogram
+ num_mfcc: The number of MFCC features wanted.
+
+ Returns:
+ Calculated mffc features.
+ """
+ spectrogram = audio_ops.audio_spectrogram(input=audio_signal, window_size=window_size, stride=window_stride,
+ magnitude_squared=True)
+
+ mfcc_features = audio_ops.mfcc(spectrogram, audio_sample_rate, dct_coefficient_count=num_mfcc)
+
+ return mfcc_features
+
+
+def which_set(filename, validation_percentage, testing_percentage):
+ """Determines which data partition the file should belong to.
+
+ We want to keep files in the same training, validation, or testing sets even
+ if new ones are added over time. This makes it less likely that testing
+ samples will accidentally be reused in training when long runs are restarted
+ for example. To keep this stability, a hash of the filename is taken and used
+ to determine which set it should belong to. This determination only depends on
+ the name and the set proportions, so it won't change as other files are added.
+ It's also useful to associate particular files as related (for example words
+ spoken by the same person), so anything after '_nohash_' in a filename is
+ ignored for set determination. This ensures that 'bobby_nohash_0.wav' and
+ 'bobby_nohash_1.wav' are always in the same set, for example.
+
+ Args:
+ filename: File path of the data sample.
+ validation_percentage: How much of the data set to use for validation.
+ testing_percentage: How much of the data set to use for testing.
+
+ Returns:
+ String, one of 'training', 'validation', or 'testing'.
+ """
+ base_name = os.path.basename(filename)
+ # We want to ignore anything after '_nohash_' in the file name when
+ # deciding which set to put a wav in, so the data set creator has a way of
+ # grouping wavs that are close variations of each other.
+ hash_name = re.sub(r'_nohash_.*$', '', base_name)
+ # This looks a bit magical, but we need to decide whether this file should
+ # go into the training, testing, or validation sets, and we want to keep
+ # existing files in the same set even if more files are subsequently
+ # added.
+ # To do that, we need a stable way of deciding based on just the file name
+ # itself, so we do a hash of that and then use that to generate a
+ # probability value that we use to assign it.
+ hash_name_hashed = hashlib.sha1(tf.compat.as_bytes(hash_name)).hexdigest()
+ percentage_hash = ((int(hash_name_hashed, 16) %
+ (MAX_NUM_WAVS_PER_CLASS + 1)) *
+ (100.0 / MAX_NUM_WAVS_PER_CLASS))
+ if percentage_hash < validation_percentage:
+ result = 'validation'
+ elif percentage_hash < (testing_percentage + validation_percentage):
+ result = 'testing'
+ else:
+ result = 'training'
+ return result
+
+
+def prepare_words_list(wanted_words):
+ """Prepends common tokens to the custom word list.
+
+ Args:
+ wanted_words: List of strings containing custom words to spot.
+
+ Returns:
+ List of words with silence and unknown tokens added.
+ """
+ return [SILENCE_LABEL, UNKNOWN_WORD_LABEL] + wanted_words
+
+
+class AudioProcessor:
+ """Handles loading, partitioning, and preparing audio training data."""
+
+ class Modes(Enum):
+ TRAINING = 1
+ VALIDATION = 2
+ TESTING = 3
+
+ def __init__(self, data_url, data_dir, silence_percentage, unknown_percentage,
+ wanted_words, validation_percentage, testing_percentage, model_settings):
+ self.data_dir = Path(data_dir)
+ self.model_settings = model_settings
+ self.words_list = prepare_words_list(wanted_words)
+
+ self._tf_datasets = {}
+ self.background_data = None
+ self._set_size = {'training': 0, 'validation': 0, 'testing': 0}
+
+ self._download_and_extract_data(data_url, data_dir)
+ self._prepare_datasets(silence_percentage, unknown_percentage, wanted_words,
+ validation_percentage, testing_percentage)
+ self._prepare_background_data()
+
+ def get_data(self, mode, background_frequency=0, background_volume_range=0, time_shift=0):
+ """Returns the train, validation or test set for KWS as a TF Dataset.
+
+ Args:
+ mode: The set to return, see AudioProcessor.Modes enumeration.
+ background_frequency: How many of the samples have background noise mixed in.
+ background_volume_range: How loud the background noise should be, between 0 and 1.
+ time_shift: Range to randomly shift the training audio by in time.
+
+ Returns:
+ TF dataset that will generate tuples containing an mfcc and corresponding label.
+
+ Raises:
+ ValueError: If mode is not recognised.
+ """
+ if mode == AudioProcessor.Modes.TRAINING:
+ dataset = self._tf_datasets['training']
+ elif mode == AudioProcessor.Modes.VALIDATION:
+ dataset = self._tf_datasets['validation']
+ elif mode == AudioProcessor.Modes.TESTING:
+ dataset = self._tf_datasets['testing']
+ else:
+ ValueError("Incorrect dataset type given")
+
+ use_background = (self.background_data is not None) and (mode == AudioProcessor.Modes.TRAINING)
+ dataset = dataset.map(lambda path, label: self._process_path(path, label, self.model_settings,
+ background_frequency, background_volume_range,
+ time_shift, use_background, self.background_data),
+ num_parallel_calls=tf.data.experimental.AUTOTUNE)
+
+ return dataset
+
+ def set_size(self, mode):
+ """Get the number of samples in the requested dataset partition.
+
+ Args:
+ mode: Which partition, see AudioProcessor.Modes enumeration.
+
+ Returns:
+ Number of samples in the partition.
+
+ Raises:
+ ValueError: If mode is not recognised.
+ """
+ if mode == AudioProcessor.Modes.TRAINING:
+ return self._set_size['training']
+ elif mode == AudioProcessor.Modes.VALIDATION:
+ return self._set_size['validation']
+ elif mode == AudioProcessor.Modes.TESTING:
+ return self._set_size['testing']
+ else:
+ ValueError('Incorrect dataset type given')
+
+ @staticmethod
+ def _process_path(path, label, model_settings, background_frequency, background_volume_range, time_shift_samples,
+ use_background, background_data):
+ """Load wav files and calculate mfcc features.
+
+ Random shifting of samples and adding in background noise is done within this function as well.
+ This function is meant to be mapped onto a TF Dataset by using a lambda function.
+
+ Args:
+ path: Path to the wav file to load.
+ label: Integer label for classifying the audio clip.
+ model_settings: Dictionary of settings for model being trained.
+ background_frequency: How many clips will have background noise, 0.0 to 1.0.
+ background_volume_range: How loud the background noise will be.
+ time_shift_samples: How much to randomly shift the clips by.
+ use_background: Add in background noise to audio clips or not.
+ background_data: Ragged tensor of loaded background noise samples.
+
+ Returns:
+ Tuple of calculated flattened mfcc and its class label.
+ """
+
+ desired_samples = model_settings['desired_samples']
+ audio, sample_rate = load_wav_file(path, desired_samples=desired_samples)
+
+ # Make our own silence audio data.
+ if label == SILENCE_INDEX:
+ audio = tf.multiply(audio, 0)
+
+ # Shift samples start position and pad any gaps with zeros.
+ if time_shift_samples > 0:
+ time_shift_amount = tf.random.uniform(shape=(), minval=-time_shift_samples, maxval=time_shift_samples,
+ dtype=tf.int32)
+ else:
+ time_shift_amount = 0
+ if time_shift_amount > 0:
+ time_shift_padding = [[time_shift_amount, 0], [0, 0]]
+ time_shift_offset = [0, 0]
+ else:
+ time_shift_padding = [[0, -time_shift_amount], [0, 0]]
+ time_shift_offset = [-time_shift_amount, 0]
+
+ padded_foreground = tf.pad(audio, time_shift_padding, mode='CONSTANT')
+ sliced_foreground = tf.slice(padded_foreground, time_shift_offset, [desired_samples, -1])
+
+ # Get a random section of background noise.
+ if use_background:
+ background_index = tf.random.uniform(shape=(), maxval=background_data.shape[0], dtype=tf.int32)
+ background_sample = background_data[background_index]
+ background_offset = tf.random.uniform(shape=(), maxval=len(background_sample)-desired_samples,
+ dtype=tf.int32)
+ background_clipped = background_sample[background_offset:(background_offset + desired_samples)]
+ background_reshaped = tf.reshape(background_clipped, [desired_samples, 1])
+ if tf.random.uniform(shape=(), maxval=1) < background_frequency:
+ background_volume = tf.random.uniform(shape=(), maxval=background_volume_range)
+ else:
+ background_volume = tf.constant(0, dtype='float32')
+ else:
+ background_reshaped = np.zeros([desired_samples, 1], dtype=np.float32)
+ background_volume = tf.constant(0, dtype='float32')
+
+ # Mix in background noise.
+ background_mul = tf.multiply(background_reshaped, background_volume)
+ background_add = tf.add(background_mul, sliced_foreground)
+ background_clamp = tf.clip_by_value(background_add, -1.0, 1.0)
+
+ mfcc = calculate_mfcc(background_clamp, sample_rate, model_settings['window_size_samples'],
+ model_settings['window_stride_samples'],
+ model_settings['dct_coefficient_count'])
+ mfcc = tf.reshape(mfcc, [-1])
+
+ return mfcc, label
+
+ def _download_and_extract_data(self, data_url, target_directory):
+ """Downloads and extracts file to target directory.
+
+ If the file does not already exist download it and then untar into the target directory.
+
+ Args:
+ data_url: Web link to the tarred data to download.
+ target_directory: Directory to download and extract to.
+ """
+ target_directory = Path(target_directory)
+ target_directory.mkdir(exist_ok=True)
+
+ filename = data_url.split('/')[-1]
+ filepath = target_directory / filename
+
+ if not filepath.exists():
+ def _report_hook(block_num, block_size, total_size):
+ """Function to track download progress in urllib"""
+ read_so_far = block_num * block_size
+ percent = (read_so_far / total_size) * 100.0
+
+ s = f"\rDownloading {filename} {percent:.1f}%"
+
+ sys.stdout.write(s)
+ sys.stdout.flush()
+
+ filepath, _ = urllib.request.urlretrieve(data_url, filepath, _report_hook)
+ print()
+
+ print(f'Untarring {filename}...')
+ tarfile.open(filepath, 'r:gz').extractall(target_directory)
+
+ def _prepare_datasets(self, silence_percentage, unknown_percentage, wanted_words,
+ validation_percentage, testing_percentage):
+ """Split the data into train, validation and testing sets.
+
+ Silence and unknown data is added, then sets are converted to TF Datasets.
+
+ Args:
+ silence_percentage: Percent of words should be silence.
+ unknown_percentage: Percent of words that should be unknown.
+ wanted_words: List of words wanted to classify.
+ validation_percentage: Percent to split off for validation.
+ testing_percentage: Percent to split off for testing.
+ """
+ # Make sure the shuffling and picking of unknowns is deterministic.
+ random.seed(RANDOM_SEED)
+ wanted_words_index = {}
+
+ for index, wanted_word in enumerate(wanted_words):
+ wanted_words_index[wanted_word] = index + 2
+
+ # Find all wav files in subfolders.
+ search_path = self.data_dir / '*' / '*.wav'
+ data_index, unknown_index, all_words = self._find_and_sort_wavs(search_path, validation_percentage,
+ testing_percentage, wanted_words_index)
+
+ for index, wanted_word in enumerate(wanted_words):
+ if wanted_word not in all_words:
+ raise Exception(f'Tried to find {wanted_word} in labels but only found: {", ".join(all_words.keys())}')
+
+ word_to_index = {}
+ for word in all_words:
+ if word in wanted_words_index:
+ word_to_index[word] = wanted_words_index[word]
+ else:
+ word_to_index[word] = UNKNOWN_WORD_INDEX
+ word_to_index[SILENCE_LABEL] = SILENCE_INDEX
+
+ # We need an arbitrary file to load as the input for the silence samples.
+ # It's multiplied by zero later, so the content doesn't matter.
+ silence_wav_path = data_index['training'][0]['file']
+ for set_index in ['validation', 'testing', 'training']:
+ set_size = len(data_index[set_index]) # Size before adding silence and unknown samples.
+ silence_size = int(math.ceil(set_size * silence_percentage / 100))
+ for _ in range(silence_size):
+ data_index[set_index].append({
+ 'label': SILENCE_LABEL,
+ 'file': silence_wav_path
+ })
+ # Pick some unknowns to add to each partition of the data set.
+ random.shuffle(unknown_index[set_index])
+ unknown_size = int(math.ceil(set_size * unknown_percentage / 100))
+ data_index[set_index].extend(unknown_index[set_index][:unknown_size])
+
+ self._set_size[set_index] = len(data_index[set_index]) # Size after adding silence and unknown samples.
+
+ # Make sure the ordering is random.
+ random.shuffle(data_index[set_index])
+
+ # Transform into TF Datasets ready for easier processing later.
+ labels, paths = list(zip(*[d.values() for d in data_index[set_index]]))
+ labels = [word_to_index[label] for label in labels]
+ self._tf_datasets[set_index] = tf.data.Dataset.from_tensor_slices((list(paths), labels))
+
+ def _find_and_sort_wavs(self, search_pattern, validation_percentage, testing_percentage, wanted_words_index):
+ """Find and sort wav files into known and unknown word sets.
+
+ Known words are files containing words in the list of wanted words.
+ Any other clip goes to the unknown label set. Labels come from the folder names.
+ All clips are also assigned to train, test and validation sets.
+
+ Args:
+ search_pattern: Path pattern used by glob to find wav files.
+ validation_percentage: Percent to split off for validation.
+ testing_percentage: Percent to split off for testing.
+ wanted_words_index: Dict mapping wanted words to their label index.
+
+ Returns:
+ 3-tuple of known words, unknown words and mapping of all word labels.
+ """
+ data_index = {'validation': [], 'testing': [], 'training': []}
+ unknown_index = {'validation': [], 'testing': [], 'training': []}
+ all_words = {}
+
+ for wav_path in sorted(tf.io.gfile.glob(str(search_pattern))):
+ word = Path(wav_path).parent.name.lower()
+
+ # Treat the '_background_noise_' folder as a special case, since we expect
+ # it to contain long audio samples we mix in to improve training.
+ if word == BACKGROUND_NOISE_DIR_NAME:
+ continue
+
+ all_words[word] = True
+ set_index = which_set(wav_path, validation_percentage, testing_percentage)
+ # If it's a known class, store its detail, otherwise add it to the list
+ # we'll use to train the unknown label.
+ if word in wanted_words_index:
+ data_index[set_index].append({'label': word, 'file': wav_path})
+ else:
+ unknown_index[set_index].append({'label': word, 'file': wav_path})
+ if not all_words:
+ raise Exception('No .wavs found at ' + str(search_pattern))
+
+ return data_index, unknown_index, all_words
+
+ def _prepare_background_data(self):
+ """Searches a folder for background noise audio, and loads it into memory.
+
+ It's expected that the background audio samples will be in a subdirectory
+ named '_background_noise_' inside the 'data_dir' folder, as .wavs that match
+ the sample rate of the training data, but can be much longer in duration.
+
+ If the '_background_noise_' folder doesn't exist at all, this isn't an
+ error, it's just taken to mean that no background noise augmentation should
+ be used. If the folder does exist, but it's empty, that's treated as an
+ error.
+
+ Returns:
+ Ragged tensor of raw PCM-encoded audio samples of background noise.
+ None if '_background_noise_' folder doesnt exist.
+
+ Raises:
+ Exception: If files aren't found in the folder.
+ """
+ background_data = []
+ background_dir = Path(self.data_dir / BACKGROUND_NOISE_DIR_NAME)
+ if not background_dir.exists():
+ self.background_data = None
+ return
+
+ search_path = Path(background_dir / '*.wav')
+ for wav_path in tf.io.gfile.glob(str(search_path)):
+ wav_data, _ = load_wav_file(wav_path, desired_samples=-1)
+ background_data.append(tf.reshape(wav_data, [-1]))
+
+ if not background_data:
+ raise Exception('No background wav files were found in ' + str(search_path))
+
+ # Ragged tensor as we cant use lists in tf dataset map functions.
+ self.background_data = tf.ragged.stack(background_data)
diff --git a/models/keyword_spotting/cnn_small/model_package_tf/evaluation.py b/models/keyword_spotting/cnn_small/model_package_tf/evaluation.py
new file mode 100644
index 0000000..026e8f8
--- /dev/null
+++ b/models/keyword_spotting/cnn_small/model_package_tf/evaluation.py
@@ -0,0 +1,250 @@
+# Copyright © 2023 Arm Ltd. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Functions for testing trained keyword spotting models from checkpoint files and TFLite files."""
+
+import argparse
+
+import numpy as np
+import tensorflow as tf
+
+from data_processing import data_preprocessing
+from model_core_utils import models
+from cnn_s_inference_tflite import tflite_inference
+
+
+def tflite_test(model_settings, audio_processor, tflite_path):
+ """Calculate accuracy and confusion matrices on the validation and test sets.
+
+ A TFLite model is used for doing testing.
+
+ Args:
+ model_settings: Dictionary of common model settings.
+ audio_processor: Audio processor class object.
+ tflite_path: Path to TFLite file to use for inference.
+ """
+ # Evaluate on validation set.
+ print("Running TFLite evaluation on validation set...")
+ val_data = audio_processor.get_data(audio_processor.Modes.VALIDATION).batch(1)
+ expected_indices = np.concatenate([y for x, y in val_data])
+ predicted_indices = []
+
+ for mfcc, label in val_data:
+ prediction = tflite_inference(mfcc, tflite_path)
+ predicted_indices.append(np.squeeze(tf.argmax(prediction, axis=1)))
+
+ val_accuracy = calculate_accuracy(predicted_indices, expected_indices)
+ confusion_matrix = tf.math.confusion_matrix(expected_indices, predicted_indices,
+ num_classes=model_settings['label_count'])
+
+ print(confusion_matrix.numpy())
+ print(f'Validation accuracy = {val_accuracy * 100:.2f}%'
+ f'(N={audio_processor.set_size(audio_processor.Modes.VALIDATION)})')
+
+ # Evaluate on testing set.
+ print("Running TFLite evaluation on test set...")
+ test_data = audio_processor.get_data(audio_processor.Modes.TESTING).batch(1)
+ expected_indices = np.concatenate([y for x, y in test_data])
+ predicted_indices = []
+
+ for mfcc, label in test_data:
+ prediction = tflite_inference(mfcc, tflite_path)
+ predicted_indices.append(np.squeeze(tf.argmax(prediction, axis=1)))
+
+ test_accuracy = calculate_accuracy(predicted_indices, expected_indices)
+ confusion_matrix = tf.math.confusion_matrix(expected_indices, predicted_indices,
+ num_classes=model_settings['label_count'])
+
+ print(confusion_matrix.numpy())
+ print(f'Test accuracy = {test_accuracy * 100:.2f}%'
+ f'(N={audio_processor.set_size(audio_processor.Modes.TESTING)})')
+
+
+def keras_test(model_settings, audio_processor, model):
+ """Calculate accuracy and confusion matrices on the validation and test sets.
+
+ A loaded keras model is used for doing testing.
+
+ Args:
+ model_settings: Dictionary of common model settings.
+ audio_processor: Audio processor class object.
+ model: Loaded keras model.
+ """
+ # Evaluate on validation set.
+ print("Running TF evaluation on validation set...")
+ val_data = audio_processor.get_data(audio_processor.Modes.VALIDATION).batch(FLAGS.batch_size)
+ expected_indices = np.concatenate([y for x, y in val_data])
+
+ predictions = model.predict(val_data)
+ predicted_indices = tf.argmax(predictions, axis=1)
+
+ val_accuracy = calculate_accuracy(predicted_indices, expected_indices)
+ confusion_matrix = tf.math.confusion_matrix(expected_indices, predicted_indices,
+ num_classes=model_settings['label_count'])
+ print(confusion_matrix.numpy())
+ print(f'Validation accuracy = {val_accuracy * 100:.2f}%'
+ f'(N={audio_processor.set_size(audio_processor.Modes.VALIDATION)})')
+
+ # Evaluate on testing set.
+ print("Running TF evaluation on test set...")
+ test_data = audio_processor.get_data(audio_processor.Modes.TESTING).batch(FLAGS.batch_size)
+ expected_indices = np.concatenate([y for x, y in test_data])
+
+ predictions = model.predict(test_data)
+ predicted_indices = tf.argmax(predictions, axis=1)
+
+ test_accuracy = calculate_accuracy(predicted_indices, expected_indices)
+ confusion_matrix = tf.math.confusion_matrix(expected_indices, predicted_indices,
+ num_classes=model_settings['label_count'])
+ print(confusion_matrix.numpy())
+ print(f'Test accuracy = {test_accuracy * 100:.2f}%'
+ f'(N={audio_processor.set_size(audio_processor.Modes.TESTING)})')
+
+
+def calculate_accuracy(predicted_indices, expected_indices):
+ """Calculates and returns accuracy.
+
+ Args:
+ predicted_indices: List of predicted integer indices.
+ expected_indices: List of expected integer indices.
+
+ Returns:
+ Accuracy value between 0 and 1.
+ """
+ correct_prediction = tf.equal(predicted_indices, expected_indices)
+ accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
+ return accuracy
+
+
+def evaluate():
+ """Calculate accuracy and confusion matrices on validation and test sets.
+
+ Model is created and weights loaded from supplied command line arguments.
+ """
+ model_settings = models.prepare_model_settings(
+ len(data_preprocessing.prepare_words_list(FLAGS.wanted_words.split(','))),
+ FLAGS.sample_rate, FLAGS.clip_duration_ms, FLAGS.window_size_ms,
+ FLAGS.window_stride_ms, FLAGS.dct_coefficient_count)
+
+ audio_processor = data_preprocessing.AudioProcessor(data_url=FLAGS.data_url,
+ data_dir=FLAGS.data_dir,
+ silence_percentage=FLAGS.silence_percentage,
+ unknown_percentage=FLAGS.unknown_percentage,
+ wanted_words=FLAGS.wanted_words.split(','),
+ validation_percentage=FLAGS.validation_percentage,
+ testing_percentage=FLAGS.testing_percentage,
+ model_settings=model_settings)
+
+ if FLAGS.tflite_path:
+ tflite_test(model_settings, audio_processor, FLAGS.tflite_path)
+
+ if FLAGS.checkpoint:
+ model = models.create_model(model_settings, FLAGS.model_architecture, FLAGS.model_size_info, False)
+ model.load_weights(FLAGS.checkpoint).expect_partial()
+ keras_test(model_settings, audio_processor, model)
+
+
+if __name__ == '__main__':
+ parser = argparse.ArgumentParser()
+ parser.add_argument(
+ '--data_url',
+ type=str,
+ default='http://download.tensorflow.org/data/speech_commands_v0.02.tar.gz',
+ help='Location of speech training data archive on the web.')
+ parser.add_argument(
+ '--data_dir',
+ type=str,
+ default='/tmp/speech_dataset/',
+ help="""\
+ Where to download the speech training data to.
+ """)
+ parser.add_argument(
+ '--silence_percentage',
+ type=float,
+ default=10.0,
+ help="""\
+ How much of the training data should be silence.
+ """)
+ parser.add_argument(
+ '--unknown_percentage',
+ type=float,
+ default=10.0,
+ help="""\
+ How much of the training data should be unknown words.
+ """)
+ parser.add_argument(
+ '--testing_percentage',
+ type=int,
+ default=10,
+ help='What percentage of wavs to use as a test set.')
+ parser.add_argument(
+ '--validation_percentage',
+ type=int,
+ default=10,
+ help='What percentage of wavs to use as a validation set.')
+ parser.add_argument(
+ '--sample_rate',
+ type=int,
+ default=16000,
+ help='Expected sample rate of the wavs',)
+ parser.add_argument(
+ '--clip_duration_ms',
+ type=int,
+ default=1000,
+ help='Expected duration in milliseconds of the wavs',)
+ parser.add_argument(
+ '--window_size_ms',
+ type=float,
+ default=30.0,
+ help='How long each spectrogram timeslice is',)
+ parser.add_argument(
+ '--window_stride_ms',
+ type=float,
+ default=10.0,
+ help='How long each spectrogram timeslice is',)
+ parser.add_argument(
+ '--dct_coefficient_count',
+ type=int,
+ default=40,
+ help='How many bins to use for the MFCC fingerprint',)
+ parser.add_argument(
+ '--batch_size',
+ type=int,
+ default=100,
+ help='How many items to train with at once',)
+ parser.add_argument(
+ '--wanted_words',
+ type=str,
+ default='yes,no,up,down,left,right,on,off,stop,go',
+ help='Words to use (others will be added to an unknown label)',)
+ parser.add_argument(
+ '--checkpoint',
+ type=str,
+ help='Checkpoint to load the weights from')
+ parser.add_argument(
+ '--tflite_path',
+ type=str,
+ help='Path to TFLite file to use for evaluation')
+ parser.add_argument(
+ '--model_architecture',
+ type=str,
+ default='dnn',
+ help='What model architecture to use')
+ parser.add_argument(
+ '--model_size_info',
+ type=int,
+ nargs="+",
+ default=[128, 128, 128],
+ help='Model dimensions - different for various models')
+
+ FLAGS, _ = parser.parse_known_args()
+ evaluate()
diff --git a/models/keyword_spotting/cnn_small/model_package_tf/how_to_guidance.ipynb b/models/keyword_spotting/cnn_small/model_package_tf/how_to_guidance.ipynb
new file mode 100644
index 0000000..8b19ae4
--- /dev/null
+++ b/models/keyword_spotting/cnn_small/model_package_tf/how_to_guidance.ipynb
@@ -0,0 +1,428 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Copyright (C) 2023 Arm Limited or its affiliates. All rights reserved.\n",
+ "#\n",
+ "# SPDX-License-Identifier: Apache-2.0\n",
+ "#\n",
+ "# Licensed under the Apache License, Version 2.0 (the License); you may\n",
+ "# not use this file except in compliance with the License.\n",
+ "# You may obtain a copy of the License at\n",
+ "#\n",
+ "# www.apache.org/licenses/LICENSE-2.0\n",
+ "#\n",
+ "# Unless required by applicable law or agreed to in writing, software\n",
+ "# distributed under the License is distributed on an AS IS BASIS, WITHOUT\n",
+ "# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
+ "# See the License for the specific language governing permissions and\n",
+ "# limitations under the License."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# CNN_Small - Optimised\n",
+ "\n",
+ "Here we reproduce the models with our established codebase and ModelPackage approach for your convenience.\n",
+ "\n",
+ "## Model-Package Overview:\n",
+ "\n",
+ "| Model \t| CNN_Small \t|\n",
+ "|:---------------:\t|:---------------------------------------------------------------:\t|\n",
+ "| **Format**: \t| Keras, Saved Model, TensorFlow Lite int8, TensorFlow Lite fp32 |\n",
+ "| **Feature**: \t| Keyword spotting for Arm Cortex-M CPUs |\n",
+ "| **Architectural Delta w.r.t. Vanilla**: | None |\n",
+ "| **Domain**: \t| Keyword spotting |\n",
+ "| **Package Quality**: \t| Optimised |"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Table of contents \n",
+ "\n",
+ "This how-to guidance presents the key steps to reproduce everything in this package. The contents are organised as below. We provided the internal navigation links for users to easy-jump among different sections. \n",
+ "\n",
+ " \n",
+ "* [1.0 Model recreation](#model_recreation)\n",
+ "\n",
+ "* [2.0 Training](#training)\n",
+ "\n",
+ "* [3.0 Testing](#testing)\n",
+ "\n",
+ "* [4.0 Optimization](#optimization)\n",
+ "\n",
+ "* [5.0 Quantization and TFLite conversion](#tflite_conversion)\n",
+ "\n",
+ "* [6.0 Inference the TFLite model files](#tflite_inference)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 1.0 Model Recreation\n",
+ "\n",
+ "In order to recreate the model you will first need to be using ```Python3.7``` and install the requirements in ```requirements.txt```.\n",
+ "\n",
+ "Once you have these requirements satisfied you can execute the recreation script contained within this folder, just run:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "2023-01-31 13:13:21.365383: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0\n",
+ "Untarring speech_commands_v0.02.tar.gz...\n",
+ "2023-01-31 13:14:12.415896: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcuda.so.1\n",
+ "2023-01-31 13:14:12.453662: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: \n",
+ "pciBusID: 0000:03:00.0 name: NVIDIA TITAN Xp computeCapability: 6.1\n",
+ "coreClock: 1.582GHz coreCount: 30 deviceMemorySize: 11.91GiB deviceMemoryBandwidth: 510.07GiB/s\n",
+ "2023-01-31 13:14:12.453701: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0\n",
+ "2023-01-31 13:14:12.477025: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcublas.so.11\n",
+ "2023-01-31 13:14:12.477130: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcublasLt.so.11\n",
+ "2023-01-31 13:14:12.480970: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcufft.so.10\n",
+ "2023-01-31 13:14:12.481614: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcurand.so.10\n",
+ "2023-01-31 13:14:12.482232: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcusolver.so.11\n",
+ "2023-01-31 13:14:12.483034: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcusparse.so.11\n",
+ "2023-01-31 13:14:12.483190: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudnn.so.8\n",
+ "2023-01-31 13:14:12.483677: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0\n",
+ "2023-01-31 13:14:12.483964: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA\n",
+ "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
+ "2023-01-31 13:14:12.484760: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: \n",
+ "pciBusID: 0000:03:00.0 name: NVIDIA TITAN Xp computeCapability: 6.1\n",
+ "coreClock: 1.582GHz coreCount: 30 deviceMemorySize: 11.91GiB deviceMemoryBandwidth: 510.07GiB/s\n",
+ "2023-01-31 13:14:12.485262: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0\n",
+ "2023-01-31 13:14:12.485316: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0\n",
+ "2023-01-31 13:14:12.916344: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1258] Device interconnect StreamExecutor with strength 1 edge matrix:\n",
+ "2023-01-31 13:14:12.916381: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1264] 0 \n",
+ "2023-01-31 13:14:12.916389: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1277] 0: N \n",
+ "2023-01-31 13:14:12.916905: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1418] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 10809 MB memory) -> physical GPU (device: 0, name: NVIDIA TITAN Xp, pci bus id: 0000:03:00.0, compute capability: 6.1)\n",
+ "WARNING:tensorflow:Compiled the loaded model, but the compiled metrics have yet to be built. `model.compile_metrics` will be empty until you train or evaluate the model.\n",
+ "2023-01-31 13:14:14.471348: W tensorflow/python/util/util.cc:348] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.\n",
+ "2023-01-31 13:14:15.329325: I tensorflow/core/grappler/devices.cc:69] Number of eligible GPUs (core count >= 8, compute capability >= 0.0): 1\n",
+ "2023-01-31 13:14:15.329556: I tensorflow/core/grappler/clusters/single_machine.cc:357] Starting new session\n",
+ "2023-01-31 13:14:15.329983: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: \n",
+ "pciBusID: 0000:03:00.0 name: NVIDIA TITAN Xp computeCapability: 6.1\n",
+ "coreClock: 1.582GHz coreCount: 30 deviceMemorySize: 11.91GiB deviceMemoryBandwidth: 510.07GiB/s\n",
+ "2023-01-31 13:14:15.330272: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0\n",
+ "2023-01-31 13:14:15.330306: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1258] Device interconnect StreamExecutor with strength 1 edge matrix:\n",
+ "2023-01-31 13:14:15.330322: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1264] 0 \n",
+ "2023-01-31 13:14:15.330334: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1277] 0: N \n",
+ "2023-01-31 13:14:15.330642: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1418] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 10809 MB memory) -> physical GPU (device: 0, name: NVIDIA TITAN Xp, pci bus id: 0000:03:00.0, compute capability: 6.1)\n",
+ "2023-01-31 13:14:15.347491: I tensorflow/core/platform/profile_utils/cpu_utils.cc:114] CPU Frequency: 3492140000 Hz\n",
+ "2023-01-31 13:14:15.352470: I tensorflow/core/grappler/optimizers/meta_optimizer.cc:1144] Optimization results for grappler item: graph_to_optimize\n",
+ " function_optimizer: function_optimizer did nothing. time = 0.021ms.\n",
+ " function_optimizer: function_optimizer did nothing. time = 0.002ms.\n",
+ "\n",
+ "2023-01-31 13:14:15.425956: W tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc:345] Ignored output_format.\n",
+ "2023-01-31 13:14:15.425996: W tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc:348] Ignored drop_control_dependency.\n",
+ "2023-01-31 13:14:15.429502: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:210] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.\n",
+ "2023-01-31 13:14:15.431843: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: \n",
+ "pciBusID: 0000:03:00.0 name: NVIDIA TITAN Xp computeCapability: 6.1\n",
+ "coreClock: 1.582GHz coreCount: 30 deviceMemorySize: 11.91GiB deviceMemoryBandwidth: 510.07GiB/s\n",
+ "2023-01-31 13:14:15.432118: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0\n",
+ "2023-01-31 13:14:15.432154: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1258] Device interconnect StreamExecutor with strength 1 edge matrix:\n",
+ "2023-01-31 13:14:15.432167: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1264] 0 \n",
+ "2023-01-31 13:14:15.432178: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1277] 0: N \n",
+ "2023-01-31 13:14:15.432489: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1418] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 10809 MB memory) -> physical GPU (device: 0, name: NVIDIA TITAN Xp, pci bus id: 0000:03:00.0, compute capability: 6.1)\n",
+ "Converted model saved to cnn.tflite.\n",
+ "Running TFLite evaluation on validation set...\n",
+ "2023-01-31 13:14:15.484981: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:176] None of the MLIR Optimization Passes are enabled (registered 2)\n",
+ "[[371 0 0 0 0 0 0 0 0 0 0 0]\n",
+ " [ 0 300 5 7 2 13 9 4 11 4 2 14]\n",
+ " [ 0 1 381 4 0 2 8 0 0 0 0 1]\n",
+ " [ 1 13 1 363 0 8 3 1 0 1 4 11]\n",
+ " [ 0 3 1 1 328 0 1 0 5 8 3 0]\n",
+ " [ 0 9 0 12 1 340 5 0 0 0 4 6]\n",
+ " [ 1 3 9 2 2 0 332 2 0 0 0 1]\n",
+ " [ 0 11 0 0 1 2 6 341 0 1 0 1]\n",
+ " [ 1 9 0 0 4 1 0 0 339 8 1 0]\n",
+ " [ 0 3 2 0 20 0 4 0 4 334 3 3]\n",
+ " [ 1 5 1 0 9 1 2 0 0 2 329 0]\n",
+ " [ 0 9 0 28 1 8 1 0 0 5 6 314]]\n",
+ "Validation accuracy = 91.61%(N=4445)\n",
+ "Running TFLite evaluation on test set...\n",
+ "[[408 0 0 0 0 0 0 0 0 0 0 0]\n",
+ " [ 0 338 5 9 3 1 6 10 17 4 6 9]\n",
+ " [ 0 10 395 2 0 2 9 0 0 0 0 1]\n",
+ " [ 0 7 3 374 0 10 5 0 0 0 0 6]\n",
+ " [ 0 8 0 0 395 2 0 0 5 7 6 2]\n",
+ " [ 0 9 2 14 1 369 0 1 3 0 2 5]\n",
+ " [ 0 6 7 0 1 0 394 2 0 1 1 0]\n",
+ " [ 0 4 0 0 0 2 8 378 1 1 0 2]\n",
+ " [ 1 13 0 0 5 3 1 0 356 14 1 2]\n",
+ " [ 0 2 0 1 11 0 1 0 7 372 0 8]\n",
+ " [ 0 1 0 0 5 4 2 0 0 0 394 5]\n",
+ " [ 0 15 0 28 4 10 2 2 1 2 2 336]]\n",
+ "Test accuracy = 92.21%(N=4890)\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "2023-01-31 13:14:39.184982: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0\n",
+ "Untarring speech_commands_v0.02.tar.gz...\n",
+ "2023-01-31 13:15:30.798819: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcuda.so.1\n",
+ "2023-01-31 13:15:30.834958: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: \n",
+ "pciBusID: 0000:03:00.0 name: NVIDIA TITAN Xp computeCapability: 6.1\n",
+ "coreClock: 1.582GHz coreCount: 30 deviceMemorySize: 11.91GiB deviceMemoryBandwidth: 510.07GiB/s\n",
+ "2023-01-31 13:15:30.834997: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0\n",
+ "2023-01-31 13:15:30.856434: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcublas.so.11\n",
+ "2023-01-31 13:15:30.856508: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcublasLt.so.11\n",
+ "2023-01-31 13:15:30.860012: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcufft.so.10\n",
+ "2023-01-31 13:15:30.860406: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcurand.so.10\n",
+ "2023-01-31 13:15:30.861063: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcusolver.so.11\n",
+ "2023-01-31 13:15:30.861848: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcusparse.so.11\n",
+ "2023-01-31 13:15:30.862001: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudnn.so.8\n",
+ "2023-01-31 13:15:30.862359: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0\n",
+ "2023-01-31 13:15:30.862643: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA\n",
+ "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
+ "2023-01-31 13:15:30.863248: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: \n",
+ "pciBusID: 0000:03:00.0 name: NVIDIA TITAN Xp computeCapability: 6.1\n",
+ "coreClock: 1.582GHz coreCount: 30 deviceMemorySize: 11.91GiB deviceMemoryBandwidth: 510.07GiB/s\n",
+ "2023-01-31 13:15:30.863639: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0\n",
+ "2023-01-31 13:15:30.863701: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0\n",
+ "2023-01-31 13:15:31.316265: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1258] Device interconnect StreamExecutor with strength 1 edge matrix:\n",
+ "2023-01-31 13:15:31.316302: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1264] 0 \n",
+ "2023-01-31 13:15:31.316312: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1277] 0: N \n",
+ "2023-01-31 13:15:31.316827: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1418] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 10809 MB memory) -> physical GPU (device: 0, name: NVIDIA TITAN Xp, pci bus id: 0000:03:00.0, compute capability: 6.1)\n",
+ "WARNING:tensorflow:Compiled the loaded model, but the compiled metrics have yet to be built. `model.compile_metrics` will be empty until you train or evaluate the model.\n",
+ "2023-01-31 13:15:32.911559: W tensorflow/python/util/util.cc:348] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.\n",
+ "2023-01-31 13:15:33.701396: I tensorflow/core/grappler/devices.cc:69] Number of eligible GPUs (core count >= 8, compute capability >= 0.0): 1\n",
+ "2023-01-31 13:15:33.701483: I tensorflow/core/grappler/clusters/single_machine.cc:357] Starting new session\n",
+ "2023-01-31 13:15:33.702020: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: \n",
+ "pciBusID: 0000:03:00.0 name: NVIDIA TITAN Xp computeCapability: 6.1\n",
+ "coreClock: 1.582GHz coreCount: 30 deviceMemorySize: 11.91GiB deviceMemoryBandwidth: 510.07GiB/s\n",
+ "2023-01-31 13:15:33.702305: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0\n",
+ "2023-01-31 13:15:33.702342: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1258] Device interconnect StreamExecutor with strength 1 edge matrix:\n",
+ "2023-01-31 13:15:33.702357: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1264] 0 \n",
+ "2023-01-31 13:15:33.702364: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1277] 0: N \n",
+ "2023-01-31 13:15:33.702677: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1418] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 10809 MB memory) -> physical GPU (device: 0, name: NVIDIA TITAN Xp, pci bus id: 0000:03:00.0, compute capability: 6.1)\n",
+ "2023-01-31 13:15:33.719401: I tensorflow/core/platform/profile_utils/cpu_utils.cc:114] CPU Frequency: 3492140000 Hz\n",
+ "2023-01-31 13:15:33.721665: I tensorflow/core/grappler/optimizers/meta_optimizer.cc:1144] Optimization results for grappler item: graph_to_optimize\n",
+ " function_optimizer: function_optimizer did nothing. time = 0.012ms.\n",
+ " function_optimizer: function_optimizer did nothing. time = 0.002ms.\n",
+ "\n",
+ "2023-01-31 13:15:33.790485: W tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc:345] Ignored output_format.\n",
+ "2023-01-31 13:15:33.790521: W tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc:348] Ignored drop_control_dependency.\n",
+ "2023-01-31 13:15:33.793705: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:210] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.\n",
+ "2023-01-31 13:15:33.795921: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: \n",
+ "pciBusID: 0000:03:00.0 name: NVIDIA TITAN Xp computeCapability: 6.1\n",
+ "coreClock: 1.582GHz coreCount: 30 deviceMemorySize: 11.91GiB deviceMemoryBandwidth: 510.07GiB/s\n",
+ "2023-01-31 13:15:33.796178: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0\n",
+ "2023-01-31 13:15:33.796208: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1258] Device interconnect StreamExecutor with strength 1 edge matrix:\n",
+ "2023-01-31 13:15:33.796218: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1264] 0 \n",
+ "2023-01-31 13:15:33.796225: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1277] 0: N \n",
+ "2023-01-31 13:15:33.796508: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1418] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 10809 MB memory) -> physical GPU (device: 0, name: NVIDIA TITAN Xp, pci bus id: 0000:03:00.0, compute capability: 6.1)\n",
+ "2023-01-31 13:15:33.820120: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:176] None of the MLIR Optimization Passes are enabled (registered 2)\n",
+ "fully_quantize: 0, inference_type: 6, input_inference_type: 9, output_inference_type: 9\n",
+ "Quantized model saved to cnn_quantized.tflite.\n",
+ "Running TFLite evaluation on validation set...\n",
+ "[[371 0 0 0 0 0 0 0 0 0 0 0]\n",
+ " [ 0 301 5 8 2 12 9 4 11 3 3 13]\n",
+ " [ 0 2 376 3 0 2 11 2 0 0 0 1]\n",
+ " [ 1 13 1 350 4 8 5 1 1 0 4 18]\n",
+ " [ 0 4 1 1 321 0 2 0 5 7 9 0]\n",
+ " [ 0 10 0 9 3 337 4 1 0 0 7 6]\n",
+ " [ 1 6 9 1 4 1 327 2 0 0 0 1]\n",
+ " [ 0 14 0 0 2 2 6 337 1 1 0 0]\n",
+ " [ 1 9 1 0 4 2 1 0 339 5 1 0]\n",
+ " [ 0 4 1 0 25 0 5 0 6 322 5 5]\n",
+ " [ 1 6 1 0 13 1 1 0 1 3 323 0]\n",
+ " [ 0 11 1 26 3 7 1 1 1 3 4 314]]\n",
+ "Validation accuracy = 90.39%(N=4445)\n",
+ "Running TFLite evaluation on test set...\n",
+ "[[408 0 0 0 0 0 0 0 0 0 0 0]\n",
+ " [ 0 332 3 8 7 3 11 6 15 2 10 11]\n",
+ " [ 0 9 390 1 3 2 14 0 0 0 0 0]\n",
+ " [ 0 8 2 355 5 12 8 0 0 0 1 14]\n",
+ " [ 0 12 0 0 386 2 1 0 5 7 11 1]\n",
+ " [ 0 12 2 11 2 363 0 1 4 1 6 4]\n",
+ " [ 0 5 7 0 8 0 388 3 0 1 0 0]\n",
+ " [ 0 5 0 0 4 0 15 369 0 1 0 2]\n",
+ " [ 1 14 0 0 6 3 1 1 352 14 2 2]\n",
+ " [ 0 4 0 1 16 0 4 0 16 352 2 7]\n",
+ " [ 0 1 0 0 10 3 2 1 1 0 388 5]\n",
+ " [ 0 14 1 28 10 14 3 4 0 1 2 325]]\n",
+ "Test accuracy = 90.14%(N=4890)\n"
+ ]
+ }
+ ],
+ "source": [
+ "!bash ./recreate_model.sh"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Running this script will use the pre-trained checkpoint files supplied in the ```./model_archive/model_source/weights``` folder to generate the TFLite files and perform evaluation on the test set. Both an fp32 version and a quantized version will be produced. The quantized version will use post-training quantization to fully quantize it.\n",
+ "\n",
+ "If you want to run training from scratch you can do this by supplying ```--train``` when running the script. For example:\n",
+ "\n",
+ "```bash\n",
+ "bash ./recreate_model.sh --train\n",
+ "```\n",
+ "\n",
+ "Training is then performed and should produce a model to the stated accuracy in this repository. Note that exporting to TFLite will still happen with the baseline pre-trained checkpoint files, so you will need to re-run the script and this time supply the path to the new checkpoint files you want to use, for example:\n",
+ "\n",
+ "```bash\n",
+ "bash ./recreate_model.sh --ckpt \n",
+ "```"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 2.0 Training\n",
+ "\n",
+ "The training scripts can be used to recreate any of the models from the [Hello Edge paper](https://arxiv.org/pdf/1711.07128.pdf) provided the right hyperparameters are used. The training commands with all the hyperparameters to reproduce the model in this repository are given [here](recreate_model.sh). The model in this part of the repository represents just one variation of the models from the paper, other varieties are covered in other parts of the repository.\n",
+ "\n",
+ "\n",
+ "As a general example of how to train a DNN with 3 fully-connected layers with 128 neurons in each layer, run:\n",
+ "```\n",
+ "python train.py --model_architecture dnn --model_size_info 128 128 128\n",
+ "```\n",
+ "\n",
+ "The command line argument *--model_size_info* is used to pass the neural network layer\n",
+ "dimensions such as number of layers, convolution filter size/stride as a list to models.py,\n",
+ "which builds the TensorFlow graph based on the provided model architecture\n",
+ "and layer dimensions. For more info on *model_size_info* for each network architecture see\n",
+ "[models.py](model_core_utils/models.py).\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 3.0 Testing\n",
+ "To run inference on the trained model from a checkpoint and get accuracy on validation and test sets, run:\n",
+ "```\n",
+ "python evaluation.py --model_architecture dnn --model_size_info 128 128 128 --checkpoint \n",
+ "```\n",
+ "**The model and feature extraction parameters passed to this script should match those used in the Training step.**"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 4.0 Optimization\n",
+ "\n",
+ "We introduce an *optional* step to optimize the trained keyword spotting model for deployment.\n",
+ "\n",
+ "Here we use TensorFlow's [weight clustering API](https://www.tensorflow.org/model_optimization/guide/clustering) to reduce the compressed model size and optimize inference on supported hardware. 32 weight clusters and kmeans++ cluster intialization method are used as the clustering hyperparameters.\n",
+ "\n",
+ "To optimize your trained model (e.g. a DNN), a trained model checkpoint is needed to run clustering and fine-tuning on.\n",
+ "You can use the pre-trained checkpoints provided, or train your own model and use the resulting checkpoint.\n",
+ "\n",
+ "To apply the optimization and fine-tuning, run the following command:\n",
+ "```\n",
+ "python optimisations.py --model_architecture dnn --model_size_info 128 128 128 --checkpoint \n",
+ "```\n",
+ "**The model and feature extraction parameters used here should match those used in the Training step, except for the number of training steps.\n",
+ "The number of training steps is reduced since the optimization step only requires fine-tuning.**\n",
+ "\n",
+ "This will generate a clustered model checkpoint that can be used in the quantization step to generate a quantized and clustered TFLite model."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 5.0 Quantization and TFLite Conversion\n",
+ "\n",
+ "You can now use TensorFlow's\n",
+ "[post training quantization](https://www.tensorflow.org/lite/performance/post_training_quantization) to\n",
+ "make quantization of the trained models super simple.\n",
+ "\n",
+ "To quantize your trained model (e.g. a DNN) run:\n",
+ "```\n",
+ "python convert_to_tflite.py --model_architecture dnn --model_size_info 128 128 128 --checkpoint [--inference_type int8|int16]\n",
+ "```\n",
+ "**The model and feature extraction parameters used here should match those used in the Training step.**\n",
+ "\n",
+ "The ```inference_type``` parameter is *optional* and to be used if a fully quantized model with inputs and outputs of type int8 or int16 is needed. It defaults to fp32.\n",
+ "\n",
+ "In this example, this step will produce a quantized TFLite file *dnn_quantized.tflite*."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "You can test the accuracy of this quantized model on the test set by running:\n",
+ "```\n",
+ "python evaluation.py --tflite_path dnn_quantized.tflite\n",
+ "```\n",
+ "**The model and feature extraction parameters used here should match those used in the Training step.**\n",
+ "\n",
+ "`convert_to_tflite.py` uses post-training quantization to generate a quantized model by default. If you wish to convert to a floating point TFLite model, use the command below:\n",
+ "\n",
+ "```\n",
+ "python convert_to_tflite.py --model_architecture dnn --model_size_info 128 128 128 --checkpoint --no-quantize\n",
+ "```\n",
+ "\n",
+ "This will produce a floating point TFLite file *dnn.tflite*. You can test the accuracy of this floating point model using `evaluation.py` as above.\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 6.0 Single inference of the TFLite model files \n",
+ "\n",
+ "You can conduct TFLite inference for .fp32 and .int8 model files by using the following command: \n",
+ "\n",
+ "```python cnn_s_inference_tflite.py --labels validation_utils/labels.txt --wav --tflite_path ```\n",
+ "\n",
+ "**The feature extraction parameters used here should match those used in the Training step.**\n",
+ "\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.7.4"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/models/keyword_spotting/cnn_small/model_package_tf/model_archive/TFLite/tflite_fp32/README.md b/models/keyword_spotting/cnn_small/model_package_tf/model_archive/TFLite/tflite_fp32/README.md
new file mode 100644
index 0000000..c964371
--- /dev/null
+++ b/models/keyword_spotting/cnn_small/model_package_tf/model_archive/TFLite/tflite_fp32/README.md
@@ -0,0 +1,62 @@
+# keyword_spotting/cnn_small/model_package_tf/model_archive/TFLite/tflite_fp32
+
+## Description
+This is a floating point fp32 version of the CNN Small model developed by Arm, from the Hello Edge paper.
+
+## License
+[Apache-2.0](https://spdx.org/licenses/Apache-2.0.html)
+
+## Network Information
+| Network Information | Value |
+|---------------------|-------|
+| Framework | TensorFlow Lite |
+| Datatype | fp32 |
+| SHA-1 Hash | e9471348e6fb25191092236dac6af7c1fc84116b |
+| Size (Bytes) | 280444 |
+| Provenance | https://arxiv.org/abs/1711.07128 |
+| Training | Trained by Arm |
+| Paper | https://arxiv.org/abs/1711.07128 |
+
+## DataSet
+| Dataset Information | Value |
+|--------|-------|
+| Name | Google Speech Commands test set |
+
+## Accuracy
+
+| Metric | Value |
+|--------|-------|
+| accuracy | 92.21% |
+
+## HW Support
+| HW Support | Value |
+|--------------|-------|
+| Cortex-A |:heavy_check_mark: |
+| Cortex-M |:heavy_check_mark: |
+| Mali GPU |:heavy_check_mark: |
+| Ethos U |:heavy_multiplication_x: |
+
+### Key
+* :heavy_check_mark: - Will run on this platform.
+* :heavy_multiplication_x: - Will not run on this platform.
+
+## Network Quality
+| Network Quality | Value |
+|-------------------------|-------|
+| Recreate | :heavy_check_mark: |
+| Quality level | Deployable |
+| Vanilla | :heavy_check_mark: |
+| Clustered | :heavy_multiplication_x: |
+| Pruned | :heavy_multiplication_x: |
+| Quantization - default | :heavy_multiplication_x: |
+| Quantization - full | :heavy_multiplication_x: |
+
+## Network Inputs
+| Input Node Name | Shape | Type | Example Path | Example Type | Example Shape | Example Use Case |
+|-----------------|-------|-------|--------------|-------|-------|-----------------|
+| input | (1, 490) | fp32 | models/keyword_spotting/cnn_small/model_package_tf/model_archive/TFLite/tflite_fp32/testing_input/input | fp32 | [1, 490] | The input is a processed MFCCs |
+
+## Network Outputs
+| Output Node Name | Shape | Type | Example Path | Example Type | Example Shape | Example Use Case |
+|-----------------|-------|-------|--------------|-------|-------|-----------------|
+| Identity | (1, 12) | fp32 | models/keyword_spotting/cnn_small/model_package_tf/model_archive/TFLite/tflite_fp32/testing_output/Identity | fp32 | [1, 12] | The probability on 12 keywords |
\ No newline at end of file
diff --git a/models/keyword_spotting/cnn_small/model_package_tf/model_archive/TFLite/tflite_fp32/cnn_s.tflite b/models/keyword_spotting/cnn_small/model_package_tf/model_archive/TFLite/tflite_fp32/cnn_s.tflite
new file mode 100644
index 0000000..11ed7c3
--- /dev/null
+++ b/models/keyword_spotting/cnn_small/model_package_tf/model_archive/TFLite/tflite_fp32/cnn_s.tflite
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:39d968b59dec6a543fba800718fd72c9009644b39bcfd1e08226e18b40b6d9b5
+size 280444
diff --git a/models/keyword_spotting/cnn_small/model_package_tf/model_archive/TFLite/tflite_fp32/definition.yaml b/models/keyword_spotting/cnn_small/model_package_tf/model_archive/TFLite/tflite_fp32/definition.yaml
new file mode 100644
index 0000000..18e9f60
--- /dev/null
+++ b/models/keyword_spotting/cnn_small/model_package_tf/model_archive/TFLite/tflite_fp32/definition.yaml
@@ -0,0 +1,64 @@
+benchmark:
+ benchmark_metrics:
+ accuracy: 92.21%
+ benchmark_name: Google Speech Commands test set
+description: This is a floating point fp32 version of the CNN Small model developed
+ by Arm, from the Hello Edge paper.
+license:
+- Apache-2.0
+network:
+ datatype: fp32
+ file_size_bytes: 280444
+ filename: cnn_s.tflite
+ framework: TensorFlow Lite
+ hash:
+ algorithm: sha1
+ value: e9471348e6fb25191092236dac6af7c1fc84116b
+ provenance: https://arxiv.org/abs/1711.07128
+ training: Trained by Arm
+network_parameters:
+ input_nodes:
+ - description: The input is a processed MFCCs of shape (1, 490)
+ example_input:
+ path: models/keyword_spotting/cnn_small/model_package_tf/model_archive/TFLite/tflite_fp32/testing_input/input
+ shape:
+ - 1
+ - 490
+ type: fp32
+ use_case: Random input for model regression.
+ input_datatype: fp32
+ name: input
+ shape:
+ - 1
+ - 490
+ output_nodes:
+ - description: The probability on 12 keywords.
+ example_output:
+ path: models/keyword_spotting/cnn_small/model_package_tf/model_archive/TFLite/tflite_fp32/testing_output/Identity
+ shape:
+ - 1
+ - 12
+ type: fp32
+ use_case: output for model regression.
+ name: Identity
+ output_datatype: fp32
+ shape:
+ - 1
+ - 12
+network_quality:
+ clustered: false
+ is_vanilla: true
+ pruned: false
+ quality_level: Deployable
+ quality_level_hero_hw: cortex_m
+ quantization_default: false
+ quantization_full: false
+ recreate: true
+operators:
+ TensorFlow Lite:
+ - CONV_2D
+ - FULLY_CONNECTED
+ - RELU
+ - RESHAPE
+ - SOFTMAX
+paper: https://arxiv.org/abs/1711.07128
\ No newline at end of file
diff --git a/models/keyword_spotting/cnn_small/model_package_tf/model_archive/TFLite/tflite_fp32/testing_input/input/0.npy b/models/keyword_spotting/cnn_small/model_package_tf/model_archive/TFLite/tflite_fp32/testing_input/input/0.npy
new file mode 100644
index 0000000..2759db6
--- /dev/null
+++ b/models/keyword_spotting/cnn_small/model_package_tf/model_archive/TFLite/tflite_fp32/testing_input/input/0.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a4e38dbf192916f7af5440e17d27eaf1a19e13054977fed1ec5e85322e3da897
+size 2088
diff --git a/models/keyword_spotting/cnn_small/model_package_tf/model_archive/TFLite/tflite_fp32/testing_output/Identity/0.npy b/models/keyword_spotting/cnn_small/model_package_tf/model_archive/TFLite/tflite_fp32/testing_output/Identity/0.npy
new file mode 100644
index 0000000..b651412
--- /dev/null
+++ b/models/keyword_spotting/cnn_small/model_package_tf/model_archive/TFLite/tflite_fp32/testing_output/Identity/0.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9ad3e4972e18774433a093b7228742fe66dceece314ea2de02bc0ac29a632cf8
+size 176
diff --git a/models/keyword_spotting/cnn_small/model_package_tf/model_archive/TFLite/tflite_int8/README.md b/models/keyword_spotting/cnn_small/model_package_tf/model_archive/TFLite/tflite_int8/README.md
new file mode 100644
index 0000000..30ae15d
--- /dev/null
+++ b/models/keyword_spotting/cnn_small/model_package_tf/model_archive/TFLite/tflite_int8/README.md
@@ -0,0 +1,62 @@
+# keyword_spotting/cnn_small/model_package_tf/model_archive/TFLite/tflite_int8
+
+## Description
+This is a fully quantized int8 version of the CNN Small model developed by Arm, from the Hello Edge paper.
+
+## License
+[Apache-2.0](https://spdx.org/licenses/Apache-2.0.html)
+
+## Network Information
+| Network Information | Value |
+|---------------------|-------|
+| Framework | TensorFlow Lite |
+| Datatype | int8 |
+| SHA-1 Hash | 3415f88dfb8f78fe47d282d68ccbc3ce71a7510f |
+| Size (Bytes) | 75400 |
+| Provenance | https://arxiv.org/abs/1711.07128 |
+| Training | Trained by Arm |
+| Paper | https://arxiv.org/abs/1711.07128 |
+
+## DataSet
+| Dataset Information | Value |
+|--------|-------|
+| Name | Google Speech Commands test set |
+
+## Accuracy
+
+| Metric | Value |
+|--------|-------|
+| Accuracy | 90.18% |
+
+## HW Support
+| HW Support | Value |
+|--------------|-------|
+| Cortex-A |:heavy_check_mark: |
+| Cortex-M |:heavy_check_mark: |
+| Mali GPU |:heavy_check_mark: |
+| Ethos U |:heavy_check_mark: |
+
+### Key
+* :heavy_check_mark: - Will run on this platform.
+* :heavy_multiplication_x: - Will not run on this platform.
+
+## Network Quality
+| Network Quality | Value |
+|-------------------------|-------|
+| Recreate | :heavy_check_mark: |
+| Quality level | Deployable |
+| Vanilla | :heavy_check_mark: |
+| Clustered | :heavy_multiplication_x: |
+| Pruned | :heavy_multiplication_x: |
+| Quantization - default | :heavy_multiplication_x: |
+| Quantization - full | :heavy_check_mark: |
+
+## Network Inputs
+| Input Node Name | Shape | Type | Example Path | Example Type | Example Shape | Example Use Case |
+|-----------------|-------|-------|--------------|-------|-------|-----------------|
+| input | (1, 490) | int8 | models/keyword_spotting/cnn_small/model_package_tf/model_archive/TFLite/tflite_int8/testing_input/input | fp32 | [1, 490] | The input is a processed MFCCs |
+
+## Network Outputs
+| Output Node Name | Shape | Type | Example Path | Example Type | Example Shape | Example Use Case |
+|-----------------|-------|-------|--------------|-------|-------|-----------------|
+| Identity | (1, 12) | int8 | models/keyword_spotting/cnn_small/model_package_tf/model_archive/TFLite/tflite_int8/testing_output/Identity | fp32 | [1, 12] | The probability on 12 keywords |
\ No newline at end of file
diff --git a/models/keyword_spotting/cnn_small/tflite_int8/cnn_s_quantized.tflite b/models/keyword_spotting/cnn_small/model_package_tf/model_archive/TFLite/tflite_int8/cnn_s_quantized.tflite
similarity index 100%
rename from models/keyword_spotting/cnn_small/tflite_int8/cnn_s_quantized.tflite
rename to models/keyword_spotting/cnn_small/model_package_tf/model_archive/TFLite/tflite_int8/cnn_s_quantized.tflite
diff --git a/models/keyword_spotting/cnn_small/model_package_tf/model_archive/TFLite/tflite_int8/definition.yaml b/models/keyword_spotting/cnn_small/model_package_tf/model_archive/TFLite/tflite_int8/definition.yaml
new file mode 100644
index 0000000..c836274
--- /dev/null
+++ b/models/keyword_spotting/cnn_small/model_package_tf/model_archive/TFLite/tflite_int8/definition.yaml
@@ -0,0 +1,64 @@
+benchmark:
+ benchmark_metrics:
+ Accuracy: 90.18%
+ benchmark_name: Google Speech Commands test set
+description: This is a fully quantized int8 version of the CNN Small model developed
+ by Arm, from the Hello Edge paper.
+license:
+- Apache-2.0
+network:
+ datatype: int8
+ file_size_bytes: 75400
+ filename: cnn_s_quantized.tflite
+ framework: TensorFlow Lite
+ hash:
+ algorithm: sha1
+ value: 3415f88dfb8f78fe47d282d68ccbc3ce71a7510f
+ provenance: https://arxiv.org/abs/1711.07128
+ training: Trained by Arm
+network_parameters:
+ input_nodes:
+ - description: The input is a processed MFCCs of shape (1, 490)
+ example_input:
+ path: models/keyword_spotting/cnn_small/model_package_tf/model_archive/TFLite/tflite_int8/testing_input/input
+ shape:
+ - 1
+ - 490
+ type: int8
+ use_case: Random input for model regression.
+ input_datatype: int8
+ name: input
+ shape:
+ - 1
+ - 490
+ output_nodes:
+ - description: The probability on 12 keywords.
+ example_output:
+ path: models/keyword_spotting/cnn_small/model_package_tf/model_archive/TFLite/tflite_int8/testing_output/Identity
+ shape:
+ - 1
+ - 12
+ type: int8
+ use_case: output for model regression.
+ name: Identity
+ output_datatype: int8
+ shape:
+ - 1
+ - 12
+network_quality:
+ clustered: false
+ is_vanilla: true
+ pruned: false
+ quality_level: Deployable
+ quality_level_hero_hw: cortex_m
+ quantization_default: false
+ quantization_full: true
+ recreate: true
+operators:
+ TensorFlow Lite:
+ - CONV_2D
+ - FULLY_CONNECTED
+ - RELU
+ - RESHAPE
+ - SOFTMAX
+paper: https://arxiv.org/abs/1711.07128
\ No newline at end of file
diff --git a/models/keyword_spotting/cnn_small/tflite_int8/testing_input/input/0.npy b/models/keyword_spotting/cnn_small/model_package_tf/model_archive/TFLite/tflite_int8/testing_input/input/0.npy
similarity index 100%
rename from models/keyword_spotting/cnn_small/tflite_int8/testing_input/input/0.npy
rename to models/keyword_spotting/cnn_small/model_package_tf/model_archive/TFLite/tflite_int8/testing_input/input/0.npy
diff --git a/models/keyword_spotting/cnn_small/tflite_int8/testing_output/Identity/0.npy b/models/keyword_spotting/cnn_small/model_package_tf/model_archive/TFLite/tflite_int8/testing_output/Identity/0.npy
similarity index 100%
rename from models/keyword_spotting/cnn_small/tflite_int8/testing_output/Identity/0.npy
rename to models/keyword_spotting/cnn_small/model_package_tf/model_archive/TFLite/tflite_int8/testing_output/Identity/0.npy
diff --git a/models/keyword_spotting/cnn_small/model_package_tf/model_archive/model_source/saved_model/cnn_small/keras_metadata.pb b/models/keyword_spotting/cnn_small/model_package_tf/model_archive/model_source/saved_model/cnn_small/keras_metadata.pb
new file mode 100644
index 0000000..f463c39
--- /dev/null
+++ b/models/keyword_spotting/cnn_small/model_package_tf/model_archive/model_source/saved_model/cnn_small/keras_metadata.pb
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:97d0b45b0027a13e5c2d0a0049775bfa1ac4661ee6e1e9c20690137ba0b91539
+size 28876
diff --git a/models/keyword_spotting/cnn_small/model_package_tf/model_archive/model_source/saved_model/cnn_small/saved_model.pb b/models/keyword_spotting/cnn_small/model_package_tf/model_archive/model_source/saved_model/cnn_small/saved_model.pb
new file mode 100644
index 0000000..1904687
--- /dev/null
+++ b/models/keyword_spotting/cnn_small/model_package_tf/model_archive/model_source/saved_model/cnn_small/saved_model.pb
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c1d3d2c96b473b7cd1b9ca9cd60695a3c6e27d6cc57469b79da75e709e869ff6
+size 302218
diff --git a/models/keyword_spotting/cnn_small/model_package_tf/model_archive/model_source/saved_model/cnn_small/variables/variables.data-00000-of-00001 b/models/keyword_spotting/cnn_small/model_package_tf/model_archive/model_source/saved_model/cnn_small/variables/variables.data-00000-of-00001
new file mode 100644
index 0000000..ad5b44d
--- /dev/null
+++ b/models/keyword_spotting/cnn_small/model_package_tf/model_archive/model_source/saved_model/cnn_small/variables/variables.data-00000-of-00001
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:da9dce03052ef2895fbd3b41f28aade4d53d3ba38a706ded903c133b4c57a549
+size 288200
diff --git a/models/keyword_spotting/cnn_small/model_package_tf/model_archive/model_source/saved_model/cnn_small/variables/variables.index b/models/keyword_spotting/cnn_small/model_package_tf/model_archive/model_source/saved_model/cnn_small/variables/variables.index
new file mode 100644
index 0000000..c4f021a
--- /dev/null
+++ b/models/keyword_spotting/cnn_small/model_package_tf/model_archive/model_source/saved_model/cnn_small/variables/variables.index
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3021889ecbad08fd6d5edf947596f2fd9dee8a594a63a1f3d2f4bafee7271cce
+size 1466
diff --git a/models/keyword_spotting/cnn_small/tflite_int8/ckpt/checkpoint b/models/keyword_spotting/cnn_small/model_package_tf/model_archive/model_source/weights/checkpoint
similarity index 100%
rename from models/keyword_spotting/cnn_small/tflite_int8/ckpt/checkpoint
rename to models/keyword_spotting/cnn_small/model_package_tf/model_archive/model_source/weights/checkpoint
diff --git a/models/keyword_spotting/cnn_small/tflite_int8/ckpt/cnn_0.92_ckpt.data-00000-of-00001 b/models/keyword_spotting/cnn_small/model_package_tf/model_archive/model_source/weights/cnn_0.92_ckpt.data-00000-of-00001
similarity index 100%
rename from models/keyword_spotting/cnn_small/tflite_int8/ckpt/cnn_0.92_ckpt.data-00000-of-00001
rename to models/keyword_spotting/cnn_small/model_package_tf/model_archive/model_source/weights/cnn_0.92_ckpt.data-00000-of-00001
diff --git a/models/keyword_spotting/cnn_small/tflite_int8/ckpt/cnn_0.92_ckpt.index b/models/keyword_spotting/cnn_small/model_package_tf/model_archive/model_source/weights/cnn_0.92_ckpt.index
similarity index 100%
rename from models/keyword_spotting/cnn_small/tflite_int8/ckpt/cnn_0.92_ckpt.index
rename to models/keyword_spotting/cnn_small/model_package_tf/model_archive/model_source/weights/cnn_0.92_ckpt.index
diff --git a/models/keyword_spotting/cnn_small/model_package_tf/model_core_utils/__init__.py b/models/keyword_spotting/cnn_small/model_package_tf/model_core_utils/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/models/keyword_spotting/cnn_small/model_package_tf/model_core_utils/models.py b/models/keyword_spotting/cnn_small/model_package_tf/model_core_utils/models.py
new file mode 100644
index 0000000..1978136
--- /dev/null
+++ b/models/keyword_spotting/cnn_small/model_package_tf/model_core_utils/models.py
@@ -0,0 +1,327 @@
+# Copyright © 2023 Arm Ltd. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Model definitions for simple keyword spotting."""
+
+import math
+
+import tensorflow as tf
+
+
+def prepare_model_settings(label_count, sample_rate, clip_duration_ms,
+ window_size_ms, window_stride_ms,
+ dct_coefficient_count):
+ """Calculates common settings needed for all models.
+
+ Args:
+ label_count: How many classes are to be recognized.
+ sample_rate: Number of audio samples per second.
+ clip_duration_ms: Length of each audio clip to be analyzed.
+ window_size_ms: Duration of frequency analysis window.
+ window_stride_ms: How far to move in time between frequency windows.
+ dct_coefficient_count: Number of frequency bins to use for analysis.
+
+ Returns:
+ Dictionary containing common settings.
+ """
+ desired_samples = int(sample_rate * clip_duration_ms / 1000)
+ window_size_samples = int(sample_rate * window_size_ms / 1000)
+ window_stride_samples = int(sample_rate * window_stride_ms / 1000)
+ length_minus_window = (desired_samples - window_size_samples)
+ if length_minus_window < 0:
+ spectrogram_length = 0
+ else:
+ spectrogram_length = 1 + int(length_minus_window / window_stride_samples)
+ fingerprint_size = dct_coefficient_count * spectrogram_length
+
+ return {
+ 'desired_samples': desired_samples,
+ 'window_size_samples': window_size_samples,
+ 'window_stride_samples': window_stride_samples,
+ 'spectrogram_length': spectrogram_length,
+ 'dct_coefficient_count': dct_coefficient_count,
+ 'fingerprint_size': fingerprint_size,
+ 'label_count': label_count,
+ 'sample_rate': sample_rate,
+ }
+
+
+def create_model(model_settings, model_architecture, model_size_info, is_training):
+ """Builds a tf.keras model of the requested architecture compatible with the settings.
+
+ Args:
+ model_settings: Dictionary of information about the model.
+ model_architecture: String specifying which kind of model to create.
+ model_size_info: Array with specific information for the chosen architecture
+ (e.g convolutional parameters, number of layers).
+
+ Returns:
+ A tf.keras Model with the requested architecture.
+
+ Raises:
+ Exception: If the architecture type isn't recognized.
+ """
+
+ if model_architecture == 'dnn':
+ return create_dnn_model(model_settings, model_size_info)
+
+ elif model_architecture == 'cnn':
+ return create_cnn_model(model_settings, model_size_info)
+
+ elif model_architecture == 'ds_cnn':
+ return create_ds_cnn_model(model_settings, model_size_info)
+ elif model_architecture == 'single_fc':
+ return create_single_fc_model(model_settings)
+ elif model_architecture == 'basic_lstm':
+ return create_basic_lstm_model(model_settings, model_size_info, is_training)
+ else:
+ raise Exception(f'model_architecture argument {model_architecture} not recognized'
+ f', should be one of, "dnn", "cnn", "ds_cnn" ')
+
+
+def create_single_fc_model(model_settings):
+ """Builds a model with a single fully-connected layer.
+
+ For details see https://arxiv.org/abs/1711.07128.
+
+ Args:
+ model_settings: Dict of different settings for model training.
+
+ Returns:
+ tf.keras Model of the 'SINGLE_FC' architecture.
+ """
+ inputs = tf.keras.Input(shape=(model_settings['fingerprint_size'],), name='input')
+ # Fully connected layer
+ output = tf.keras.layers.Dense(units=model_settings['label_count'], activation='softmax')(inputs)
+
+ return tf.keras.Model(inputs, output)
+
+
+def create_basic_lstm_model(model_settings, model_size_info, is_training):
+ """Builds a model with a basic lstm layer.
+
+ For details see https://arxiv.org/abs/1711.07128.
+
+ Args:
+ model_settings: Dict of different settings for model training.
+ model_size_info: Length of the array defines the number of hidden-layers and
+ each element in the array represent the number of neurons in that layer.
+ is_training: Determining whether the use of the model is for training or for something else.
+
+ Returns:
+ tf.keras Model of the 'Basic_LSTM' architecture.
+ """
+ inputs = tf.keras.Input(shape=(model_settings['fingerprint_size'], ), name='input')
+
+ input_frequency_size = model_settings['dct_coefficient_count']
+ input_time_size = model_settings['spectrogram_length']
+ x = tf.reshape(inputs, shape=(-1, input_time_size, input_frequency_size))
+
+ # LSTM layer, and unrolling depending on whether you are training or not
+ if is_training:
+ x = tf.keras.layers.LSTM(units=model_size_info[0], time_major=False, unroll=False)(x)
+ else:
+ x = tf.keras.layers.LSTM(units=model_size_info[0], time_major=False, unroll=True)(x)
+
+ # Outputs a fully connected layer
+ output = tf.keras.layers.Dense(units=model_settings['label_count'], activation='softmax')(x)
+
+ return tf.keras.Model(inputs, output)
+
+
+def create_dnn_model(model_settings, model_size_info):
+ """Builds a model with multiple hidden fully-connected layers.
+
+ For details see https://arxiv.org/abs/1711.07128.
+
+ Args:
+ model_settings: Dict of different settings for model training.
+ model_size_info: Length of the array defines the number of hidden-layers and
+ each element in the array represent the number of neurons in that layer.
+
+ Returns:
+ tf.keras Model of the 'DNN' architecture.
+ """
+
+ inputs = tf.keras.Input(shape=(model_settings['fingerprint_size'], ), name='input')
+
+ # First fully connected layer.
+ x = tf.keras.layers.Dense(units=model_size_info[0], activation='relu')(inputs)
+
+ # Hidden layers with ReLU activations.
+ for i in range(1, len(model_size_info)):
+ x = tf.keras.layers.Dense(units=model_size_info[i], activation='relu')(x)
+
+ # Output fully connected layer.
+ output = tf.keras.layers.Dense(units=model_settings['label_count'], activation='softmax')(x)
+
+ return tf.keras.Model(inputs, output)
+
+
+def create_cnn_model(model_settings, model_size_info):
+ """Builds a model with 2 convolution layers followed by a linear layer and a hidden fully-connected layer.
+
+ For details see https://arxiv.org/abs/1711.07128.
+
+ Args:
+ model_settings: Dict of different settings for model training.
+ model_size_info: Defines the first and second convolution parameters in
+ {number of conv features, conv filter height, width, stride in y,x dir.},
+ followed by linear layer size and fully-connected layer size.
+
+ Returns:
+ tf.keras Model of the 'CNN' architecture.
+ """
+
+ input_frequency_size = model_settings['dct_coefficient_count']
+ input_time_size = model_settings['spectrogram_length']
+
+ first_filter_count = model_size_info[0]
+ first_filter_height = model_size_info[1] # Time axis.
+ first_filter_width = model_size_info[2] # Frequency axis.
+ first_filter_stride_y = model_size_info[3] # Time axis.
+ first_filter_stride_x = model_size_info[4] # Frequency_axis.
+
+ second_filter_count = model_size_info[5]
+ second_filter_height = model_size_info[6] # Time axis.
+ second_filter_width = model_size_info[7] # Frequency axis.
+ second_filter_stride_y = model_size_info[8] # Time axis.
+ second_filter_stride_x = model_size_info[9] # Frequency axis.
+
+ linear_layer_size = model_size_info[10]
+ fc_size = model_size_info[11]
+
+ inputs = tf.keras.Input(shape=(model_settings['fingerprint_size']), name='input')
+
+ # Reshape the flattened input.
+ x = tf.reshape(inputs, shape=(-1, input_time_size, input_frequency_size, 1))
+
+ # First convolution.
+ x = tf.keras.layers.Conv2D(filters=first_filter_count,
+ kernel_size=(first_filter_height, first_filter_width),
+ strides=(first_filter_stride_y, first_filter_stride_x),
+ padding='VALID')(x)
+ x = tf.keras.layers.BatchNormalization()(x)
+ x = tf.keras.layers.ReLU()(x)
+ x = tf.keras.layers.Dropout(rate=0)(x)
+
+ # Second convolution.
+ x = tf.keras.layers.Conv2D(filters=second_filter_count,
+ kernel_size=(second_filter_height, second_filter_width),
+ strides=(second_filter_stride_y, second_filter_stride_x),
+ padding='VALID')(x)
+ x = tf.keras.layers.BatchNormalization()(x)
+ x = tf.keras.layers.ReLU()(x)
+ x = tf.keras.layers.Dropout(rate=0)(x)
+
+ # Flatten for fully connected layers.
+ x = tf.keras.layers.Flatten()(x)
+
+ # Fully connected layer with no activation.
+ x = tf.keras.layers.Dense(units=linear_layer_size)(x)
+
+ # Fully connected layer with ReLU activation.
+ x = tf.keras.layers.Dense(units=fc_size)(x)
+ x = tf.keras.layers.BatchNormalization()(x)
+ x = tf.keras.layers.ReLU()(x)
+ x = tf.keras.layers.Dropout(rate=0)(x)
+
+ # Output fully connected.
+ output = tf.keras.layers.Dense(units=model_settings['label_count'], activation='softmax')(x)
+
+ return tf.keras.Model(inputs, output)
+
+
+def create_ds_cnn_model(model_settings, model_size_info):
+ """Builds a model with convolutional & depthwise separable convolutional layers.
+
+ For more details see https://arxiv.org/abs/1711.07128.
+
+ Args:
+ model_settings: Dict of different settings for model training.
+ model_size_info: Defines number of layers, followed by the DS-Conv layer
+ parameters in the order {number of conv features, conv filter height,
+ width and stride in y,x dir.} for each of the layers.
+
+ Returns:
+ tf.keras Model of the 'DS-CNN' architecture.
+ """
+
+ label_count = model_settings['label_count']
+ input_frequency_size = model_settings['dct_coefficient_count']
+ input_time_size = model_settings['spectrogram_length']
+
+ t_dim = input_time_size
+ f_dim = input_frequency_size
+
+ # Extract model dimensions from model_size_info.
+ num_layers = model_size_info[0]
+ conv_feat = [None]*num_layers
+ conv_kt = [None]*num_layers
+ conv_kf = [None]*num_layers
+ conv_st = [None]*num_layers
+ conv_sf = [None]*num_layers
+
+ i = 1
+ for layer_no in range(0, num_layers):
+ conv_feat[layer_no] = model_size_info[i]
+ i += 1
+ conv_kt[layer_no] = model_size_info[i]
+ i += 1
+ conv_kf[layer_no] = model_size_info[i]
+ i += 1
+ conv_st[layer_no] = model_size_info[i]
+ i += 1
+ conv_sf[layer_no] = model_size_info[i]
+ i += 1
+
+ inputs = tf.keras.Input(shape=(model_settings['fingerprint_size']), name='input')
+
+ # Reshape the flattened input.
+ x = tf.reshape(inputs, shape=(-1, input_time_size, input_frequency_size, 1))
+
+ # Depthwise separable convolutions.
+ for layer_no in range(0, num_layers):
+ if layer_no == 0:
+ # First convolution.
+ x = tf.keras.layers.Conv2D(filters=conv_feat[0],
+ kernel_size=(conv_kt[0], conv_kf[0]),
+ strides=(conv_st[0], conv_sf[0]),
+ padding='SAME')(x)
+ x = tf.keras.layers.BatchNormalization()(x)
+ x = tf.keras.layers.ReLU()(x)
+ else:
+ # Depthwise convolution.
+ x = tf.keras.layers.DepthwiseConv2D(kernel_size=(conv_kt[layer_no], conv_kf[layer_no]),
+ strides=(conv_sf[layer_no], conv_st[layer_no]),
+ padding='SAME')(x)
+ x = tf.keras.layers.BatchNormalization()(x)
+ x = tf.keras.layers.ReLU()(x)
+
+ # Pointwise convolution.
+ x = tf.keras.layers.Conv2D(filters=conv_feat[layer_no], kernel_size=(1, 1))(x)
+ x = tf.keras.layers.BatchNormalization()(x)
+ x = tf.keras.layers.ReLU()(x)
+
+ t_dim = math.ceil(t_dim/float(conv_st[layer_no]))
+ f_dim = math.ceil(f_dim/float(conv_sf[layer_no]))
+
+ # Global average pool.
+ x = tf.keras.layers.AveragePooling2D(pool_size=(t_dim, f_dim), strides=1)(x)
+
+ # Squeeze before passing to output fully connected layer.
+ x = tf.reshape(x, shape=(-1, conv_feat[layer_no]))
+
+ # Output connected layer.
+ output = tf.keras.layers.Dense(units=label_count, activation='softmax')(x)
+
+ return tf.keras.Model(inputs, output)
diff --git a/models/keyword_spotting/cnn_small/model_package_tf/optimisations.py b/models/keyword_spotting/cnn_small/model_package_tf/optimisations.py
new file mode 100644
index 0000000..16b6f4c
--- /dev/null
+++ b/models/keyword_spotting/cnn_small/model_package_tf/optimisations.py
@@ -0,0 +1,259 @@
+# Copyright © 2023 Arm Ltd. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Functions for optimizing simple keyword spotting models using clustering API."""
+
+import argparse
+from pathlib import Path
+
+import tensorflow as tf
+import numpy as np
+import tensorflow_model_optimization as tfmot
+
+from data_processing import data_preprocessing
+from model_core_utils import models
+
+
+def print_model_weight_clusters(model):
+
+ for layer in model.layers:
+ if isinstance(layer, tf.keras.layers.Wrapper):
+ weights = layer.trainable_weights
+ else:
+ weights = layer.weights
+ for weight in weights:
+ if "kernel" in weight.name:
+ unique_count = len(np.unique(weight))
+ print(
+ f"{layer.name}/{weight.name}: {unique_count} clusters "
+ )
+
+
+def optimize():
+ model_settings = models.prepare_model_settings(
+ len(data_preprocessing.prepare_words_list(FLAGS.wanted_words.split(','))),
+ FLAGS.sample_rate, FLAGS.clip_duration_ms, FLAGS.window_size_ms,
+ FLAGS.window_stride_ms, FLAGS.dct_coefficient_count)
+
+ # Create the model to optimize from checkpoint.
+ model = models.create_model(model_settings, FLAGS.model_architecture, FLAGS.model_size_info)
+ model.load_weights(FLAGS.checkpoint).expect_partial()
+
+ audio_processor = data_preprocessing.AudioProcessor(data_url=FLAGS.data_url,
+ data_dir=FLAGS.data_dir,
+ silence_percentage=FLAGS.silence_percentage,
+ unknown_percentage=FLAGS.unknown_percentage,
+ wanted_words=FLAGS.wanted_words.split(','),
+ validation_percentage=FLAGS.validation_percentage,
+ testing_percentage=FLAGS.testing_percentage,
+ model_settings=model_settings)
+
+ # We decay learning rate in a constant piecewise way to help learning.
+ training_steps_list = list(map(int, FLAGS.how_many_training_steps.split(',')))
+ learning_rates_list = list(map(float, FLAGS.learning_rate.split(',')))
+ lr_boundary_list = training_steps_list[:-1] # Only need the values at which to change lr.
+ lr_schedule = tf.keras.optimizers.schedules.PiecewiseConstantDecay(boundaries=lr_boundary_list,
+ values=learning_rates_list)
+
+ cluster_weights = tfmot.clustering.keras.cluster_weights
+ CentroidInitialization = tfmot.clustering.keras.CentroidInitialization
+
+ clustering_params = {
+ 'number_of_clusters': 32,
+ 'cluster_centroids_init': CentroidInitialization.KMEANS_PLUS_PLUS}
+
+ clustered_model = cluster_weights(model, **clustering_params)
+
+ # Specify the optimizer configurations.
+ optimizer = tf.keras.optimizers.Adam(learning_rate=lr_schedule)
+ clustered_model.compile(optimizer=optimizer,
+ loss=tf.keras.losses.SparseCategoricalCrossentropy(),
+ metrics=['accuracy'])
+
+ train_data = audio_processor.get_data(audio_processor.Modes.TRAINING,
+ FLAGS.background_frequency, FLAGS.background_volume,
+ int((FLAGS.time_shift_ms * FLAGS.sample_rate) / 1000))
+ train_data = train_data.repeat().batch(FLAGS.batch_size).prefetch(tf.data.AUTOTUNE)
+ val_data = audio_processor.get_data(audio_processor.Modes.VALIDATION)
+ val_data = val_data.batch(FLAGS.batch_size).prefetch(tf.data.AUTOTUNE)
+
+ # We train for a max number of iterations so need to calculate how many 'epochs' this will be.
+ training_steps_max = np.sum(training_steps_list)
+ training_epoch_max = int(np.ceil(training_steps_max / FLAGS.eval_step_interval))
+
+ # Train the model with clustering applied.
+ clustered_model.fit(x=train_data,
+ steps_per_epoch=FLAGS.eval_step_interval,
+ epochs=training_epoch_max,
+ validation_data=val_data)
+
+ stripped_clustered_model = tfmot.clustering.keras.strip_clustering(clustered_model)
+
+ print_model_weight_clusters(stripped_clustered_model)
+
+ # Save the clustered model weights
+ train_dir = Path(FLAGS.train_dir) / "optimized"
+ train_dir.mkdir(parents=True, exist_ok=True)
+
+ stripped_clustered_model.save_weights((train_dir /
+ (FLAGS.model_architecture +
+ "_clustered_ckpt")))
+
+ # Test the model.
+ test_data = audio_processor.get_data(audio_processor.Modes.TESTING)
+ test_data = test_data.batch(FLAGS.batch_size)
+
+ stripped_clustered_model.compile(optimizer=optimizer,
+ loss=tf.keras.losses.SparseCategoricalCrossentropy(),
+ metrics=['accuracy'])
+
+ test_loss, test_acc = stripped_clustered_model.evaluate(x=test_data)
+ print(f'Final test accuracy: {test_acc*100:.2f}%')
+
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser()
+ parser.add_argument(
+ '--data_url',
+ type=str,
+ default='http://download.tensorflow.org/data/speech_commands_v0.02.tar.gz',
+ help='Location of speech training data archive on the web.')
+ parser.add_argument(
+ '--data_dir',
+ type=str,
+ default='/tmp/speech_dataset/',
+ help="""\
+ Where to download the speech training data to.
+ """)
+ parser.add_argument(
+ '--background_volume',
+ type=float,
+ default=0.1,
+ help="""\
+ How loud the background noise should be, between 0 and 1.
+ """)
+ parser.add_argument(
+ '--background_frequency',
+ type=float,
+ default=0.8,
+ help="""\
+ How many of the training samples have background noise mixed in.
+ """)
+ parser.add_argument(
+ '--silence_percentage',
+ type=float,
+ default=10.0,
+ help="""\
+ How much of the training data should be silence.
+ """)
+ parser.add_argument(
+ '--unknown_percentage',
+ type=float,
+ default=10.0,
+ help="""\
+ How much of the training data should be unknown words.
+ """)
+ parser.add_argument(
+ '--time_shift_ms',
+ type=float,
+ default=100.0,
+ help="""\
+ Range to randomly shift the training audio by in time.
+ """)
+ parser.add_argument(
+ '--testing_percentage',
+ type=int,
+ default=10,
+ help='What percentage of wavs to use as a test set.')
+ parser.add_argument(
+ '--validation_percentage',
+ type=int,
+ default=10,
+ help='What percentage of wavs to use as a validation set.')
+ parser.add_argument(
+ '--sample_rate',
+ type=int,
+ default=16000,
+ help='Expected sample rate of the wavs',)
+ parser.add_argument(
+ '--clip_duration_ms',
+ type=int,
+ default=1000,
+ help='Expected duration in milliseconds of the wavs',)
+ parser.add_argument(
+ '--window_size_ms',
+ type=float,
+ default=30.0,
+ help='How long each spectrogram timeslice is',)
+ parser.add_argument(
+ '--window_stride_ms',
+ type=float,
+ default=10.0,
+ help='How long each spectrogram timeslice is',)
+ parser.add_argument(
+ '--dct_coefficient_count',
+ type=int,
+ default=40,
+ help='How many bins to use for the MFCC fingerprint',)
+ parser.add_argument(
+ '--how_many_training_steps',
+ type=str,
+ default='3750,750',
+ help='How many training loops to run',)
+ parser.add_argument(
+ '--eval_step_interval',
+ type=int,
+ default=400,
+ help='How often to evaluate the training results.')
+ parser.add_argument(
+ '--learning_rate',
+ type=str,
+ default='0.001,0.0001',
+ help='How large a learning rate to use when training.')
+ parser.add_argument(
+ '--batch_size',
+ type=int,
+ default=100,
+ help='How many items to train with at once',)
+ parser.add_argument(
+ '--wanted_words',
+ type=str,
+ default='yes,no,up,down,left,right,on,off,stop,go',
+ help='Words to use (others will be added to an unknown label)',)
+ parser.add_argument(
+ '--train_dir',
+ type=str,
+ default='/tmp/speech_commands_train',
+ help='Directory to write event logs and checkpoint.')
+ parser.add_argument(
+ '--save_step_interval',
+ type=int,
+ default=100,
+ help='Save model checkpoint every save_steps.')
+ parser.add_argument(
+ '--checkpoint',
+ type=str,
+ help='Checkpoint to load the weights from before fine-tuning.')
+ parser.add_argument(
+ '--model_architecture',
+ type=str,
+ default='dnn',
+ help='What model architecture to use')
+ parser.add_argument(
+ '--model_size_info',
+ type=int,
+ nargs="+",
+ default=[128, 128, 128],
+ help='Model dimensions - different for various models')
+
+ FLAGS, _ = parser.parse_known_args()
+ optimize()
diff --git a/models/keyword_spotting/cnn_small/model_package_tf/recreate_model.sh b/models/keyword_spotting/cnn_small/model_package_tf/recreate_model.sh
new file mode 100644
index 0000000..1f0289a
--- /dev/null
+++ b/models/keyword_spotting/cnn_small/model_package_tf/recreate_model.sh
@@ -0,0 +1,52 @@
+#!/usr/bin/env bash
+# Copyright (C) 2023 Arm Limited or its affiliates. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+ckpt_path=model_archive/model_source/weights/cnn_0.92_ckpt
+train=false
+
+# Parse command line args
+while (( $# >= 1 )); do
+ case $1 in
+ --ckpt)
+ if [ "$2" ]; then
+ ckpt_path=$2
+ shift
+ else
+ printf 'ERROR: "--ckpt" requires a path to be supplied.\n'
+ exit 1
+ fi
+ ;;
+ --train)
+ train=true
+ break;;
+ *) shift;
+ esac;
+done
+
+
+# CNN Small training
+if [ "$train" = true ]
+then
+python train.py --model_architecture cnn --model_size_info 28 10 4 1 1 30 10 4 2 1 16 128 --dct_coefficient_count 10 --window_size_ms 40 --window_stride_ms 20 --learning_rate 0.0005,0.0001,0.00002 --how_many_training_steps 10000,10000,10000 --summaries_dir work/CNN/CNN_S/retrain_logs --train_dir work/CNN/CNN_S/training
+fi
+
+# Conversion to TFLite fp32
+python convert_to_tflite.py --model_architecture cnn --model_size_info 28 10 4 1 1 30 10 4 2 1 16 128 --dct_coefficient_count 10 --window_size_ms 40 --window_stride_ms 20 --checkpoint $ckpt_path --no-quantize
+
+# Conversion to TFLite int8
+python convert_to_tflite.py --model_architecture cnn --model_size_info 28 10 4 1 1 30 10 4 2 1 16 128 --dct_coefficient_count 10 --window_size_ms 40 --window_stride_ms 20 --checkpoint $ckpt_path --inference_type int8
+
diff --git a/models/keyword_spotting/cnn_small/model_package_tf/requirements.txt b/models/keyword_spotting/cnn_small/model_package_tf/requirements.txt
new file mode 100644
index 0000000..3448cff
--- /dev/null
+++ b/models/keyword_spotting/cnn_small/model_package_tf/requirements.txt
@@ -0,0 +1,3 @@
+numpy == 1.19.5
+tensorflow == 2.5.0
+tensorflow-model-optimization == 0.6.0
\ No newline at end of file
diff --git a/models/keyword_spotting/cnn_small/model_package_tf/train.py b/models/keyword_spotting/cnn_small/model_package_tf/train.py
new file mode 100644
index 0000000..8c488b3
--- /dev/null
+++ b/models/keyword_spotting/cnn_small/model_package_tf/train.py
@@ -0,0 +1,227 @@
+# Copyright © 2023 Arm Ltd. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Functions for training simple keyword spotting models."""
+
+import argparse
+from pathlib import Path
+
+import tensorflow as tf
+import numpy as np
+
+from data_processing import data_preprocessing
+from model_core_utils import models
+
+
+def train():
+ model_settings = models.prepare_model_settings(
+ len(data_preprocessing.prepare_words_list(FLAGS.wanted_words.split(','))),
+ FLAGS.sample_rate, FLAGS.clip_duration_ms, FLAGS.window_size_ms,
+ FLAGS.window_stride_ms, FLAGS.dct_coefficient_count)
+
+ # Create the model.
+ model = models.create_model(model_settings, FLAGS.model_architecture, FLAGS.model_size_info, True)
+
+ audio_processor = data_preprocessing.AudioProcessor(data_url=FLAGS.data_url,
+ data_dir=FLAGS.data_dir,
+ silence_percentage=FLAGS.silence_percentage,
+ unknown_percentage=FLAGS.unknown_percentage,
+ wanted_words=FLAGS.wanted_words.split(','),
+ validation_percentage=FLAGS.validation_percentage,
+ testing_percentage=FLAGS.testing_percentage,
+ model_settings=model_settings)
+
+ # We decay learning rate in a constant piecewise way to help learning.
+ training_steps_list = list(map(int, FLAGS.how_many_training_steps.split(',')))
+ learning_rates_list = list(map(float, FLAGS.learning_rate.split(',')))
+ lr_boundary_list = training_steps_list[:-1] # Only need the values at which to change lr.
+ lr_schedule = tf.keras.optimizers.schedules.PiecewiseConstantDecay(boundaries=lr_boundary_list,
+ values=learning_rates_list)
+
+ # Specify the optimizer configurations.
+ optimizer = tf.keras.optimizers.Adam(learning_rate=lr_schedule)
+ model.compile(optimizer=optimizer,
+ loss=tf.keras.losses.SparseCategoricalCrossentropy(),
+ metrics=['accuracy'])
+
+ train_data = audio_processor.get_data(audio_processor.Modes.TRAINING,
+ FLAGS.background_frequency, FLAGS.background_volume,
+ int((FLAGS.time_shift_ms * FLAGS.sample_rate) / 1000))
+ train_data = train_data.repeat().batch(FLAGS.batch_size).prefetch(tf.data.AUTOTUNE)
+ val_data = audio_processor.get_data(audio_processor.Modes.VALIDATION)
+ val_data = val_data.batch(FLAGS.batch_size).prefetch(tf.data.AUTOTUNE)
+
+ # We train for a max number of iterations so need to calculate how many 'epochs' this will be.
+ training_steps_max = np.sum(training_steps_list)
+ training_epoch_max = int(np.ceil(training_steps_max / FLAGS.eval_step_interval))
+
+ # Callbacks.
+ train_dir = Path(FLAGS.train_dir) / "best"
+ train_dir.mkdir(parents=True, exist_ok=True)
+ model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
+ filepath=(train_dir / (FLAGS.model_architecture + "_{val_accuracy:.3f}_ckpt")),
+ save_weights_only=True,
+ monitor='val_accuracy',
+ mode='max',
+ save_best_only=True)
+ tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=FLAGS.summaries_dir)
+
+ # Train the model.
+ model.fit(x=train_data,
+ steps_per_epoch=FLAGS.eval_step_interval,
+ epochs=training_epoch_max,
+ validation_data=val_data,
+ callbacks=[model_checkpoint_callback, tensorboard_callback])
+
+ # Test and save the model.
+ test_data = audio_processor.get_data(audio_processor.Modes.TESTING)
+ test_data = test_data.batch(FLAGS.batch_size)
+
+ test_loss, test_acc = model.evaluate(x=test_data)
+ print(f'Final test accuracy: {test_acc*100:.2f}%')
+ model.save(f'saved_model/{FLAGS.model_architecture}')
+ model.save(f'keras/{FLAGS.model_architecture}.h5')
+
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser()
+ parser.add_argument(
+ '--data_url',
+ type=str,
+ default='http://download.tensorflow.org/data/speech_commands_v0.02.tar.gz',
+ help='Location of speech training data archive on the web.')
+ parser.add_argument(
+ '--data_dir',
+ type=str,
+ default='/tmp/speech_dataset/',
+ help="""\
+ Where to download the speech training data to.
+ """)
+ parser.add_argument(
+ '--background_volume',
+ type=float,
+ default=0.1,
+ help="""\
+ How loud the background noise should be, between 0 and 1.
+ """)
+ parser.add_argument(
+ '--background_frequency',
+ type=float,
+ default=0.8,
+ help="""\
+ How many of the training samples have background noise mixed in.
+ """)
+ parser.add_argument(
+ '--silence_percentage',
+ type=float,
+ default=10.0,
+ help="""\
+ How much of the training data should be silence.
+ """)
+ parser.add_argument(
+ '--unknown_percentage',
+ type=float,
+ default=10.0,
+ help="""\
+ How much of the training data should be unknown words.
+ """)
+ parser.add_argument(
+ '--time_shift_ms',
+ type=float,
+ default=100.0,
+ help="""\
+ Range to randomly shift the training audio by in time.
+ """)
+ parser.add_argument(
+ '--testing_percentage',
+ type=int,
+ default=10,
+ help='What percentage of wavs to use as a test set.')
+ parser.add_argument(
+ '--validation_percentage',
+ type=int,
+ default=10,
+ help='What percentage of wavs to use as a validation set.')
+ parser.add_argument(
+ '--sample_rate',
+ type=int,
+ default=16000,
+ help='Expected sample rate of the wavs',)
+ parser.add_argument(
+ '--clip_duration_ms',
+ type=int,
+ default=1000,
+ help='Expected duration in milliseconds of the wavs',)
+ parser.add_argument(
+ '--window_size_ms',
+ type=float,
+ default=30.0,
+ help='How long each spectrogram timeslice is',)
+ parser.add_argument(
+ '--window_stride_ms',
+ type=float,
+ default=10.0,
+ help='How long each spectrogram timeslice is',)
+ parser.add_argument(
+ '--dct_coefficient_count',
+ type=int,
+ default=40,
+ help='How many bins to use for the MFCC fingerprint',)
+ parser.add_argument(
+ '--how_many_training_steps',
+ type=str,
+ default='15000,3000',
+ help='How many training loops to run',)
+ parser.add_argument(
+ '--eval_step_interval',
+ type=int,
+ default=400,
+ help='How often to evaluate the training results.')
+ parser.add_argument(
+ '--learning_rate',
+ type=str,
+ default='0.001,0.0001',
+ help='How large a learning rate to use when training.')
+ parser.add_argument(
+ '--batch_size',
+ type=int,
+ default=100,
+ help='How many items to train with at once',)
+ parser.add_argument(
+ '--summaries_dir',
+ type=str,
+ default='/tmp/retrain_logs',
+ help='Where to save summary logs for TensorBoard.')
+ parser.add_argument(
+ '--wanted_words',
+ type=str,
+ default='yes,no,up,down,left,right,on,off,stop,go',
+ help='Words to use (others will be added to an unknown label)',)
+ parser.add_argument(
+ '--train_dir',
+ type=str,
+ default='/tmp/speech_commands_train',
+ help='Directory to write event logs and checkpoint.')
+ parser.add_argument(
+ '--model_architecture',
+ type=str,
+ default='dnn',
+ help='What model architecture to use')
+ parser.add_argument(
+ '--model_size_info',
+ type=int,
+ nargs="+",
+ default=[128, 128, 128],
+ help='Model dimensions - different for various models')
+
+ FLAGS, _ = parser.parse_known_args()
+ train()
diff --git a/models/keyword_spotting/cnn_small/model_package_tf/validation_utils/labels.txt b/models/keyword_spotting/cnn_small/model_package_tf/validation_utils/labels.txt
new file mode 100644
index 0000000..ba41645
--- /dev/null
+++ b/models/keyword_spotting/cnn_small/model_package_tf/validation_utils/labels.txt
@@ -0,0 +1,12 @@
+_silence_
+_unknown_
+yes
+no
+up
+down
+left
+right
+on
+off
+stop
+go
\ No newline at end of file
diff --git a/models/keyword_spotting/cnn_small/tflite_int8/README.md b/models/keyword_spotting/cnn_small/tflite_int8/README.md
deleted file mode 100644
index 54e42bd..0000000
--- a/models/keyword_spotting/cnn_small/tflite_int8/README.md
+++ /dev/null
@@ -1,57 +0,0 @@
-# CNN Small INT8
-
-## Description
-This is a fully quantized version (asymmetrical int8) of the CNN Small model developed by Arm, with training checkpoints, from the Hello Edge paper. Code to recreate this model can be found here: https://github.com/ARM-software/ML-examples/tree/master/tflu-kws-cortex-m
-
-## License
-[Apache-2.0](https://spdx.org/licenses/Apache-2.0.html)
-
-## Related Materials
-### Class Labels
-The class labels associated with this model can be downloaded by running the script `get_class_labels.sh`.
-
-### Model Recreation Code
-Code to recreate this model can be found here: https://github.com/ARM-software/ML-examples/tree/master/tflu-kws-cortex-m.
-
-## Network Information
-| Network Information | Value |
-|---------------------|------------------|
-| Framework | TensorFlow Lite |
-| SHA-1 Hash | 3415f88dfb8f78fe47d282d68ccbc3ce71a7510f |
-| Size (Bytes) | 75400 |
-| Provenance | https://github.com/ARM-software/ML-examples/tree/master/tflu-kws-cortex-m |
-| Paper | https://arxiv.org/abs/1711.07128 |
-
-## Accuracy
-Dataset: Google Speech Commands Test Set
-
-| Metric | Value |
-|--------|-------|
-| Accuracy | 0.912 |
-
-## Performance
-| Platform | Optimized |
-|----------|:---------:|
-| Cortex-A |:heavy_check_mark: |
-| Cortex-M |:heavy_check_mark: |
-| Mali GPU |:heavy_check_mark: |
-| Ethos U |:heavy_check_mark: |
-
-### Key
-* :heavy_check_mark: - Will run on this platform.
-* :heavy_multiplication_x: - Will not run on this platform.
-
-## Optimizations
-| Optimization | Value |
-|-----------------|---------|
-| Quantization | INT8 |
-
-## Network Inputs
-| Input Node Name | Shape | Description |
-|-----------------|---------|-------------|
-| input | (1, 490) | The input is a processed MFCCs of shape (1, 490) |
-
-## Network Outputs
-| Output Node Name | Shape | Description |
-|------------------|---------|-------------|
-| Identity | (1, 12) | The probability on 12 keywords. |
diff --git a/models/keyword_spotting/cnn_small/tflite_int8/definition.yaml b/models/keyword_spotting/cnn_small/tflite_int8/definition.yaml
deleted file mode 100644
index e5cd3c4..0000000
--- a/models/keyword_spotting/cnn_small/tflite_int8/definition.yaml
+++ /dev/null
@@ -1,43 +0,0 @@
-benchmark:
- Google Speech Commands test set:
- Accuracy: 91.23%
-description: 'This is a fully quantized version (asymmetrical int8) of the CNN Small
- model developed by Arm, with training checkpoints, from the Hello Edge paper. Code
- to recreate this model can be found here: https://github.com/ARM-software/ML-examples/tree/master/tflu-kws-cortex-m'
-license:
-- Apache-2.0
-network:
- file_size_bytes: 75400
- filename: cnn_s_quantized.tflite
- framework: TensorFlow Lite
- hash:
- algorithm: sha1
- value: 3415f88dfb8f78fe47d282d68ccbc3ce71a7510f
- provenance: https://github.com/ARM-software/ML-examples/tree/master/tflu-kws-cortex-m
- quality_level: null
-network_parameters:
- input_nodes:
- - description: The input is a processed MFCCs of shape (1, 490)
- example_input:
- path: models/keyword_spotting/cnn_small/tflite_int8/testing_input/input
- name: input
- shape:
- - 1
- - 490
- output_nodes:
- - description: The probability on 12 keywords.
- name: Identity
- shape:
- - 1
- - 12
- test_output_path: models/keyword_spotting/cnn_small/tflite_int8/testing_output/Identity
-operators:
- TensorFlow Lite:
- - CONV_2D
- - DEQUANTIZE
- - FULLY_CONNECTED
- - QUANTIZE
- - RELU
- - RESHAPE
- - SOFTMAX
-paper: https://arxiv.org/abs/1711.07128
diff --git a/models/keyword_spotting/cnn_small/tflite_int8/get_class_labels.sh b/models/keyword_spotting/cnn_small/tflite_int8/get_class_labels.sh
deleted file mode 100755
index e59caf5..0000000
--- a/models/keyword_spotting/cnn_small/tflite_int8/get_class_labels.sh
+++ /dev/null
@@ -1,20 +0,0 @@
-# Copyright (C) 2021 Arm Limited or its affiliates. All rights reserved.
-#
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-#!/usr/bin/env bash
-
-wget https://raw.githubusercontent.com/ARM-software/ML-KWS-for-MCU/e9cf319e9aa2ff71d433e111477dd95329fb94cb/Pretrained_models/labels.txt
-mv labels.txt labelmappings.txt
\ No newline at end of file
diff --git a/models/keyword_spotting/dnn_large/model_package_tf/README.md b/models/keyword_spotting/dnn_large/model_package_tf/README.md
new file mode 100644
index 0000000..75d5348
--- /dev/null
+++ b/models/keyword_spotting/dnn_large/model_package_tf/README.md
@@ -0,0 +1,115 @@
+# DNN Large model package
+
+This folder contains code that will allow you to recreate the DNN Large keyword spotting model from
+the [Hello Edge paper](https://arxiv.org/pdf/1711.07128.pdf).
+
+## License
+[Apache-2.0](https://spdx.org/licenses/Apache-2.0.html)
+
+## Model Package Overview
+| Model | DNN_Large |
+|:---------------: |:------------------------------------------:|
+| **Format**: | Keras, Saved Model, TensorFlow Lite int8, TensorFlow Lite fp32 |
+| **Feature**: | Keyword spotting for Arm Cortex-M CPUs |
+| **Architectural Delta w.r.t. Vanilla**: | None |
+| **Domain**: | Keyword spotting |
+| **Package Quality**: | Optimised |
+
+## Model Recreation
+
+In order to recreate the model you will first need to be using ```Python3.7``` and install the requirements in ```requirements.txt```.
+
+Once you have these requirements satisfied you can execute the recreation script contained within this folder, just run:
+
+```bash
+bash ./recreate_model.sh
+```
+
+Running this script will use the pre-trained checkpoint files supplied in the ```./model_archive/model_source/weights``` folder
+to generate the TFLite files and perform evaluation on the test sets. Both an fp32 version and a quantized version will be produced.
+The quantized version will use post-training quantization to fully quantize it.
+
+If you want to run training from scratch you can do this by supplying ```--train``` when running the script. For example:
+
+```bash
+bash ./recreate_model.sh --train
+```
+
+Training is then performed and should produce a model to the stated accuracy in this repository.
+Note that exporting to TFLite will still happen with the pre-trained checkpoint files so you will need to re-run the script
+and this time supply the path to the new checkpoint files you want to use, for example:
+
+```bash
+bash ./recreate_model.sh --ckpt
+```
+
+
+## Training
+
+To train a DNN with 3 fully-connected layers with 128 neurons in each layer, run:
+
+```
+python train.py --model_architecture dnn --model_size_info 128 128 128
+```
+The command line argument *--model_size_info* is used to pass the neural network layer
+dimensions such as number of layers, convolution filter size/stride as a list to models.py,
+which builds the TensorFlow graph based on the provided model architecture
+and layer dimensions. For more info on *model_size_info* for each network architecture see
+[models.py](models.py).
+
+The training commands with all the hyperparameters to reproduce the models shown in the
+[paper](https://arxiv.org/pdf/1711.07128.pdf) are given [here](recreate_model.sh).
+
+## Testing
+To run inference on the trained model from a checkpoint and get accuracy on validation and test sets, run:
+```
+python evaluation.py --model_architecture dnn --model_size_info 128 128 128 --checkpoint
+```
+The parameters used here should match those used in the Training step.
+
+## Optimization
+
+We introduce a new *optional* step to optimize the trained keyword spotting model for deployment.
+
+Here we use TensorFlow's [weight clustering API](https://www.tensorflow.org/model_optimization/guide/clustering) to reduce the compressed model size and optimize inference on supported hardware. 32 weight clusters and kmeans++ cluster intialization method are used as the clustering hyperparameters.
+
+To optimize your trained model (e.g. a DNN), a trained model checkpoint is needed to run clustering and fine-tuning on.
+You can use the pre-trained checkpoints provided, or train your own model and use the resulting checkpoint.
+
+To apply the optimization and fine-tuning, run the following command:
+```
+python optimisations.py --model_architecture dnn --model_size_info 128 128 128 --checkpoint
+```
+The parameters used here should match those used in the Training step, except for the number of training steps.
+The number of training steps is reduced since the optimization step only requires fine-tuning.
+
+This will generate a clustered model checkpoint that can be used in the quantization step to generate a quantized and clustered TFLite model.
+
+## Quantization and TFLite Conversion
+
+As part of the update we now use TensorFlow's
+[post training quantization](https://www.tensorflow.org/lite/performance/post_training_quantization) to
+make quantization of the trained models super simple.
+
+To quantize your trained model (e.g. a DNN) run:
+```
+python convert_to_tflite.py --model_architecture dnn --model_size_info 128 128 128 --checkpoint [--inference_type int8|int16]
+```
+The parameters used here should match those used in the Training step.
+
+The inference_type parameter is *optional* and to be used if a fully quantized model with inputs and outputs of type int8 or int16 is needed. It defaults to fp32.
+
+This step will produce a quantized TFLite file *dnn_quantized.tflite*.
+You can test the accuracy of this quantized model on the test set by running:
+```
+python evaluation.py --tflite_path dnn_quantized.tflite
+```
+The parameters used here should match those used in the Training step.
+
+`convert_to_tflite.py` uses post-training quantization to generate a quantized model by default. If you wish to convert to a floating point TFLite model, use the command below:
+
+```
+python convert_to_tflite.py --model_architecture dnn --model_size_info 128 128 128 --checkpoint --no-quantize
+```
+
+This will produce a floating point TFLite file *dnn.tflite*. You can test the accuracy of this floating point model using `evaluation.py` as above.
diff --git a/models/keyword_spotting/dnn_large/model_package_tf/convert_to_tflite.py b/models/keyword_spotting/dnn_large/model_package_tf/convert_to_tflite.py
new file mode 100644
index 0000000..64ab8df
--- /dev/null
+++ b/models/keyword_spotting/dnn_large/model_package_tf/convert_to_tflite.py
@@ -0,0 +1,234 @@
+# Copyright © 2023 Arm Ltd. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Functions for converting and quantizing a trained keyword spotting
+ model and saving to TFLite."""
+
+import argparse
+
+import tensorflow as tf
+
+from data_processing import data_preprocessing
+from model_core_utils import models
+from evaluation import tflite_test
+
+NUM_REP_DATA_SAMPLES = 100 # How many samples to use for post training quantization.
+
+
+def convert(model_settings, audio_processor, checkpoint, quantize, inference_type, tflite_path):
+ """Load our trained floating point model and convert it.
+
+ TFLite conversion or post training quantization is performed and the
+ resulting model is saved as a TFLite file.
+ We use samples from the validation set to do post training quantization.
+
+ Args:
+ model_settings: Dictionary of common model settings.
+ audio_processor: Audio processor class object.
+ checkpoint: Path to training checkpoint to load.
+ quantize: Whether to quantize the model or convert to fp32 TFLite model.
+ inference_type: Input/output type of the quantized model.
+ tflite_path: Output TFLite file save path.
+ """
+ model = models.create_model(model_settings, FLAGS.model_architecture, FLAGS.model_size_info, False)
+ model.load_weights(checkpoint).expect_partial()
+
+ val_data = audio_processor.get_data(audio_processor.Modes.VALIDATION).batch(1)
+
+ def _rep_dataset():
+ """Generator function to produce representative dataset."""
+ i = 0
+ for mfcc, label in val_data:
+ if i > NUM_REP_DATA_SAMPLES:
+ break
+ i += 1
+ yield [mfcc]
+
+ if quantize:
+ # Quantize model and save to disk.
+ tflite_model = post_training_quantize(model, inference_type, _rep_dataset)
+ with open(tflite_path, 'wb') as f:
+ f.write(tflite_model)
+ print(f'Quantized model saved to {tflite_path}.')
+ else:
+ converter = tf.lite.TFLiteConverter.from_keras_model(model)
+ tflite_model = converter.convert()
+ with open(tflite_path, 'wb') as f:
+ f.write(tflite_model)
+ print(f'Converted model saved to {tflite_path}.')
+
+
+def post_training_quantize(keras_model, inference_type, rep_dataset):
+ """Perform post training quantization and returns the TFLite model ready for saving.
+
+ See https://www.tensorflow.org/lite/performance/post_training_quantization#full_integer_quantization for
+ more details.
+
+ Args:
+ keras_model: The trained tf Keras model used for post training quantization.
+ inference_type: Input/output type of the quantized model.
+ rep_dataset: Function to use as a representative dataset, must be callable.
+
+ Returns:
+ Quantized TFLite model ready for saving to disk.
+ """
+ converter = tf.lite.TFLiteConverter.from_keras_model(keras_model)
+ converter.optimizations = [tf.lite.Optimize.DEFAULT]
+
+ if inference_type == 'int8':
+ converter.inference_input_type = tf.int8
+ converter.inference_output_type = tf.int8
+ supported_ops = tf.lite.OpsSet.TFLITE_BUILTINS_INT8
+ if inference_type == 'int16':
+ converter.inference_input_type = tf.int16
+ converter.inference_output_type = tf.int16
+ supported_ops = tf.lite.OpsSet.EXPERIMENTAL_TFLITE_BUILTINS_ACTIVATIONS_INT16_WEIGHTS_INT8
+
+ # Int8 post training quantization needs representative dataset.
+ converter.representative_dataset = rep_dataset
+ converter.target_spec.supported_ops = [supported_ops]
+
+ tflite_model = converter.convert()
+
+ return tflite_model
+
+
+def main():
+ model_settings = models.prepare_model_settings(
+ len(data_preprocessing.prepare_words_list(FLAGS.wanted_words.split(','))),
+ FLAGS.sample_rate, FLAGS.clip_duration_ms, FLAGS.window_size_ms,
+ FLAGS.window_stride_ms, FLAGS.dct_coefficient_count)
+
+ audio_processor = data_preprocessing.AudioProcessor(data_url=FLAGS.data_url,
+ data_dir=FLAGS.data_dir,
+ silence_percentage=FLAGS.silence_percentage,
+ unknown_percentage=FLAGS.unknown_percentage,
+ wanted_words=FLAGS.wanted_words.split(','),
+ validation_percentage=FLAGS.validation_percentage,
+ testing_percentage=FLAGS.testing_percentage,
+ model_settings=model_settings)
+
+ if FLAGS.quantize:
+ tflite_path = f'{FLAGS.model_architecture}_quantized.tflite'
+ else:
+ tflite_path = f'{FLAGS.model_architecture}.tflite'
+
+ # Load floating point model from checkpoint and convert it.
+ convert(model_settings, audio_processor, FLAGS.checkpoint,
+ FLAGS.quantize, FLAGS.inference_type, tflite_path)
+
+ # Test the newly converted model on the test set.
+ tflite_test(model_settings, audio_processor, tflite_path)
+
+
+if __name__ == '__main__':
+ parser = argparse.ArgumentParser()
+ parser.add_argument(
+ '--data_url',
+ type=str,
+ default='http://download.tensorflow.org/data/speech_commands_v0.02.tar.gz',
+ help='Location of speech training data archive on the web.')
+ parser.add_argument(
+ '--data_dir',
+ type=str,
+ default='/tmp/speech_dataset/',
+ help="""\
+ Where to download the speech training data to.
+ """)
+ parser.add_argument(
+ '--silence_percentage',
+ type=float,
+ default=10.0,
+ help="""\
+ How much of the training data should be silence.
+ """)
+ parser.add_argument(
+ '--unknown_percentage',
+ type=float,
+ default=10.0,
+ help="""\
+ How much of the training data should be unknown words.
+ """)
+ parser.add_argument(
+ '--testing_percentage',
+ type=int,
+ default=10,
+ help='What percentage of wavs to use as a test set.')
+ parser.add_argument(
+ '--validation_percentage',
+ type=int,
+ default=10,
+ help='What percentage of wavs to use as a validation set.')
+ parser.add_argument(
+ '--sample_rate',
+ type=int,
+ default=16000,
+ help='Expected sample rate of the wavs',)
+ parser.add_argument(
+ '--clip_duration_ms',
+ type=int,
+ default=1000,
+ help='Expected duration in milliseconds of the wavs',)
+ parser.add_argument(
+ '--window_size_ms',
+ type=float,
+ default=30.0,
+ help='How long each spectrogram timeslice is',)
+ parser.add_argument(
+ '--window_stride_ms',
+ type=float,
+ default=10.0,
+ help='How long each spectrogram timeslice is',)
+ parser.add_argument(
+ '--dct_coefficient_count',
+ type=int,
+ default=40,
+ help='How many bins to use for the MFCC fingerprint',)
+ parser.add_argument(
+ '--wanted_words',
+ type=str,
+ default='yes,no,up,down,left,right,on,off,stop,go',
+ help='Words to use (others will be added to an unknown label)',)
+ parser.add_argument(
+ '--model_architecture',
+ type=str,
+ default='dnn',
+ help='What model architecture to use')
+ parser.add_argument(
+ '--model_size_info',
+ type=int,
+ nargs="+",
+ default=[128, 128, 128],
+ help='Model dimensions - different for various models')
+ parser.add_argument(
+ '--checkpoint',
+ type=str,
+ help='Checkpoint to load the weights from.')
+ parser.add_argument(
+ '--quantize',
+ dest='quantize',
+ action="store_true",
+ default=True,
+ help='Whether to quantize the model or convert to fp32 TFLite model. Defaults to True.')
+ parser.add_argument(
+ '--no-quantize',
+ dest='quantize',
+ action="store_false",
+ help='Whether to quantize the model or convert to fp32 TFLite model. Defaults to True.')
+ parser.add_argument(
+ '--inference_type',
+ type=str,
+ default='fp32',
+ help='If quantize is true, whether the model input and output is float32, int8 or int16')
+
+ FLAGS, _ = parser.parse_known_args()
+ main()
diff --git a/models/keyword_spotting/dnn_large/model_package_tf/data_processing/__init__.py b/models/keyword_spotting/dnn_large/model_package_tf/data_processing/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/models/keyword_spotting/dnn_large/model_package_tf/data_processing/data_preprocessing.py b/models/keyword_spotting/dnn_large/model_package_tf/data_processing/data_preprocessing.py
new file mode 100644
index 0000000..05cf5ba
--- /dev/null
+++ b/models/keyword_spotting/dnn_large/model_package_tf/data_processing/data_preprocessing.py
@@ -0,0 +1,462 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+#
+# Modifications Copyright 2023 Arm Inc. All Rights Reserved.
+# Modified to use TensorFlow 2.0 and data pipelines.
+#
+"""Functions for loading and preparing data for keyword spotting."""
+
+import os
+import re
+import sys
+import urllib
+from pathlib import Path
+import tarfile
+import hashlib
+import random
+import math
+from enum import Enum
+
+import numpy as np
+import tensorflow as tf
+from tensorflow.python.ops import gen_audio_ops as audio_ops
+
+MAX_NUM_WAVS_PER_CLASS = 2**27 - 1 # ~134M
+RANDOM_SEED = 59185
+BACKGROUND_NOISE_DIR_NAME = '_background_noise_'
+SILENCE_LABEL = '_silence_'
+SILENCE_INDEX = 0
+UNKNOWN_WORD_INDEX = 1
+UNKNOWN_WORD_LABEL = '_unknown_'
+
+
+def load_wav_file(wav_filename, desired_samples):
+ """Loads and then decodes a given 16bit PCM wav file.
+
+ Decoded audio is scaled to the range [-1, 1] and padded or cropped to the desired number of samples.
+
+ Args:
+ wav_filename: 16bit PCM wav file to load.
+ desired_samples: Number of samples wanted from the audio file.
+
+ Returns:
+ Tuple consisting of the decoded audio and sample rate.
+ """
+ wav_file = tf.io.read_file(wav_filename)
+ decoded_wav = audio_ops.decode_wav(wav_file, desired_channels=1, desired_samples=desired_samples)
+
+ return decoded_wav.audio, decoded_wav.sample_rate
+
+
+def calculate_mfcc(audio_signal, audio_sample_rate, window_size, window_stride, num_mfcc):
+ """Returns Mel Frequency Cepstral Coefficients (MFCC) for a given audio signal.
+
+ Args:
+ audio_signal: Raw audio signal in range [-1, 1]
+ audio_sample_rate: Audio signal sample rate
+ window_size: Window size in samples for calculating spectrogram
+ window_stride: Window stride in samples for calculating spectrogram
+ num_mfcc: The number of MFCC features wanted.
+
+ Returns:
+ Calculated mffc features.
+ """
+ spectrogram = audio_ops.audio_spectrogram(input=audio_signal, window_size=window_size, stride=window_stride,
+ magnitude_squared=True)
+
+ mfcc_features = audio_ops.mfcc(spectrogram, audio_sample_rate, dct_coefficient_count=num_mfcc)
+
+ return mfcc_features
+
+
+def which_set(filename, validation_percentage, testing_percentage):
+ """Determines which data partition the file should belong to.
+
+ We want to keep files in the same training, validation, or testing sets even
+ if new ones are added over time. This makes it less likely that testing
+ samples will accidentally be reused in training when long runs are restarted
+ for example. To keep this stability, a hash of the filename is taken and used
+ to determine which set it should belong to. This determination only depends on
+ the name and the set proportions, so it won't change as other files are added.
+ It's also useful to associate particular files as related (for example words
+ spoken by the same person), so anything after '_nohash_' in a filename is
+ ignored for set determination. This ensures that 'bobby_nohash_0.wav' and
+ 'bobby_nohash_1.wav' are always in the same set, for example.
+
+ Args:
+ filename: File path of the data sample.
+ validation_percentage: How much of the data set to use for validation.
+ testing_percentage: How much of the data set to use for testing.
+
+ Returns:
+ String, one of 'training', 'validation', or 'testing'.
+ """
+ base_name = os.path.basename(filename)
+ # We want to ignore anything after '_nohash_' in the file name when
+ # deciding which set to put a wav in, so the data set creator has a way of
+ # grouping wavs that are close variations of each other.
+ hash_name = re.sub(r'_nohash_.*$', '', base_name)
+ # This looks a bit magical, but we need to decide whether this file should
+ # go into the training, testing, or validation sets, and we want to keep
+ # existing files in the same set even if more files are subsequently
+ # added.
+ # To do that, we need a stable way of deciding based on just the file name
+ # itself, so we do a hash of that and then use that to generate a
+ # probability value that we use to assign it.
+ hash_name_hashed = hashlib.sha1(tf.compat.as_bytes(hash_name)).hexdigest()
+ percentage_hash = ((int(hash_name_hashed, 16) %
+ (MAX_NUM_WAVS_PER_CLASS + 1)) *
+ (100.0 / MAX_NUM_WAVS_PER_CLASS))
+ if percentage_hash < validation_percentage:
+ result = 'validation'
+ elif percentage_hash < (testing_percentage + validation_percentage):
+ result = 'testing'
+ else:
+ result = 'training'
+ return result
+
+
+def prepare_words_list(wanted_words):
+ """Prepends common tokens to the custom word list.
+
+ Args:
+ wanted_words: List of strings containing custom words to spot.
+
+ Returns:
+ List of words with silence and unknown tokens added.
+ """
+ return [SILENCE_LABEL, UNKNOWN_WORD_LABEL] + wanted_words
+
+
+class AudioProcessor:
+ """Handles loading, partitioning, and preparing audio training data."""
+
+ class Modes(Enum):
+ TRAINING = 1
+ VALIDATION = 2
+ TESTING = 3
+
+ def __init__(self, data_url, data_dir, silence_percentage, unknown_percentage,
+ wanted_words, validation_percentage, testing_percentage, model_settings):
+ self.data_dir = Path(data_dir)
+ self.model_settings = model_settings
+ self.words_list = prepare_words_list(wanted_words)
+
+ self._tf_datasets = {}
+ self.background_data = None
+ self._set_size = {'training': 0, 'validation': 0, 'testing': 0}
+
+ self._download_and_extract_data(data_url, data_dir)
+ self._prepare_datasets(silence_percentage, unknown_percentage, wanted_words,
+ validation_percentage, testing_percentage)
+ self._prepare_background_data()
+
+ def get_data(self, mode, background_frequency=0, background_volume_range=0, time_shift=0):
+ """Returns the train, validation or test set for KWS as a TF Dataset.
+
+ Args:
+ mode: The set to return, see AudioProcessor.Modes enumeration.
+ background_frequency: How many of the samples have background noise mixed in.
+ background_volume_range: How loud the background noise should be, between 0 and 1.
+ time_shift: Range to randomly shift the training audio by in time.
+
+ Returns:
+ TF dataset that will generate tuples containing an mfcc and corresponding label.
+
+ Raises:
+ ValueError: If mode is not recognised.
+ """
+ if mode == AudioProcessor.Modes.TRAINING:
+ dataset = self._tf_datasets['training']
+ elif mode == AudioProcessor.Modes.VALIDATION:
+ dataset = self._tf_datasets['validation']
+ elif mode == AudioProcessor.Modes.TESTING:
+ dataset = self._tf_datasets['testing']
+ else:
+ ValueError("Incorrect dataset type given")
+
+ use_background = (self.background_data is not None) and (mode == AudioProcessor.Modes.TRAINING)
+ dataset = dataset.map(lambda path, label: self._process_path(path, label, self.model_settings,
+ background_frequency, background_volume_range,
+ time_shift, use_background, self.background_data),
+ num_parallel_calls=tf.data.experimental.AUTOTUNE)
+
+ return dataset
+
+ def set_size(self, mode):
+ """Get the number of samples in the requested dataset partition.
+
+ Args:
+ mode: Which partition, see AudioProcessor.Modes enumeration.
+
+ Returns:
+ Number of samples in the partition.
+
+ Raises:
+ ValueError: If mode is not recognised.
+ """
+ if mode == AudioProcessor.Modes.TRAINING:
+ return self._set_size['training']
+ elif mode == AudioProcessor.Modes.VALIDATION:
+ return self._set_size['validation']
+ elif mode == AudioProcessor.Modes.TESTING:
+ return self._set_size['testing']
+ else:
+ ValueError('Incorrect dataset type given')
+
+ @staticmethod
+ def _process_path(path, label, model_settings, background_frequency, background_volume_range, time_shift_samples,
+ use_background, background_data):
+ """Load wav files and calculate mfcc features.
+
+ Random shifting of samples and adding in background noise is done within this function as well.
+ This function is meant to be mapped onto a TF Dataset by using a lambda function.
+
+ Args:
+ path: Path to the wav file to load.
+ label: Integer label for classifying the audio clip.
+ model_settings: Dictionary of settings for model being trained.
+ background_frequency: How many clips will have background noise, 0.0 to 1.0.
+ background_volume_range: How loud the background noise will be.
+ time_shift_samples: How much to randomly shift the clips by.
+ use_background: Add in background noise to audio clips or not.
+ background_data: Ragged tensor of loaded background noise samples.
+
+ Returns:
+ Tuple of calculated flattened mfcc and its class label.
+ """
+
+ desired_samples = model_settings['desired_samples']
+ audio, sample_rate = load_wav_file(path, desired_samples=desired_samples)
+
+ # Make our own silence audio data.
+ if label == SILENCE_INDEX:
+ audio = tf.multiply(audio, 0)
+
+ # Shift samples start position and pad any gaps with zeros.
+ if time_shift_samples > 0:
+ time_shift_amount = tf.random.uniform(shape=(), minval=-time_shift_samples, maxval=time_shift_samples,
+ dtype=tf.int32)
+ else:
+ time_shift_amount = 0
+ if time_shift_amount > 0:
+ time_shift_padding = [[time_shift_amount, 0], [0, 0]]
+ time_shift_offset = [0, 0]
+ else:
+ time_shift_padding = [[0, -time_shift_amount], [0, 0]]
+ time_shift_offset = [-time_shift_amount, 0]
+
+ padded_foreground = tf.pad(audio, time_shift_padding, mode='CONSTANT')
+ sliced_foreground = tf.slice(padded_foreground, time_shift_offset, [desired_samples, -1])
+
+ # Get a random section of background noise.
+ if use_background:
+ background_index = tf.random.uniform(shape=(), maxval=background_data.shape[0], dtype=tf.int32)
+ background_sample = background_data[background_index]
+ background_offset = tf.random.uniform(shape=(), maxval=len(background_sample)-desired_samples,
+ dtype=tf.int32)
+ background_clipped = background_sample[background_offset:(background_offset + desired_samples)]
+ background_reshaped = tf.reshape(background_clipped, [desired_samples, 1])
+ if tf.random.uniform(shape=(), maxval=1) < background_frequency:
+ background_volume = tf.random.uniform(shape=(), maxval=background_volume_range)
+ else:
+ background_volume = tf.constant(0, dtype='float32')
+ else:
+ background_reshaped = np.zeros([desired_samples, 1], dtype=np.float32)
+ background_volume = tf.constant(0, dtype='float32')
+
+ # Mix in background noise.
+ background_mul = tf.multiply(background_reshaped, background_volume)
+ background_add = tf.add(background_mul, sliced_foreground)
+ background_clamp = tf.clip_by_value(background_add, -1.0, 1.0)
+
+ mfcc = calculate_mfcc(background_clamp, sample_rate, model_settings['window_size_samples'],
+ model_settings['window_stride_samples'],
+ model_settings['dct_coefficient_count'])
+ mfcc = tf.reshape(mfcc, [-1])
+
+ return mfcc, label
+
+ def _download_and_extract_data(self, data_url, target_directory):
+ """Downloads and extracts file to target directory.
+
+ If the file does not already exist download it and then untar into the target directory.
+
+ Args:
+ data_url: Web link to the tarred data to download.
+ target_directory: Directory to download and extract to.
+ """
+ target_directory = Path(target_directory)
+ target_directory.mkdir(exist_ok=True)
+
+ filename = data_url.split('/')[-1]
+ filepath = target_directory / filename
+
+ if not filepath.exists():
+ def _report_hook(block_num, block_size, total_size):
+ """Function to track download progress in urllib"""
+ read_so_far = block_num * block_size
+ percent = (read_so_far / total_size) * 100.0
+
+ s = f"\rDownloading {filename} {percent:.1f}%"
+
+ sys.stdout.write(s)
+ sys.stdout.flush()
+
+ filepath, _ = urllib.request.urlretrieve(data_url, filepath, _report_hook)
+ print()
+
+ print(f'Untarring {filename}...')
+ tarfile.open(filepath, 'r:gz').extractall(target_directory)
+
+ def _prepare_datasets(self, silence_percentage, unknown_percentage, wanted_words,
+ validation_percentage, testing_percentage):
+ """Split the data into train, validation and testing sets.
+
+ Silence and unknown data is added, then sets are converted to TF Datasets.
+
+ Args:
+ silence_percentage: Percent of words should be silence.
+ unknown_percentage: Percent of words that should be unknown.
+ wanted_words: List of words wanted to classify.
+ validation_percentage: Percent to split off for validation.
+ testing_percentage: Percent to split off for testing.
+ """
+ # Make sure the shuffling and picking of unknowns is deterministic.
+ random.seed(RANDOM_SEED)
+ wanted_words_index = {}
+
+ for index, wanted_word in enumerate(wanted_words):
+ wanted_words_index[wanted_word] = index + 2
+
+ # Find all wav files in subfolders.
+ search_path = self.data_dir / '*' / '*.wav'
+ data_index, unknown_index, all_words = self._find_and_sort_wavs(search_path, validation_percentage,
+ testing_percentage, wanted_words_index)
+
+ for index, wanted_word in enumerate(wanted_words):
+ if wanted_word not in all_words:
+ raise Exception(f'Tried to find {wanted_word} in labels but only found: {", ".join(all_words.keys())}')
+
+ word_to_index = {}
+ for word in all_words:
+ if word in wanted_words_index:
+ word_to_index[word] = wanted_words_index[word]
+ else:
+ word_to_index[word] = UNKNOWN_WORD_INDEX
+ word_to_index[SILENCE_LABEL] = SILENCE_INDEX
+
+ # We need an arbitrary file to load as the input for the silence samples.
+ # It's multiplied by zero later, so the content doesn't matter.
+ silence_wav_path = data_index['training'][0]['file']
+ for set_index in ['validation', 'testing', 'training']:
+ set_size = len(data_index[set_index]) # Size before adding silence and unknown samples.
+ silence_size = int(math.ceil(set_size * silence_percentage / 100))
+ for _ in range(silence_size):
+ data_index[set_index].append({
+ 'label': SILENCE_LABEL,
+ 'file': silence_wav_path
+ })
+ # Pick some unknowns to add to each partition of the data set.
+ random.shuffle(unknown_index[set_index])
+ unknown_size = int(math.ceil(set_size * unknown_percentage / 100))
+ data_index[set_index].extend(unknown_index[set_index][:unknown_size])
+
+ self._set_size[set_index] = len(data_index[set_index]) # Size after adding silence and unknown samples.
+
+ # Make sure the ordering is random.
+ random.shuffle(data_index[set_index])
+
+ # Transform into TF Datasets ready for easier processing later.
+ labels, paths = list(zip(*[d.values() for d in data_index[set_index]]))
+ labels = [word_to_index[label] for label in labels]
+ self._tf_datasets[set_index] = tf.data.Dataset.from_tensor_slices((list(paths), labels))
+
+ def _find_and_sort_wavs(self, search_pattern, validation_percentage, testing_percentage, wanted_words_index):
+ """Find and sort wav files into known and unknown word sets.
+
+ Known words are files containing words in the list of wanted words.
+ Any other clip goes to the unknown label set. Labels come from the folder names.
+ All clips are also assigned to train, test and validation sets.
+
+ Args:
+ search_pattern: Path pattern used by glob to find wav files.
+ validation_percentage: Percent to split off for validation.
+ testing_percentage: Percent to split off for testing.
+ wanted_words_index: Dict mapping wanted words to their label index.
+
+ Returns:
+ 3-tuple of known words, unknown words and mapping of all word labels.
+ """
+ data_index = {'validation': [], 'testing': [], 'training': []}
+ unknown_index = {'validation': [], 'testing': [], 'training': []}
+ all_words = {}
+
+ for wav_path in sorted(tf.io.gfile.glob(str(search_pattern))):
+ word = Path(wav_path).parent.name.lower()
+
+ # Treat the '_background_noise_' folder as a special case, since we expect
+ # it to contain long audio samples we mix in to improve training.
+ if word == BACKGROUND_NOISE_DIR_NAME:
+ continue
+
+ all_words[word] = True
+ set_index = which_set(wav_path, validation_percentage, testing_percentage)
+ # If it's a known class, store its detail, otherwise add it to the list
+ # we'll use to train the unknown label.
+ if word in wanted_words_index:
+ data_index[set_index].append({'label': word, 'file': wav_path})
+ else:
+ unknown_index[set_index].append({'label': word, 'file': wav_path})
+ if not all_words:
+ raise Exception('No .wavs found at ' + str(search_pattern))
+
+ return data_index, unknown_index, all_words
+
+ def _prepare_background_data(self):
+ """Searches a folder for background noise audio, and loads it into memory.
+
+ It's expected that the background audio samples will be in a subdirectory
+ named '_background_noise_' inside the 'data_dir' folder, as .wavs that match
+ the sample rate of the training data, but can be much longer in duration.
+
+ If the '_background_noise_' folder doesn't exist at all, this isn't an
+ error, it's just taken to mean that no background noise augmentation should
+ be used. If the folder does exist, but it's empty, that's treated as an
+ error.
+
+ Returns:
+ Ragged tensor of raw PCM-encoded audio samples of background noise.
+ None if '_background_noise_' folder doesnt exist.
+
+ Raises:
+ Exception: If files aren't found in the folder.
+ """
+ background_data = []
+ background_dir = Path(self.data_dir / BACKGROUND_NOISE_DIR_NAME)
+ if not background_dir.exists():
+ self.background_data = None
+ return
+
+ search_path = Path(background_dir / '*.wav')
+ for wav_path in tf.io.gfile.glob(str(search_path)):
+ wav_data, _ = load_wav_file(wav_path, desired_samples=-1)
+ background_data.append(tf.reshape(wav_data, [-1]))
+
+ if not background_data:
+ raise Exception('No background wav files were found in ' + str(search_path))
+
+ # Ragged tensor as we cant use lists in tf dataset map functions.
+ self.background_data = tf.ragged.stack(background_data)
diff --git a/models/keyword_spotting/dnn_large/model_package_tf/dnn_l_inference_keras.py b/models/keyword_spotting/dnn_large/model_package_tf/dnn_l_inference_keras.py
new file mode 100644
index 0000000..db7694a
--- /dev/null
+++ b/models/keyword_spotting/dnn_large/model_package_tf/dnn_l_inference_keras.py
@@ -0,0 +1,76 @@
+# Copyright © 2023 Arm Ltd. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from data_processing.data_preprocessing import load_wav_file, calculate_mfcc
+
+import tensorflow as tf
+import argparse
+
+
+def load_labels(filename):
+ """Read in labels, one label per line."""
+ f = open(filename, "r")
+ return f.read().splitlines()
+
+
+def main():
+ window_size_samples = int(FLAGS.sample_rate * FLAGS.window_size_ms / 1000)
+ window_stride_samples = int(FLAGS.sample_rate * FLAGS.window_stride_ms / 1000)
+ decoded, sample = load_wav_file(FLAGS.wav, FLAGS.sample_rate)
+ x = calculate_mfcc(decoded, sample, window_size_samples, window_stride_samples, FLAGS.dct_coefficient_count)
+ x = tf.reshape(x, [1, -1])
+
+ model = tf.keras.models.load_model(FLAGS.keras_file_path)
+ predictions = model.predict(x)
+
+ # Sort to show labels in order of confidence
+ top_k = predictions[0].argsort()[-1:][::-1]
+ for node_id in top_k:
+ human_string = load_labels(FLAGS.labels)[int(node_id)]
+ score = predictions[0,node_id]
+ print(f'model predicted: {human_string} with score {score:.5f}')
+
+
+if __name__ == '__main__':
+ parser = argparse.ArgumentParser()
+ parser.add_argument(
+ '--wav', type=str, default='', help='Audio file to be identified.')
+ parser.add_argument(
+ '--labels', type=str, default='', help='Path to file containing labels.')
+ parser.add_argument(
+ '--sample_rate',
+ type=int,
+ default=16000,
+ help='Expected sample rate of the wavs', )
+ parser.add_argument(
+ '--window_size_ms',
+ type=float,
+ default=30.0,
+ help='How long each spectrogram timeslice is', )
+ parser.add_argument(
+ '--window_stride_ms',
+ type=float,
+ default=10.0,
+ help='How long each spectrogram timeslice is', )
+ parser.add_argument(
+ '--dct_coefficient_count',
+ type=int,
+ default=40,
+ help='How many bins to use for the MFCC fingerprint', )
+ parser.add_argument(
+ '--keras_file_path',
+ type=str,
+ default='',
+ help='Path to the .h5 Keras model file to use for testing.')
+ FLAGS, unparsed = parser.parse_known_args()
+ main()
diff --git a/models/keyword_spotting/dnn_large/model_package_tf/dnn_l_inference_tflite.py b/models/keyword_spotting/dnn_large/model_package_tf/dnn_l_inference_tflite.py
new file mode 100644
index 0000000..9f79d99
--- /dev/null
+++ b/models/keyword_spotting/dnn_large/model_package_tf/dnn_l_inference_tflite.py
@@ -0,0 +1,120 @@
+# Copyright © 2023 Arm Ltd. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from data_processing.data_preprocessing import load_wav_file, calculate_mfcc
+
+import tensorflow as tf
+import numpy as np
+import argparse
+
+
+def tflite_inference(input_data, tflite_path):
+ """Call forwards pass of TFLite file and returns the result.
+
+ Args:
+ input_data: Input data to use on forward pass.
+ tflite_path: Path to TFLite file to run.
+
+ Returns:
+ Output from inference.
+ """
+ supported_quant_dtypes = (np.int8, np.int16)
+ interpreter = tf.lite.Interpreter(model_path=tflite_path)
+ interpreter.allocate_tensors()
+
+ input_details = interpreter.get_input_details()
+ output_details = interpreter.get_output_details()
+
+ input_dtype = input_details[0]["dtype"]
+ output_dtype = output_details[0]["dtype"]
+
+ # Check if the input/output type is quantized,
+ # set scale and zero-point accordingly
+ if input_dtype in supported_quant_dtypes:
+ input_scale, input_zero_point = input_details[0]["quantization"]
+ else:
+ input_scale, input_zero_point = 1, 0
+
+ input_data = input_data / input_scale + input_zero_point
+ input_data = np.round(input_data) if input_dtype in supported_quant_dtypes else input_data
+
+ if output_dtype in supported_quant_dtypes:
+ output_scale, output_zero_point = output_details[0]["quantization"]
+ else:
+ output_scale, output_zero_point = 1, 0
+
+ interpreter.set_tensor(input_details[0]['index'], tf.cast(input_data, input_dtype))
+ interpreter.invoke()
+
+ output_data = interpreter.get_tensor(output_details[0]['index'])
+
+ output_data = output_scale * (output_data.astype(np.float32) - output_zero_point)
+
+ return output_data
+
+
+def load_labels(filename):
+ """Read in labels, one label per line."""
+ f = open(filename, "r")
+ return f.read().splitlines()
+
+
+def main():
+ window_size_samples = int(FLAGS.sample_rate * FLAGS.window_size_ms / 1000)
+ window_stride_samples = int(FLAGS.sample_rate * FLAGS.window_stride_ms / 1000)
+ decoded, sample = load_wav_file(FLAGS.wav, FLAGS.sample_rate)
+ x = calculate_mfcc(decoded, sample, window_size_samples, window_stride_samples, FLAGS.dct_coefficient_count)
+ x = tf.reshape(x, [1, -1])
+ predictions = tflite_inference(x, FLAGS.tflite_path)
+
+ # Sort to show labels in order of confidence
+ top_k = predictions[0].argsort()[-1:][::-1]
+ for node_id in top_k:
+ human_string = load_labels(FLAGS.labels)[int(node_id)]
+ score = predictions[0,node_id]
+ print(f'model predicted: {human_string} with score {score:.5f}')
+
+
+if __name__ == '__main__':
+ parser = argparse.ArgumentParser()
+ parser.add_argument(
+ '--wav', type=str, default='', help='Audio file to be identified.')
+ parser.add_argument(
+ '--labels', type=str, default='', help='Path to file containing labels.')
+ parser.add_argument(
+ '--sample_rate',
+ type=int,
+ default=16000,
+ help='Expected sample rate of the wavs', )
+ parser.add_argument(
+ '--window_size_ms',
+ type=float,
+ default=30.0,
+ help='How long each spectrogram timeslice is', )
+ parser.add_argument(
+ '--window_stride_ms',
+ type=float,
+ default=10.0,
+ help='How long each spectrogram timeslice is', )
+ parser.add_argument(
+ '--dct_coefficient_count',
+ type=int,
+ default=40,
+ help='How many bins to use for the MFCC fingerprint', )
+ parser.add_argument(
+ '--tflite_path',
+ type=str,
+ default='',
+ help='Path to TFLite file to use for testing.')
+ FLAGS, unparsed = parser.parse_known_args()
+ main()
diff --git a/models/keyword_spotting/dnn_large/model_package_tf/evaluation.py b/models/keyword_spotting/dnn_large/model_package_tf/evaluation.py
new file mode 100644
index 0000000..5e60134
--- /dev/null
+++ b/models/keyword_spotting/dnn_large/model_package_tf/evaluation.py
@@ -0,0 +1,250 @@
+# Copyright © 2023 Arm Ltd. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Functions for testing trained keyword spotting models from checkpoint files and TFLite files."""
+
+import argparse
+
+import numpy as np
+import tensorflow as tf
+
+from data_processing import data_preprocessing
+from model_core_utils import models
+from dnn_l_inference_tflite import tflite_inference
+
+
+def tflite_test(model_settings, audio_processor, tflite_path):
+ """Calculate accuracy and confusion matrices on the validation and test sets.
+
+ A TFLite model is used for doing testing.
+
+ Args:
+ model_settings: Dictionary of common model settings.
+ audio_processor: Audio processor class object.
+ tflite_path: Path to TFLite file to use for inference.
+ """
+ # Evaluate on validation set.
+ print("Running TFLite evaluation on validation set...")
+ val_data = audio_processor.get_data(audio_processor.Modes.VALIDATION).batch(1)
+ expected_indices = np.concatenate([y for x, y in val_data])
+ predicted_indices = []
+
+ for mfcc, label in val_data:
+ prediction = tflite_inference(mfcc, tflite_path)
+ predicted_indices.append(np.squeeze(tf.argmax(prediction, axis=1)))
+
+ val_accuracy = calculate_accuracy(predicted_indices, expected_indices)
+ confusion_matrix = tf.math.confusion_matrix(expected_indices, predicted_indices,
+ num_classes=model_settings['label_count'])
+
+ print(confusion_matrix.numpy())
+ print(f'Validation accuracy = {val_accuracy * 100:.2f}%'
+ f'(N={audio_processor.set_size(audio_processor.Modes.VALIDATION)})')
+
+ # Evaluate on testing set.
+ print("Running TFLite evaluation on test set...")
+ test_data = audio_processor.get_data(audio_processor.Modes.TESTING).batch(1)
+ expected_indices = np.concatenate([y for x, y in test_data])
+ predicted_indices = []
+
+ for mfcc, label in test_data:
+ prediction = tflite_inference(mfcc, tflite_path)
+ predicted_indices.append(np.squeeze(tf.argmax(prediction, axis=1)))
+
+ test_accuracy = calculate_accuracy(predicted_indices, expected_indices)
+ confusion_matrix = tf.math.confusion_matrix(expected_indices, predicted_indices,
+ num_classes=model_settings['label_count'])
+
+ print(confusion_matrix.numpy())
+ print(f'Test accuracy = {test_accuracy * 100:.2f}%'
+ f'(N={audio_processor.set_size(audio_processor.Modes.TESTING)})')
+
+
+def keras_test(model_settings, audio_processor, model):
+ """Calculate accuracy and confusion matrices on the validation and test sets.
+
+ A loaded keras model is used for doing testing.
+
+ Args:
+ model_settings: Dictionary of common model settings.
+ audio_processor: Audio processor class object.
+ model: Loaded keras model.
+ """
+ # Evaluate on validation set.
+ print("Running TF evaluation on validation set...")
+ val_data = audio_processor.get_data(audio_processor.Modes.VALIDATION).batch(FLAGS.batch_size)
+ expected_indices = np.concatenate([y for x, y in val_data])
+
+ predictions = model.predict(val_data)
+ predicted_indices = tf.argmax(predictions, axis=1)
+
+ val_accuracy = calculate_accuracy(predicted_indices, expected_indices)
+ confusion_matrix = tf.math.confusion_matrix(expected_indices, predicted_indices,
+ num_classes=model_settings['label_count'])
+ print(confusion_matrix.numpy())
+ print(f'Validation accuracy = {val_accuracy * 100:.2f}%'
+ f'(N={audio_processor.set_size(audio_processor.Modes.VALIDATION)})')
+
+ # Evaluate on testing set.
+ print("Running TF evaluation on test set...")
+ test_data = audio_processor.get_data(audio_processor.Modes.TESTING).batch(FLAGS.batch_size)
+ expected_indices = np.concatenate([y for x, y in test_data])
+
+ predictions = model.predict(test_data)
+ predicted_indices = tf.argmax(predictions, axis=1)
+
+ test_accuracy = calculate_accuracy(predicted_indices, expected_indices)
+ confusion_matrix = tf.math.confusion_matrix(expected_indices, predicted_indices,
+ num_classes=model_settings['label_count'])
+ print(confusion_matrix.numpy())
+ print(f'Test accuracy = {test_accuracy * 100:.2f}%'
+ f'(N={audio_processor.set_size(audio_processor.Modes.TESTING)})')
+
+
+def calculate_accuracy(predicted_indices, expected_indices):
+ """Calculates and returns accuracy.
+
+ Args:
+ predicted_indices: List of predicted integer indices.
+ expected_indices: List of expected integer indices.
+
+ Returns:
+ Accuracy value between 0 and 1.
+ """
+ correct_prediction = tf.equal(predicted_indices, expected_indices)
+ accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
+ return accuracy
+
+
+def evaluate():
+ """Calculate accuracy and confusion matrices on validation and test sets.
+
+ Model is created and weights loaded from supplied command line arguments.
+ """
+ model_settings = models.prepare_model_settings(
+ len(data_preprocessing.prepare_words_list(FLAGS.wanted_words.split(','))),
+ FLAGS.sample_rate, FLAGS.clip_duration_ms, FLAGS.window_size_ms,
+ FLAGS.window_stride_ms, FLAGS.dct_coefficient_count)
+
+ audio_processor = data_preprocessing.AudioProcessor(data_url=FLAGS.data_url,
+ data_dir=FLAGS.data_dir,
+ silence_percentage=FLAGS.silence_percentage,
+ unknown_percentage=FLAGS.unknown_percentage,
+ wanted_words=FLAGS.wanted_words.split(','),
+ validation_percentage=FLAGS.validation_percentage,
+ testing_percentage=FLAGS.testing_percentage,
+ model_settings=model_settings)
+
+ if FLAGS.tflite_path:
+ tflite_test(model_settings, audio_processor, FLAGS.tflite_path)
+
+ if FLAGS.checkpoint:
+ model = models.create_model(model_settings, FLAGS.model_architecture, FLAGS.model_size_info, False)
+ model.load_weights(FLAGS.checkpoint).expect_partial()
+ keras_test(model_settings, audio_processor, model)
+
+
+if __name__ == '__main__':
+ parser = argparse.ArgumentParser()
+ parser.add_argument(
+ '--data_url',
+ type=str,
+ default='http://download.tensorflow.org/data/speech_commands_v0.02.tar.gz',
+ help='Location of speech training data archive on the web.')
+ parser.add_argument(
+ '--data_dir',
+ type=str,
+ default='/tmp/speech_dataset/',
+ help="""\
+ Where to download the speech training data to.
+ """)
+ parser.add_argument(
+ '--silence_percentage',
+ type=float,
+ default=10.0,
+ help="""\
+ How much of the training data should be silence.
+ """)
+ parser.add_argument(
+ '--unknown_percentage',
+ type=float,
+ default=10.0,
+ help="""\
+ How much of the training data should be unknown words.
+ """)
+ parser.add_argument(
+ '--testing_percentage',
+ type=int,
+ default=10,
+ help='What percentage of wavs to use as a test set.')
+ parser.add_argument(
+ '--validation_percentage',
+ type=int,
+ default=10,
+ help='What percentage of wavs to use as a validation set.')
+ parser.add_argument(
+ '--sample_rate',
+ type=int,
+ default=16000,
+ help='Expected sample rate of the wavs',)
+ parser.add_argument(
+ '--clip_duration_ms',
+ type=int,
+ default=1000,
+ help='Expected duration in milliseconds of the wavs',)
+ parser.add_argument(
+ '--window_size_ms',
+ type=float,
+ default=30.0,
+ help='How long each spectrogram timeslice is',)
+ parser.add_argument(
+ '--window_stride_ms',
+ type=float,
+ default=10.0,
+ help='How long each spectrogram timeslice is',)
+ parser.add_argument(
+ '--dct_coefficient_count',
+ type=int,
+ default=40,
+ help='How many bins to use for the MFCC fingerprint',)
+ parser.add_argument(
+ '--batch_size',
+ type=int,
+ default=100,
+ help='How many items to train with at once',)
+ parser.add_argument(
+ '--wanted_words',
+ type=str,
+ default='yes,no,up,down,left,right,on,off,stop,go',
+ help='Words to use (others will be added to an unknown label)',)
+ parser.add_argument(
+ '--checkpoint',
+ type=str,
+ help='Checkpoint to load the weights from')
+ parser.add_argument(
+ '--tflite_path',
+ type=str,
+ help='Path to TFLite file to use for evaluation')
+ parser.add_argument(
+ '--model_architecture',
+ type=str,
+ default='dnn',
+ help='What model architecture to use')
+ parser.add_argument(
+ '--model_size_info',
+ type=int,
+ nargs="+",
+ default=[128, 128, 128],
+ help='Model dimensions - different for various models')
+
+ FLAGS, _ = parser.parse_known_args()
+ evaluate()
diff --git a/models/keyword_spotting/dnn_large/model_package_tf/how_to_guidance.ipynb b/models/keyword_spotting/dnn_large/model_package_tf/how_to_guidance.ipynb
new file mode 100644
index 0000000..67b2031
--- /dev/null
+++ b/models/keyword_spotting/dnn_large/model_package_tf/how_to_guidance.ipynb
@@ -0,0 +1,428 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Copyright (C) 2023 Arm Limited or its affiliates. All rights reserved.\n",
+ "#\n",
+ "# SPDX-License-Identifier: Apache-2.0\n",
+ "#\n",
+ "# Licensed under the Apache License, Version 2.0 (the License); you may\n",
+ "# not use this file except in compliance with the License.\n",
+ "# You may obtain a copy of the License at\n",
+ "#\n",
+ "# www.apache.org/licenses/LICENSE-2.0\n",
+ "#\n",
+ "# Unless required by applicable law or agreed to in writing, software\n",
+ "# distributed under the License is distributed on an AS IS BASIS, WITHOUT\n",
+ "# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
+ "# See the License for the specific language governing permissions and\n",
+ "# limitations under the License."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# DNN_Large - Optimised\n",
+ "\n",
+ "Here we reproduce the models with our established codebase and ModelPackage approach for your convenience.\n",
+ "\n",
+ "## Model-Package Overview:\n",
+ "\n",
+ "| Model \t| DNN_Large \t|\n",
+ "|:---------------:\t|:---------------------------------------------------------------:\t|\n",
+ "| **Format**: \t| Keras, Saved Model, TensorFlow Lite int8, TensorFlow Lite fp32 |\n",
+ "| **Feature**: \t| Keyword spotting for Arm Cortex-M CPUs |\n",
+ "| **Architectural Delta w.r.t. Vanilla**: | None |\n",
+ "| **Domain**: \t| Keyword spotting |\n",
+ "| **Package Quality**: \t| Optimised |"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Table of contents \n",
+ "\n",
+ "This how-to guidance presents the key steps to reproduce everything in this package. The contents are organised as below. We provided the internal navigation links for users to easy-jump among different sections. \n",
+ "\n",
+ " \n",
+ "* [1.0 Model recreation](#model_recreation)\n",
+ "\n",
+ "* [2.0 Training](#training)\n",
+ "\n",
+ "* [3.0 Testing](#testing)\n",
+ "\n",
+ "* [4.0 Optimization](#optimization)\n",
+ "\n",
+ "* [5.0 Quantization and TFLite conversion](#tflite_conversion)\n",
+ "\n",
+ "* [6.0 Inference the TFLite model files](#tflite_inference)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 1.0 Model Recreation\n",
+ "\n",
+ "In order to recreate the model you will first need to be using ```Python3.7``` and install the requirements in ```requirements.txt```.\n",
+ "\n",
+ "Once you have these requirements satisfied you can execute the recreation script contained within this folder, just run:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "2023-01-31 13:18:57.429502: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0\n",
+ "Untarring speech_commands_v0.02.tar.gz...\n",
+ "2023-01-31 13:19:44.590405: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcuda.so.1\n",
+ "2023-01-31 13:19:44.627169: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: \n",
+ "pciBusID: 0000:03:00.0 name: NVIDIA TITAN Xp computeCapability: 6.1\n",
+ "coreClock: 1.582GHz coreCount: 30 deviceMemorySize: 11.91GiB deviceMemoryBandwidth: 510.07GiB/s\n",
+ "2023-01-31 13:19:44.627205: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0\n",
+ "2023-01-31 13:19:44.650614: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcublas.so.11\n",
+ "2023-01-31 13:19:44.650690: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcublasLt.so.11\n",
+ "2023-01-31 13:19:44.653550: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcufft.so.10\n",
+ "2023-01-31 13:19:44.653884: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcurand.so.10\n",
+ "2023-01-31 13:19:44.654515: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcusolver.so.11\n",
+ "2023-01-31 13:19:44.655280: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcusparse.so.11\n",
+ "2023-01-31 13:19:44.655466: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudnn.so.8\n",
+ "2023-01-31 13:19:44.655866: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0\n",
+ "2023-01-31 13:19:44.656166: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA\n",
+ "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
+ "2023-01-31 13:19:44.657031: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: \n",
+ "pciBusID: 0000:03:00.0 name: NVIDIA TITAN Xp computeCapability: 6.1\n",
+ "coreClock: 1.582GHz coreCount: 30 deviceMemorySize: 11.91GiB deviceMemoryBandwidth: 510.07GiB/s\n",
+ "2023-01-31 13:19:44.657463: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0\n",
+ "2023-01-31 13:19:44.657531: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0\n",
+ "2023-01-31 13:19:45.095453: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1258] Device interconnect StreamExecutor with strength 1 edge matrix:\n",
+ "2023-01-31 13:19:45.095490: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1264] 0 \n",
+ "2023-01-31 13:19:45.095499: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1277] 0: N \n",
+ "2023-01-31 13:19:45.096006: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1418] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 10942 MB memory) -> physical GPU (device: 0, name: NVIDIA TITAN Xp, pci bus id: 0000:03:00.0, compute capability: 6.1)\n",
+ "WARNING:tensorflow:Compiled the loaded model, but the compiled metrics have yet to be built. `model.compile_metrics` will be empty until you train or evaluate the model.\n",
+ "2023-01-31 13:19:46.231729: W tensorflow/python/util/util.cc:348] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.\n",
+ "2023-01-31 13:19:46.494512: I tensorflow/core/grappler/devices.cc:69] Number of eligible GPUs (core count >= 8, compute capability >= 0.0): 1\n",
+ "2023-01-31 13:19:46.494713: I tensorflow/core/grappler/clusters/single_machine.cc:357] Starting new session\n",
+ "2023-01-31 13:19:46.495116: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: \n",
+ "pciBusID: 0000:03:00.0 name: NVIDIA TITAN Xp computeCapability: 6.1\n",
+ "coreClock: 1.582GHz coreCount: 30 deviceMemorySize: 11.91GiB deviceMemoryBandwidth: 510.07GiB/s\n",
+ "2023-01-31 13:19:46.495381: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0\n",
+ "2023-01-31 13:19:46.495413: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1258] Device interconnect StreamExecutor with strength 1 edge matrix:\n",
+ "2023-01-31 13:19:46.495422: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1264] 0 \n",
+ "2023-01-31 13:19:46.495429: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1277] 0: N \n",
+ "2023-01-31 13:19:46.495705: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1418] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 10942 MB memory) -> physical GPU (device: 0, name: NVIDIA TITAN Xp, pci bus id: 0000:03:00.0, compute capability: 6.1)\n",
+ "2023-01-31 13:19:46.519581: I tensorflow/core/platform/profile_utils/cpu_utils.cc:114] CPU Frequency: 3492140000 Hz\n",
+ "2023-01-31 13:19:46.520288: I tensorflow/core/grappler/optimizers/meta_optimizer.cc:1144] Optimization results for grappler item: graph_to_optimize\n",
+ " function_optimizer: function_optimizer did nothing. time = 0.007ms.\n",
+ " function_optimizer: function_optimizer did nothing. time = 0.001ms.\n",
+ "\n",
+ "2023-01-31 13:19:46.560745: W tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc:345] Ignored output_format.\n",
+ "2023-01-31 13:19:46.560780: W tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc:348] Ignored drop_control_dependency.\n",
+ "2023-01-31 13:19:46.564917: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:210] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.\n",
+ "2023-01-31 13:19:46.566851: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: \n",
+ "pciBusID: 0000:03:00.0 name: NVIDIA TITAN Xp computeCapability: 6.1\n",
+ "coreClock: 1.582GHz coreCount: 30 deviceMemorySize: 11.91GiB deviceMemoryBandwidth: 510.07GiB/s\n",
+ "2023-01-31 13:19:46.567112: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0\n",
+ "2023-01-31 13:19:46.567143: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1258] Device interconnect StreamExecutor with strength 1 edge matrix:\n",
+ "2023-01-31 13:19:46.567154: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1264] 0 \n",
+ "2023-01-31 13:19:46.567161: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1277] 0: N \n",
+ "2023-01-31 13:19:46.567471: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1418] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 10942 MB memory) -> physical GPU (device: 0, name: NVIDIA TITAN Xp, pci bus id: 0000:03:00.0, compute capability: 6.1)\n",
+ "Converted model saved to dnn.tflite.\n",
+ "Running TFLite evaluation on validation set...\n",
+ "2023-01-31 13:19:46.612300: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:176] None of the MLIR Optimization Passes are enabled (registered 2)\n",
+ "[[371 0 0 0 0 0 0 0 0 0 0 0]\n",
+ " [ 0 265 7 9 5 21 10 10 15 6 5 18]\n",
+ " [ 0 6 353 5 1 2 19 6 0 0 0 5]\n",
+ " [ 0 11 7 340 5 14 3 1 3 1 5 16]\n",
+ " [ 0 4 0 3 296 1 8 1 5 20 8 4]\n",
+ " [ 0 3 0 17 0 334 0 1 6 1 1 14]\n",
+ " [ 0 5 23 1 3 1 307 8 0 2 1 1]\n",
+ " [ 0 10 1 2 2 2 3 339 1 2 0 1]\n",
+ " [ 1 9 1 2 7 7 1 0 323 9 0 3]\n",
+ " [ 0 3 0 1 28 2 3 1 9 323 3 0]\n",
+ " [ 1 4 0 0 10 2 1 0 4 3 324 1]\n",
+ " [ 0 11 1 34 5 17 1 1 3 3 1 295]]\n",
+ "Validation accuracy = 87.06%(N=4445)\n",
+ "Running TFLite evaluation on test set...\n",
+ "[[408 0 0 0 0 0 0 0 0 0 0 0]\n",
+ " [ 0 308 9 12 7 10 9 6 19 4 9 15]\n",
+ " [ 0 9 382 2 0 7 14 2 0 0 1 2]\n",
+ " [ 0 9 3 332 0 25 2 0 0 0 2 32]\n",
+ " [ 0 11 1 2 366 4 3 0 11 9 13 5]\n",
+ " [ 0 13 1 27 2 337 8 1 1 0 1 15]\n",
+ " [ 0 9 13 5 4 2 365 9 1 1 2 1]\n",
+ " [ 0 16 0 1 3 2 5 362 2 4 0 1]\n",
+ " [ 0 9 1 0 2 9 1 0 351 21 1 1]\n",
+ " [ 0 10 0 0 17 1 5 2 11 350 1 5]\n",
+ " [ 0 3 1 4 15 4 0 1 0 2 377 4]\n",
+ " [ 0 12 3 55 6 9 4 2 3 5 4 299]]\n",
+ "Test accuracy = 86.65%(N=4890)\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "2023-01-31 13:19:59.827495: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0\n",
+ "Untarring speech_commands_v0.02.tar.gz...\n",
+ "2023-01-31 13:20:49.624250: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcuda.so.1\n",
+ "2023-01-31 13:20:49.663343: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: \n",
+ "pciBusID: 0000:03:00.0 name: NVIDIA TITAN Xp computeCapability: 6.1\n",
+ "coreClock: 1.582GHz coreCount: 30 deviceMemorySize: 11.91GiB deviceMemoryBandwidth: 510.07GiB/s\n",
+ "2023-01-31 13:20:49.663382: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0\n",
+ "2023-01-31 13:20:49.683862: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcublas.so.11\n",
+ "2023-01-31 13:20:49.683941: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcublasLt.so.11\n",
+ "2023-01-31 13:20:49.686764: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcufft.so.10\n",
+ "2023-01-31 13:20:49.687075: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcurand.so.10\n",
+ "2023-01-31 13:20:49.687678: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcusolver.so.11\n",
+ "2023-01-31 13:20:49.688414: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcusparse.so.11\n",
+ "2023-01-31 13:20:49.688571: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudnn.so.8\n",
+ "2023-01-31 13:20:49.688929: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0\n",
+ "2023-01-31 13:20:49.689226: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA\n",
+ "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
+ "2023-01-31 13:20:49.689923: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: \n",
+ "pciBusID: 0000:03:00.0 name: NVIDIA TITAN Xp computeCapability: 6.1\n",
+ "coreClock: 1.582GHz coreCount: 30 deviceMemorySize: 11.91GiB deviceMemoryBandwidth: 510.07GiB/s\n",
+ "2023-01-31 13:20:49.690297: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0\n",
+ "2023-01-31 13:20:49.690365: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0\n",
+ "2023-01-31 13:20:50.138334: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1258] Device interconnect StreamExecutor with strength 1 edge matrix:\n",
+ "2023-01-31 13:20:50.138374: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1264] 0 \n",
+ "2023-01-31 13:20:50.138386: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1277] 0: N \n",
+ "2023-01-31 13:20:50.138892: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1418] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 10942 MB memory) -> physical GPU (device: 0, name: NVIDIA TITAN Xp, pci bus id: 0000:03:00.0, compute capability: 6.1)\n",
+ "WARNING:tensorflow:Compiled the loaded model, but the compiled metrics have yet to be built. `model.compile_metrics` will be empty until you train or evaluate the model.\n",
+ "2023-01-31 13:20:51.250414: W tensorflow/python/util/util.cc:348] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.\n",
+ "2023-01-31 13:20:51.521477: I tensorflow/core/grappler/devices.cc:69] Number of eligible GPUs (core count >= 8, compute capability >= 0.0): 1\n",
+ "2023-01-31 13:20:51.521575: I tensorflow/core/grappler/clusters/single_machine.cc:357] Starting new session\n",
+ "2023-01-31 13:20:51.522122: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: \n",
+ "pciBusID: 0000:03:00.0 name: NVIDIA TITAN Xp computeCapability: 6.1\n",
+ "coreClock: 1.582GHz coreCount: 30 deviceMemorySize: 11.91GiB deviceMemoryBandwidth: 510.07GiB/s\n",
+ "2023-01-31 13:20:51.522382: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0\n",
+ "2023-01-31 13:20:51.522413: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1258] Device interconnect StreamExecutor with strength 1 edge matrix:\n",
+ "2023-01-31 13:20:51.522424: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1264] 0 \n",
+ "2023-01-31 13:20:51.522432: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1277] 0: N \n",
+ "2023-01-31 13:20:51.522720: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1418] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 10942 MB memory) -> physical GPU (device: 0, name: NVIDIA TITAN Xp, pci bus id: 0000:03:00.0, compute capability: 6.1)\n",
+ "2023-01-31 13:20:51.539458: I tensorflow/core/platform/profile_utils/cpu_utils.cc:114] CPU Frequency: 3492140000 Hz\n",
+ "2023-01-31 13:20:51.540454: I tensorflow/core/grappler/optimizers/meta_optimizer.cc:1144] Optimization results for grappler item: graph_to_optimize\n",
+ " function_optimizer: function_optimizer did nothing. time = 0.01ms.\n",
+ " function_optimizer: function_optimizer did nothing. time = 0.001ms.\n",
+ "\n",
+ "2023-01-31 13:20:51.584213: W tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc:345] Ignored output_format.\n",
+ "2023-01-31 13:20:51.584254: W tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc:348] Ignored drop_control_dependency.\n",
+ "2023-01-31 13:20:51.588197: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:210] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.\n",
+ "2023-01-31 13:20:51.590131: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: \n",
+ "pciBusID: 0000:03:00.0 name: NVIDIA TITAN Xp computeCapability: 6.1\n",
+ "coreClock: 1.582GHz coreCount: 30 deviceMemorySize: 11.91GiB deviceMemoryBandwidth: 510.07GiB/s\n",
+ "2023-01-31 13:20:51.590402: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0\n",
+ "2023-01-31 13:20:51.590432: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1258] Device interconnect StreamExecutor with strength 1 edge matrix:\n",
+ "2023-01-31 13:20:51.590442: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1264] 0 \n",
+ "2023-01-31 13:20:51.590450: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1277] 0: N \n",
+ "2023-01-31 13:20:51.590759: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1418] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 10942 MB memory) -> physical GPU (device: 0, name: NVIDIA TITAN Xp, pci bus id: 0000:03:00.0, compute capability: 6.1)\n",
+ "2023-01-31 13:20:51.621299: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:176] None of the MLIR Optimization Passes are enabled (registered 2)\n",
+ "fully_quantize: 0, inference_type: 6, input_inference_type: 9, output_inference_type: 9\n",
+ "Quantized model saved to dnn_quantized.tflite.\n",
+ "Running TFLite evaluation on validation set...\n",
+ "[[371 0 0 0 0 0 0 0 0 0 0 0]\n",
+ " [ 0 271 7 7 5 19 9 10 16 6 5 16]\n",
+ " [ 0 8 354 6 2 2 17 4 0 0 1 3]\n",
+ " [ 0 16 7 333 9 14 5 2 2 1 3 14]\n",
+ " [ 0 6 1 3 293 2 8 1 4 19 8 5]\n",
+ " [ 0 9 2 19 9 320 0 1 3 1 3 10]\n",
+ " [ 0 3 29 1 5 1 297 11 0 2 1 2]\n",
+ " [ 0 14 1 4 8 1 4 325 1 2 2 1]\n",
+ " [ 1 10 2 1 10 4 1 1 323 7 0 3]\n",
+ " [ 0 4 0 0 32 2 3 0 6 320 4 2]\n",
+ " [ 1 7 0 1 16 3 0 3 3 2 314 0]\n",
+ " [ 0 11 1 47 9 18 1 1 3 3 1 277]]\n",
+ "Validation accuracy = 85.44%(N=4445)\n",
+ "Running TFLite evaluation on test set...\n",
+ "[[408 0 0 0 0 0 0 0 0 0 0 0]\n",
+ " [ 0 319 8 13 7 8 9 5 17 4 7 11]\n",
+ " [ 0 10 379 2 5 7 10 2 0 0 1 3]\n",
+ " [ 0 13 3 332 4 20 2 2 0 2 1 26]\n",
+ " [ 0 15 0 3 363 3 3 3 10 11 10 4]\n",
+ " [ 0 17 6 26 5 327 6 2 1 0 2 14]\n",
+ " [ 0 13 15 3 11 2 352 10 2 3 1 0]\n",
+ " [ 0 18 2 0 7 2 9 350 3 2 1 2]\n",
+ " [ 0 13 1 0 9 7 0 1 342 17 4 2]\n",
+ " [ 0 14 1 0 27 0 6 3 8 334 2 7]\n",
+ " [ 0 5 1 2 23 3 4 1 0 1 366 5]\n",
+ " [ 0 13 2 62 13 13 4 2 1 3 6 283]]\n",
+ "Test accuracy = 84.97%(N=4890)\n"
+ ]
+ }
+ ],
+ "source": [
+ "!bash ./recreate_model.sh"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Running this script will use the pre-trained checkpoint files supplied in the ```./model_archive/model_source/weights``` folder to generate the TFLite files and perform evaluation on the test set. Both an fp32 version and a quantized version will be produced. The quantized version will use post-training quantization to fully quantize it.\n",
+ "\n",
+ "If you want to run training from scratch you can do this by supplying ```--train``` when running the script. For example:\n",
+ "\n",
+ "```bash\n",
+ "bash ./recreate_model.sh --train\n",
+ "```\n",
+ "\n",
+ "Training is then performed and should produce a model to the stated accuracy in this repository. Note that exporting to TFLite will still happen with the baseline pre-trained checkpoint files, so you will need to re-run the script and this time supply the path to the new checkpoint files you want to use, for example:\n",
+ "\n",
+ "```bash\n",
+ "bash ./recreate_model.sh --ckpt \n",
+ "```"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 2.0 Training\n",
+ "\n",
+ "The training scripts can be used to recreate any of the models from the [Hello Edge paper](https://arxiv.org/pdf/1711.07128.pdf) provided the right hyperparameters are used. The training commands with all the hyperparameters to reproduce the model in this repository are given [here](recreate_model.sh). The model in this part of the repository represents just one variation of the models from the paper, other varieties are covered in other parts of the repository.\n",
+ "\n",
+ "\n",
+ "As a general example of how to train a DNN with 3 fully-connected layers with 128 neurons in each layer, run:\n",
+ "```\n",
+ "python train.py --model_architecture dnn --model_size_info 128 128 128\n",
+ "```\n",
+ "\n",
+ "The command line argument *--model_size_info* is used to pass the neural network layer\n",
+ "dimensions such as number of layers, convolution filter size/stride as a list to models.py,\n",
+ "which builds the TensorFlow graph based on the provided model architecture\n",
+ "and layer dimensions. For more info on *model_size_info* for each network architecture see\n",
+ "[models.py](model_core_utils/models.py).\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 3.0 Testing\n",
+ "To run inference on the trained model from a checkpoint and get accuracy on validation and test sets, run:\n",
+ "```\n",
+ "python evaluation.py --model_architecture dnn --model_size_info 128 128 128 --checkpoint \n",
+ "```\n",
+ "**The model and feature extraction parameters passed to this script should match those used in the Training step.**"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 4.0 Optimization\n",
+ "\n",
+ "We introduce an *optional* step to optimize the trained keyword spotting model for deployment.\n",
+ "\n",
+ "Here we use TensorFlow's [weight clustering API](https://www.tensorflow.org/model_optimization/guide/clustering) to reduce the compressed model size and optimize inference on supported hardware. 32 weight clusters and kmeans++ cluster intialization method are used as the clustering hyperparameters.\n",
+ "\n",
+ "To optimize your trained model (e.g. a DNN), a trained model checkpoint is needed to run clustering and fine-tuning on.\n",
+ "You can use the pre-trained checkpoints provided, or train your own model and use the resulting checkpoint.\n",
+ "\n",
+ "To apply the optimization and fine-tuning, run the following command:\n",
+ "```\n",
+ "python optimisations.py --model_architecture dnn --model_size_info 128 128 128 --checkpoint \n",
+ "```\n",
+ "**The model and feature extraction parameters used here should match those used in the Training step, except for the number of training steps.\n",
+ "The number of training steps is reduced since the optimization step only requires fine-tuning.**\n",
+ "\n",
+ "This will generate a clustered model checkpoint that can be used in the quantization step to generate a quantized and clustered TFLite model."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 5.0 Quantization and TFLite Conversion\n",
+ "\n",
+ "You can now use TensorFlow's\n",
+ "[post training quantization](https://www.tensorflow.org/lite/performance/post_training_quantization) to\n",
+ "make quantization of the trained models super simple.\n",
+ "\n",
+ "To quantize your trained model (e.g. a DNN) run:\n",
+ "```\n",
+ "python convert_to_tflite.py --model_architecture dnn --model_size_info 128 128 128 --checkpoint [--inference_type int8|int16]\n",
+ "```\n",
+ "**The model and feature extraction parameters used here should match those used in the Training step.**\n",
+ "\n",
+ "The ```inference_type``` parameter is *optional* and to be used if a fully quantized model with inputs and outputs of type int8 or int16 is needed. It defaults to fp32.\n",
+ "\n",
+ "In this example, this step will produce a quantized TFLite file *dnn_quantized.tflite*."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "You can test the accuracy of this quantized model on the test set by running:\n",
+ "```\n",
+ "python evaluation.py --tflite_path dnn_quantized.tflite\n",
+ "```\n",
+ "**The model and feature extraction parameters used here should match those used in the Training step.**\n",
+ "\n",
+ "`convert_to_tflite.py` uses post-training quantization to generate a quantized model by default. If you wish to convert to a floating point TFLite model, use the command below:\n",
+ "\n",
+ "```\n",
+ "python convert_to_tflite.py --model_architecture dnn --model_size_info 128 128 128 --checkpoint --no-quantize\n",
+ "```\n",
+ "\n",
+ "This will produce a floating point TFLite file *dnn.tflite*. You can test the accuracy of this floating point model using `evaluation.py` as above.\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 6.0 Single inference of the TFLite model files \n",
+ "\n",
+ "You can conduct TFLite inference for .fp32 and .int8 model files by using the following command: \n",
+ "\n",
+ "```python dnn_l_inference_tflite.py --labels validation_utils/labels.txt --wav --tflite_path ```\n",
+ "\n",
+ "**The feature extraction parameters used here should match those used in the Training step.**\n",
+ "\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.7.4"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/models/keyword_spotting/dnn_large/model_package_tf/model_archive/TFLite/tflite_fp32/README.md b/models/keyword_spotting/dnn_large/model_package_tf/model_archive/TFLite/tflite_fp32/README.md
new file mode 100644
index 0000000..6d3f666
--- /dev/null
+++ b/models/keyword_spotting/dnn_large/model_package_tf/model_archive/TFLite/tflite_fp32/README.md
@@ -0,0 +1,62 @@
+# keyword_spotting/dnn_large/model_package_tf/model_archive/TFLite/tflite_fp32
+
+## Description
+This is a floating point fp32 version of the DNN Large model developed by Arm, from the Hello Edge paper.
+
+## License
+[Apache-2.0](https://spdx.org/licenses/Apache-2.0.html)
+
+## Network Information
+| Network Information | Value |
+|---------------------|-------|
+| Framework | TensorFlow Lite |
+| Datatype | fp32 |
+| SHA-1 Hash | 1ce04d01ed7decc016076a868f22858d8f092942 |
+| Size (Bytes) | 1985048 |
+| Provenance | https://arxiv.org/abs/1711.07128 |
+| Training | Trained by Arm |
+| Paper | https://arxiv.org/abs/1711.07128 |
+
+## DataSet
+| Dataset Information | Value |
+|--------|-------|
+| Name | Google Speech Commands test set |
+
+## Accuracy
+
+| Metric | Value |
+|--------|-------|
+| accuracy | 86.65% |
+
+## HW Support
+| HW Support | Value |
+|--------------|-------|
+| Cortex-A |:heavy_check_mark: |
+| Cortex-M |:heavy_check_mark: |
+| Mali GPU |:heavy_check_mark: |
+| Ethos U |:heavy_multiplication_x: |
+
+### Key
+* :heavy_check_mark: - Will run on this platform.
+* :heavy_multiplication_x: - Will not run on this platform.
+
+## Network Quality
+| Network Quality | Value |
+|-------------------------|-------|
+| Recreate | :heavy_check_mark: |
+| Quality level | Deployable |
+| Vanilla | :heavy_check_mark: |
+| Clustered | :heavy_multiplication_x: |
+| Pruned | :heavy_multiplication_x: |
+| Quantization - default | :heavy_multiplication_x: |
+| Quantization - full | :heavy_multiplication_x: |
+
+## Network Inputs
+| Input Node Name | Shape | Type | Example Path | Example Type | Example Shape | Example Use Case |
+|-----------------|-------|-------|--------------|-------|-------|-----------------|
+| input | (1, 250) | fp32 | models/keyword_spotting/dnn_large/model_package_tf/model_archive/TFLite/tflite_fp32/testing_input/input | fp32 | [1, 250] | The input is a processed MFCCs |
+
+## Network Outputs
+| Output Node Name | Shape | Type | Example Path | Example Type | Example Shape | Example Use Case |
+|-----------------|-------|-------|--------------|-------|-------|-----------------|
+| Identity | (1, 12) | fp32 | models/keyword_spotting/dnn_large/model_package_tf/model_archive/TFLite/tflite_fp32/testing_output/Identity | fp32 | [1, 12] | The probability on 12 keywords |
\ No newline at end of file
diff --git a/models/keyword_spotting/dnn_large/model_package_tf/model_archive/TFLite/tflite_fp32/definition.yaml b/models/keyword_spotting/dnn_large/model_package_tf/model_archive/TFLite/tflite_fp32/definition.yaml
new file mode 100644
index 0000000..38082c2
--- /dev/null
+++ b/models/keyword_spotting/dnn_large/model_package_tf/model_archive/TFLite/tflite_fp32/definition.yaml
@@ -0,0 +1,62 @@
+benchmark:
+ benchmark_metrics:
+ accuracy: 86.65%
+ benchmark_name: Google Speech Commands test set
+description: This is a floating point fp32 version of the DNN Large model developed
+ by Arm, from the Hello Edge paper.
+license:
+- Apache-2.0
+network:
+ datatype: fp32
+ file_size_bytes: 1985048
+ filename: dnn_l.tflite
+ framework: TensorFlow Lite
+ hash:
+ algorithm: sha1
+ value: 1ce04d01ed7decc016076a868f22858d8f092942
+ provenance: https://arxiv.org/abs/1711.07128
+ training: Trained by Arm
+network_parameters:
+ input_nodes:
+ - description: The input is a processed MFCCs of shape (1, 490)
+ example_input:
+ path: models/keyword_spotting/dnn_large/model_package_tf/model_archive/TFLite/tflite_fp32/testing_input/input
+ shape:
+ - 1
+ - 250
+ type: fp32
+ use_case: Random input for model regression.
+ input_datatype: fp32
+ name: input
+ shape:
+ - 1
+ - 250
+ output_nodes:
+ - description: The probability on 12 keywords.
+ example_output:
+ path: models/keyword_spotting/dnn_large/model_package_tf/model_archive/TFLite/tflite_fp32/testing_output/Identity
+ shape:
+ - 1
+ - 12
+ type: fp32
+ use_case: output for model regression.
+ name: Identity
+ output_datatype: fp32
+ shape:
+ - 1
+ - 12
+network_quality:
+ clustered: false
+ is_vanilla: true
+ pruned: false
+ quality_level: Deployable
+ quality_level_hero_hw: cortex_m
+ quantization_default: false
+ quantization_full: false
+ recreate: true
+operators:
+ TensorFlow Lite:
+ - FULLY_CONNECTED
+ - RELU
+ - SOFTMAX
+paper: https://arxiv.org/abs/1711.07128
\ No newline at end of file
diff --git a/models/keyword_spotting/dnn_large/model_package_tf/model_archive/TFLite/tflite_fp32/dnn_l.tflite b/models/keyword_spotting/dnn_large/model_package_tf/model_archive/TFLite/tflite_fp32/dnn_l.tflite
new file mode 100644
index 0000000..e5cbfe0
--- /dev/null
+++ b/models/keyword_spotting/dnn_large/model_package_tf/model_archive/TFLite/tflite_fp32/dnn_l.tflite
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dffdcf515fce70988132d98f8007564e0b303d0b463c422f039e2074cb29fc51
+size 1985048
diff --git a/models/keyword_spotting/dnn_large/model_package_tf/model_archive/TFLite/tflite_fp32/testing_input/input/0.npy b/models/keyword_spotting/dnn_large/model_package_tf/model_archive/TFLite/tflite_fp32/testing_input/input/0.npy
new file mode 100644
index 0000000..5c996be
--- /dev/null
+++ b/models/keyword_spotting/dnn_large/model_package_tf/model_archive/TFLite/tflite_fp32/testing_input/input/0.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:77bdd1510d16c990db4276179453648d51e6526f4fbbe29091c183316184c827
+size 1128
diff --git a/models/keyword_spotting/dnn_large/model_package_tf/model_archive/TFLite/tflite_fp32/testing_output/Identity/0.npy b/models/keyword_spotting/dnn_large/model_package_tf/model_archive/TFLite/tflite_fp32/testing_output/Identity/0.npy
new file mode 100644
index 0000000..98bc3fd
--- /dev/null
+++ b/models/keyword_spotting/dnn_large/model_package_tf/model_archive/TFLite/tflite_fp32/testing_output/Identity/0.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5bc29017fbb5d27101b9b96399c1fbc857a07871d759ca39a20de0b39ecc0396
+size 176
diff --git a/models/keyword_spotting/dnn_large/model_package_tf/model_archive/TFLite/tflite_int8/README.md b/models/keyword_spotting/dnn_large/model_package_tf/model_archive/TFLite/tflite_int8/README.md
new file mode 100644
index 0000000..db3aa64
--- /dev/null
+++ b/models/keyword_spotting/dnn_large/model_package_tf/model_archive/TFLite/tflite_int8/README.md
@@ -0,0 +1,62 @@
+# keyword_spotting/dnn_large/model_package_tf/model_archive/TFLite/tflite_int8
+
+## Description
+This is a fully quantized int8 version of the DNN Large model developed by Arm, from the Hello Edge paper.
+
+## License
+[Apache-2.0](https://spdx.org/licenses/Apache-2.0.html)
+
+## Network Information
+| Network Information | Value |
+|---------------------|-------|
+| Framework | TensorFlow Lite |
+| Datatype | int8 |
+| SHA-1 Hash | 2b1ee34e4c87ba6f24092c7457593227099efaf1 |
+| Size (Bytes) | 502272 |
+| Provenance | https://arxiv.org/abs/1711.07128 |
+| Training | Trained by Arm |
+| Paper | https://arxiv.org/abs/1711.07128 |
+
+## DataSet
+| Dataset Information | Value |
+|--------|-------|
+| Name | Google Speech Commands test set |
+
+## Accuracy
+
+| Metric | Value |
+|--------|-------|
+| Accuracy | 86.01% |
+
+## HW Support
+| HW Support | Value |
+|--------------|-------|
+| Cortex-A |:heavy_check_mark: |
+| Cortex-M |:heavy_check_mark: |
+| Mali GPU |:heavy_check_mark: |
+| Ethos U |:heavy_check_mark: |
+
+### Key
+* :heavy_check_mark: - Will run on this platform.
+* :heavy_multiplication_x: - Will not run on this platform.
+
+## Network Quality
+| Network Quality | Value |
+|-------------------------|-------|
+| Recreate | :heavy_check_mark: |
+| Quality level | Deployable |
+| Vanilla | :heavy_check_mark: |
+| Clustered | :heavy_multiplication_x: |
+| Pruned | :heavy_multiplication_x: |
+| Quantization - default | :heavy_multiplication_x: |
+| Quantization - full | :heavy_check_mark: |
+
+## Network Inputs
+| Input Node Name | Shape | Type | Example Path | Example Type | Example Shape | Example Use Case |
+|-----------------|-------|-------|--------------|-------|-------|-----------------|
+| input | (1, 250) | int8 | models/keyword_spotting/dnn_large/model_package_tf/model_archive/TFLite/tflite_int8/testing_input/input | int8 | [1, 250] | The input is a processed MFCCs |
+
+## Network Outputs
+| Output Node Name | Shape | Type | Example Path | Example Type | Example Shape | Example Use Case |
+|-----------------|-------|-------|--------------|-------|-------|-----------------|
+| Identity | (1, 12) | int8 | models/keyword_spotting/dnn_large/model_package_tf/model_archive/TFLite/tflite_int8/testing_output/Identity | int8 | [1, 12] | The probability on 12 keywords |
\ No newline at end of file
diff --git a/models/keyword_spotting/dnn_large/model_package_tf/model_archive/TFLite/tflite_int8/definition.yaml b/models/keyword_spotting/dnn_large/model_package_tf/model_archive/TFLite/tflite_int8/definition.yaml
new file mode 100644
index 0000000..7040a89
--- /dev/null
+++ b/models/keyword_spotting/dnn_large/model_package_tf/model_archive/TFLite/tflite_int8/definition.yaml
@@ -0,0 +1,62 @@
+benchmark:
+ benchmark_metrics:
+ accuracy: 86.01%
+ benchmark_name: Google Speech Commands test set
+description: This is a fully quantized int8 version of the DNN Large model developed
+ by Arm, from the Hello Edge paper.
+license:
+- Apache-2.0
+network:
+ datatype: int8
+ file_size_bytes: 502272
+ filename: dnn_l_quantized.tflite
+ framework: TensorFlow Lite
+ hash:
+ algorithm: sha1
+ value: 2b1ee34e4c87ba6f24092c7457593227099efaf1
+ provenance: https://arxiv.org/abs/1711.07128
+ training: Trained by Arm
+network_parameters:
+ input_nodes:
+ - description: The input is a processed MFCCs of shape (1, 250)
+ example_input:
+ path: models/keyword_spotting/dnn_large/model_package_tf/model_archive/TFLite/tflite_int8/testing_input/input
+ shape:
+ - 1
+ - 250
+ type: int8
+ use_case: Random input for model regression.
+ input_datatype: int8
+ name: input
+ shape:
+ - 1
+ - 250
+ output_nodes:
+ - description: The probability on 12 keywords.
+ example_output:
+ path: models/keyword_spotting/dnn_large/model_package_tf/model_archive/TFLite/tflite_int8/testing_output/Identity
+ shape:
+ - 1
+ - 12
+ type: int8
+ use_case: output for model regression.
+ name: Identity
+ output_datatype: int8
+ shape:
+ - 1
+ - 12
+network_quality:
+ clustered: false
+ is_vanilla: true
+ pruned: false
+ quality_level: Deployable
+ quality_level_hero_hw: cortex_m
+ quantization_default: false
+ quantization_full: true
+ recreate: true
+operators:
+ TensorFlow Lite:
+ - FULLY_CONNECTED
+ - RELU
+ - SOFTMAX
+paper: https://arxiv.org/abs/1711.07128
\ No newline at end of file
diff --git a/models/keyword_spotting/dnn_large/tflite_int8/dnn_l_quantized.tflite b/models/keyword_spotting/dnn_large/model_package_tf/model_archive/TFLite/tflite_int8/dnn_l_quantized.tflite
similarity index 100%
rename from models/keyword_spotting/dnn_large/tflite_int8/dnn_l_quantized.tflite
rename to models/keyword_spotting/dnn_large/model_package_tf/model_archive/TFLite/tflite_int8/dnn_l_quantized.tflite
diff --git a/models/keyword_spotting/dnn_large/tflite_int8/testing_input/input/0.npy b/models/keyword_spotting/dnn_large/model_package_tf/model_archive/TFLite/tflite_int8/testing_input/input/0.npy
similarity index 100%
rename from models/keyword_spotting/dnn_large/tflite_int8/testing_input/input/0.npy
rename to models/keyword_spotting/dnn_large/model_package_tf/model_archive/TFLite/tflite_int8/testing_input/input/0.npy
diff --git a/models/keyword_spotting/dnn_large/tflite_int8/testing_output/Identity/0.npy b/models/keyword_spotting/dnn_large/model_package_tf/model_archive/TFLite/tflite_int8/testing_output/Identity/0.npy
similarity index 100%
rename from models/keyword_spotting/dnn_large/tflite_int8/testing_output/Identity/0.npy
rename to models/keyword_spotting/dnn_large/model_package_tf/model_archive/TFLite/tflite_int8/testing_output/Identity/0.npy
diff --git a/models/keyword_spotting/dnn_large/model_package_tf/model_archive/model_source/saved_model/dnn_large/keras_metadata.pb b/models/keyword_spotting/dnn_large/model_package_tf/model_archive/model_source/saved_model/dnn_large/keras_metadata.pb
new file mode 100644
index 0000000..364939d
--- /dev/null
+++ b/models/keyword_spotting/dnn_large/model_package_tf/model_archive/model_source/saved_model/dnn_large/keras_metadata.pb
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:06b16edf52376997d110f024184261ef588cd3309d8175c8769aa45482cd0164
+size 10087
diff --git a/models/keyword_spotting/dnn_large/model_package_tf/model_archive/model_source/saved_model/dnn_large/saved_model.pb b/models/keyword_spotting/dnn_large/model_package_tf/model_archive/model_source/saved_model/dnn_large/saved_model.pb
new file mode 100644
index 0000000..59d2022
--- /dev/null
+++ b/models/keyword_spotting/dnn_large/model_package_tf/model_archive/model_source/saved_model/dnn_large/saved_model.pb
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:acb29f21ca89c9369eca08f583daaf3c7e64cd26ab5fec4cb0b95cf9d04435ef
+size 85126
diff --git a/models/keyword_spotting/dnn_large/model_package_tf/model_archive/model_source/saved_model/dnn_large/variables/variables.data-00000-of-00001 b/models/keyword_spotting/dnn_large/model_package_tf/model_archive/model_source/saved_model/dnn_large/variables/variables.data-00000-of-00001
new file mode 100644
index 0000000..4d554fc
--- /dev/null
+++ b/models/keyword_spotting/dnn_large/model_package_tf/model_archive/model_source/saved_model/dnn_large/variables/variables.data-00000-of-00001
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:11f3d672a01c44c0e86a5f485ddfe4b2e5c8c6770563a6e0520297ed1e029579
+size 1985615
diff --git a/models/keyword_spotting/dnn_large/model_package_tf/model_archive/model_source/saved_model/dnn_large/variables/variables.index b/models/keyword_spotting/dnn_large/model_package_tf/model_archive/model_source/saved_model/dnn_large/variables/variables.index
new file mode 100644
index 0000000..fc9e90c
--- /dev/null
+++ b/models/keyword_spotting/dnn_large/model_package_tf/model_archive/model_source/saved_model/dnn_large/variables/variables.index
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d1ae6b295e9da819138459f3684755e71c2fac683da141510581996541e509e6
+size 642
diff --git a/models/keyword_spotting/dnn_large/tflite_int8/ckpt/checkpoint b/models/keyword_spotting/dnn_large/model_package_tf/model_archive/model_source/weights/checkpoint
similarity index 100%
rename from models/keyword_spotting/dnn_large/tflite_int8/ckpt/checkpoint
rename to models/keyword_spotting/dnn_large/model_package_tf/model_archive/model_source/weights/checkpoint
diff --git a/models/keyword_spotting/dnn_large/tflite_int8/ckpt/dnn_0.87_ckpt.data-00000-of-00001 b/models/keyword_spotting/dnn_large/model_package_tf/model_archive/model_source/weights/dnn_0.87_ckpt.data-00000-of-00001
similarity index 100%
rename from models/keyword_spotting/dnn_large/tflite_int8/ckpt/dnn_0.87_ckpt.data-00000-of-00001
rename to models/keyword_spotting/dnn_large/model_package_tf/model_archive/model_source/weights/dnn_0.87_ckpt.data-00000-of-00001
diff --git a/models/keyword_spotting/dnn_large/tflite_int8/ckpt/dnn_0.87_ckpt.index b/models/keyword_spotting/dnn_large/model_package_tf/model_archive/model_source/weights/dnn_0.87_ckpt.index
similarity index 100%
rename from models/keyword_spotting/dnn_large/tflite_int8/ckpt/dnn_0.87_ckpt.index
rename to models/keyword_spotting/dnn_large/model_package_tf/model_archive/model_source/weights/dnn_0.87_ckpt.index
diff --git a/models/keyword_spotting/dnn_large/model_package_tf/model_core_utils/__init__.py b/models/keyword_spotting/dnn_large/model_package_tf/model_core_utils/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/models/keyword_spotting/dnn_large/model_package_tf/model_core_utils/models.py b/models/keyword_spotting/dnn_large/model_package_tf/model_core_utils/models.py
new file mode 100644
index 0000000..1978136
--- /dev/null
+++ b/models/keyword_spotting/dnn_large/model_package_tf/model_core_utils/models.py
@@ -0,0 +1,327 @@
+# Copyright © 2023 Arm Ltd. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Model definitions for simple keyword spotting."""
+
+import math
+
+import tensorflow as tf
+
+
+def prepare_model_settings(label_count, sample_rate, clip_duration_ms,
+ window_size_ms, window_stride_ms,
+ dct_coefficient_count):
+ """Calculates common settings needed for all models.
+
+ Args:
+ label_count: How many classes are to be recognized.
+ sample_rate: Number of audio samples per second.
+ clip_duration_ms: Length of each audio clip to be analyzed.
+ window_size_ms: Duration of frequency analysis window.
+ window_stride_ms: How far to move in time between frequency windows.
+ dct_coefficient_count: Number of frequency bins to use for analysis.
+
+ Returns:
+ Dictionary containing common settings.
+ """
+ desired_samples = int(sample_rate * clip_duration_ms / 1000)
+ window_size_samples = int(sample_rate * window_size_ms / 1000)
+ window_stride_samples = int(sample_rate * window_stride_ms / 1000)
+ length_minus_window = (desired_samples - window_size_samples)
+ if length_minus_window < 0:
+ spectrogram_length = 0
+ else:
+ spectrogram_length = 1 + int(length_minus_window / window_stride_samples)
+ fingerprint_size = dct_coefficient_count * spectrogram_length
+
+ return {
+ 'desired_samples': desired_samples,
+ 'window_size_samples': window_size_samples,
+ 'window_stride_samples': window_stride_samples,
+ 'spectrogram_length': spectrogram_length,
+ 'dct_coefficient_count': dct_coefficient_count,
+ 'fingerprint_size': fingerprint_size,
+ 'label_count': label_count,
+ 'sample_rate': sample_rate,
+ }
+
+
+def create_model(model_settings, model_architecture, model_size_info, is_training):
+ """Builds a tf.keras model of the requested architecture compatible with the settings.
+
+ Args:
+ model_settings: Dictionary of information about the model.
+ model_architecture: String specifying which kind of model to create.
+ model_size_info: Array with specific information for the chosen architecture
+ (e.g convolutional parameters, number of layers).
+
+ Returns:
+ A tf.keras Model with the requested architecture.
+
+ Raises:
+ Exception: If the architecture type isn't recognized.
+ """
+
+ if model_architecture == 'dnn':
+ return create_dnn_model(model_settings, model_size_info)
+
+ elif model_architecture == 'cnn':
+ return create_cnn_model(model_settings, model_size_info)
+
+ elif model_architecture == 'ds_cnn':
+ return create_ds_cnn_model(model_settings, model_size_info)
+ elif model_architecture == 'single_fc':
+ return create_single_fc_model(model_settings)
+ elif model_architecture == 'basic_lstm':
+ return create_basic_lstm_model(model_settings, model_size_info, is_training)
+ else:
+ raise Exception(f'model_architecture argument {model_architecture} not recognized'
+ f', should be one of, "dnn", "cnn", "ds_cnn" ')
+
+
+def create_single_fc_model(model_settings):
+ """Builds a model with a single fully-connected layer.
+
+ For details see https://arxiv.org/abs/1711.07128.
+
+ Args:
+ model_settings: Dict of different settings for model training.
+
+ Returns:
+ tf.keras Model of the 'SINGLE_FC' architecture.
+ """
+ inputs = tf.keras.Input(shape=(model_settings['fingerprint_size'],), name='input')
+ # Fully connected layer
+ output = tf.keras.layers.Dense(units=model_settings['label_count'], activation='softmax')(inputs)
+
+ return tf.keras.Model(inputs, output)
+
+
+def create_basic_lstm_model(model_settings, model_size_info, is_training):
+ """Builds a model with a basic lstm layer.
+
+ For details see https://arxiv.org/abs/1711.07128.
+
+ Args:
+ model_settings: Dict of different settings for model training.
+ model_size_info: Length of the array defines the number of hidden-layers and
+ each element in the array represent the number of neurons in that layer.
+ is_training: Determining whether the use of the model is for training or for something else.
+
+ Returns:
+ tf.keras Model of the 'Basic_LSTM' architecture.
+ """
+ inputs = tf.keras.Input(shape=(model_settings['fingerprint_size'], ), name='input')
+
+ input_frequency_size = model_settings['dct_coefficient_count']
+ input_time_size = model_settings['spectrogram_length']
+ x = tf.reshape(inputs, shape=(-1, input_time_size, input_frequency_size))
+
+ # LSTM layer, and unrolling depending on whether you are training or not
+ if is_training:
+ x = tf.keras.layers.LSTM(units=model_size_info[0], time_major=False, unroll=False)(x)
+ else:
+ x = tf.keras.layers.LSTM(units=model_size_info[0], time_major=False, unroll=True)(x)
+
+ # Outputs a fully connected layer
+ output = tf.keras.layers.Dense(units=model_settings['label_count'], activation='softmax')(x)
+
+ return tf.keras.Model(inputs, output)
+
+
+def create_dnn_model(model_settings, model_size_info):
+ """Builds a model with multiple hidden fully-connected layers.
+
+ For details see https://arxiv.org/abs/1711.07128.
+
+ Args:
+ model_settings: Dict of different settings for model training.
+ model_size_info: Length of the array defines the number of hidden-layers and
+ each element in the array represent the number of neurons in that layer.
+
+ Returns:
+ tf.keras Model of the 'DNN' architecture.
+ """
+
+ inputs = tf.keras.Input(shape=(model_settings['fingerprint_size'], ), name='input')
+
+ # First fully connected layer.
+ x = tf.keras.layers.Dense(units=model_size_info[0], activation='relu')(inputs)
+
+ # Hidden layers with ReLU activations.
+ for i in range(1, len(model_size_info)):
+ x = tf.keras.layers.Dense(units=model_size_info[i], activation='relu')(x)
+
+ # Output fully connected layer.
+ output = tf.keras.layers.Dense(units=model_settings['label_count'], activation='softmax')(x)
+
+ return tf.keras.Model(inputs, output)
+
+
+def create_cnn_model(model_settings, model_size_info):
+ """Builds a model with 2 convolution layers followed by a linear layer and a hidden fully-connected layer.
+
+ For details see https://arxiv.org/abs/1711.07128.
+
+ Args:
+ model_settings: Dict of different settings for model training.
+ model_size_info: Defines the first and second convolution parameters in
+ {number of conv features, conv filter height, width, stride in y,x dir.},
+ followed by linear layer size and fully-connected layer size.
+
+ Returns:
+ tf.keras Model of the 'CNN' architecture.
+ """
+
+ input_frequency_size = model_settings['dct_coefficient_count']
+ input_time_size = model_settings['spectrogram_length']
+
+ first_filter_count = model_size_info[0]
+ first_filter_height = model_size_info[1] # Time axis.
+ first_filter_width = model_size_info[2] # Frequency axis.
+ first_filter_stride_y = model_size_info[3] # Time axis.
+ first_filter_stride_x = model_size_info[4] # Frequency_axis.
+
+ second_filter_count = model_size_info[5]
+ second_filter_height = model_size_info[6] # Time axis.
+ second_filter_width = model_size_info[7] # Frequency axis.
+ second_filter_stride_y = model_size_info[8] # Time axis.
+ second_filter_stride_x = model_size_info[9] # Frequency axis.
+
+ linear_layer_size = model_size_info[10]
+ fc_size = model_size_info[11]
+
+ inputs = tf.keras.Input(shape=(model_settings['fingerprint_size']), name='input')
+
+ # Reshape the flattened input.
+ x = tf.reshape(inputs, shape=(-1, input_time_size, input_frequency_size, 1))
+
+ # First convolution.
+ x = tf.keras.layers.Conv2D(filters=first_filter_count,
+ kernel_size=(first_filter_height, first_filter_width),
+ strides=(first_filter_stride_y, first_filter_stride_x),
+ padding='VALID')(x)
+ x = tf.keras.layers.BatchNormalization()(x)
+ x = tf.keras.layers.ReLU()(x)
+ x = tf.keras.layers.Dropout(rate=0)(x)
+
+ # Second convolution.
+ x = tf.keras.layers.Conv2D(filters=second_filter_count,
+ kernel_size=(second_filter_height, second_filter_width),
+ strides=(second_filter_stride_y, second_filter_stride_x),
+ padding='VALID')(x)
+ x = tf.keras.layers.BatchNormalization()(x)
+ x = tf.keras.layers.ReLU()(x)
+ x = tf.keras.layers.Dropout(rate=0)(x)
+
+ # Flatten for fully connected layers.
+ x = tf.keras.layers.Flatten()(x)
+
+ # Fully connected layer with no activation.
+ x = tf.keras.layers.Dense(units=linear_layer_size)(x)
+
+ # Fully connected layer with ReLU activation.
+ x = tf.keras.layers.Dense(units=fc_size)(x)
+ x = tf.keras.layers.BatchNormalization()(x)
+ x = tf.keras.layers.ReLU()(x)
+ x = tf.keras.layers.Dropout(rate=0)(x)
+
+ # Output fully connected.
+ output = tf.keras.layers.Dense(units=model_settings['label_count'], activation='softmax')(x)
+
+ return tf.keras.Model(inputs, output)
+
+
+def create_ds_cnn_model(model_settings, model_size_info):
+ """Builds a model with convolutional & depthwise separable convolutional layers.
+
+ For more details see https://arxiv.org/abs/1711.07128.
+
+ Args:
+ model_settings: Dict of different settings for model training.
+ model_size_info: Defines number of layers, followed by the DS-Conv layer
+ parameters in the order {number of conv features, conv filter height,
+ width and stride in y,x dir.} for each of the layers.
+
+ Returns:
+ tf.keras Model of the 'DS-CNN' architecture.
+ """
+
+ label_count = model_settings['label_count']
+ input_frequency_size = model_settings['dct_coefficient_count']
+ input_time_size = model_settings['spectrogram_length']
+
+ t_dim = input_time_size
+ f_dim = input_frequency_size
+
+ # Extract model dimensions from model_size_info.
+ num_layers = model_size_info[0]
+ conv_feat = [None]*num_layers
+ conv_kt = [None]*num_layers
+ conv_kf = [None]*num_layers
+ conv_st = [None]*num_layers
+ conv_sf = [None]*num_layers
+
+ i = 1
+ for layer_no in range(0, num_layers):
+ conv_feat[layer_no] = model_size_info[i]
+ i += 1
+ conv_kt[layer_no] = model_size_info[i]
+ i += 1
+ conv_kf[layer_no] = model_size_info[i]
+ i += 1
+ conv_st[layer_no] = model_size_info[i]
+ i += 1
+ conv_sf[layer_no] = model_size_info[i]
+ i += 1
+
+ inputs = tf.keras.Input(shape=(model_settings['fingerprint_size']), name='input')
+
+ # Reshape the flattened input.
+ x = tf.reshape(inputs, shape=(-1, input_time_size, input_frequency_size, 1))
+
+ # Depthwise separable convolutions.
+ for layer_no in range(0, num_layers):
+ if layer_no == 0:
+ # First convolution.
+ x = tf.keras.layers.Conv2D(filters=conv_feat[0],
+ kernel_size=(conv_kt[0], conv_kf[0]),
+ strides=(conv_st[0], conv_sf[0]),
+ padding='SAME')(x)
+ x = tf.keras.layers.BatchNormalization()(x)
+ x = tf.keras.layers.ReLU()(x)
+ else:
+ # Depthwise convolution.
+ x = tf.keras.layers.DepthwiseConv2D(kernel_size=(conv_kt[layer_no], conv_kf[layer_no]),
+ strides=(conv_sf[layer_no], conv_st[layer_no]),
+ padding='SAME')(x)
+ x = tf.keras.layers.BatchNormalization()(x)
+ x = tf.keras.layers.ReLU()(x)
+
+ # Pointwise convolution.
+ x = tf.keras.layers.Conv2D(filters=conv_feat[layer_no], kernel_size=(1, 1))(x)
+ x = tf.keras.layers.BatchNormalization()(x)
+ x = tf.keras.layers.ReLU()(x)
+
+ t_dim = math.ceil(t_dim/float(conv_st[layer_no]))
+ f_dim = math.ceil(f_dim/float(conv_sf[layer_no]))
+
+ # Global average pool.
+ x = tf.keras.layers.AveragePooling2D(pool_size=(t_dim, f_dim), strides=1)(x)
+
+ # Squeeze before passing to output fully connected layer.
+ x = tf.reshape(x, shape=(-1, conv_feat[layer_no]))
+
+ # Output connected layer.
+ output = tf.keras.layers.Dense(units=label_count, activation='softmax')(x)
+
+ return tf.keras.Model(inputs, output)
diff --git a/models/keyword_spotting/dnn_large/model_package_tf/optimisations.py b/models/keyword_spotting/dnn_large/model_package_tf/optimisations.py
new file mode 100644
index 0000000..16b6f4c
--- /dev/null
+++ b/models/keyword_spotting/dnn_large/model_package_tf/optimisations.py
@@ -0,0 +1,259 @@
+# Copyright © 2023 Arm Ltd. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Functions for optimizing simple keyword spotting models using clustering API."""
+
+import argparse
+from pathlib import Path
+
+import tensorflow as tf
+import numpy as np
+import tensorflow_model_optimization as tfmot
+
+from data_processing import data_preprocessing
+from model_core_utils import models
+
+
+def print_model_weight_clusters(model):
+
+ for layer in model.layers:
+ if isinstance(layer, tf.keras.layers.Wrapper):
+ weights = layer.trainable_weights
+ else:
+ weights = layer.weights
+ for weight in weights:
+ if "kernel" in weight.name:
+ unique_count = len(np.unique(weight))
+ print(
+ f"{layer.name}/{weight.name}: {unique_count} clusters "
+ )
+
+
+def optimize():
+ model_settings = models.prepare_model_settings(
+ len(data_preprocessing.prepare_words_list(FLAGS.wanted_words.split(','))),
+ FLAGS.sample_rate, FLAGS.clip_duration_ms, FLAGS.window_size_ms,
+ FLAGS.window_stride_ms, FLAGS.dct_coefficient_count)
+
+ # Create the model to optimize from checkpoint.
+ model = models.create_model(model_settings, FLAGS.model_architecture, FLAGS.model_size_info)
+ model.load_weights(FLAGS.checkpoint).expect_partial()
+
+ audio_processor = data_preprocessing.AudioProcessor(data_url=FLAGS.data_url,
+ data_dir=FLAGS.data_dir,
+ silence_percentage=FLAGS.silence_percentage,
+ unknown_percentage=FLAGS.unknown_percentage,
+ wanted_words=FLAGS.wanted_words.split(','),
+ validation_percentage=FLAGS.validation_percentage,
+ testing_percentage=FLAGS.testing_percentage,
+ model_settings=model_settings)
+
+ # We decay learning rate in a constant piecewise way to help learning.
+ training_steps_list = list(map(int, FLAGS.how_many_training_steps.split(',')))
+ learning_rates_list = list(map(float, FLAGS.learning_rate.split(',')))
+ lr_boundary_list = training_steps_list[:-1] # Only need the values at which to change lr.
+ lr_schedule = tf.keras.optimizers.schedules.PiecewiseConstantDecay(boundaries=lr_boundary_list,
+ values=learning_rates_list)
+
+ cluster_weights = tfmot.clustering.keras.cluster_weights
+ CentroidInitialization = tfmot.clustering.keras.CentroidInitialization
+
+ clustering_params = {
+ 'number_of_clusters': 32,
+ 'cluster_centroids_init': CentroidInitialization.KMEANS_PLUS_PLUS}
+
+ clustered_model = cluster_weights(model, **clustering_params)
+
+ # Specify the optimizer configurations.
+ optimizer = tf.keras.optimizers.Adam(learning_rate=lr_schedule)
+ clustered_model.compile(optimizer=optimizer,
+ loss=tf.keras.losses.SparseCategoricalCrossentropy(),
+ metrics=['accuracy'])
+
+ train_data = audio_processor.get_data(audio_processor.Modes.TRAINING,
+ FLAGS.background_frequency, FLAGS.background_volume,
+ int((FLAGS.time_shift_ms * FLAGS.sample_rate) / 1000))
+ train_data = train_data.repeat().batch(FLAGS.batch_size).prefetch(tf.data.AUTOTUNE)
+ val_data = audio_processor.get_data(audio_processor.Modes.VALIDATION)
+ val_data = val_data.batch(FLAGS.batch_size).prefetch(tf.data.AUTOTUNE)
+
+ # We train for a max number of iterations so need to calculate how many 'epochs' this will be.
+ training_steps_max = np.sum(training_steps_list)
+ training_epoch_max = int(np.ceil(training_steps_max / FLAGS.eval_step_interval))
+
+ # Train the model with clustering applied.
+ clustered_model.fit(x=train_data,
+ steps_per_epoch=FLAGS.eval_step_interval,
+ epochs=training_epoch_max,
+ validation_data=val_data)
+
+ stripped_clustered_model = tfmot.clustering.keras.strip_clustering(clustered_model)
+
+ print_model_weight_clusters(stripped_clustered_model)
+
+ # Save the clustered model weights
+ train_dir = Path(FLAGS.train_dir) / "optimized"
+ train_dir.mkdir(parents=True, exist_ok=True)
+
+ stripped_clustered_model.save_weights((train_dir /
+ (FLAGS.model_architecture +
+ "_clustered_ckpt")))
+
+ # Test the model.
+ test_data = audio_processor.get_data(audio_processor.Modes.TESTING)
+ test_data = test_data.batch(FLAGS.batch_size)
+
+ stripped_clustered_model.compile(optimizer=optimizer,
+ loss=tf.keras.losses.SparseCategoricalCrossentropy(),
+ metrics=['accuracy'])
+
+ test_loss, test_acc = stripped_clustered_model.evaluate(x=test_data)
+ print(f'Final test accuracy: {test_acc*100:.2f}%')
+
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser()
+ parser.add_argument(
+ '--data_url',
+ type=str,
+ default='http://download.tensorflow.org/data/speech_commands_v0.02.tar.gz',
+ help='Location of speech training data archive on the web.')
+ parser.add_argument(
+ '--data_dir',
+ type=str,
+ default='/tmp/speech_dataset/',
+ help="""\
+ Where to download the speech training data to.
+ """)
+ parser.add_argument(
+ '--background_volume',
+ type=float,
+ default=0.1,
+ help="""\
+ How loud the background noise should be, between 0 and 1.
+ """)
+ parser.add_argument(
+ '--background_frequency',
+ type=float,
+ default=0.8,
+ help="""\
+ How many of the training samples have background noise mixed in.
+ """)
+ parser.add_argument(
+ '--silence_percentage',
+ type=float,
+ default=10.0,
+ help="""\
+ How much of the training data should be silence.
+ """)
+ parser.add_argument(
+ '--unknown_percentage',
+ type=float,
+ default=10.0,
+ help="""\
+ How much of the training data should be unknown words.
+ """)
+ parser.add_argument(
+ '--time_shift_ms',
+ type=float,
+ default=100.0,
+ help="""\
+ Range to randomly shift the training audio by in time.
+ """)
+ parser.add_argument(
+ '--testing_percentage',
+ type=int,
+ default=10,
+ help='What percentage of wavs to use as a test set.')
+ parser.add_argument(
+ '--validation_percentage',
+ type=int,
+ default=10,
+ help='What percentage of wavs to use as a validation set.')
+ parser.add_argument(
+ '--sample_rate',
+ type=int,
+ default=16000,
+ help='Expected sample rate of the wavs',)
+ parser.add_argument(
+ '--clip_duration_ms',
+ type=int,
+ default=1000,
+ help='Expected duration in milliseconds of the wavs',)
+ parser.add_argument(
+ '--window_size_ms',
+ type=float,
+ default=30.0,
+ help='How long each spectrogram timeslice is',)
+ parser.add_argument(
+ '--window_stride_ms',
+ type=float,
+ default=10.0,
+ help='How long each spectrogram timeslice is',)
+ parser.add_argument(
+ '--dct_coefficient_count',
+ type=int,
+ default=40,
+ help='How many bins to use for the MFCC fingerprint',)
+ parser.add_argument(
+ '--how_many_training_steps',
+ type=str,
+ default='3750,750',
+ help='How many training loops to run',)
+ parser.add_argument(
+ '--eval_step_interval',
+ type=int,
+ default=400,
+ help='How often to evaluate the training results.')
+ parser.add_argument(
+ '--learning_rate',
+ type=str,
+ default='0.001,0.0001',
+ help='How large a learning rate to use when training.')
+ parser.add_argument(
+ '--batch_size',
+ type=int,
+ default=100,
+ help='How many items to train with at once',)
+ parser.add_argument(
+ '--wanted_words',
+ type=str,
+ default='yes,no,up,down,left,right,on,off,stop,go',
+ help='Words to use (others will be added to an unknown label)',)
+ parser.add_argument(
+ '--train_dir',
+ type=str,
+ default='/tmp/speech_commands_train',
+ help='Directory to write event logs and checkpoint.')
+ parser.add_argument(
+ '--save_step_interval',
+ type=int,
+ default=100,
+ help='Save model checkpoint every save_steps.')
+ parser.add_argument(
+ '--checkpoint',
+ type=str,
+ help='Checkpoint to load the weights from before fine-tuning.')
+ parser.add_argument(
+ '--model_architecture',
+ type=str,
+ default='dnn',
+ help='What model architecture to use')
+ parser.add_argument(
+ '--model_size_info',
+ type=int,
+ nargs="+",
+ default=[128, 128, 128],
+ help='Model dimensions - different for various models')
+
+ FLAGS, _ = parser.parse_known_args()
+ optimize()
diff --git a/models/keyword_spotting/dnn_large/model_package_tf/recreate_model.sh b/models/keyword_spotting/dnn_large/model_package_tf/recreate_model.sh
new file mode 100644
index 0000000..cb54318
--- /dev/null
+++ b/models/keyword_spotting/dnn_large/model_package_tf/recreate_model.sh
@@ -0,0 +1,52 @@
+#!/usr/bin/env bash
+# Copyright (C) 2023 Arm Limited or its affiliates. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+ckpt_path=model_archive/model_source/weights/dnn_0.87_ckpt
+train=false
+
+# Parse command line args
+while (( $# >= 1 )); do
+ case $1 in
+ --ckpt)
+ if [ "$2" ]; then
+ ckpt_path=$2
+ shift
+ else
+ printf 'ERROR: "--ckpt" requires a path to be supplied.\n'
+ exit 1
+ fi
+ ;;
+ --train)
+ train=true
+ break;;
+ *) shift;
+ esac;
+done
+
+
+# DNN Large training
+if [ "$train" = true ]
+then
+python train.py --model_architecture dnn --model_size_info 436 436 436 --dct_coefficient_count 10 --window_size_ms 40 --window_stride_ms 40 --learning_rate 0.0005,0.0001,0.00002 --how_many_training_steps 10000,10000,10000 --summaries_dir work/DNN/DNN_L/retrain_logs --train_dir work/DNN/DNN_L/training
+fi
+
+# Conversion to TFLite fp32
+python convert_to_tflite.py --model_architecture dnn --model_size_info 436 436 436 --dct_coefficient_count 10 --window_size_ms 40 --window_stride_ms 40 --checkpoint $ckpt_path --no-quantize
+
+# Conversion to TFLite int8
+python convert_to_tflite.py --model_architecture dnn --model_size_info 436 436 436 --dct_coefficient_count 10 --window_size_ms 40 --window_stride_ms 40 --checkpoint $ckpt_path --inference_type int8
+
diff --git a/models/keyword_spotting/dnn_large/model_package_tf/requirements.txt b/models/keyword_spotting/dnn_large/model_package_tf/requirements.txt
new file mode 100644
index 0000000..3448cff
--- /dev/null
+++ b/models/keyword_spotting/dnn_large/model_package_tf/requirements.txt
@@ -0,0 +1,3 @@
+numpy == 1.19.5
+tensorflow == 2.5.0
+tensorflow-model-optimization == 0.6.0
\ No newline at end of file
diff --git a/models/keyword_spotting/dnn_large/model_package_tf/train.py b/models/keyword_spotting/dnn_large/model_package_tf/train.py
new file mode 100644
index 0000000..8c488b3
--- /dev/null
+++ b/models/keyword_spotting/dnn_large/model_package_tf/train.py
@@ -0,0 +1,227 @@
+# Copyright © 2023 Arm Ltd. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Functions for training simple keyword spotting models."""
+
+import argparse
+from pathlib import Path
+
+import tensorflow as tf
+import numpy as np
+
+from data_processing import data_preprocessing
+from model_core_utils import models
+
+
+def train():
+ model_settings = models.prepare_model_settings(
+ len(data_preprocessing.prepare_words_list(FLAGS.wanted_words.split(','))),
+ FLAGS.sample_rate, FLAGS.clip_duration_ms, FLAGS.window_size_ms,
+ FLAGS.window_stride_ms, FLAGS.dct_coefficient_count)
+
+ # Create the model.
+ model = models.create_model(model_settings, FLAGS.model_architecture, FLAGS.model_size_info, True)
+
+ audio_processor = data_preprocessing.AudioProcessor(data_url=FLAGS.data_url,
+ data_dir=FLAGS.data_dir,
+ silence_percentage=FLAGS.silence_percentage,
+ unknown_percentage=FLAGS.unknown_percentage,
+ wanted_words=FLAGS.wanted_words.split(','),
+ validation_percentage=FLAGS.validation_percentage,
+ testing_percentage=FLAGS.testing_percentage,
+ model_settings=model_settings)
+
+ # We decay learning rate in a constant piecewise way to help learning.
+ training_steps_list = list(map(int, FLAGS.how_many_training_steps.split(',')))
+ learning_rates_list = list(map(float, FLAGS.learning_rate.split(',')))
+ lr_boundary_list = training_steps_list[:-1] # Only need the values at which to change lr.
+ lr_schedule = tf.keras.optimizers.schedules.PiecewiseConstantDecay(boundaries=lr_boundary_list,
+ values=learning_rates_list)
+
+ # Specify the optimizer configurations.
+ optimizer = tf.keras.optimizers.Adam(learning_rate=lr_schedule)
+ model.compile(optimizer=optimizer,
+ loss=tf.keras.losses.SparseCategoricalCrossentropy(),
+ metrics=['accuracy'])
+
+ train_data = audio_processor.get_data(audio_processor.Modes.TRAINING,
+ FLAGS.background_frequency, FLAGS.background_volume,
+ int((FLAGS.time_shift_ms * FLAGS.sample_rate) / 1000))
+ train_data = train_data.repeat().batch(FLAGS.batch_size).prefetch(tf.data.AUTOTUNE)
+ val_data = audio_processor.get_data(audio_processor.Modes.VALIDATION)
+ val_data = val_data.batch(FLAGS.batch_size).prefetch(tf.data.AUTOTUNE)
+
+ # We train for a max number of iterations so need to calculate how many 'epochs' this will be.
+ training_steps_max = np.sum(training_steps_list)
+ training_epoch_max = int(np.ceil(training_steps_max / FLAGS.eval_step_interval))
+
+ # Callbacks.
+ train_dir = Path(FLAGS.train_dir) / "best"
+ train_dir.mkdir(parents=True, exist_ok=True)
+ model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
+ filepath=(train_dir / (FLAGS.model_architecture + "_{val_accuracy:.3f}_ckpt")),
+ save_weights_only=True,
+ monitor='val_accuracy',
+ mode='max',
+ save_best_only=True)
+ tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=FLAGS.summaries_dir)
+
+ # Train the model.
+ model.fit(x=train_data,
+ steps_per_epoch=FLAGS.eval_step_interval,
+ epochs=training_epoch_max,
+ validation_data=val_data,
+ callbacks=[model_checkpoint_callback, tensorboard_callback])
+
+ # Test and save the model.
+ test_data = audio_processor.get_data(audio_processor.Modes.TESTING)
+ test_data = test_data.batch(FLAGS.batch_size)
+
+ test_loss, test_acc = model.evaluate(x=test_data)
+ print(f'Final test accuracy: {test_acc*100:.2f}%')
+ model.save(f'saved_model/{FLAGS.model_architecture}')
+ model.save(f'keras/{FLAGS.model_architecture}.h5')
+
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser()
+ parser.add_argument(
+ '--data_url',
+ type=str,
+ default='http://download.tensorflow.org/data/speech_commands_v0.02.tar.gz',
+ help='Location of speech training data archive on the web.')
+ parser.add_argument(
+ '--data_dir',
+ type=str,
+ default='/tmp/speech_dataset/',
+ help="""\
+ Where to download the speech training data to.
+ """)
+ parser.add_argument(
+ '--background_volume',
+ type=float,
+ default=0.1,
+ help="""\
+ How loud the background noise should be, between 0 and 1.
+ """)
+ parser.add_argument(
+ '--background_frequency',
+ type=float,
+ default=0.8,
+ help="""\
+ How many of the training samples have background noise mixed in.
+ """)
+ parser.add_argument(
+ '--silence_percentage',
+ type=float,
+ default=10.0,
+ help="""\
+ How much of the training data should be silence.
+ """)
+ parser.add_argument(
+ '--unknown_percentage',
+ type=float,
+ default=10.0,
+ help="""\
+ How much of the training data should be unknown words.
+ """)
+ parser.add_argument(
+ '--time_shift_ms',
+ type=float,
+ default=100.0,
+ help="""\
+ Range to randomly shift the training audio by in time.
+ """)
+ parser.add_argument(
+ '--testing_percentage',
+ type=int,
+ default=10,
+ help='What percentage of wavs to use as a test set.')
+ parser.add_argument(
+ '--validation_percentage',
+ type=int,
+ default=10,
+ help='What percentage of wavs to use as a validation set.')
+ parser.add_argument(
+ '--sample_rate',
+ type=int,
+ default=16000,
+ help='Expected sample rate of the wavs',)
+ parser.add_argument(
+ '--clip_duration_ms',
+ type=int,
+ default=1000,
+ help='Expected duration in milliseconds of the wavs',)
+ parser.add_argument(
+ '--window_size_ms',
+ type=float,
+ default=30.0,
+ help='How long each spectrogram timeslice is',)
+ parser.add_argument(
+ '--window_stride_ms',
+ type=float,
+ default=10.0,
+ help='How long each spectrogram timeslice is',)
+ parser.add_argument(
+ '--dct_coefficient_count',
+ type=int,
+ default=40,
+ help='How many bins to use for the MFCC fingerprint',)
+ parser.add_argument(
+ '--how_many_training_steps',
+ type=str,
+ default='15000,3000',
+ help='How many training loops to run',)
+ parser.add_argument(
+ '--eval_step_interval',
+ type=int,
+ default=400,
+ help='How often to evaluate the training results.')
+ parser.add_argument(
+ '--learning_rate',
+ type=str,
+ default='0.001,0.0001',
+ help='How large a learning rate to use when training.')
+ parser.add_argument(
+ '--batch_size',
+ type=int,
+ default=100,
+ help='How many items to train with at once',)
+ parser.add_argument(
+ '--summaries_dir',
+ type=str,
+ default='/tmp/retrain_logs',
+ help='Where to save summary logs for TensorBoard.')
+ parser.add_argument(
+ '--wanted_words',
+ type=str,
+ default='yes,no,up,down,left,right,on,off,stop,go',
+ help='Words to use (others will be added to an unknown label)',)
+ parser.add_argument(
+ '--train_dir',
+ type=str,
+ default='/tmp/speech_commands_train',
+ help='Directory to write event logs and checkpoint.')
+ parser.add_argument(
+ '--model_architecture',
+ type=str,
+ default='dnn',
+ help='What model architecture to use')
+ parser.add_argument(
+ '--model_size_info',
+ type=int,
+ nargs="+",
+ default=[128, 128, 128],
+ help='Model dimensions - different for various models')
+
+ FLAGS, _ = parser.parse_known_args()
+ train()
diff --git a/models/keyword_spotting/dnn_large/model_package_tf/validation_utils/labels.txt b/models/keyword_spotting/dnn_large/model_package_tf/validation_utils/labels.txt
new file mode 100644
index 0000000..ba41645
--- /dev/null
+++ b/models/keyword_spotting/dnn_large/model_package_tf/validation_utils/labels.txt
@@ -0,0 +1,12 @@
+_silence_
+_unknown_
+yes
+no
+up
+down
+left
+right
+on
+off
+stop
+go
\ No newline at end of file
diff --git a/models/keyword_spotting/dnn_large/tflite_int8/README.md b/models/keyword_spotting/dnn_large/tflite_int8/README.md
deleted file mode 100644
index 40a0507..0000000
--- a/models/keyword_spotting/dnn_large/tflite_int8/README.md
+++ /dev/null
@@ -1,59 +0,0 @@
-# DNN Large INT8
-
-## Description
-This is a fully quantized version (asymmetrical int8) of the DNN Large model developed by Arm, with training checkpoints, from the Hello Edge paper. Code to recreate this model can be found here: https://github.com/ARM-software/ML-examples/tree/master/tflu-kws-cortex-m
-
-## License
-[Apache-2.0](https://spdx.org/licenses/Apache-2.0.html)
-
-## Related Materials
-### Class Labels
-The class labels associated with this model can be downloaded by running the script `get_class_labels.sh`.
-
-### Model Recreation Code
-Code to recreate this model can be found here: https://github.com/ARM-software/ML-examples/tree/master/tflu-kws-cortex-m.
-
-## Network Information
-| Network Information | Value |
-|---------------------|------------------|
-| Framework | TensorFlow Lite |
-| SHA-1 Hash | 2b1ee34e4c87ba6f24092c7457593227099efaf1 |
-| Size (Bytes) | 502272 |
-| Provenance | https://github.com/ARM-software/ML-examples/tree/master/tflu-kws-cortex-m |
-| Paper | https://arxiv.org/abs/1711.07128 |
-
-## Accuracy
-Dataset: Google Speech Commands Test Set
-
-| Metric | Value |
-|--------|-------|
-| Accuracy | 0.863 |
-
-## Performance
-| Platform | Optimized |
-|----------|:---------:|
-| Cortex-A |:heavy_check_mark: |
-| Cortex-M |:heavy_check_mark: |
-| Mali GPU |:heavy_check_mark: |
-| Ethos U |:heavy_check_mark: |
-
-### Key
-* :heavy_check_mark: - Will run on this platform.
-* :heavy_multiplication_x: - Will not run on this platform.
-
-
-
-## Optimizations
-| Optimization | Value |
-|-----------------|---------|
-| Quantization | INT8 |
-
-## Network Inputs
-| Input Node Name | Shape | Description |
-|-----------------|---------|-------------|
-| input | (1, 250) | The input is a processed MFCCs of shape (1, 250) |
-
-## Network Outputs
-| Output Node Name | Shape | Description |
-|------------------|---------|-------------|
-| Identity | (1, 12) | The probability on 12 keywords. |
diff --git a/models/keyword_spotting/dnn_large/tflite_int8/definition.yaml b/models/keyword_spotting/dnn_large/tflite_int8/definition.yaml
deleted file mode 100644
index 68c8968..0000000
--- a/models/keyword_spotting/dnn_large/tflite_int8/definition.yaml
+++ /dev/null
@@ -1,41 +0,0 @@
-benchmark:
- Google Speech Commands test set:
- Accuracy: 86.26%
-description: 'This is a fully quantized version (asymmetrical int8) of the DNN Large
- model developed by Arm, with training checkpoints, from the Hello Edge paper. Code
- to recreate this model can be found here: https://github.com/ARM-software/ML-examples/tree/master/tflu-kws-cortex-m'
-license:
-- Apache-2.0
-network:
- file_size_bytes: 502272
- filename: dnn_l_quantized.tflite
- framework: TensorFlow Lite
- hash:
- algorithm: sha1
- value: 2b1ee34e4c87ba6f24092c7457593227099efaf1
- provenance: https://github.com/ARM-software/ML-examples/tree/master/tflu-kws-cortex-m
- quality_level: null
-network_parameters:
- input_nodes:
- - description: The input is a processed MFCCs of shape (1, 250)
- example_input:
- path: models/keyword_spotting/dnn_large/tflite_int8/testing_input/input
- name: input
- shape:
- - 1
- - 250
- output_nodes:
- - description: The probability on 12 keywords.
- name: Identity
- shape:
- - 1
- - 12
- test_output_path: models/keyword_spotting/dnn_large/tflite_int8/testing_output/Identity
-operators:
- TensorFlow Lite:
- - DEQUANTIZE
- - FULLY_CONNECTED
- - QUANTIZE
- - RELU
- - SOFTMAX
-paper: https://arxiv.org/abs/1711.07128
diff --git a/models/keyword_spotting/dnn_large/tflite_int8/get_class_labels.sh b/models/keyword_spotting/dnn_large/tflite_int8/get_class_labels.sh
deleted file mode 100755
index e59caf5..0000000
--- a/models/keyword_spotting/dnn_large/tflite_int8/get_class_labels.sh
+++ /dev/null
@@ -1,20 +0,0 @@
-# Copyright (C) 2021 Arm Limited or its affiliates. All rights reserved.
-#
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-#!/usr/bin/env bash
-
-wget https://raw.githubusercontent.com/ARM-software/ML-KWS-for-MCU/e9cf319e9aa2ff71d433e111477dd95329fb94cb/Pretrained_models/labels.txt
-mv labels.txt labelmappings.txt
\ No newline at end of file
diff --git a/models/keyword_spotting/dnn_medium/model_package_tf/README.md b/models/keyword_spotting/dnn_medium/model_package_tf/README.md
new file mode 100644
index 0000000..8005a3c
--- /dev/null
+++ b/models/keyword_spotting/dnn_medium/model_package_tf/README.md
@@ -0,0 +1,115 @@
+# DNN Medium model package
+
+This folder contains code that will allow you to recreate the DNN Medium keyword spotting model from
+the [Hello Edge paper](https://arxiv.org/pdf/1711.07128.pdf).
+
+## License
+[Apache-2.0](https://spdx.org/licenses/Apache-2.0.html)
+
+## Model Package Overview
+| Model | DNN_Medium |
+|:---------------: |:------------------------------------------:|
+| **Format**: | Keras, Saved Model, TensorFlow Lite int8, TensorFlow Lite fp32 |
+| **Feature**: | Keyword spotting for Arm Cortex-M CPUs |
+| **Architectural Delta w.r.t. Vanilla**: | None |
+| **Domain**: | Keyword spotting |
+| **Package Quality**: | Optimised |
+
+## Model Recreation
+
+In order to recreate the model you will first need to be using ```Python3.7``` and install the requirements in ```requirements.txt```.
+
+Once you have these requirements satisfied you can execute the recreation script contained within this folder, just run:
+
+```bash
+bash ./recreate_model.sh
+```
+
+Running this script will use the pre-trained checkpoint files supplied in the ```./model_archive/model_source/weights``` folder
+to generate the TFLite files and perform evaluation on the test sets. Both an fp32 version and a quantized version will be produced.
+The quantized version will use post-training quantization to fully quantize it.
+
+If you want to run training from scratch you can do this by supplying ```--train``` when running the script. For example:
+
+```bash
+bash ./recreate_model.sh --train
+```
+
+Training is then performed and should produce a model to the stated accuracy in this repository.
+Note that exporting to TFLite will still happen with the pre-trained checkpoint files so you will need to re-run the script
+and this time supply the path to the new checkpoint files you want to use, for example:
+
+```bash
+bash ./recreate_model.sh --ckpt
+```
+
+
+## Training
+
+To train a DNN with 3 fully-connected layers with 128 neurons in each layer, run:
+
+```
+python train.py --model_architecture dnn --model_size_info 128 128 128
+```
+The command line argument *--model_size_info* is used to pass the neural network layer
+dimensions such as number of layers, convolution filter size/stride as a list to models.py,
+which builds the TensorFlow graph based on the provided model architecture
+and layer dimensions. For more info on *model_size_info* for each network architecture see
+[models.py](models.py).
+
+The training commands with all the hyperparameters to reproduce the models shown in the
+[paper](https://arxiv.org/pdf/1711.07128.pdf) are given [here](recreate_model.sh).
+
+## Testing
+To run inference on the trained model from a checkpoint and get accuracy on validation and test sets, run:
+```
+python evaluation.py --model_architecture dnn --model_size_info 128 128 128 --checkpoint
+```
+The parameters used here should match those used in the Training step.
+
+## Optimization
+
+We introduce a new *optional* step to optimize the trained keyword spotting model for deployment.
+
+Here we use TensorFlow's [weight clustering API](https://www.tensorflow.org/model_optimization/guide/clustering) to reduce the compressed model size and optimize inference on supported hardware. 32 weight clusters and kmeans++ cluster intialization method are used as the clustering hyperparameters.
+
+To optimize your trained model (e.g. a DNN), a trained model checkpoint is needed to run clustering and fine-tuning on.
+You can use the pre-trained checkpoints provided, or train your own model and use the resulting checkpoint.
+
+To apply the optimization and fine-tuning, run the following command:
+```
+python optimisations.py --model_architecture dnn --model_size_info 128 128 128 --checkpoint
+```
+The parameters used here should match those used in the Training step, except for the number of training steps.
+The number of training steps is reduced since the optimization step only requires fine-tuning.
+
+This will generate a clustered model checkpoint that can be used in the quantization step to generate a quantized and clustered TFLite model.
+
+## Quantization and TFLite Conversion
+
+As part of the update we now use TensorFlow's
+[post training quantization](https://www.tensorflow.org/lite/performance/post_training_quantization) to
+make quantization of the trained models super simple.
+
+To quantize your trained model (e.g. a DNN) run:
+```
+python convert_to_tflite.py --model_architecture dnn --model_size_info 128 128 128 --checkpoint [--inference_type int8|int16]
+```
+The parameters used here should match those used in the Training step.
+
+The inference_type parameter is *optional* and to be used if a fully quantized model with inputs and outputs of type int8 or int16 is needed. It defaults to fp32.
+
+This step will produce a quantized TFLite file *dnn_quantized.tflite*.
+You can test the accuracy of this quantized model on the test set by running:
+```
+python evaluation.py --tflite_path dnn_quantized.tflite
+```
+The parameters used here should match those used in the Training step.
+
+`convert_to_tflite.py` uses post-training quantization to generate a quantized model by default. If you wish to convert to a floating point TFLite model, use the command below:
+
+```
+python convert_to_tflite.py --model_architecture dnn --model_size_info 128 128 128 --checkpoint --no-quantize
+```
+
+This will produce a floating point TFLite file *dnn.tflite*. You can test the accuracy of this floating point model using `evaluation.py` as above.
diff --git a/models/keyword_spotting/dnn_medium/model_package_tf/convert_to_tflite.py b/models/keyword_spotting/dnn_medium/model_package_tf/convert_to_tflite.py
new file mode 100644
index 0000000..64ab8df
--- /dev/null
+++ b/models/keyword_spotting/dnn_medium/model_package_tf/convert_to_tflite.py
@@ -0,0 +1,234 @@
+# Copyright © 2023 Arm Ltd. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Functions for converting and quantizing a trained keyword spotting
+ model and saving to TFLite."""
+
+import argparse
+
+import tensorflow as tf
+
+from data_processing import data_preprocessing
+from model_core_utils import models
+from evaluation import tflite_test
+
+NUM_REP_DATA_SAMPLES = 100 # How many samples to use for post training quantization.
+
+
+def convert(model_settings, audio_processor, checkpoint, quantize, inference_type, tflite_path):
+ """Load our trained floating point model and convert it.
+
+ TFLite conversion or post training quantization is performed and the
+ resulting model is saved as a TFLite file.
+ We use samples from the validation set to do post training quantization.
+
+ Args:
+ model_settings: Dictionary of common model settings.
+ audio_processor: Audio processor class object.
+ checkpoint: Path to training checkpoint to load.
+ quantize: Whether to quantize the model or convert to fp32 TFLite model.
+ inference_type: Input/output type of the quantized model.
+ tflite_path: Output TFLite file save path.
+ """
+ model = models.create_model(model_settings, FLAGS.model_architecture, FLAGS.model_size_info, False)
+ model.load_weights(checkpoint).expect_partial()
+
+ val_data = audio_processor.get_data(audio_processor.Modes.VALIDATION).batch(1)
+
+ def _rep_dataset():
+ """Generator function to produce representative dataset."""
+ i = 0
+ for mfcc, label in val_data:
+ if i > NUM_REP_DATA_SAMPLES:
+ break
+ i += 1
+ yield [mfcc]
+
+ if quantize:
+ # Quantize model and save to disk.
+ tflite_model = post_training_quantize(model, inference_type, _rep_dataset)
+ with open(tflite_path, 'wb') as f:
+ f.write(tflite_model)
+ print(f'Quantized model saved to {tflite_path}.')
+ else:
+ converter = tf.lite.TFLiteConverter.from_keras_model(model)
+ tflite_model = converter.convert()
+ with open(tflite_path, 'wb') as f:
+ f.write(tflite_model)
+ print(f'Converted model saved to {tflite_path}.')
+
+
+def post_training_quantize(keras_model, inference_type, rep_dataset):
+ """Perform post training quantization and returns the TFLite model ready for saving.
+
+ See https://www.tensorflow.org/lite/performance/post_training_quantization#full_integer_quantization for
+ more details.
+
+ Args:
+ keras_model: The trained tf Keras model used for post training quantization.
+ inference_type: Input/output type of the quantized model.
+ rep_dataset: Function to use as a representative dataset, must be callable.
+
+ Returns:
+ Quantized TFLite model ready for saving to disk.
+ """
+ converter = tf.lite.TFLiteConverter.from_keras_model(keras_model)
+ converter.optimizations = [tf.lite.Optimize.DEFAULT]
+
+ if inference_type == 'int8':
+ converter.inference_input_type = tf.int8
+ converter.inference_output_type = tf.int8
+ supported_ops = tf.lite.OpsSet.TFLITE_BUILTINS_INT8
+ if inference_type == 'int16':
+ converter.inference_input_type = tf.int16
+ converter.inference_output_type = tf.int16
+ supported_ops = tf.lite.OpsSet.EXPERIMENTAL_TFLITE_BUILTINS_ACTIVATIONS_INT16_WEIGHTS_INT8
+
+ # Int8 post training quantization needs representative dataset.
+ converter.representative_dataset = rep_dataset
+ converter.target_spec.supported_ops = [supported_ops]
+
+ tflite_model = converter.convert()
+
+ return tflite_model
+
+
+def main():
+ model_settings = models.prepare_model_settings(
+ len(data_preprocessing.prepare_words_list(FLAGS.wanted_words.split(','))),
+ FLAGS.sample_rate, FLAGS.clip_duration_ms, FLAGS.window_size_ms,
+ FLAGS.window_stride_ms, FLAGS.dct_coefficient_count)
+
+ audio_processor = data_preprocessing.AudioProcessor(data_url=FLAGS.data_url,
+ data_dir=FLAGS.data_dir,
+ silence_percentage=FLAGS.silence_percentage,
+ unknown_percentage=FLAGS.unknown_percentage,
+ wanted_words=FLAGS.wanted_words.split(','),
+ validation_percentage=FLAGS.validation_percentage,
+ testing_percentage=FLAGS.testing_percentage,
+ model_settings=model_settings)
+
+ if FLAGS.quantize:
+ tflite_path = f'{FLAGS.model_architecture}_quantized.tflite'
+ else:
+ tflite_path = f'{FLAGS.model_architecture}.tflite'
+
+ # Load floating point model from checkpoint and convert it.
+ convert(model_settings, audio_processor, FLAGS.checkpoint,
+ FLAGS.quantize, FLAGS.inference_type, tflite_path)
+
+ # Test the newly converted model on the test set.
+ tflite_test(model_settings, audio_processor, tflite_path)
+
+
+if __name__ == '__main__':
+ parser = argparse.ArgumentParser()
+ parser.add_argument(
+ '--data_url',
+ type=str,
+ default='http://download.tensorflow.org/data/speech_commands_v0.02.tar.gz',
+ help='Location of speech training data archive on the web.')
+ parser.add_argument(
+ '--data_dir',
+ type=str,
+ default='/tmp/speech_dataset/',
+ help="""\
+ Where to download the speech training data to.
+ """)
+ parser.add_argument(
+ '--silence_percentage',
+ type=float,
+ default=10.0,
+ help="""\
+ How much of the training data should be silence.
+ """)
+ parser.add_argument(
+ '--unknown_percentage',
+ type=float,
+ default=10.0,
+ help="""\
+ How much of the training data should be unknown words.
+ """)
+ parser.add_argument(
+ '--testing_percentage',
+ type=int,
+ default=10,
+ help='What percentage of wavs to use as a test set.')
+ parser.add_argument(
+ '--validation_percentage',
+ type=int,
+ default=10,
+ help='What percentage of wavs to use as a validation set.')
+ parser.add_argument(
+ '--sample_rate',
+ type=int,
+ default=16000,
+ help='Expected sample rate of the wavs',)
+ parser.add_argument(
+ '--clip_duration_ms',
+ type=int,
+ default=1000,
+ help='Expected duration in milliseconds of the wavs',)
+ parser.add_argument(
+ '--window_size_ms',
+ type=float,
+ default=30.0,
+ help='How long each spectrogram timeslice is',)
+ parser.add_argument(
+ '--window_stride_ms',
+ type=float,
+ default=10.0,
+ help='How long each spectrogram timeslice is',)
+ parser.add_argument(
+ '--dct_coefficient_count',
+ type=int,
+ default=40,
+ help='How many bins to use for the MFCC fingerprint',)
+ parser.add_argument(
+ '--wanted_words',
+ type=str,
+ default='yes,no,up,down,left,right,on,off,stop,go',
+ help='Words to use (others will be added to an unknown label)',)
+ parser.add_argument(
+ '--model_architecture',
+ type=str,
+ default='dnn',
+ help='What model architecture to use')
+ parser.add_argument(
+ '--model_size_info',
+ type=int,
+ nargs="+",
+ default=[128, 128, 128],
+ help='Model dimensions - different for various models')
+ parser.add_argument(
+ '--checkpoint',
+ type=str,
+ help='Checkpoint to load the weights from.')
+ parser.add_argument(
+ '--quantize',
+ dest='quantize',
+ action="store_true",
+ default=True,
+ help='Whether to quantize the model or convert to fp32 TFLite model. Defaults to True.')
+ parser.add_argument(
+ '--no-quantize',
+ dest='quantize',
+ action="store_false",
+ help='Whether to quantize the model or convert to fp32 TFLite model. Defaults to True.')
+ parser.add_argument(
+ '--inference_type',
+ type=str,
+ default='fp32',
+ help='If quantize is true, whether the model input and output is float32, int8 or int16')
+
+ FLAGS, _ = parser.parse_known_args()
+ main()
diff --git a/models/keyword_spotting/dnn_medium/model_package_tf/data_processing/__init__.py b/models/keyword_spotting/dnn_medium/model_package_tf/data_processing/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/models/keyword_spotting/dnn_medium/model_package_tf/data_processing/data_preprocessing.py b/models/keyword_spotting/dnn_medium/model_package_tf/data_processing/data_preprocessing.py
new file mode 100644
index 0000000..05cf5ba
--- /dev/null
+++ b/models/keyword_spotting/dnn_medium/model_package_tf/data_processing/data_preprocessing.py
@@ -0,0 +1,462 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+#
+# Modifications Copyright 2023 Arm Inc. All Rights Reserved.
+# Modified to use TensorFlow 2.0 and data pipelines.
+#
+"""Functions for loading and preparing data for keyword spotting."""
+
+import os
+import re
+import sys
+import urllib
+from pathlib import Path
+import tarfile
+import hashlib
+import random
+import math
+from enum import Enum
+
+import numpy as np
+import tensorflow as tf
+from tensorflow.python.ops import gen_audio_ops as audio_ops
+
+MAX_NUM_WAVS_PER_CLASS = 2**27 - 1 # ~134M
+RANDOM_SEED = 59185
+BACKGROUND_NOISE_DIR_NAME = '_background_noise_'
+SILENCE_LABEL = '_silence_'
+SILENCE_INDEX = 0
+UNKNOWN_WORD_INDEX = 1
+UNKNOWN_WORD_LABEL = '_unknown_'
+
+
+def load_wav_file(wav_filename, desired_samples):
+ """Loads and then decodes a given 16bit PCM wav file.
+
+ Decoded audio is scaled to the range [-1, 1] and padded or cropped to the desired number of samples.
+
+ Args:
+ wav_filename: 16bit PCM wav file to load.
+ desired_samples: Number of samples wanted from the audio file.
+
+ Returns:
+ Tuple consisting of the decoded audio and sample rate.
+ """
+ wav_file = tf.io.read_file(wav_filename)
+ decoded_wav = audio_ops.decode_wav(wav_file, desired_channels=1, desired_samples=desired_samples)
+
+ return decoded_wav.audio, decoded_wav.sample_rate
+
+
+def calculate_mfcc(audio_signal, audio_sample_rate, window_size, window_stride, num_mfcc):
+ """Returns Mel Frequency Cepstral Coefficients (MFCC) for a given audio signal.
+
+ Args:
+ audio_signal: Raw audio signal in range [-1, 1]
+ audio_sample_rate: Audio signal sample rate
+ window_size: Window size in samples for calculating spectrogram
+ window_stride: Window stride in samples for calculating spectrogram
+ num_mfcc: The number of MFCC features wanted.
+
+ Returns:
+ Calculated mffc features.
+ """
+ spectrogram = audio_ops.audio_spectrogram(input=audio_signal, window_size=window_size, stride=window_stride,
+ magnitude_squared=True)
+
+ mfcc_features = audio_ops.mfcc(spectrogram, audio_sample_rate, dct_coefficient_count=num_mfcc)
+
+ return mfcc_features
+
+
+def which_set(filename, validation_percentage, testing_percentage):
+ """Determines which data partition the file should belong to.
+
+ We want to keep files in the same training, validation, or testing sets even
+ if new ones are added over time. This makes it less likely that testing
+ samples will accidentally be reused in training when long runs are restarted
+ for example. To keep this stability, a hash of the filename is taken and used
+ to determine which set it should belong to. This determination only depends on
+ the name and the set proportions, so it won't change as other files are added.
+ It's also useful to associate particular files as related (for example words
+ spoken by the same person), so anything after '_nohash_' in a filename is
+ ignored for set determination. This ensures that 'bobby_nohash_0.wav' and
+ 'bobby_nohash_1.wav' are always in the same set, for example.
+
+ Args:
+ filename: File path of the data sample.
+ validation_percentage: How much of the data set to use for validation.
+ testing_percentage: How much of the data set to use for testing.
+
+ Returns:
+ String, one of 'training', 'validation', or 'testing'.
+ """
+ base_name = os.path.basename(filename)
+ # We want to ignore anything after '_nohash_' in the file name when
+ # deciding which set to put a wav in, so the data set creator has a way of
+ # grouping wavs that are close variations of each other.
+ hash_name = re.sub(r'_nohash_.*$', '', base_name)
+ # This looks a bit magical, but we need to decide whether this file should
+ # go into the training, testing, or validation sets, and we want to keep
+ # existing files in the same set even if more files are subsequently
+ # added.
+ # To do that, we need a stable way of deciding based on just the file name
+ # itself, so we do a hash of that and then use that to generate a
+ # probability value that we use to assign it.
+ hash_name_hashed = hashlib.sha1(tf.compat.as_bytes(hash_name)).hexdigest()
+ percentage_hash = ((int(hash_name_hashed, 16) %
+ (MAX_NUM_WAVS_PER_CLASS + 1)) *
+ (100.0 / MAX_NUM_WAVS_PER_CLASS))
+ if percentage_hash < validation_percentage:
+ result = 'validation'
+ elif percentage_hash < (testing_percentage + validation_percentage):
+ result = 'testing'
+ else:
+ result = 'training'
+ return result
+
+
+def prepare_words_list(wanted_words):
+ """Prepends common tokens to the custom word list.
+
+ Args:
+ wanted_words: List of strings containing custom words to spot.
+
+ Returns:
+ List of words with silence and unknown tokens added.
+ """
+ return [SILENCE_LABEL, UNKNOWN_WORD_LABEL] + wanted_words
+
+
+class AudioProcessor:
+ """Handles loading, partitioning, and preparing audio training data."""
+
+ class Modes(Enum):
+ TRAINING = 1
+ VALIDATION = 2
+ TESTING = 3
+
+ def __init__(self, data_url, data_dir, silence_percentage, unknown_percentage,
+ wanted_words, validation_percentage, testing_percentage, model_settings):
+ self.data_dir = Path(data_dir)
+ self.model_settings = model_settings
+ self.words_list = prepare_words_list(wanted_words)
+
+ self._tf_datasets = {}
+ self.background_data = None
+ self._set_size = {'training': 0, 'validation': 0, 'testing': 0}
+
+ self._download_and_extract_data(data_url, data_dir)
+ self._prepare_datasets(silence_percentage, unknown_percentage, wanted_words,
+ validation_percentage, testing_percentage)
+ self._prepare_background_data()
+
+ def get_data(self, mode, background_frequency=0, background_volume_range=0, time_shift=0):
+ """Returns the train, validation or test set for KWS as a TF Dataset.
+
+ Args:
+ mode: The set to return, see AudioProcessor.Modes enumeration.
+ background_frequency: How many of the samples have background noise mixed in.
+ background_volume_range: How loud the background noise should be, between 0 and 1.
+ time_shift: Range to randomly shift the training audio by in time.
+
+ Returns:
+ TF dataset that will generate tuples containing an mfcc and corresponding label.
+
+ Raises:
+ ValueError: If mode is not recognised.
+ """
+ if mode == AudioProcessor.Modes.TRAINING:
+ dataset = self._tf_datasets['training']
+ elif mode == AudioProcessor.Modes.VALIDATION:
+ dataset = self._tf_datasets['validation']
+ elif mode == AudioProcessor.Modes.TESTING:
+ dataset = self._tf_datasets['testing']
+ else:
+ ValueError("Incorrect dataset type given")
+
+ use_background = (self.background_data is not None) and (mode == AudioProcessor.Modes.TRAINING)
+ dataset = dataset.map(lambda path, label: self._process_path(path, label, self.model_settings,
+ background_frequency, background_volume_range,
+ time_shift, use_background, self.background_data),
+ num_parallel_calls=tf.data.experimental.AUTOTUNE)
+
+ return dataset
+
+ def set_size(self, mode):
+ """Get the number of samples in the requested dataset partition.
+
+ Args:
+ mode: Which partition, see AudioProcessor.Modes enumeration.
+
+ Returns:
+ Number of samples in the partition.
+
+ Raises:
+ ValueError: If mode is not recognised.
+ """
+ if mode == AudioProcessor.Modes.TRAINING:
+ return self._set_size['training']
+ elif mode == AudioProcessor.Modes.VALIDATION:
+ return self._set_size['validation']
+ elif mode == AudioProcessor.Modes.TESTING:
+ return self._set_size['testing']
+ else:
+ ValueError('Incorrect dataset type given')
+
+ @staticmethod
+ def _process_path(path, label, model_settings, background_frequency, background_volume_range, time_shift_samples,
+ use_background, background_data):
+ """Load wav files and calculate mfcc features.
+
+ Random shifting of samples and adding in background noise is done within this function as well.
+ This function is meant to be mapped onto a TF Dataset by using a lambda function.
+
+ Args:
+ path: Path to the wav file to load.
+ label: Integer label for classifying the audio clip.
+ model_settings: Dictionary of settings for model being trained.
+ background_frequency: How many clips will have background noise, 0.0 to 1.0.
+ background_volume_range: How loud the background noise will be.
+ time_shift_samples: How much to randomly shift the clips by.
+ use_background: Add in background noise to audio clips or not.
+ background_data: Ragged tensor of loaded background noise samples.
+
+ Returns:
+ Tuple of calculated flattened mfcc and its class label.
+ """
+
+ desired_samples = model_settings['desired_samples']
+ audio, sample_rate = load_wav_file(path, desired_samples=desired_samples)
+
+ # Make our own silence audio data.
+ if label == SILENCE_INDEX:
+ audio = tf.multiply(audio, 0)
+
+ # Shift samples start position and pad any gaps with zeros.
+ if time_shift_samples > 0:
+ time_shift_amount = tf.random.uniform(shape=(), minval=-time_shift_samples, maxval=time_shift_samples,
+ dtype=tf.int32)
+ else:
+ time_shift_amount = 0
+ if time_shift_amount > 0:
+ time_shift_padding = [[time_shift_amount, 0], [0, 0]]
+ time_shift_offset = [0, 0]
+ else:
+ time_shift_padding = [[0, -time_shift_amount], [0, 0]]
+ time_shift_offset = [-time_shift_amount, 0]
+
+ padded_foreground = tf.pad(audio, time_shift_padding, mode='CONSTANT')
+ sliced_foreground = tf.slice(padded_foreground, time_shift_offset, [desired_samples, -1])
+
+ # Get a random section of background noise.
+ if use_background:
+ background_index = tf.random.uniform(shape=(), maxval=background_data.shape[0], dtype=tf.int32)
+ background_sample = background_data[background_index]
+ background_offset = tf.random.uniform(shape=(), maxval=len(background_sample)-desired_samples,
+ dtype=tf.int32)
+ background_clipped = background_sample[background_offset:(background_offset + desired_samples)]
+ background_reshaped = tf.reshape(background_clipped, [desired_samples, 1])
+ if tf.random.uniform(shape=(), maxval=1) < background_frequency:
+ background_volume = tf.random.uniform(shape=(), maxval=background_volume_range)
+ else:
+ background_volume = tf.constant(0, dtype='float32')
+ else:
+ background_reshaped = np.zeros([desired_samples, 1], dtype=np.float32)
+ background_volume = tf.constant(0, dtype='float32')
+
+ # Mix in background noise.
+ background_mul = tf.multiply(background_reshaped, background_volume)
+ background_add = tf.add(background_mul, sliced_foreground)
+ background_clamp = tf.clip_by_value(background_add, -1.0, 1.0)
+
+ mfcc = calculate_mfcc(background_clamp, sample_rate, model_settings['window_size_samples'],
+ model_settings['window_stride_samples'],
+ model_settings['dct_coefficient_count'])
+ mfcc = tf.reshape(mfcc, [-1])
+
+ return mfcc, label
+
+ def _download_and_extract_data(self, data_url, target_directory):
+ """Downloads and extracts file to target directory.
+
+ If the file does not already exist download it and then untar into the target directory.
+
+ Args:
+ data_url: Web link to the tarred data to download.
+ target_directory: Directory to download and extract to.
+ """
+ target_directory = Path(target_directory)
+ target_directory.mkdir(exist_ok=True)
+
+ filename = data_url.split('/')[-1]
+ filepath = target_directory / filename
+
+ if not filepath.exists():
+ def _report_hook(block_num, block_size, total_size):
+ """Function to track download progress in urllib"""
+ read_so_far = block_num * block_size
+ percent = (read_so_far / total_size) * 100.0
+
+ s = f"\rDownloading {filename} {percent:.1f}%"
+
+ sys.stdout.write(s)
+ sys.stdout.flush()
+
+ filepath, _ = urllib.request.urlretrieve(data_url, filepath, _report_hook)
+ print()
+
+ print(f'Untarring {filename}...')
+ tarfile.open(filepath, 'r:gz').extractall(target_directory)
+
+ def _prepare_datasets(self, silence_percentage, unknown_percentage, wanted_words,
+ validation_percentage, testing_percentage):
+ """Split the data into train, validation and testing sets.
+
+ Silence and unknown data is added, then sets are converted to TF Datasets.
+
+ Args:
+ silence_percentage: Percent of words should be silence.
+ unknown_percentage: Percent of words that should be unknown.
+ wanted_words: List of words wanted to classify.
+ validation_percentage: Percent to split off for validation.
+ testing_percentage: Percent to split off for testing.
+ """
+ # Make sure the shuffling and picking of unknowns is deterministic.
+ random.seed(RANDOM_SEED)
+ wanted_words_index = {}
+
+ for index, wanted_word in enumerate(wanted_words):
+ wanted_words_index[wanted_word] = index + 2
+
+ # Find all wav files in subfolders.
+ search_path = self.data_dir / '*' / '*.wav'
+ data_index, unknown_index, all_words = self._find_and_sort_wavs(search_path, validation_percentage,
+ testing_percentage, wanted_words_index)
+
+ for index, wanted_word in enumerate(wanted_words):
+ if wanted_word not in all_words:
+ raise Exception(f'Tried to find {wanted_word} in labels but only found: {", ".join(all_words.keys())}')
+
+ word_to_index = {}
+ for word in all_words:
+ if word in wanted_words_index:
+ word_to_index[word] = wanted_words_index[word]
+ else:
+ word_to_index[word] = UNKNOWN_WORD_INDEX
+ word_to_index[SILENCE_LABEL] = SILENCE_INDEX
+
+ # We need an arbitrary file to load as the input for the silence samples.
+ # It's multiplied by zero later, so the content doesn't matter.
+ silence_wav_path = data_index['training'][0]['file']
+ for set_index in ['validation', 'testing', 'training']:
+ set_size = len(data_index[set_index]) # Size before adding silence and unknown samples.
+ silence_size = int(math.ceil(set_size * silence_percentage / 100))
+ for _ in range(silence_size):
+ data_index[set_index].append({
+ 'label': SILENCE_LABEL,
+ 'file': silence_wav_path
+ })
+ # Pick some unknowns to add to each partition of the data set.
+ random.shuffle(unknown_index[set_index])
+ unknown_size = int(math.ceil(set_size * unknown_percentage / 100))
+ data_index[set_index].extend(unknown_index[set_index][:unknown_size])
+
+ self._set_size[set_index] = len(data_index[set_index]) # Size after adding silence and unknown samples.
+
+ # Make sure the ordering is random.
+ random.shuffle(data_index[set_index])
+
+ # Transform into TF Datasets ready for easier processing later.
+ labels, paths = list(zip(*[d.values() for d in data_index[set_index]]))
+ labels = [word_to_index[label] for label in labels]
+ self._tf_datasets[set_index] = tf.data.Dataset.from_tensor_slices((list(paths), labels))
+
+ def _find_and_sort_wavs(self, search_pattern, validation_percentage, testing_percentage, wanted_words_index):
+ """Find and sort wav files into known and unknown word sets.
+
+ Known words are files containing words in the list of wanted words.
+ Any other clip goes to the unknown label set. Labels come from the folder names.
+ All clips are also assigned to train, test and validation sets.
+
+ Args:
+ search_pattern: Path pattern used by glob to find wav files.
+ validation_percentage: Percent to split off for validation.
+ testing_percentage: Percent to split off for testing.
+ wanted_words_index: Dict mapping wanted words to their label index.
+
+ Returns:
+ 3-tuple of known words, unknown words and mapping of all word labels.
+ """
+ data_index = {'validation': [], 'testing': [], 'training': []}
+ unknown_index = {'validation': [], 'testing': [], 'training': []}
+ all_words = {}
+
+ for wav_path in sorted(tf.io.gfile.glob(str(search_pattern))):
+ word = Path(wav_path).parent.name.lower()
+
+ # Treat the '_background_noise_' folder as a special case, since we expect
+ # it to contain long audio samples we mix in to improve training.
+ if word == BACKGROUND_NOISE_DIR_NAME:
+ continue
+
+ all_words[word] = True
+ set_index = which_set(wav_path, validation_percentage, testing_percentage)
+ # If it's a known class, store its detail, otherwise add it to the list
+ # we'll use to train the unknown label.
+ if word in wanted_words_index:
+ data_index[set_index].append({'label': word, 'file': wav_path})
+ else:
+ unknown_index[set_index].append({'label': word, 'file': wav_path})
+ if not all_words:
+ raise Exception('No .wavs found at ' + str(search_pattern))
+
+ return data_index, unknown_index, all_words
+
+ def _prepare_background_data(self):
+ """Searches a folder for background noise audio, and loads it into memory.
+
+ It's expected that the background audio samples will be in a subdirectory
+ named '_background_noise_' inside the 'data_dir' folder, as .wavs that match
+ the sample rate of the training data, but can be much longer in duration.
+
+ If the '_background_noise_' folder doesn't exist at all, this isn't an
+ error, it's just taken to mean that no background noise augmentation should
+ be used. If the folder does exist, but it's empty, that's treated as an
+ error.
+
+ Returns:
+ Ragged tensor of raw PCM-encoded audio samples of background noise.
+ None if '_background_noise_' folder doesnt exist.
+
+ Raises:
+ Exception: If files aren't found in the folder.
+ """
+ background_data = []
+ background_dir = Path(self.data_dir / BACKGROUND_NOISE_DIR_NAME)
+ if not background_dir.exists():
+ self.background_data = None
+ return
+
+ search_path = Path(background_dir / '*.wav')
+ for wav_path in tf.io.gfile.glob(str(search_path)):
+ wav_data, _ = load_wav_file(wav_path, desired_samples=-1)
+ background_data.append(tf.reshape(wav_data, [-1]))
+
+ if not background_data:
+ raise Exception('No background wav files were found in ' + str(search_path))
+
+ # Ragged tensor as we cant use lists in tf dataset map functions.
+ self.background_data = tf.ragged.stack(background_data)
diff --git a/models/keyword_spotting/dnn_medium/model_package_tf/dnn_m_inference_keras.py b/models/keyword_spotting/dnn_medium/model_package_tf/dnn_m_inference_keras.py
new file mode 100644
index 0000000..db7694a
--- /dev/null
+++ b/models/keyword_spotting/dnn_medium/model_package_tf/dnn_m_inference_keras.py
@@ -0,0 +1,76 @@
+# Copyright © 2023 Arm Ltd. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from data_processing.data_preprocessing import load_wav_file, calculate_mfcc
+
+import tensorflow as tf
+import argparse
+
+
+def load_labels(filename):
+ """Read in labels, one label per line."""
+ f = open(filename, "r")
+ return f.read().splitlines()
+
+
+def main():
+ window_size_samples = int(FLAGS.sample_rate * FLAGS.window_size_ms / 1000)
+ window_stride_samples = int(FLAGS.sample_rate * FLAGS.window_stride_ms / 1000)
+ decoded, sample = load_wav_file(FLAGS.wav, FLAGS.sample_rate)
+ x = calculate_mfcc(decoded, sample, window_size_samples, window_stride_samples, FLAGS.dct_coefficient_count)
+ x = tf.reshape(x, [1, -1])
+
+ model = tf.keras.models.load_model(FLAGS.keras_file_path)
+ predictions = model.predict(x)
+
+ # Sort to show labels in order of confidence
+ top_k = predictions[0].argsort()[-1:][::-1]
+ for node_id in top_k:
+ human_string = load_labels(FLAGS.labels)[int(node_id)]
+ score = predictions[0,node_id]
+ print(f'model predicted: {human_string} with score {score:.5f}')
+
+
+if __name__ == '__main__':
+ parser = argparse.ArgumentParser()
+ parser.add_argument(
+ '--wav', type=str, default='', help='Audio file to be identified.')
+ parser.add_argument(
+ '--labels', type=str, default='', help='Path to file containing labels.')
+ parser.add_argument(
+ '--sample_rate',
+ type=int,
+ default=16000,
+ help='Expected sample rate of the wavs', )
+ parser.add_argument(
+ '--window_size_ms',
+ type=float,
+ default=30.0,
+ help='How long each spectrogram timeslice is', )
+ parser.add_argument(
+ '--window_stride_ms',
+ type=float,
+ default=10.0,
+ help='How long each spectrogram timeslice is', )
+ parser.add_argument(
+ '--dct_coefficient_count',
+ type=int,
+ default=40,
+ help='How many bins to use for the MFCC fingerprint', )
+ parser.add_argument(
+ '--keras_file_path',
+ type=str,
+ default='',
+ help='Path to the .h5 Keras model file to use for testing.')
+ FLAGS, unparsed = parser.parse_known_args()
+ main()
diff --git a/models/keyword_spotting/dnn_medium/model_package_tf/dnn_m_inference_tflite.py b/models/keyword_spotting/dnn_medium/model_package_tf/dnn_m_inference_tflite.py
new file mode 100644
index 0000000..9f79d99
--- /dev/null
+++ b/models/keyword_spotting/dnn_medium/model_package_tf/dnn_m_inference_tflite.py
@@ -0,0 +1,120 @@
+# Copyright © 2023 Arm Ltd. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from data_processing.data_preprocessing import load_wav_file, calculate_mfcc
+
+import tensorflow as tf
+import numpy as np
+import argparse
+
+
+def tflite_inference(input_data, tflite_path):
+ """Call forwards pass of TFLite file and returns the result.
+
+ Args:
+ input_data: Input data to use on forward pass.
+ tflite_path: Path to TFLite file to run.
+
+ Returns:
+ Output from inference.
+ """
+ supported_quant_dtypes = (np.int8, np.int16)
+ interpreter = tf.lite.Interpreter(model_path=tflite_path)
+ interpreter.allocate_tensors()
+
+ input_details = interpreter.get_input_details()
+ output_details = interpreter.get_output_details()
+
+ input_dtype = input_details[0]["dtype"]
+ output_dtype = output_details[0]["dtype"]
+
+ # Check if the input/output type is quantized,
+ # set scale and zero-point accordingly
+ if input_dtype in supported_quant_dtypes:
+ input_scale, input_zero_point = input_details[0]["quantization"]
+ else:
+ input_scale, input_zero_point = 1, 0
+
+ input_data = input_data / input_scale + input_zero_point
+ input_data = np.round(input_data) if input_dtype in supported_quant_dtypes else input_data
+
+ if output_dtype in supported_quant_dtypes:
+ output_scale, output_zero_point = output_details[0]["quantization"]
+ else:
+ output_scale, output_zero_point = 1, 0
+
+ interpreter.set_tensor(input_details[0]['index'], tf.cast(input_data, input_dtype))
+ interpreter.invoke()
+
+ output_data = interpreter.get_tensor(output_details[0]['index'])
+
+ output_data = output_scale * (output_data.astype(np.float32) - output_zero_point)
+
+ return output_data
+
+
+def load_labels(filename):
+ """Read in labels, one label per line."""
+ f = open(filename, "r")
+ return f.read().splitlines()
+
+
+def main():
+ window_size_samples = int(FLAGS.sample_rate * FLAGS.window_size_ms / 1000)
+ window_stride_samples = int(FLAGS.sample_rate * FLAGS.window_stride_ms / 1000)
+ decoded, sample = load_wav_file(FLAGS.wav, FLAGS.sample_rate)
+ x = calculate_mfcc(decoded, sample, window_size_samples, window_stride_samples, FLAGS.dct_coefficient_count)
+ x = tf.reshape(x, [1, -1])
+ predictions = tflite_inference(x, FLAGS.tflite_path)
+
+ # Sort to show labels in order of confidence
+ top_k = predictions[0].argsort()[-1:][::-1]
+ for node_id in top_k:
+ human_string = load_labels(FLAGS.labels)[int(node_id)]
+ score = predictions[0,node_id]
+ print(f'model predicted: {human_string} with score {score:.5f}')
+
+
+if __name__ == '__main__':
+ parser = argparse.ArgumentParser()
+ parser.add_argument(
+ '--wav', type=str, default='', help='Audio file to be identified.')
+ parser.add_argument(
+ '--labels', type=str, default='', help='Path to file containing labels.')
+ parser.add_argument(
+ '--sample_rate',
+ type=int,
+ default=16000,
+ help='Expected sample rate of the wavs', )
+ parser.add_argument(
+ '--window_size_ms',
+ type=float,
+ default=30.0,
+ help='How long each spectrogram timeslice is', )
+ parser.add_argument(
+ '--window_stride_ms',
+ type=float,
+ default=10.0,
+ help='How long each spectrogram timeslice is', )
+ parser.add_argument(
+ '--dct_coefficient_count',
+ type=int,
+ default=40,
+ help='How many bins to use for the MFCC fingerprint', )
+ parser.add_argument(
+ '--tflite_path',
+ type=str,
+ default='',
+ help='Path to TFLite file to use for testing.')
+ FLAGS, unparsed = parser.parse_known_args()
+ main()
diff --git a/models/keyword_spotting/dnn_medium/model_package_tf/evaluation.py b/models/keyword_spotting/dnn_medium/model_package_tf/evaluation.py
new file mode 100644
index 0000000..4481dcd
--- /dev/null
+++ b/models/keyword_spotting/dnn_medium/model_package_tf/evaluation.py
@@ -0,0 +1,250 @@
+# Copyright © 2023 Arm Ltd. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Functions for testing trained keyword spotting models from checkpoint files and TFLite files."""
+
+import argparse
+
+import numpy as np
+import tensorflow as tf
+
+from data_processing import data_preprocessing
+from model_core_utils import models
+from dnn_m_inference_tflite import tflite_inference
+
+
+def tflite_test(model_settings, audio_processor, tflite_path):
+ """Calculate accuracy and confusion matrices on the validation and test sets.
+
+ A TFLite model is used for doing testing.
+
+ Args:
+ model_settings: Dictionary of common model settings.
+ audio_processor: Audio processor class object.
+ tflite_path: Path to TFLite file to use for inference.
+ """
+ # Evaluate on validation set.
+ print("Running TFLite evaluation on validation set...")
+ val_data = audio_processor.get_data(audio_processor.Modes.VALIDATION).batch(1)
+ expected_indices = np.concatenate([y for x, y in val_data])
+ predicted_indices = []
+
+ for mfcc, label in val_data:
+ prediction = tflite_inference(mfcc, tflite_path)
+ predicted_indices.append(np.squeeze(tf.argmax(prediction, axis=1)))
+
+ val_accuracy = calculate_accuracy(predicted_indices, expected_indices)
+ confusion_matrix = tf.math.confusion_matrix(expected_indices, predicted_indices,
+ num_classes=model_settings['label_count'])
+
+ print(confusion_matrix.numpy())
+ print(f'Validation accuracy = {val_accuracy * 100:.2f}%'
+ f'(N={audio_processor.set_size(audio_processor.Modes.VALIDATION)})')
+
+ # Evaluate on testing set.
+ print("Running TFLite evaluation on test set...")
+ test_data = audio_processor.get_data(audio_processor.Modes.TESTING).batch(1)
+ expected_indices = np.concatenate([y for x, y in test_data])
+ predicted_indices = []
+
+ for mfcc, label in test_data:
+ prediction = tflite_inference(mfcc, tflite_path)
+ predicted_indices.append(np.squeeze(tf.argmax(prediction, axis=1)))
+
+ test_accuracy = calculate_accuracy(predicted_indices, expected_indices)
+ confusion_matrix = tf.math.confusion_matrix(expected_indices, predicted_indices,
+ num_classes=model_settings['label_count'])
+
+ print(confusion_matrix.numpy())
+ print(f'Test accuracy = {test_accuracy * 100:.2f}%'
+ f'(N={audio_processor.set_size(audio_processor.Modes.TESTING)})')
+
+
+def keras_test(model_settings, audio_processor, model):
+ """Calculate accuracy and confusion matrices on the validation and test sets.
+
+ A loaded keras model is used for doing testing.
+
+ Args:
+ model_settings: Dictionary of common model settings.
+ audio_processor: Audio processor class object.
+ model: Loaded keras model.
+ """
+ # Evaluate on validation set.
+ print("Running TF evaluation on validation set...")
+ val_data = audio_processor.get_data(audio_processor.Modes.VALIDATION).batch(FLAGS.batch_size)
+ expected_indices = np.concatenate([y for x, y in val_data])
+
+ predictions = model.predict(val_data)
+ predicted_indices = tf.argmax(predictions, axis=1)
+
+ val_accuracy = calculate_accuracy(predicted_indices, expected_indices)
+ confusion_matrix = tf.math.confusion_matrix(expected_indices, predicted_indices,
+ num_classes=model_settings['label_count'])
+ print(confusion_matrix.numpy())
+ print(f'Validation accuracy = {val_accuracy * 100:.2f}%'
+ f'(N={audio_processor.set_size(audio_processor.Modes.VALIDATION)})')
+
+ # Evaluate on testing set.
+ print("Running TF evaluation on test set...")
+ test_data = audio_processor.get_data(audio_processor.Modes.TESTING).batch(FLAGS.batch_size)
+ expected_indices = np.concatenate([y for x, y in test_data])
+
+ predictions = model.predict(test_data)
+ predicted_indices = tf.argmax(predictions, axis=1)
+
+ test_accuracy = calculate_accuracy(predicted_indices, expected_indices)
+ confusion_matrix = tf.math.confusion_matrix(expected_indices, predicted_indices,
+ num_classes=model_settings['label_count'])
+ print(confusion_matrix.numpy())
+ print(f'Test accuracy = {test_accuracy * 100:.2f}%'
+ f'(N={audio_processor.set_size(audio_processor.Modes.TESTING)})')
+
+
+def calculate_accuracy(predicted_indices, expected_indices):
+ """Calculates and returns accuracy.
+
+ Args:
+ predicted_indices: List of predicted integer indices.
+ expected_indices: List of expected integer indices.
+
+ Returns:
+ Accuracy value between 0 and 1.
+ """
+ correct_prediction = tf.equal(predicted_indices, expected_indices)
+ accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
+ return accuracy
+
+
+def evaluate():
+ """Calculate accuracy and confusion matrices on validation and test sets.
+
+ Model is created and weights loaded from supplied command line arguments.
+ """
+ model_settings = models.prepare_model_settings(
+ len(data_preprocessing.prepare_words_list(FLAGS.wanted_words.split(','))),
+ FLAGS.sample_rate, FLAGS.clip_duration_ms, FLAGS.window_size_ms,
+ FLAGS.window_stride_ms, FLAGS.dct_coefficient_count)
+
+ audio_processor = data_preprocessing.AudioProcessor(data_url=FLAGS.data_url,
+ data_dir=FLAGS.data_dir,
+ silence_percentage=FLAGS.silence_percentage,
+ unknown_percentage=FLAGS.unknown_percentage,
+ wanted_words=FLAGS.wanted_words.split(','),
+ validation_percentage=FLAGS.validation_percentage,
+ testing_percentage=FLAGS.testing_percentage,
+ model_settings=model_settings)
+
+ if FLAGS.tflite_path:
+ tflite_test(model_settings, audio_processor, FLAGS.tflite_path)
+
+ if FLAGS.checkpoint:
+ model = models.create_model(model_settings, FLAGS.model_architecture, FLAGS.model_size_info, False)
+ model.load_weights(FLAGS.checkpoint).expect_partial()
+ keras_test(model_settings, audio_processor, model)
+
+
+if __name__ == '__main__':
+ parser = argparse.ArgumentParser()
+ parser.add_argument(
+ '--data_url',
+ type=str,
+ default='http://download.tensorflow.org/data/speech_commands_v0.02.tar.gz',
+ help='Location of speech training data archive on the web.')
+ parser.add_argument(
+ '--data_dir',
+ type=str,
+ default='/tmp/speech_dataset/',
+ help="""\
+ Where to download the speech training data to.
+ """)
+ parser.add_argument(
+ '--silence_percentage',
+ type=float,
+ default=10.0,
+ help="""\
+ How much of the training data should be silence.
+ """)
+ parser.add_argument(
+ '--unknown_percentage',
+ type=float,
+ default=10.0,
+ help="""\
+ How much of the training data should be unknown words.
+ """)
+ parser.add_argument(
+ '--testing_percentage',
+ type=int,
+ default=10,
+ help='What percentage of wavs to use as a test set.')
+ parser.add_argument(
+ '--validation_percentage',
+ type=int,
+ default=10,
+ help='What percentage of wavs to use as a validation set.')
+ parser.add_argument(
+ '--sample_rate',
+ type=int,
+ default=16000,
+ help='Expected sample rate of the wavs',)
+ parser.add_argument(
+ '--clip_duration_ms',
+ type=int,
+ default=1000,
+ help='Expected duration in milliseconds of the wavs',)
+ parser.add_argument(
+ '--window_size_ms',
+ type=float,
+ default=30.0,
+ help='How long each spectrogram timeslice is',)
+ parser.add_argument(
+ '--window_stride_ms',
+ type=float,
+ default=10.0,
+ help='How long each spectrogram timeslice is',)
+ parser.add_argument(
+ '--dct_coefficient_count',
+ type=int,
+ default=40,
+ help='How many bins to use for the MFCC fingerprint',)
+ parser.add_argument(
+ '--batch_size',
+ type=int,
+ default=100,
+ help='How many items to train with at once',)
+ parser.add_argument(
+ '--wanted_words',
+ type=str,
+ default='yes,no,up,down,left,right,on,off,stop,go',
+ help='Words to use (others will be added to an unknown label)',)
+ parser.add_argument(
+ '--checkpoint',
+ type=str,
+ help='Checkpoint to load the weights from')
+ parser.add_argument(
+ '--tflite_path',
+ type=str,
+ help='Path to TFLite file to use for evaluation')
+ parser.add_argument(
+ '--model_architecture',
+ type=str,
+ default='dnn',
+ help='What model architecture to use')
+ parser.add_argument(
+ '--model_size_info',
+ type=int,
+ nargs="+",
+ default=[128, 128, 128],
+ help='Model dimensions - different for various models')
+
+ FLAGS, _ = parser.parse_known_args()
+ evaluate()
diff --git a/models/keyword_spotting/dnn_medium/model_package_tf/how_to_guidance.ipynb b/models/keyword_spotting/dnn_medium/model_package_tf/how_to_guidance.ipynb
new file mode 100644
index 0000000..ac8b78c
--- /dev/null
+++ b/models/keyword_spotting/dnn_medium/model_package_tf/how_to_guidance.ipynb
@@ -0,0 +1,428 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Copyright (C) 2023 Arm Limited or its affiliates. All rights reserved.\n",
+ "#\n",
+ "# SPDX-License-Identifier: Apache-2.0\n",
+ "#\n",
+ "# Licensed under the Apache License, Version 2.0 (the License); you may\n",
+ "# not use this file except in compliance with the License.\n",
+ "# You may obtain a copy of the License at\n",
+ "#\n",
+ "# www.apache.org/licenses/LICENSE-2.0\n",
+ "#\n",
+ "# Unless required by applicable law or agreed to in writing, software\n",
+ "# distributed under the License is distributed on an AS IS BASIS, WITHOUT\n",
+ "# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
+ "# See the License for the specific language governing permissions and\n",
+ "# limitations under the License."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# DNN_Medium - Optimised\n",
+ "\n",
+ "Here we reproduce the models with our established codebase and ModelPackage approach for your convenience.\n",
+ "\n",
+ "## Model-Package Overview:\n",
+ "\n",
+ "| Model \t| DNN_Medium \t|\n",
+ "|:---------------:\t|:---------------------------------------------------------------:\t|\n",
+ "| **Format**: \t| Keras, Saved Model, TensorFlow Lite int8, TensorFlow Lite fp32 |\n",
+ "| **Feature**: \t| Keyword spotting for Arm Cortex-M CPUs |\n",
+ "| **Architectural Delta w.r.t. Vanilla**: | None |\n",
+ "| **Domain**: \t| Keyword spotting |\n",
+ "| **Package Quality**: \t| Optimised |"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Table of contents \n",
+ "\n",
+ "This how-to guidance presents the key steps to reproduce everything in this package. The contents are organised as below. We provided the internal navigation links for users to easy-jump among different sections. \n",
+ "\n",
+ " \n",
+ "* [1.0 Model recreation](#model_recreation)\n",
+ "\n",
+ "* [2.0 Training](#training)\n",
+ "\n",
+ "* [3.0 Testing](#testing)\n",
+ "\n",
+ "* [4.0 Optimization](#optimization)\n",
+ "\n",
+ "* [5.0 Quantization and TFLite conversion](#tflite_conversion)\n",
+ "\n",
+ "* [6.0 Inference the TFLite model files](#tflite_inference)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 1.0 Model Recreation\n",
+ "\n",
+ "In order to recreate the model you will first need to be using ```Python3.7``` and install the requirements in ```requirements.txt```.\n",
+ "\n",
+ "Once you have these requirements satisfied you can execute the recreation script contained within this folder, just run:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "2023-01-31 13:21:58.189962: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0\n",
+ "Untarring speech_commands_v0.02.tar.gz...\n",
+ "2023-01-31 13:22:48.489206: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcuda.so.1\n",
+ "2023-01-31 13:22:48.528844: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: \n",
+ "pciBusID: 0000:03:00.0 name: NVIDIA TITAN Xp computeCapability: 6.1\n",
+ "coreClock: 1.582GHz coreCount: 30 deviceMemorySize: 11.91GiB deviceMemoryBandwidth: 510.07GiB/s\n",
+ "2023-01-31 13:22:48.528880: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0\n",
+ "2023-01-31 13:22:48.548795: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcublas.so.11\n",
+ "2023-01-31 13:22:48.548866: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcublasLt.so.11\n",
+ "2023-01-31 13:22:48.551645: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcufft.so.10\n",
+ "2023-01-31 13:22:48.551935: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcurand.so.10\n",
+ "2023-01-31 13:22:48.552501: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcusolver.so.11\n",
+ "2023-01-31 13:22:48.553238: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcusparse.so.11\n",
+ "2023-01-31 13:22:48.553392: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudnn.so.8\n",
+ "2023-01-31 13:22:48.553886: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0\n",
+ "2023-01-31 13:22:48.554176: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA\n",
+ "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
+ "2023-01-31 13:22:48.554998: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: \n",
+ "pciBusID: 0000:03:00.0 name: NVIDIA TITAN Xp computeCapability: 6.1\n",
+ "coreClock: 1.582GHz coreCount: 30 deviceMemorySize: 11.91GiB deviceMemoryBandwidth: 510.07GiB/s\n",
+ "2023-01-31 13:22:48.555410: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0\n",
+ "2023-01-31 13:22:48.555527: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0\n",
+ "2023-01-31 13:22:48.994481: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1258] Device interconnect StreamExecutor with strength 1 edge matrix:\n",
+ "2023-01-31 13:22:48.994520: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1264] 0 \n",
+ "2023-01-31 13:22:48.994528: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1277] 0: N \n",
+ "2023-01-31 13:22:48.995028: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1418] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 10939 MB memory) -> physical GPU (device: 0, name: NVIDIA TITAN Xp, pci bus id: 0000:03:00.0, compute capability: 6.1)\n",
+ "WARNING:tensorflow:Compiled the loaded model, but the compiled metrics have yet to be built. `model.compile_metrics` will be empty until you train or evaluate the model.\n",
+ "2023-01-31 13:22:50.146418: W tensorflow/python/util/util.cc:348] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.\n",
+ "2023-01-31 13:22:50.411740: I tensorflow/core/grappler/devices.cc:69] Number of eligible GPUs (core count >= 8, compute capability >= 0.0): 1\n",
+ "2023-01-31 13:22:50.411969: I tensorflow/core/grappler/clusters/single_machine.cc:357] Starting new session\n",
+ "2023-01-31 13:22:50.412348: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: \n",
+ "pciBusID: 0000:03:00.0 name: NVIDIA TITAN Xp computeCapability: 6.1\n",
+ "coreClock: 1.582GHz coreCount: 30 deviceMemorySize: 11.91GiB deviceMemoryBandwidth: 510.07GiB/s\n",
+ "2023-01-31 13:22:50.412596: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0\n",
+ "2023-01-31 13:22:50.412627: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1258] Device interconnect StreamExecutor with strength 1 edge matrix:\n",
+ "2023-01-31 13:22:50.412636: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1264] 0 \n",
+ "2023-01-31 13:22:50.412643: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1277] 0: N \n",
+ "2023-01-31 13:22:50.412919: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1418] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 10939 MB memory) -> physical GPU (device: 0, name: NVIDIA TITAN Xp, pci bus id: 0000:03:00.0, compute capability: 6.1)\n",
+ "2023-01-31 13:22:50.431567: I tensorflow/core/platform/profile_utils/cpu_utils.cc:114] CPU Frequency: 3492140000 Hz\n",
+ "2023-01-31 13:22:50.433318: I tensorflow/core/grappler/optimizers/meta_optimizer.cc:1144] Optimization results for grappler item: graph_to_optimize\n",
+ " function_optimizer: function_optimizer did nothing. time = 0.017ms.\n",
+ " function_optimizer: function_optimizer did nothing. time = 0.003ms.\n",
+ "\n",
+ "2023-01-31 13:22:50.470457: W tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc:345] Ignored output_format.\n",
+ "2023-01-31 13:22:50.470496: W tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc:348] Ignored drop_control_dependency.\n",
+ "2023-01-31 13:22:50.473049: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:210] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.\n",
+ "2023-01-31 13:22:50.475051: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: \n",
+ "pciBusID: 0000:03:00.0 name: NVIDIA TITAN Xp computeCapability: 6.1\n",
+ "coreClock: 1.582GHz coreCount: 30 deviceMemorySize: 11.91GiB deviceMemoryBandwidth: 510.07GiB/s\n",
+ "2023-01-31 13:22:50.475342: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0\n",
+ "2023-01-31 13:22:50.475376: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1258] Device interconnect StreamExecutor with strength 1 edge matrix:\n",
+ "2023-01-31 13:22:50.475387: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1264] 0 \n",
+ "2023-01-31 13:22:50.475395: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1277] 0: N \n",
+ "2023-01-31 13:22:50.475693: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1418] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 10939 MB memory) -> physical GPU (device: 0, name: NVIDIA TITAN Xp, pci bus id: 0000:03:00.0, compute capability: 6.1)\n",
+ "Converted model saved to dnn.tflite.\n",
+ "Running TFLite evaluation on validation set...\n",
+ "2023-01-31 13:22:50.520336: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:176] None of the MLIR Optimization Passes are enabled (registered 2)\n",
+ "[[371 0 0 0 0 0 0 0 0 0 0 0]\n",
+ " [ 0 265 9 7 5 18 11 12 17 5 8 14]\n",
+ " [ 0 6 346 9 0 2 22 5 1 0 1 5]\n",
+ " [ 0 9 8 323 8 14 3 5 0 1 2 33]\n",
+ " [ 0 4 0 2 304 1 3 3 4 17 9 3]\n",
+ " [ 0 8 1 19 1 326 2 1 7 0 0 12]\n",
+ " [ 0 2 24 2 3 1 304 13 0 0 0 3]\n",
+ " [ 0 10 1 1 4 1 4 336 1 2 0 3]\n",
+ " [ 1 10 1 1 7 2 0 2 326 9 1 3]\n",
+ " [ 1 2 0 1 27 0 1 1 11 321 4 4]\n",
+ " [ 2 5 0 0 16 2 2 1 1 2 318 1]\n",
+ " [ 0 13 0 43 6 13 1 2 3 3 1 287]]\n",
+ "Validation accuracy = 86.10%(N=4445)\n",
+ "Running TFLite evaluation on test set...\n",
+ "[[408 0 0 0 0 0 0 0 0 0 0 0]\n",
+ " [ 0 295 7 11 6 6 13 12 24 8 5 21]\n",
+ " [ 0 12 380 3 0 4 15 1 0 0 0 4]\n",
+ " [ 1 11 2 332 0 22 1 0 0 0 0 36]\n",
+ " [ 0 14 1 2 357 2 2 5 12 11 11 8]\n",
+ " [ 0 18 5 18 6 329 5 1 4 0 2 18]\n",
+ " [ 0 10 25 3 4 1 347 15 1 0 2 4]\n",
+ " [ 0 20 1 0 5 1 14 349 1 5 0 0]\n",
+ " [ 0 12 0 1 5 9 0 0 347 16 2 4]\n",
+ " [ 0 15 0 1 15 1 5 2 12 339 3 9]\n",
+ " [ 0 5 0 3 21 2 4 1 2 1 368 4]\n",
+ " [ 0 10 1 62 8 13 3 1 0 0 1 303]]\n",
+ "Test accuracy = 84.95%(N=4890)\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "2023-01-31 13:23:02.712653: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0\n",
+ "Untarring speech_commands_v0.02.tar.gz...\n",
+ "2023-01-31 13:23:53.488800: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcuda.so.1\n",
+ "2023-01-31 13:23:53.524175: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: \n",
+ "pciBusID: 0000:03:00.0 name: NVIDIA TITAN Xp computeCapability: 6.1\n",
+ "coreClock: 1.582GHz coreCount: 30 deviceMemorySize: 11.91GiB deviceMemoryBandwidth: 510.07GiB/s\n",
+ "2023-01-31 13:23:53.524209: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0\n",
+ "2023-01-31 13:23:53.544183: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcublas.so.11\n",
+ "2023-01-31 13:23:53.544253: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcublasLt.so.11\n",
+ "2023-01-31 13:23:53.546889: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcufft.so.10\n",
+ "2023-01-31 13:23:53.547146: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcurand.so.10\n",
+ "2023-01-31 13:23:53.547744: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcusolver.so.11\n",
+ "2023-01-31 13:23:53.548454: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcusparse.so.11\n",
+ "2023-01-31 13:23:53.548596: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudnn.so.8\n",
+ "2023-01-31 13:23:53.548947: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0\n",
+ "2023-01-31 13:23:53.549238: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA\n",
+ "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
+ "2023-01-31 13:23:53.549958: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: \n",
+ "pciBusID: 0000:03:00.0 name: NVIDIA TITAN Xp computeCapability: 6.1\n",
+ "coreClock: 1.582GHz coreCount: 30 deviceMemorySize: 11.91GiB deviceMemoryBandwidth: 510.07GiB/s\n",
+ "2023-01-31 13:23:53.550439: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0\n",
+ "2023-01-31 13:23:53.550510: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0\n",
+ "2023-01-31 13:23:53.960933: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1258] Device interconnect StreamExecutor with strength 1 edge matrix:\n",
+ "2023-01-31 13:23:53.960972: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1264] 0 \n",
+ "2023-01-31 13:23:53.960979: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1277] 0: N \n",
+ "2023-01-31 13:23:53.961483: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1418] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 10940 MB memory) -> physical GPU (device: 0, name: NVIDIA TITAN Xp, pci bus id: 0000:03:00.0, compute capability: 6.1)\n",
+ "WARNING:tensorflow:Compiled the loaded model, but the compiled metrics have yet to be built. `model.compile_metrics` will be empty until you train or evaluate the model.\n",
+ "2023-01-31 13:23:55.053376: W tensorflow/python/util/util.cc:348] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.\n",
+ "2023-01-31 13:23:55.321894: I tensorflow/core/grappler/devices.cc:69] Number of eligible GPUs (core count >= 8, compute capability >= 0.0): 1\n",
+ "2023-01-31 13:23:55.322084: I tensorflow/core/grappler/clusters/single_machine.cc:357] Starting new session\n",
+ "2023-01-31 13:23:55.322539: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: \n",
+ "pciBusID: 0000:03:00.0 name: NVIDIA TITAN Xp computeCapability: 6.1\n",
+ "coreClock: 1.582GHz coreCount: 30 deviceMemorySize: 11.91GiB deviceMemoryBandwidth: 510.07GiB/s\n",
+ "2023-01-31 13:23:55.322808: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0\n",
+ "2023-01-31 13:23:55.322839: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1258] Device interconnect StreamExecutor with strength 1 edge matrix:\n",
+ "2023-01-31 13:23:55.322850: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1264] 0 \n",
+ "2023-01-31 13:23:55.322858: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1277] 0: N \n",
+ "2023-01-31 13:23:55.323143: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1418] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 10940 MB memory) -> physical GPU (device: 0, name: NVIDIA TITAN Xp, pci bus id: 0000:03:00.0, compute capability: 6.1)\n",
+ "2023-01-31 13:23:55.347442: I tensorflow/core/platform/profile_utils/cpu_utils.cc:114] CPU Frequency: 3492140000 Hz\n",
+ "2023-01-31 13:23:55.348486: I tensorflow/core/grappler/optimizers/meta_optimizer.cc:1144] Optimization results for grappler item: graph_to_optimize\n",
+ " function_optimizer: function_optimizer did nothing. time = 0.011ms.\n",
+ " function_optimizer: function_optimizer did nothing. time = 0.002ms.\n",
+ "\n",
+ "2023-01-31 13:23:55.387556: W tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc:345] Ignored output_format.\n",
+ "2023-01-31 13:23:55.387602: W tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc:348] Ignored drop_control_dependency.\n",
+ "2023-01-31 13:23:55.390277: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:210] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.\n",
+ "2023-01-31 13:23:55.392318: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: \n",
+ "pciBusID: 0000:03:00.0 name: NVIDIA TITAN Xp computeCapability: 6.1\n",
+ "coreClock: 1.582GHz coreCount: 30 deviceMemorySize: 11.91GiB deviceMemoryBandwidth: 510.07GiB/s\n",
+ "2023-01-31 13:23:55.392627: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0\n",
+ "2023-01-31 13:23:55.392665: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1258] Device interconnect StreamExecutor with strength 1 edge matrix:\n",
+ "2023-01-31 13:23:55.392681: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1264] 0 \n",
+ "2023-01-31 13:23:55.392693: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1277] 0: N \n",
+ "2023-01-31 13:23:55.393015: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1418] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 10940 MB memory) -> physical GPU (device: 0, name: NVIDIA TITAN Xp, pci bus id: 0000:03:00.0, compute capability: 6.1)\n",
+ "2023-01-31 13:23:55.414179: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:176] None of the MLIR Optimization Passes are enabled (registered 2)\n",
+ "fully_quantize: 0, inference_type: 6, input_inference_type: 9, output_inference_type: 9\n",
+ "Quantized model saved to dnn_quantized.tflite.\n",
+ "Running TFLite evaluation on validation set...\n",
+ "[[371 0 0 0 0 0 0 0 0 0 0 0]\n",
+ " [ 0 272 6 8 8 19 9 12 17 6 4 10]\n",
+ " [ 0 11 341 9 5 2 20 6 0 0 0 3]\n",
+ " [ 0 15 9 319 13 13 2 4 1 1 3 26]\n",
+ " [ 0 6 0 3 307 1 1 2 3 16 9 2]\n",
+ " [ 0 11 1 20 12 312 3 0 6 0 1 11]\n",
+ " [ 0 7 26 3 5 1 294 11 1 1 1 2]\n",
+ " [ 0 13 1 1 9 2 5 326 1 1 2 2]\n",
+ " [ 2 13 0 0 7 4 1 2 318 10 4 2]\n",
+ " [ 1 4 0 2 37 0 1 2 12 308 3 3]\n",
+ " [ 2 5 0 0 21 2 2 1 1 3 312 1]\n",
+ " [ 0 16 1 43 9 15 1 3 1 3 1 279]]\n",
+ "Validation accuracy = 84.57%(N=4445)\n",
+ "Running TFLite evaluation on test set...\n",
+ "[[408 0 0 0 0 0 0 0 0 0 0 0]\n",
+ " [ 0 303 7 13 6 4 12 9 22 8 6 18]\n",
+ " [ 0 13 370 5 4 3 15 1 1 0 2 5]\n",
+ " [ 0 12 6 335 4 19 1 1 1 0 0 26]\n",
+ " [ 0 14 1 4 354 1 0 3 15 14 11 8]\n",
+ " [ 0 26 5 26 10 316 5 2 3 0 1 12]\n",
+ " [ 0 15 25 2 9 1 334 17 1 0 2 6]\n",
+ " [ 0 19 1 0 10 1 14 338 4 4 4 1]\n",
+ " [ 0 16 1 2 8 8 1 0 339 11 6 4]\n",
+ " [ 0 15 0 1 27 0 6 2 12 329 3 7]\n",
+ " [ 0 9 0 3 22 2 4 1 2 2 360 6]\n",
+ " [ 0 20 0 63 16 12 1 3 1 1 6 279]]\n",
+ "Test accuracy = 83.13%(N=4890)\n"
+ ]
+ }
+ ],
+ "source": [
+ "!bash ./recreate_model.sh"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Running this script will use the pre-trained checkpoint files supplied in the ```./model_archive/model_source/weights``` folder to generate the TFLite files and perform evaluation on the test set. Both an fp32 version and a quantized version will be produced. The quantized version will use post-training quantization to fully quantize it.\n",
+ "\n",
+ "If you want to run training from scratch you can do this by supplying ```--train``` when running the script. For example:\n",
+ "\n",
+ "```bash\n",
+ "bash ./recreate_model.sh --train\n",
+ "```\n",
+ "\n",
+ "Training is then performed and should produce a model to the stated accuracy in this repository. Note that exporting to TFLite will still happen with the baseline pre-trained checkpoint files, so you will need to re-run the script and this time supply the path to the new checkpoint files you want to use, for example:\n",
+ "\n",
+ "```bash\n",
+ "bash ./recreate_model.sh --ckpt \n",
+ "```"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 2.0 Training\n",
+ "\n",
+ "The training scripts can be used to recreate any of the models from the [Hello Edge paper](https://arxiv.org/pdf/1711.07128.pdf) provided the right hyperparameters are used. The training commands with all the hyperparameters to reproduce the model in this repository are given [here](recreate_model.sh). The model in this part of the repository represents just one variation of the models from the paper, other varieties are covered in other parts of the repository.\n",
+ "\n",
+ "\n",
+ "As a general example of how to train a DNN with 3 fully-connected layers with 128 neurons in each layer, run:\n",
+ "```\n",
+ "python train.py --model_architecture dnn --model_size_info 128 128 128\n",
+ "```\n",
+ "\n",
+ "The command line argument *--model_size_info* is used to pass the neural network layer\n",
+ "dimensions such as number of layers, convolution filter size/stride as a list to models.py,\n",
+ "which builds the TensorFlow graph based on the provided model architecture\n",
+ "and layer dimensions. For more info on *model_size_info* for each network architecture see\n",
+ "[models.py](model_core_utils/models.py).\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 3.0 Testing\n",
+ "To run inference on the trained model from a checkpoint and get accuracy on validation and test sets, run:\n",
+ "```\n",
+ "python evaluation.py --model_architecture dnn --model_size_info 128 128 128 --checkpoint \n",
+ "```\n",
+ "**The model and feature extraction parameters passed to this script should match those used in the Training step.**"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 4.0 Optimization\n",
+ "\n",
+ "We introduce an *optional* step to optimize the trained keyword spotting model for deployment.\n",
+ "\n",
+ "Here we use TensorFlow's [weight clustering API](https://www.tensorflow.org/model_optimization/guide/clustering) to reduce the compressed model size and optimize inference on supported hardware. 32 weight clusters and kmeans++ cluster intialization method are used as the clustering hyperparameters.\n",
+ "\n",
+ "To optimize your trained model (e.g. a DNN), a trained model checkpoint is needed to run clustering and fine-tuning on.\n",
+ "You can use the pre-trained checkpoints provided, or train your own model and use the resulting checkpoint.\n",
+ "\n",
+ "To apply the optimization and fine-tuning, run the following command:\n",
+ "```\n",
+ "python optimisations.py --model_architecture dnn --model_size_info 128 128 128 --checkpoint \n",
+ "```\n",
+ "**The model and feature extraction parameters used here should match those used in the Training step, except for the number of training steps.\n",
+ "The number of training steps is reduced since the optimization step only requires fine-tuning.**\n",
+ "\n",
+ "This will generate a clustered model checkpoint that can be used in the quantization step to generate a quantized and clustered TFLite model."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 5.0 Quantization and TFLite Conversion\n",
+ "\n",
+ "You can now use TensorFlow's\n",
+ "[post training quantization](https://www.tensorflow.org/lite/performance/post_training_quantization) to\n",
+ "make quantization of the trained models super simple.\n",
+ "\n",
+ "To quantize your trained model (e.g. a DNN) run:\n",
+ "```\n",
+ "python convert_to_tflite.py --model_architecture dnn --model_size_info 128 128 128 --checkpoint [--inference_type int8|int16]\n",
+ "```\n",
+ "**The model and feature extraction parameters used here should match those used in the Training step.**\n",
+ "\n",
+ "The ```inference_type``` parameter is *optional* and to be used if a fully quantized model with inputs and outputs of type int8 or int16 is needed. It defaults to fp32.\n",
+ "\n",
+ "In this example, this step will produce a quantized TFLite file *dnn_quantized.tflite*."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "You can test the accuracy of this quantized model on the test set by running:\n",
+ "```\n",
+ "python evaluation.py --tflite_path dnn_quantized.tflite\n",
+ "```\n",
+ "**The model and feature extraction parameters used here should match those used in the Training step.**\n",
+ "\n",
+ "`convert_to_tflite.py` uses post-training quantization to generate a quantized model by default. If you wish to convert to a floating point TFLite model, use the command below:\n",
+ "\n",
+ "```\n",
+ "python convert_to_tflite.py --model_architecture dnn --model_size_info 128 128 128 --checkpoint --no-quantize\n",
+ "```\n",
+ "\n",
+ "This will produce a floating point TFLite file *dnn.tflite*. You can test the accuracy of this floating point model using `evaluation.py` as above.\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 6.0 Single inference of the TFLite model files \n",
+ "\n",
+ "You can conduct TFLite inference for .fp32 and .int8 model files by using the following command: \n",
+ "\n",
+ "```python dnn_m_inference_tflite.py --labels validation_utils/labels.txt --wav --tflite_path ```\n",
+ "\n",
+ "**The feature extraction parameters used here should match those used in the Training step.**\n",
+ "\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.7.4"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/models/keyword_spotting/dnn_medium/model_package_tf/model_archive/TFLite/tflite_fp32/README.md b/models/keyword_spotting/dnn_medium/model_package_tf/model_archive/TFLite/tflite_fp32/README.md
new file mode 100644
index 0000000..54631cd
--- /dev/null
+++ b/models/keyword_spotting/dnn_medium/model_package_tf/model_archive/TFLite/tflite_fp32/README.md
@@ -0,0 +1,62 @@
+# keyword_spotting/dnn_medium/model_package_tf/model_archive/TFLite/tflite_fp32
+
+## Description
+This is a floating point fp32 version of the DNN Medium model developed by Arm, from the Hello Edge paper.
+
+## License
+[Apache-2.0](https://spdx.org/licenses/Apache-2.0.html)
+
+## Network Information
+| Network Information | Value |
+|---------------------|-------|
+| Framework | TensorFlow Lite |
+| Datatype | fp32 |
+| SHA-1 Hash | 3c20c6ee24ee41ed6db968ff58d69f5823c94036 |
+| Size (Bytes) | 797768 |
+| Provenance | https://arxiv.org/abs/1711.07128 |
+| Training | Trained by Arm |
+| Paper | https://arxiv.org/abs/1711.07128 |
+
+## DataSet
+| Dataset Information | Value |
+|--------|-------|
+| Name | Google Speech Commands test set |
+
+## Accuracy
+
+| Metric | Value |
+|--------|-------|
+| accuracy | 84.95% |
+
+## HW Support
+| HW Support | Value |
+|--------------|-------|
+| Cortex-A |:heavy_check_mark: |
+| Cortex-M |:heavy_check_mark: |
+| Mali GPU |:heavy_check_mark: |
+| Ethos U |:heavy_multiplication_x: |
+
+### Key
+* :heavy_check_mark: - Will run on this platform.
+* :heavy_multiplication_x: - Will not run on this platform.
+
+## Network Quality
+| Network Quality | Value |
+|-------------------------|-------|
+| Recreate | :heavy_check_mark: |
+| Quality level | Deployable |
+| Vanilla | :heavy_check_mark: |
+| Clustered | :heavy_multiplication_x: |
+| Pruned | :heavy_multiplication_x: |
+| Quantization - default | :heavy_multiplication_x: |
+| Quantization - full | :heavy_multiplication_x: |
+
+## Network Inputs
+| Input Node Name | Shape | Type | Example Path | Example Type | Example Shape | Example Use Case |
+|-----------------|-------|-------|--------------|-------|-------|-----------------|
+| input | (1, 250) | fp32 | models/keyword_spotting/dnn_medium/model_package_tf/model_archive/TFLite/tflite_fp32/testing_input/input | fp32 | [1, 250] | The input is a processed MFCCs |
+
+## Network Outputs
+| Output Node Name | Shape | Type | Example Path | Example Type | Example Shape | Example Use Case |
+|-----------------|-------|-------|--------------|-------|-------|-----------------|
+| Identity | (1, 12) | fp32 | models/keyword_spotting/dnn_medium/model_package_tf/model_archive/TFLite/tflite_fp32/testing_output/Identity | fp32 | [1, 12] | The probability on 12 keywords |
\ No newline at end of file
diff --git a/models/keyword_spotting/dnn_medium/model_package_tf/model_archive/TFLite/tflite_fp32/definition.yaml b/models/keyword_spotting/dnn_medium/model_package_tf/model_archive/TFLite/tflite_fp32/definition.yaml
new file mode 100644
index 0000000..a650fd3
--- /dev/null
+++ b/models/keyword_spotting/dnn_medium/model_package_tf/model_archive/TFLite/tflite_fp32/definition.yaml
@@ -0,0 +1,62 @@
+benchmark:
+ benchmark_metrics:
+ accuracy: 84.95%
+ benchmark_name: Google Speech Commands test set
+description: This is a floating point fp32 version of the DNN Medium model developed
+ by Arm, from the Hello Edge paper.
+license:
+- Apache-2.0
+network:
+ datatype: fp32
+ file_size_bytes: 797768
+ filename: dnn_m.tflite
+ framework: TensorFlow Lite
+ hash:
+ algorithm: sha1
+ value: 3c20c6ee24ee41ed6db968ff58d69f5823c94036
+ provenance: https://arxiv.org/abs/1711.07128
+ training: Trained by Arm
+network_parameters:
+ input_nodes:
+ - description: The input is a processed MFCCs of shape (1, 490)
+ example_input:
+ path: models/keyword_spotting/dnn_medium/model_package_tf/model_archive/TFLite/tflite_fp32/testing_input/input
+ shape:
+ - 1
+ - 250
+ type: fp32
+ use_case: Random input for model regression.
+ input_datatype: fp32
+ name: input
+ shape:
+ - 1
+ - 250
+ output_nodes:
+ - description: The probability on 12 keywords.
+ example_output:
+ path: models/keyword_spotting/dnn_medium/model_package_tf/model_archive/TFLite/tflite_fp32/testing_output/Identity
+ shape:
+ - 1
+ - 12
+ type: fp32
+ use_case: output for model regression.
+ name: Identity
+ output_datatype: fp32
+ shape:
+ - 1
+ - 12
+network_quality:
+ clustered: false
+ is_vanilla: true
+ pruned: false
+ quality_level: Deployable
+ quality_level_hero_hw: cortex_m
+ quantization_default: false
+ quantization_full: false
+ recreate: true
+operators:
+ TensorFlow Lite:
+ - FULLY_CONNECTED
+ - RELU
+ - SOFTMAX
+paper: https://arxiv.org/abs/1711.07128
\ No newline at end of file
diff --git a/models/keyword_spotting/dnn_medium/model_package_tf/model_archive/TFLite/tflite_fp32/dnn_m.tflite b/models/keyword_spotting/dnn_medium/model_package_tf/model_archive/TFLite/tflite_fp32/dnn_m.tflite
new file mode 100644
index 0000000..e4e30d7
--- /dev/null
+++ b/models/keyword_spotting/dnn_medium/model_package_tf/model_archive/TFLite/tflite_fp32/dnn_m.tflite
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8380c3ff3a3152c5ab5cc2a226c73707924d906e468f708513ffa84d6e9a1d96
+size 797768
diff --git a/models/keyword_spotting/dnn_medium/model_package_tf/model_archive/TFLite/tflite_fp32/testing_input/input/0.npy b/models/keyword_spotting/dnn_medium/model_package_tf/model_archive/TFLite/tflite_fp32/testing_input/input/0.npy
new file mode 100644
index 0000000..85f3e34
--- /dev/null
+++ b/models/keyword_spotting/dnn_medium/model_package_tf/model_archive/TFLite/tflite_fp32/testing_input/input/0.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b1f811913684442a9517879b173e29799094e4261cbef84c0a84536564179349
+size 1128
diff --git a/models/keyword_spotting/dnn_medium/model_package_tf/model_archive/TFLite/tflite_fp32/testing_output/Identity/0.npy b/models/keyword_spotting/dnn_medium/model_package_tf/model_archive/TFLite/tflite_fp32/testing_output/Identity/0.npy
new file mode 100644
index 0000000..6af5cd7
--- /dev/null
+++ b/models/keyword_spotting/dnn_medium/model_package_tf/model_archive/TFLite/tflite_fp32/testing_output/Identity/0.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fed63e3ed1b354a3927bf735223654a482c6745299f5e2a57ed3974dfef295f1
+size 176
diff --git a/models/keyword_spotting/dnn_medium/model_package_tf/model_archive/TFLite/tflite_int8/README.md b/models/keyword_spotting/dnn_medium/model_package_tf/model_archive/TFLite/tflite_int8/README.md
new file mode 100644
index 0000000..1e65aad
--- /dev/null
+++ b/models/keyword_spotting/dnn_medium/model_package_tf/model_archive/TFLite/tflite_int8/README.md
@@ -0,0 +1,62 @@
+# keyword_spotting/dnn_medium/model_package_tf/model_archive/TFLite/tflite_int8
+
+## Description
+This is a fully quantized int8 version of the DNN Medium model developed by Arm, from the Hello Edge paper.
+
+## License
+[Apache-2.0](https://spdx.org/licenses/Apache-2.0.html)
+
+## Network Information
+| Network Information | Value |
+|---------------------|-------|
+| Framework | TensorFlow Lite |
+| Datatype | int8 |
+| SHA-1 Hash | 7e138f99cfc6a603a1fc735a2d9c3e28a41a6a43 |
+| Size (Bytes) | 203832 |
+| Provenance | https://arxiv.org/abs/1711.07128 |
+| Training | Trained by Arm |
+| Paper | https://arxiv.org/abs/1711.07128 |
+
+## DataSet
+| Dataset Information | Value |
+|--------|-------|
+| Name | Google Speech Commands test set |
+
+## Accuracy
+
+| Metric | Value |
+|--------|-------|
+| Accuracy | 83.93% |
+
+## HW Support
+| HW Support | Value |
+|--------------|-------|
+| Cortex-A |:heavy_check_mark: |
+| Cortex-M |:heavy_check_mark: |
+| Mali GPU |:heavy_check_mark: |
+| Ethos U |:heavy_check_mark: |
+
+### Key
+* :heavy_check_mark: - Will run on this platform.
+* :heavy_multiplication_x: - Will not run on this platform.
+
+## Network Quality
+| Network Quality | Value |
+|-------------------------|-------|
+| Recreate | :heavy_check_mark: |
+| Quality level | Deployable |
+| Vanilla | :heavy_check_mark: |
+| Clustered | :heavy_multiplication_x: |
+| Pruned | :heavy_multiplication_x: |
+| Quantization - default | :heavy_multiplication_x: |
+| Quantization - full | :heavy_check_mark: |
+
+## Network Inputs
+| Input Node Name | Shape | Type | Example Path | Example Type | Example Shape | Example Use Case |
+|-----------------|-------|-------|--------------|-------|-------|-----------------|
+| input | (1, 250) | int8 | models/keyword_spotting/dnn_medium/model_package_tf/model_archive/TFLite/tflite_int8/testing_input/input | int8 | [1, 250] | The input is a processed MFCCs |
+
+## Network Outputs
+| Output Node Name | Shape | Type | Example Path | Example Type | Example Shape | Example Use Case |
+|-----------------|-------|-------|--------------|-------|-------|-----------------|
+| Identity | (1, 12) | int8 | models/keyword_spotting/dnn_medium/model_package_tf/model_archive/TFLite/tflite_int8/testing_output/Identity | int8 | [1, 12] | The probability on 12 keywords |
\ No newline at end of file
diff --git a/models/keyword_spotting/dnn_medium/model_package_tf/model_archive/TFLite/tflite_int8/definition.yaml b/models/keyword_spotting/dnn_medium/model_package_tf/model_archive/TFLite/tflite_int8/definition.yaml
new file mode 100644
index 0000000..c519ab1
--- /dev/null
+++ b/models/keyword_spotting/dnn_medium/model_package_tf/model_archive/TFLite/tflite_int8/definition.yaml
@@ -0,0 +1,62 @@
+benchmark:
+ benchmark_metrics:
+ Accuracy: 83.93%
+ benchmark_name: Google Speech Commands test set
+description: This is a fully quantized int8 version of the DNN Medium model developed
+ by Arm, from the Hello Edge paper.
+license:
+- Apache-2.0
+network:
+ datatype: int8
+ file_size_bytes: 203832
+ filename: dnn_m_quantized.tflite
+ framework: TensorFlow Lite
+ hash:
+ algorithm: sha1
+ value: 7e138f99cfc6a603a1fc735a2d9c3e28a41a6a43
+ provenance: https://arxiv.org/abs/1711.07128
+ training: Trained by Arm
+network_parameters:
+ input_nodes:
+ - description: The input is a processed MFCCs of shape (1, 250)
+ example_input:
+ path: models/keyword_spotting/dnn_medium/model_package_tf/model_archive/TFLite/tflite_int8/testing_input/input
+ shape:
+ - 1
+ - 250
+ type: int8
+ use_case: Random input for model regression.
+ input_datatype: int8
+ name: input
+ shape:
+ - 1
+ - 250
+ output_nodes:
+ - description: The probability on 12 keywords.
+ example_output:
+ path: models/keyword_spotting/dnn_medium/model_package_tf/model_archive/TFLite/tflite_int8/testing_output/Identity
+ shape:
+ - 1
+ - 12
+ type: int8
+ use_case: output for model regression.
+ name: Identity
+ output_datatype: int8
+ shape:
+ - 1
+ - 12
+network_quality:
+ clustered: false
+ is_vanilla: true
+ pruned: false
+ quality_level: Deployable
+ quality_level_hero_hw: cortex_m
+ quantization_default: false
+ quantization_full: true
+ recreate: true
+operators:
+ TensorFlow Lite:
+ - FULLY_CONNECTED
+ - RELU
+ - SOFTMAX
+paper: https://arxiv.org/abs/1711.07128
\ No newline at end of file
diff --git a/models/keyword_spotting/dnn_medium/tflite_int8/dnn_m_quantized.tflite b/models/keyword_spotting/dnn_medium/model_package_tf/model_archive/TFLite/tflite_int8/dnn_m_quantized.tflite
similarity index 100%
rename from models/keyword_spotting/dnn_medium/tflite_int8/dnn_m_quantized.tflite
rename to models/keyword_spotting/dnn_medium/model_package_tf/model_archive/TFLite/tflite_int8/dnn_m_quantized.tflite
diff --git a/models/keyword_spotting/dnn_medium/tflite_int8/testing_input/input/0.npy b/models/keyword_spotting/dnn_medium/model_package_tf/model_archive/TFLite/tflite_int8/testing_input/input/0.npy
similarity index 100%
rename from models/keyword_spotting/dnn_medium/tflite_int8/testing_input/input/0.npy
rename to models/keyword_spotting/dnn_medium/model_package_tf/model_archive/TFLite/tflite_int8/testing_input/input/0.npy
diff --git a/models/keyword_spotting/dnn_medium/tflite_int8/testing_output/Identity/0.npy b/models/keyword_spotting/dnn_medium/model_package_tf/model_archive/TFLite/tflite_int8/testing_output/Identity/0.npy
similarity index 100%
rename from models/keyword_spotting/dnn_medium/tflite_int8/testing_output/Identity/0.npy
rename to models/keyword_spotting/dnn_medium/model_package_tf/model_archive/TFLite/tflite_int8/testing_output/Identity/0.npy
diff --git a/models/keyword_spotting/dnn_medium/model_package_tf/model_archive/model_source/saved_model/dnn_medium/keras_metadata.pb b/models/keyword_spotting/dnn_medium/model_package_tf/model_archive/model_source/saved_model/dnn_medium/keras_metadata.pb
new file mode 100644
index 0000000..08ef7e5
--- /dev/null
+++ b/models/keyword_spotting/dnn_medium/model_package_tf/model_archive/model_source/saved_model/dnn_medium/keras_metadata.pb
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d4dfba08e6695d3429dc605cf00dd1e6950f646faf61fc9876de9471f66ee419
+size 10087
diff --git a/models/keyword_spotting/dnn_medium/model_package_tf/model_archive/model_source/saved_model/dnn_medium/saved_model.pb b/models/keyword_spotting/dnn_medium/model_package_tf/model_archive/model_source/saved_model/dnn_medium/saved_model.pb
new file mode 100644
index 0000000..770dcc1
--- /dev/null
+++ b/models/keyword_spotting/dnn_medium/model_package_tf/model_archive/model_source/saved_model/dnn_medium/saved_model.pb
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ef3a9281ac9bc4de4ce805938bfdb673c0c06627ce977e11521c0782c1999256
+size 85126
diff --git a/models/keyword_spotting/dnn_medium/model_package_tf/model_archive/model_source/saved_model/dnn_medium/variables/variables.data-00000-of-00001 b/models/keyword_spotting/dnn_medium/model_package_tf/model_archive/model_source/saved_model/dnn_medium/variables/variables.data-00000-of-00001
new file mode 100644
index 0000000..afb21fe
--- /dev/null
+++ b/models/keyword_spotting/dnn_medium/model_package_tf/model_archive/model_source/saved_model/dnn_medium/variables/variables.data-00000-of-00001
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:69f2943b2684b7c153e67808422daa3f61b229dd3a6092b5ae5af95d1eaf3ff6
+size 798335
diff --git a/models/keyword_spotting/dnn_medium/model_package_tf/model_archive/model_source/saved_model/dnn_medium/variables/variables.index b/models/keyword_spotting/dnn_medium/model_package_tf/model_archive/model_source/saved_model/dnn_medium/variables/variables.index
new file mode 100644
index 0000000..7a51ce6
--- /dev/null
+++ b/models/keyword_spotting/dnn_medium/model_package_tf/model_archive/model_source/saved_model/dnn_medium/variables/variables.index
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4d85b2373052882c55abdeb07a4c061ad4aa23c0c36a72db08dc17a515d30363
+size 641
diff --git a/models/keyword_spotting/dnn_medium/tflite_int8/ckpt/checkpoint b/models/keyword_spotting/dnn_medium/model_package_tf/model_archive/model_source/weights/checkpoint
similarity index 100%
rename from models/keyword_spotting/dnn_medium/tflite_int8/ckpt/checkpoint
rename to models/keyword_spotting/dnn_medium/model_package_tf/model_archive/model_source/weights/checkpoint
diff --git a/models/keyword_spotting/dnn_medium/tflite_int8/ckpt/dnn_0.86_ckpt.data-00000-of-00001 b/models/keyword_spotting/dnn_medium/model_package_tf/model_archive/model_source/weights/dnn_0.86_ckpt.data-00000-of-00001
similarity index 100%
rename from models/keyword_spotting/dnn_medium/tflite_int8/ckpt/dnn_0.86_ckpt.data-00000-of-00001
rename to models/keyword_spotting/dnn_medium/model_package_tf/model_archive/model_source/weights/dnn_0.86_ckpt.data-00000-of-00001
diff --git a/models/keyword_spotting/dnn_medium/tflite_int8/ckpt/dnn_0.86_ckpt.index b/models/keyword_spotting/dnn_medium/model_package_tf/model_archive/model_source/weights/dnn_0.86_ckpt.index
similarity index 100%
rename from models/keyword_spotting/dnn_medium/tflite_int8/ckpt/dnn_0.86_ckpt.index
rename to models/keyword_spotting/dnn_medium/model_package_tf/model_archive/model_source/weights/dnn_0.86_ckpt.index
diff --git a/models/keyword_spotting/dnn_medium/model_package_tf/model_core_utils/__init__.py b/models/keyword_spotting/dnn_medium/model_package_tf/model_core_utils/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/models/keyword_spotting/dnn_medium/model_package_tf/model_core_utils/models.py b/models/keyword_spotting/dnn_medium/model_package_tf/model_core_utils/models.py
new file mode 100644
index 0000000..1978136
--- /dev/null
+++ b/models/keyword_spotting/dnn_medium/model_package_tf/model_core_utils/models.py
@@ -0,0 +1,327 @@
+# Copyright © 2023 Arm Ltd. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Model definitions for simple keyword spotting."""
+
+import math
+
+import tensorflow as tf
+
+
+def prepare_model_settings(label_count, sample_rate, clip_duration_ms,
+ window_size_ms, window_stride_ms,
+ dct_coefficient_count):
+ """Calculates common settings needed for all models.
+
+ Args:
+ label_count: How many classes are to be recognized.
+ sample_rate: Number of audio samples per second.
+ clip_duration_ms: Length of each audio clip to be analyzed.
+ window_size_ms: Duration of frequency analysis window.
+ window_stride_ms: How far to move in time between frequency windows.
+ dct_coefficient_count: Number of frequency bins to use for analysis.
+
+ Returns:
+ Dictionary containing common settings.
+ """
+ desired_samples = int(sample_rate * clip_duration_ms / 1000)
+ window_size_samples = int(sample_rate * window_size_ms / 1000)
+ window_stride_samples = int(sample_rate * window_stride_ms / 1000)
+ length_minus_window = (desired_samples - window_size_samples)
+ if length_minus_window < 0:
+ spectrogram_length = 0
+ else:
+ spectrogram_length = 1 + int(length_minus_window / window_stride_samples)
+ fingerprint_size = dct_coefficient_count * spectrogram_length
+
+ return {
+ 'desired_samples': desired_samples,
+ 'window_size_samples': window_size_samples,
+ 'window_stride_samples': window_stride_samples,
+ 'spectrogram_length': spectrogram_length,
+ 'dct_coefficient_count': dct_coefficient_count,
+ 'fingerprint_size': fingerprint_size,
+ 'label_count': label_count,
+ 'sample_rate': sample_rate,
+ }
+
+
+def create_model(model_settings, model_architecture, model_size_info, is_training):
+ """Builds a tf.keras model of the requested architecture compatible with the settings.
+
+ Args:
+ model_settings: Dictionary of information about the model.
+ model_architecture: String specifying which kind of model to create.
+ model_size_info: Array with specific information for the chosen architecture
+ (e.g convolutional parameters, number of layers).
+
+ Returns:
+ A tf.keras Model with the requested architecture.
+
+ Raises:
+ Exception: If the architecture type isn't recognized.
+ """
+
+ if model_architecture == 'dnn':
+ return create_dnn_model(model_settings, model_size_info)
+
+ elif model_architecture == 'cnn':
+ return create_cnn_model(model_settings, model_size_info)
+
+ elif model_architecture == 'ds_cnn':
+ return create_ds_cnn_model(model_settings, model_size_info)
+ elif model_architecture == 'single_fc':
+ return create_single_fc_model(model_settings)
+ elif model_architecture == 'basic_lstm':
+ return create_basic_lstm_model(model_settings, model_size_info, is_training)
+ else:
+ raise Exception(f'model_architecture argument {model_architecture} not recognized'
+ f', should be one of, "dnn", "cnn", "ds_cnn" ')
+
+
+def create_single_fc_model(model_settings):
+ """Builds a model with a single fully-connected layer.
+
+ For details see https://arxiv.org/abs/1711.07128.
+
+ Args:
+ model_settings: Dict of different settings for model training.
+
+ Returns:
+ tf.keras Model of the 'SINGLE_FC' architecture.
+ """
+ inputs = tf.keras.Input(shape=(model_settings['fingerprint_size'],), name='input')
+ # Fully connected layer
+ output = tf.keras.layers.Dense(units=model_settings['label_count'], activation='softmax')(inputs)
+
+ return tf.keras.Model(inputs, output)
+
+
+def create_basic_lstm_model(model_settings, model_size_info, is_training):
+ """Builds a model with a basic lstm layer.
+
+ For details see https://arxiv.org/abs/1711.07128.
+
+ Args:
+ model_settings: Dict of different settings for model training.
+ model_size_info: Length of the array defines the number of hidden-layers and
+ each element in the array represent the number of neurons in that layer.
+ is_training: Determining whether the use of the model is for training or for something else.
+
+ Returns:
+ tf.keras Model of the 'Basic_LSTM' architecture.
+ """
+ inputs = tf.keras.Input(shape=(model_settings['fingerprint_size'], ), name='input')
+
+ input_frequency_size = model_settings['dct_coefficient_count']
+ input_time_size = model_settings['spectrogram_length']
+ x = tf.reshape(inputs, shape=(-1, input_time_size, input_frequency_size))
+
+ # LSTM layer, and unrolling depending on whether you are training or not
+ if is_training:
+ x = tf.keras.layers.LSTM(units=model_size_info[0], time_major=False, unroll=False)(x)
+ else:
+ x = tf.keras.layers.LSTM(units=model_size_info[0], time_major=False, unroll=True)(x)
+
+ # Outputs a fully connected layer
+ output = tf.keras.layers.Dense(units=model_settings['label_count'], activation='softmax')(x)
+
+ return tf.keras.Model(inputs, output)
+
+
+def create_dnn_model(model_settings, model_size_info):
+ """Builds a model with multiple hidden fully-connected layers.
+
+ For details see https://arxiv.org/abs/1711.07128.
+
+ Args:
+ model_settings: Dict of different settings for model training.
+ model_size_info: Length of the array defines the number of hidden-layers and
+ each element in the array represent the number of neurons in that layer.
+
+ Returns:
+ tf.keras Model of the 'DNN' architecture.
+ """
+
+ inputs = tf.keras.Input(shape=(model_settings['fingerprint_size'], ), name='input')
+
+ # First fully connected layer.
+ x = tf.keras.layers.Dense(units=model_size_info[0], activation='relu')(inputs)
+
+ # Hidden layers with ReLU activations.
+ for i in range(1, len(model_size_info)):
+ x = tf.keras.layers.Dense(units=model_size_info[i], activation='relu')(x)
+
+ # Output fully connected layer.
+ output = tf.keras.layers.Dense(units=model_settings['label_count'], activation='softmax')(x)
+
+ return tf.keras.Model(inputs, output)
+
+
+def create_cnn_model(model_settings, model_size_info):
+ """Builds a model with 2 convolution layers followed by a linear layer and a hidden fully-connected layer.
+
+ For details see https://arxiv.org/abs/1711.07128.
+
+ Args:
+ model_settings: Dict of different settings for model training.
+ model_size_info: Defines the first and second convolution parameters in
+ {number of conv features, conv filter height, width, stride in y,x dir.},
+ followed by linear layer size and fully-connected layer size.
+
+ Returns:
+ tf.keras Model of the 'CNN' architecture.
+ """
+
+ input_frequency_size = model_settings['dct_coefficient_count']
+ input_time_size = model_settings['spectrogram_length']
+
+ first_filter_count = model_size_info[0]
+ first_filter_height = model_size_info[1] # Time axis.
+ first_filter_width = model_size_info[2] # Frequency axis.
+ first_filter_stride_y = model_size_info[3] # Time axis.
+ first_filter_stride_x = model_size_info[4] # Frequency_axis.
+
+ second_filter_count = model_size_info[5]
+ second_filter_height = model_size_info[6] # Time axis.
+ second_filter_width = model_size_info[7] # Frequency axis.
+ second_filter_stride_y = model_size_info[8] # Time axis.
+ second_filter_stride_x = model_size_info[9] # Frequency axis.
+
+ linear_layer_size = model_size_info[10]
+ fc_size = model_size_info[11]
+
+ inputs = tf.keras.Input(shape=(model_settings['fingerprint_size']), name='input')
+
+ # Reshape the flattened input.
+ x = tf.reshape(inputs, shape=(-1, input_time_size, input_frequency_size, 1))
+
+ # First convolution.
+ x = tf.keras.layers.Conv2D(filters=first_filter_count,
+ kernel_size=(first_filter_height, first_filter_width),
+ strides=(first_filter_stride_y, first_filter_stride_x),
+ padding='VALID')(x)
+ x = tf.keras.layers.BatchNormalization()(x)
+ x = tf.keras.layers.ReLU()(x)
+ x = tf.keras.layers.Dropout(rate=0)(x)
+
+ # Second convolution.
+ x = tf.keras.layers.Conv2D(filters=second_filter_count,
+ kernel_size=(second_filter_height, second_filter_width),
+ strides=(second_filter_stride_y, second_filter_stride_x),
+ padding='VALID')(x)
+ x = tf.keras.layers.BatchNormalization()(x)
+ x = tf.keras.layers.ReLU()(x)
+ x = tf.keras.layers.Dropout(rate=0)(x)
+
+ # Flatten for fully connected layers.
+ x = tf.keras.layers.Flatten()(x)
+
+ # Fully connected layer with no activation.
+ x = tf.keras.layers.Dense(units=linear_layer_size)(x)
+
+ # Fully connected layer with ReLU activation.
+ x = tf.keras.layers.Dense(units=fc_size)(x)
+ x = tf.keras.layers.BatchNormalization()(x)
+ x = tf.keras.layers.ReLU()(x)
+ x = tf.keras.layers.Dropout(rate=0)(x)
+
+ # Output fully connected.
+ output = tf.keras.layers.Dense(units=model_settings['label_count'], activation='softmax')(x)
+
+ return tf.keras.Model(inputs, output)
+
+
+def create_ds_cnn_model(model_settings, model_size_info):
+ """Builds a model with convolutional & depthwise separable convolutional layers.
+
+ For more details see https://arxiv.org/abs/1711.07128.
+
+ Args:
+ model_settings: Dict of different settings for model training.
+ model_size_info: Defines number of layers, followed by the DS-Conv layer
+ parameters in the order {number of conv features, conv filter height,
+ width and stride in y,x dir.} for each of the layers.
+
+ Returns:
+ tf.keras Model of the 'DS-CNN' architecture.
+ """
+
+ label_count = model_settings['label_count']
+ input_frequency_size = model_settings['dct_coefficient_count']
+ input_time_size = model_settings['spectrogram_length']
+
+ t_dim = input_time_size
+ f_dim = input_frequency_size
+
+ # Extract model dimensions from model_size_info.
+ num_layers = model_size_info[0]
+ conv_feat = [None]*num_layers
+ conv_kt = [None]*num_layers
+ conv_kf = [None]*num_layers
+ conv_st = [None]*num_layers
+ conv_sf = [None]*num_layers
+
+ i = 1
+ for layer_no in range(0, num_layers):
+ conv_feat[layer_no] = model_size_info[i]
+ i += 1
+ conv_kt[layer_no] = model_size_info[i]
+ i += 1
+ conv_kf[layer_no] = model_size_info[i]
+ i += 1
+ conv_st[layer_no] = model_size_info[i]
+ i += 1
+ conv_sf[layer_no] = model_size_info[i]
+ i += 1
+
+ inputs = tf.keras.Input(shape=(model_settings['fingerprint_size']), name='input')
+
+ # Reshape the flattened input.
+ x = tf.reshape(inputs, shape=(-1, input_time_size, input_frequency_size, 1))
+
+ # Depthwise separable convolutions.
+ for layer_no in range(0, num_layers):
+ if layer_no == 0:
+ # First convolution.
+ x = tf.keras.layers.Conv2D(filters=conv_feat[0],
+ kernel_size=(conv_kt[0], conv_kf[0]),
+ strides=(conv_st[0], conv_sf[0]),
+ padding='SAME')(x)
+ x = tf.keras.layers.BatchNormalization()(x)
+ x = tf.keras.layers.ReLU()(x)
+ else:
+ # Depthwise convolution.
+ x = tf.keras.layers.DepthwiseConv2D(kernel_size=(conv_kt[layer_no], conv_kf[layer_no]),
+ strides=(conv_sf[layer_no], conv_st[layer_no]),
+ padding='SAME')(x)
+ x = tf.keras.layers.BatchNormalization()(x)
+ x = tf.keras.layers.ReLU()(x)
+
+ # Pointwise convolution.
+ x = tf.keras.layers.Conv2D(filters=conv_feat[layer_no], kernel_size=(1, 1))(x)
+ x = tf.keras.layers.BatchNormalization()(x)
+ x = tf.keras.layers.ReLU()(x)
+
+ t_dim = math.ceil(t_dim/float(conv_st[layer_no]))
+ f_dim = math.ceil(f_dim/float(conv_sf[layer_no]))
+
+ # Global average pool.
+ x = tf.keras.layers.AveragePooling2D(pool_size=(t_dim, f_dim), strides=1)(x)
+
+ # Squeeze before passing to output fully connected layer.
+ x = tf.reshape(x, shape=(-1, conv_feat[layer_no]))
+
+ # Output connected layer.
+ output = tf.keras.layers.Dense(units=label_count, activation='softmax')(x)
+
+ return tf.keras.Model(inputs, output)
diff --git a/models/keyword_spotting/dnn_medium/model_package_tf/optimisations.py b/models/keyword_spotting/dnn_medium/model_package_tf/optimisations.py
new file mode 100644
index 0000000..16b6f4c
--- /dev/null
+++ b/models/keyword_spotting/dnn_medium/model_package_tf/optimisations.py
@@ -0,0 +1,259 @@
+# Copyright © 2023 Arm Ltd. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Functions for optimizing simple keyword spotting models using clustering API."""
+
+import argparse
+from pathlib import Path
+
+import tensorflow as tf
+import numpy as np
+import tensorflow_model_optimization as tfmot
+
+from data_processing import data_preprocessing
+from model_core_utils import models
+
+
+def print_model_weight_clusters(model):
+
+ for layer in model.layers:
+ if isinstance(layer, tf.keras.layers.Wrapper):
+ weights = layer.trainable_weights
+ else:
+ weights = layer.weights
+ for weight in weights:
+ if "kernel" in weight.name:
+ unique_count = len(np.unique(weight))
+ print(
+ f"{layer.name}/{weight.name}: {unique_count} clusters "
+ )
+
+
+def optimize():
+ model_settings = models.prepare_model_settings(
+ len(data_preprocessing.prepare_words_list(FLAGS.wanted_words.split(','))),
+ FLAGS.sample_rate, FLAGS.clip_duration_ms, FLAGS.window_size_ms,
+ FLAGS.window_stride_ms, FLAGS.dct_coefficient_count)
+
+ # Create the model to optimize from checkpoint.
+ model = models.create_model(model_settings, FLAGS.model_architecture, FLAGS.model_size_info)
+ model.load_weights(FLAGS.checkpoint).expect_partial()
+
+ audio_processor = data_preprocessing.AudioProcessor(data_url=FLAGS.data_url,
+ data_dir=FLAGS.data_dir,
+ silence_percentage=FLAGS.silence_percentage,
+ unknown_percentage=FLAGS.unknown_percentage,
+ wanted_words=FLAGS.wanted_words.split(','),
+ validation_percentage=FLAGS.validation_percentage,
+ testing_percentage=FLAGS.testing_percentage,
+ model_settings=model_settings)
+
+ # We decay learning rate in a constant piecewise way to help learning.
+ training_steps_list = list(map(int, FLAGS.how_many_training_steps.split(',')))
+ learning_rates_list = list(map(float, FLAGS.learning_rate.split(',')))
+ lr_boundary_list = training_steps_list[:-1] # Only need the values at which to change lr.
+ lr_schedule = tf.keras.optimizers.schedules.PiecewiseConstantDecay(boundaries=lr_boundary_list,
+ values=learning_rates_list)
+
+ cluster_weights = tfmot.clustering.keras.cluster_weights
+ CentroidInitialization = tfmot.clustering.keras.CentroidInitialization
+
+ clustering_params = {
+ 'number_of_clusters': 32,
+ 'cluster_centroids_init': CentroidInitialization.KMEANS_PLUS_PLUS}
+
+ clustered_model = cluster_weights(model, **clustering_params)
+
+ # Specify the optimizer configurations.
+ optimizer = tf.keras.optimizers.Adam(learning_rate=lr_schedule)
+ clustered_model.compile(optimizer=optimizer,
+ loss=tf.keras.losses.SparseCategoricalCrossentropy(),
+ metrics=['accuracy'])
+
+ train_data = audio_processor.get_data(audio_processor.Modes.TRAINING,
+ FLAGS.background_frequency, FLAGS.background_volume,
+ int((FLAGS.time_shift_ms * FLAGS.sample_rate) / 1000))
+ train_data = train_data.repeat().batch(FLAGS.batch_size).prefetch(tf.data.AUTOTUNE)
+ val_data = audio_processor.get_data(audio_processor.Modes.VALIDATION)
+ val_data = val_data.batch(FLAGS.batch_size).prefetch(tf.data.AUTOTUNE)
+
+ # We train for a max number of iterations so need to calculate how many 'epochs' this will be.
+ training_steps_max = np.sum(training_steps_list)
+ training_epoch_max = int(np.ceil(training_steps_max / FLAGS.eval_step_interval))
+
+ # Train the model with clustering applied.
+ clustered_model.fit(x=train_data,
+ steps_per_epoch=FLAGS.eval_step_interval,
+ epochs=training_epoch_max,
+ validation_data=val_data)
+
+ stripped_clustered_model = tfmot.clustering.keras.strip_clustering(clustered_model)
+
+ print_model_weight_clusters(stripped_clustered_model)
+
+ # Save the clustered model weights
+ train_dir = Path(FLAGS.train_dir) / "optimized"
+ train_dir.mkdir(parents=True, exist_ok=True)
+
+ stripped_clustered_model.save_weights((train_dir /
+ (FLAGS.model_architecture +
+ "_clustered_ckpt")))
+
+ # Test the model.
+ test_data = audio_processor.get_data(audio_processor.Modes.TESTING)
+ test_data = test_data.batch(FLAGS.batch_size)
+
+ stripped_clustered_model.compile(optimizer=optimizer,
+ loss=tf.keras.losses.SparseCategoricalCrossentropy(),
+ metrics=['accuracy'])
+
+ test_loss, test_acc = stripped_clustered_model.evaluate(x=test_data)
+ print(f'Final test accuracy: {test_acc*100:.2f}%')
+
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser()
+ parser.add_argument(
+ '--data_url',
+ type=str,
+ default='http://download.tensorflow.org/data/speech_commands_v0.02.tar.gz',
+ help='Location of speech training data archive on the web.')
+ parser.add_argument(
+ '--data_dir',
+ type=str,
+ default='/tmp/speech_dataset/',
+ help="""\
+ Where to download the speech training data to.
+ """)
+ parser.add_argument(
+ '--background_volume',
+ type=float,
+ default=0.1,
+ help="""\
+ How loud the background noise should be, between 0 and 1.
+ """)
+ parser.add_argument(
+ '--background_frequency',
+ type=float,
+ default=0.8,
+ help="""\
+ How many of the training samples have background noise mixed in.
+ """)
+ parser.add_argument(
+ '--silence_percentage',
+ type=float,
+ default=10.0,
+ help="""\
+ How much of the training data should be silence.
+ """)
+ parser.add_argument(
+ '--unknown_percentage',
+ type=float,
+ default=10.0,
+ help="""\
+ How much of the training data should be unknown words.
+ """)
+ parser.add_argument(
+ '--time_shift_ms',
+ type=float,
+ default=100.0,
+ help="""\
+ Range to randomly shift the training audio by in time.
+ """)
+ parser.add_argument(
+ '--testing_percentage',
+ type=int,
+ default=10,
+ help='What percentage of wavs to use as a test set.')
+ parser.add_argument(
+ '--validation_percentage',
+ type=int,
+ default=10,
+ help='What percentage of wavs to use as a validation set.')
+ parser.add_argument(
+ '--sample_rate',
+ type=int,
+ default=16000,
+ help='Expected sample rate of the wavs',)
+ parser.add_argument(
+ '--clip_duration_ms',
+ type=int,
+ default=1000,
+ help='Expected duration in milliseconds of the wavs',)
+ parser.add_argument(
+ '--window_size_ms',
+ type=float,
+ default=30.0,
+ help='How long each spectrogram timeslice is',)
+ parser.add_argument(
+ '--window_stride_ms',
+ type=float,
+ default=10.0,
+ help='How long each spectrogram timeslice is',)
+ parser.add_argument(
+ '--dct_coefficient_count',
+ type=int,
+ default=40,
+ help='How many bins to use for the MFCC fingerprint',)
+ parser.add_argument(
+ '--how_many_training_steps',
+ type=str,
+ default='3750,750',
+ help='How many training loops to run',)
+ parser.add_argument(
+ '--eval_step_interval',
+ type=int,
+ default=400,
+ help='How often to evaluate the training results.')
+ parser.add_argument(
+ '--learning_rate',
+ type=str,
+ default='0.001,0.0001',
+ help='How large a learning rate to use when training.')
+ parser.add_argument(
+ '--batch_size',
+ type=int,
+ default=100,
+ help='How many items to train with at once',)
+ parser.add_argument(
+ '--wanted_words',
+ type=str,
+ default='yes,no,up,down,left,right,on,off,stop,go',
+ help='Words to use (others will be added to an unknown label)',)
+ parser.add_argument(
+ '--train_dir',
+ type=str,
+ default='/tmp/speech_commands_train',
+ help='Directory to write event logs and checkpoint.')
+ parser.add_argument(
+ '--save_step_interval',
+ type=int,
+ default=100,
+ help='Save model checkpoint every save_steps.')
+ parser.add_argument(
+ '--checkpoint',
+ type=str,
+ help='Checkpoint to load the weights from before fine-tuning.')
+ parser.add_argument(
+ '--model_architecture',
+ type=str,
+ default='dnn',
+ help='What model architecture to use')
+ parser.add_argument(
+ '--model_size_info',
+ type=int,
+ nargs="+",
+ default=[128, 128, 128],
+ help='Model dimensions - different for various models')
+
+ FLAGS, _ = parser.parse_known_args()
+ optimize()
diff --git a/models/keyword_spotting/dnn_medium/model_package_tf/recreate_model.sh b/models/keyword_spotting/dnn_medium/model_package_tf/recreate_model.sh
new file mode 100644
index 0000000..2a465cf
--- /dev/null
+++ b/models/keyword_spotting/dnn_medium/model_package_tf/recreate_model.sh
@@ -0,0 +1,52 @@
+#!/usr/bin/env bash
+# Copyright (C) 2023 Arm Limited or its affiliates. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+ckpt_path=model_archive/model_source/weights/dnn_0.86_ckpt
+train=false
+
+# Parse command line args
+while (( $# >= 1 )); do
+ case $1 in
+ --ckpt)
+ if [ "$2" ]; then
+ ckpt_path=$2
+ shift
+ else
+ printf 'ERROR: "--ckpt" requires a path to be supplied.\n'
+ exit 1
+ fi
+ ;;
+ --train)
+ train=true
+ break;;
+ *) shift;
+ esac;
+done
+
+
+# DNN Medium training
+if [ "$train" = true ]
+then
+python train.py --model_architecture dnn --model_size_info 256 256 256 --dct_coefficient_count 10 --window_size_ms 40 --window_stride_ms 40 --learning_rate 0.0005,0.0001,0.00002 --how_many_training_steps 10000,10000,10000 --summaries_dir work/DNN/DNN_M/retrain_logs --train_dir work/DNN/DNN_M/training
+fi
+
+# Conversion to TFLite fp32
+python convert_to_tflite.py --model_architecture dnn --model_size_info 256 256 256 --dct_coefficient_count 10 --window_size_ms 40 --window_stride_ms 40 --checkpoint $ckpt_path --no-quantize
+
+# Conversion to TFLite int8
+python convert_to_tflite.py --model_architecture dnn --model_size_info 256 256 256 --dct_coefficient_count 10 --window_size_ms 40 --window_stride_ms 40 --checkpoint $ckpt_path --inference_type int8
+
diff --git a/models/keyword_spotting/dnn_medium/model_package_tf/requirements.txt b/models/keyword_spotting/dnn_medium/model_package_tf/requirements.txt
new file mode 100644
index 0000000..3448cff
--- /dev/null
+++ b/models/keyword_spotting/dnn_medium/model_package_tf/requirements.txt
@@ -0,0 +1,3 @@
+numpy == 1.19.5
+tensorflow == 2.5.0
+tensorflow-model-optimization == 0.6.0
\ No newline at end of file
diff --git a/models/keyword_spotting/dnn_medium/model_package_tf/train.py b/models/keyword_spotting/dnn_medium/model_package_tf/train.py
new file mode 100644
index 0000000..8c488b3
--- /dev/null
+++ b/models/keyword_spotting/dnn_medium/model_package_tf/train.py
@@ -0,0 +1,227 @@
+# Copyright © 2023 Arm Ltd. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Functions for training simple keyword spotting models."""
+
+import argparse
+from pathlib import Path
+
+import tensorflow as tf
+import numpy as np
+
+from data_processing import data_preprocessing
+from model_core_utils import models
+
+
+def train():
+ model_settings = models.prepare_model_settings(
+ len(data_preprocessing.prepare_words_list(FLAGS.wanted_words.split(','))),
+ FLAGS.sample_rate, FLAGS.clip_duration_ms, FLAGS.window_size_ms,
+ FLAGS.window_stride_ms, FLAGS.dct_coefficient_count)
+
+ # Create the model.
+ model = models.create_model(model_settings, FLAGS.model_architecture, FLAGS.model_size_info, True)
+
+ audio_processor = data_preprocessing.AudioProcessor(data_url=FLAGS.data_url,
+ data_dir=FLAGS.data_dir,
+ silence_percentage=FLAGS.silence_percentage,
+ unknown_percentage=FLAGS.unknown_percentage,
+ wanted_words=FLAGS.wanted_words.split(','),
+ validation_percentage=FLAGS.validation_percentage,
+ testing_percentage=FLAGS.testing_percentage,
+ model_settings=model_settings)
+
+ # We decay learning rate in a constant piecewise way to help learning.
+ training_steps_list = list(map(int, FLAGS.how_many_training_steps.split(',')))
+ learning_rates_list = list(map(float, FLAGS.learning_rate.split(',')))
+ lr_boundary_list = training_steps_list[:-1] # Only need the values at which to change lr.
+ lr_schedule = tf.keras.optimizers.schedules.PiecewiseConstantDecay(boundaries=lr_boundary_list,
+ values=learning_rates_list)
+
+ # Specify the optimizer configurations.
+ optimizer = tf.keras.optimizers.Adam(learning_rate=lr_schedule)
+ model.compile(optimizer=optimizer,
+ loss=tf.keras.losses.SparseCategoricalCrossentropy(),
+ metrics=['accuracy'])
+
+ train_data = audio_processor.get_data(audio_processor.Modes.TRAINING,
+ FLAGS.background_frequency, FLAGS.background_volume,
+ int((FLAGS.time_shift_ms * FLAGS.sample_rate) / 1000))
+ train_data = train_data.repeat().batch(FLAGS.batch_size).prefetch(tf.data.AUTOTUNE)
+ val_data = audio_processor.get_data(audio_processor.Modes.VALIDATION)
+ val_data = val_data.batch(FLAGS.batch_size).prefetch(tf.data.AUTOTUNE)
+
+ # We train for a max number of iterations so need to calculate how many 'epochs' this will be.
+ training_steps_max = np.sum(training_steps_list)
+ training_epoch_max = int(np.ceil(training_steps_max / FLAGS.eval_step_interval))
+
+ # Callbacks.
+ train_dir = Path(FLAGS.train_dir) / "best"
+ train_dir.mkdir(parents=True, exist_ok=True)
+ model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
+ filepath=(train_dir / (FLAGS.model_architecture + "_{val_accuracy:.3f}_ckpt")),
+ save_weights_only=True,
+ monitor='val_accuracy',
+ mode='max',
+ save_best_only=True)
+ tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=FLAGS.summaries_dir)
+
+ # Train the model.
+ model.fit(x=train_data,
+ steps_per_epoch=FLAGS.eval_step_interval,
+ epochs=training_epoch_max,
+ validation_data=val_data,
+ callbacks=[model_checkpoint_callback, tensorboard_callback])
+
+ # Test and save the model.
+ test_data = audio_processor.get_data(audio_processor.Modes.TESTING)
+ test_data = test_data.batch(FLAGS.batch_size)
+
+ test_loss, test_acc = model.evaluate(x=test_data)
+ print(f'Final test accuracy: {test_acc*100:.2f}%')
+ model.save(f'saved_model/{FLAGS.model_architecture}')
+ model.save(f'keras/{FLAGS.model_architecture}.h5')
+
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser()
+ parser.add_argument(
+ '--data_url',
+ type=str,
+ default='http://download.tensorflow.org/data/speech_commands_v0.02.tar.gz',
+ help='Location of speech training data archive on the web.')
+ parser.add_argument(
+ '--data_dir',
+ type=str,
+ default='/tmp/speech_dataset/',
+ help="""\
+ Where to download the speech training data to.
+ """)
+ parser.add_argument(
+ '--background_volume',
+ type=float,
+ default=0.1,
+ help="""\
+ How loud the background noise should be, between 0 and 1.
+ """)
+ parser.add_argument(
+ '--background_frequency',
+ type=float,
+ default=0.8,
+ help="""\
+ How many of the training samples have background noise mixed in.
+ """)
+ parser.add_argument(
+ '--silence_percentage',
+ type=float,
+ default=10.0,
+ help="""\
+ How much of the training data should be silence.
+ """)
+ parser.add_argument(
+ '--unknown_percentage',
+ type=float,
+ default=10.0,
+ help="""\
+ How much of the training data should be unknown words.
+ """)
+ parser.add_argument(
+ '--time_shift_ms',
+ type=float,
+ default=100.0,
+ help="""\
+ Range to randomly shift the training audio by in time.
+ """)
+ parser.add_argument(
+ '--testing_percentage',
+ type=int,
+ default=10,
+ help='What percentage of wavs to use as a test set.')
+ parser.add_argument(
+ '--validation_percentage',
+ type=int,
+ default=10,
+ help='What percentage of wavs to use as a validation set.')
+ parser.add_argument(
+ '--sample_rate',
+ type=int,
+ default=16000,
+ help='Expected sample rate of the wavs',)
+ parser.add_argument(
+ '--clip_duration_ms',
+ type=int,
+ default=1000,
+ help='Expected duration in milliseconds of the wavs',)
+ parser.add_argument(
+ '--window_size_ms',
+ type=float,
+ default=30.0,
+ help='How long each spectrogram timeslice is',)
+ parser.add_argument(
+ '--window_stride_ms',
+ type=float,
+ default=10.0,
+ help='How long each spectrogram timeslice is',)
+ parser.add_argument(
+ '--dct_coefficient_count',
+ type=int,
+ default=40,
+ help='How many bins to use for the MFCC fingerprint',)
+ parser.add_argument(
+ '--how_many_training_steps',
+ type=str,
+ default='15000,3000',
+ help='How many training loops to run',)
+ parser.add_argument(
+ '--eval_step_interval',
+ type=int,
+ default=400,
+ help='How often to evaluate the training results.')
+ parser.add_argument(
+ '--learning_rate',
+ type=str,
+ default='0.001,0.0001',
+ help='How large a learning rate to use when training.')
+ parser.add_argument(
+ '--batch_size',
+ type=int,
+ default=100,
+ help='How many items to train with at once',)
+ parser.add_argument(
+ '--summaries_dir',
+ type=str,
+ default='/tmp/retrain_logs',
+ help='Where to save summary logs for TensorBoard.')
+ parser.add_argument(
+ '--wanted_words',
+ type=str,
+ default='yes,no,up,down,left,right,on,off,stop,go',
+ help='Words to use (others will be added to an unknown label)',)
+ parser.add_argument(
+ '--train_dir',
+ type=str,
+ default='/tmp/speech_commands_train',
+ help='Directory to write event logs and checkpoint.')
+ parser.add_argument(
+ '--model_architecture',
+ type=str,
+ default='dnn',
+ help='What model architecture to use')
+ parser.add_argument(
+ '--model_size_info',
+ type=int,
+ nargs="+",
+ default=[128, 128, 128],
+ help='Model dimensions - different for various models')
+
+ FLAGS, _ = parser.parse_known_args()
+ train()
diff --git a/models/keyword_spotting/dnn_medium/model_package_tf/validation_utils/labels.txt b/models/keyword_spotting/dnn_medium/model_package_tf/validation_utils/labels.txt
new file mode 100644
index 0000000..ba41645
--- /dev/null
+++ b/models/keyword_spotting/dnn_medium/model_package_tf/validation_utils/labels.txt
@@ -0,0 +1,12 @@
+_silence_
+_unknown_
+yes
+no
+up
+down
+left
+right
+on
+off
+stop
+go
\ No newline at end of file
diff --git a/models/keyword_spotting/dnn_medium/tflite_int8/README.md b/models/keyword_spotting/dnn_medium/tflite_int8/README.md
deleted file mode 100644
index cfc52ce..0000000
--- a/models/keyword_spotting/dnn_medium/tflite_int8/README.md
+++ /dev/null
@@ -1,59 +0,0 @@
-# DNN Medium INT8
-
-## Description
-This is a fully quantized version (asymmetrical int8) of the DNN Medium model developed by Arm, with training checkpoints, from the Hello Edge paper. Code to recreate this model can be found here: https://github.com/ARM-software/ML-examples/tree/master/tflu-kws-cortex-m
-
-## License
-[Apache-2.0](https://spdx.org/licenses/Apache-2.0.html)
-
-## Related Materials
-### Class Labels
-The class labels associated with this model can be downloaded by running the script `get_class_labels.sh`.
-
-### Model Recreation Code
-Code to recreate this model can be found here: https://github.com/ARM-software/ML-examples/tree/master/tflu-kws-cortex-m.
-
-## Network Information
-| Network Information | Value |
-|---------------------|------------------|
-| Framework | TensorFlow Lite |
-| SHA-1 Hash | 7e138f99cfc6a603a1fc735a2d9c3e28a41a6a43 |
-| Size (Bytes) | 203832 |
-| Provenance | https://github.com/ARM-software/ML-examples/tree/master/tflu-kws-cortex-m |
-| Paper | https://arxiv.org/abs/1711.07128 |
-
-## Accuracy
-Dataset: Google Speech Commands Test Set
-
-| Metric | Value |
-|--------|-------|
-| Accuracy | 0.844 |
-
-## Performance
-| Platform | Optimized |
-|----------|:---------:|
-| Cortex-A |:heavy_check_mark: |
-| Cortex-M |:heavy_check_mark: |
-| Mali GPU |:heavy_check_mark: |
-| Ethos U |:heavy_check_mark: |
-
-### Key
-* :heavy_check_mark: - Will run on this platform.
-* :heavy_multiplication_x: - Will not run on this platform.
-
-
-
-## Optimizations
-| Optimization | Value |
-|-----------------|---------|
-| Quantization | INT8 |
-
-## Network Inputs
-| Input Node Name | Shape | Description |
-|-----------------|---------|-------------|
-| input | (1, 250) | The input is a processed MFCCs of shape (1, 250) |
-
-## Network Outputs
-| Output Node Name | Shape | Description |
-|------------------|---------|-------------|
-| Identity | (1, 12) | The probability on 12 keywords. |
diff --git a/models/keyword_spotting/dnn_medium/tflite_int8/definition.yaml b/models/keyword_spotting/dnn_medium/tflite_int8/definition.yaml
deleted file mode 100644
index abcfbd8..0000000
--- a/models/keyword_spotting/dnn_medium/tflite_int8/definition.yaml
+++ /dev/null
@@ -1,41 +0,0 @@
-benchmark:
- Google Speech Commands test set:
- Accuracy: 84.44%
-description: 'This is a fully quantized version (asymmetrical int8) of the DNN Medium
- model developed by Arm, with training checkpoints, from the Hello Edge paper. Code
- to recreate this model can be found here: https://github.com/ARM-software/ML-examples/tree/master/tflu-kws-cortex-m'
-license:
-- Apache-2.0
-network:
- file_size_bytes: 203832
- filename: dnn_m_quantized.tflite
- framework: TensorFlow Lite
- hash:
- algorithm: sha1
- value: 7e138f99cfc6a603a1fc735a2d9c3e28a41a6a43
- provenance: https://github.com/ARM-software/ML-examples/tree/master/tflu-kws-cortex-m
- quality_level: null
-network_parameters:
- input_nodes:
- - description: The input is a processed MFCCs of shape (1, 250)
- example_input:
- path: models/keyword_spotting/dnn_medium/tflite_int8/testing_input/input
- name: input
- shape:
- - 1
- - 250
- output_nodes:
- - description: The probability on 12 keywords.
- name: Identity
- shape:
- - 1
- - 12
- test_output_path: models/keyword_spotting/dnn_medium/tflite_int8/testing_output/Identity
-operators:
- TensorFlow Lite:
- - DEQUANTIZE
- - FULLY_CONNECTED
- - QUANTIZE
- - RELU
- - SOFTMAX
-paper: https://arxiv.org/abs/1711.07128
diff --git a/models/keyword_spotting/dnn_medium/tflite_int8/get_class_labels.sh b/models/keyword_spotting/dnn_medium/tflite_int8/get_class_labels.sh
deleted file mode 100755
index e59caf5..0000000
--- a/models/keyword_spotting/dnn_medium/tflite_int8/get_class_labels.sh
+++ /dev/null
@@ -1,20 +0,0 @@
-# Copyright (C) 2021 Arm Limited or its affiliates. All rights reserved.
-#
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-#!/usr/bin/env bash
-
-wget https://raw.githubusercontent.com/ARM-software/ML-KWS-for-MCU/e9cf319e9aa2ff71d433e111477dd95329fb94cb/Pretrained_models/labels.txt
-mv labels.txt labelmappings.txt
\ No newline at end of file
diff --git a/models/keyword_spotting/dnn_small/model_package_tf/README.md b/models/keyword_spotting/dnn_small/model_package_tf/README.md
new file mode 100644
index 0000000..7d73dab
--- /dev/null
+++ b/models/keyword_spotting/dnn_small/model_package_tf/README.md
@@ -0,0 +1,115 @@
+# DNN Small model package
+
+This folder contains code that will allow you to recreate the DNN Small keyword spotting model from
+the [Hello Edge paper](https://arxiv.org/pdf/1711.07128.pdf).
+
+## License
+[Apache-2.0](https://spdx.org/licenses/Apache-2.0.html)
+
+## Model Package Overview
+| Model | DNN_Small |
+|:---------------: |:------------------------------------------:|
+| **Format**: | Keras, Saved Model, TensorFlow Lite int8, TensorFlow Lite fp32 |
+| **Feature**: | Keyword spotting for Arm Cortex-M CPUs |
+| **Architectural Delta w.r.t. Vanilla**: | None |
+| **Domain**: | Keyword spotting |
+| **Package Quality**: | Optimised |
+
+## Model Recreation
+
+In order to recreate the model you will first need to be using ```Python3.7``` and install the requirements in ```requirements.txt```.
+
+Once you have these requirements satisfied you can execute the recreation script contained within this folder, just run:
+
+```bash
+bash ./recreate_model.sh
+```
+
+Running this script will use the pre-trained checkpoint files supplied in the ```./model_archive/model_source/weights``` folder
+to generate the TFLite files and perform evaluation on the test sets. Both an fp32 version and a quantized version will be produced.
+The quantized version will use post-training quantization to fully quantize it.
+
+If you want to run training from scratch you can do this by supplying ```--train``` when running the script. For example:
+
+```bash
+bash ./recreate_model.sh --train
+```
+
+Training is then performed and should produce a model to the stated accuracy in this repository.
+Note that exporting to TFLite will still happen with the pre-trained checkpoint files so you will need to re-run the script
+and this time supply the path to the new checkpoint files you want to use, for example:
+
+```bash
+bash ./recreate_model.sh --ckpt
+```
+
+
+## Training
+
+To train a DNN with 3 fully-connected layers with 128 neurons in each layer, run:
+
+```
+python train.py --model_architecture dnn --model_size_info 128 128 128
+```
+The command line argument *--model_size_info* is used to pass the neural network layer
+dimensions such as number of layers, convolution filter size/stride as a list to models.py,
+which builds the TensorFlow graph based on the provided model architecture
+and layer dimensions. For more info on *model_size_info* for each network architecture see
+[models.py](models.py).
+
+The training commands with all the hyperparameters to reproduce the models shown in the
+[paper](https://arxiv.org/pdf/1711.07128.pdf) are given [here](recreate_model.sh).
+
+## Testing
+To run inference on the trained model from a checkpoint and get accuracy on validation and test sets, run:
+```
+python evaluation.py --model_architecture dnn --model_size_info 128 128 128 --checkpoint
+```
+The parameters used here should match those used in the Training step.
+
+## Optimization
+
+We introduce a new *optional* step to optimize the trained keyword spotting model for deployment.
+
+Here we use TensorFlow's [weight clustering API](https://www.tensorflow.org/model_optimization/guide/clustering) to reduce the compressed model size and optimize inference on supported hardware. 32 weight clusters and kmeans++ cluster intialization method are used as the clustering hyperparameters.
+
+To optimize your trained model (e.g. a DNN), a trained model checkpoint is needed to run clustering and fine-tuning on.
+You can use the pre-trained checkpoints provided, or train your own model and use the resulting checkpoint.
+
+To apply the optimization and fine-tuning, run the following command:
+```
+python optimisations.py --model_architecture dnn --model_size_info 128 128 128 --checkpoint
+```
+The parameters used here should match those used in the Training step, except for the number of training steps.
+The number of training steps is reduced since the optimization step only requires fine-tuning.
+
+This will generate a clustered model checkpoint that can be used in the quantization step to generate a quantized and clustered TFLite model.
+
+## Quantization and TFLite Conversion
+
+As part of the update we now use TensorFlow's
+[post training quantization](https://www.tensorflow.org/lite/performance/post_training_quantization) to
+make quantization of the trained models super simple.
+
+To quantize your trained model (e.g. a DNN) run:
+```
+python convert_to_tflite.py --model_architecture dnn --model_size_info 128 128 128 --checkpoint [--inference_type int8|int16]
+```
+The parameters used here should match those used in the Training step.
+
+The inference_type parameter is *optional* and to be used if a fully quantized model with inputs and outputs of type int8 or int16 is needed. It defaults to fp32.
+
+This step will produce a quantized TFLite file *dnn_quantized.tflite*.
+You can test the accuracy of this quantized model on the test set by running:
+```
+python evaluation.py --tflite_path dnn_quantized.tflite
+```
+The parameters used here should match those used in the Training step.
+
+`convert_to_tflite.py` uses post-training quantization to generate a quantized model by default. If you wish to convert to a floating point TFLite model, use the command below:
+
+```
+python convert_to_tflite.py --model_architecture dnn --model_size_info 128 128 128 --checkpoint --no-quantize
+```
+
+This will produce a floating point TFLite file *dnn.tflite*. You can test the accuracy of this floating point model using `evaluation.py` as above.
diff --git a/models/keyword_spotting/dnn_small/model_package_tf/convert_to_tflite.py b/models/keyword_spotting/dnn_small/model_package_tf/convert_to_tflite.py
new file mode 100644
index 0000000..64ab8df
--- /dev/null
+++ b/models/keyword_spotting/dnn_small/model_package_tf/convert_to_tflite.py
@@ -0,0 +1,234 @@
+# Copyright © 2023 Arm Ltd. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Functions for converting and quantizing a trained keyword spotting
+ model and saving to TFLite."""
+
+import argparse
+
+import tensorflow as tf
+
+from data_processing import data_preprocessing
+from model_core_utils import models
+from evaluation import tflite_test
+
+NUM_REP_DATA_SAMPLES = 100 # How many samples to use for post training quantization.
+
+
+def convert(model_settings, audio_processor, checkpoint, quantize, inference_type, tflite_path):
+ """Load our trained floating point model and convert it.
+
+ TFLite conversion or post training quantization is performed and the
+ resulting model is saved as a TFLite file.
+ We use samples from the validation set to do post training quantization.
+
+ Args:
+ model_settings: Dictionary of common model settings.
+ audio_processor: Audio processor class object.
+ checkpoint: Path to training checkpoint to load.
+ quantize: Whether to quantize the model or convert to fp32 TFLite model.
+ inference_type: Input/output type of the quantized model.
+ tflite_path: Output TFLite file save path.
+ """
+ model = models.create_model(model_settings, FLAGS.model_architecture, FLAGS.model_size_info, False)
+ model.load_weights(checkpoint).expect_partial()
+
+ val_data = audio_processor.get_data(audio_processor.Modes.VALIDATION).batch(1)
+
+ def _rep_dataset():
+ """Generator function to produce representative dataset."""
+ i = 0
+ for mfcc, label in val_data:
+ if i > NUM_REP_DATA_SAMPLES:
+ break
+ i += 1
+ yield [mfcc]
+
+ if quantize:
+ # Quantize model and save to disk.
+ tflite_model = post_training_quantize(model, inference_type, _rep_dataset)
+ with open(tflite_path, 'wb') as f:
+ f.write(tflite_model)
+ print(f'Quantized model saved to {tflite_path}.')
+ else:
+ converter = tf.lite.TFLiteConverter.from_keras_model(model)
+ tflite_model = converter.convert()
+ with open(tflite_path, 'wb') as f:
+ f.write(tflite_model)
+ print(f'Converted model saved to {tflite_path}.')
+
+
+def post_training_quantize(keras_model, inference_type, rep_dataset):
+ """Perform post training quantization and returns the TFLite model ready for saving.
+
+ See https://www.tensorflow.org/lite/performance/post_training_quantization#full_integer_quantization for
+ more details.
+
+ Args:
+ keras_model: The trained tf Keras model used for post training quantization.
+ inference_type: Input/output type of the quantized model.
+ rep_dataset: Function to use as a representative dataset, must be callable.
+
+ Returns:
+ Quantized TFLite model ready for saving to disk.
+ """
+ converter = tf.lite.TFLiteConverter.from_keras_model(keras_model)
+ converter.optimizations = [tf.lite.Optimize.DEFAULT]
+
+ if inference_type == 'int8':
+ converter.inference_input_type = tf.int8
+ converter.inference_output_type = tf.int8
+ supported_ops = tf.lite.OpsSet.TFLITE_BUILTINS_INT8
+ if inference_type == 'int16':
+ converter.inference_input_type = tf.int16
+ converter.inference_output_type = tf.int16
+ supported_ops = tf.lite.OpsSet.EXPERIMENTAL_TFLITE_BUILTINS_ACTIVATIONS_INT16_WEIGHTS_INT8
+
+ # Int8 post training quantization needs representative dataset.
+ converter.representative_dataset = rep_dataset
+ converter.target_spec.supported_ops = [supported_ops]
+
+ tflite_model = converter.convert()
+
+ return tflite_model
+
+
+def main():
+ model_settings = models.prepare_model_settings(
+ len(data_preprocessing.prepare_words_list(FLAGS.wanted_words.split(','))),
+ FLAGS.sample_rate, FLAGS.clip_duration_ms, FLAGS.window_size_ms,
+ FLAGS.window_stride_ms, FLAGS.dct_coefficient_count)
+
+ audio_processor = data_preprocessing.AudioProcessor(data_url=FLAGS.data_url,
+ data_dir=FLAGS.data_dir,
+ silence_percentage=FLAGS.silence_percentage,
+ unknown_percentage=FLAGS.unknown_percentage,
+ wanted_words=FLAGS.wanted_words.split(','),
+ validation_percentage=FLAGS.validation_percentage,
+ testing_percentage=FLAGS.testing_percentage,
+ model_settings=model_settings)
+
+ if FLAGS.quantize:
+ tflite_path = f'{FLAGS.model_architecture}_quantized.tflite'
+ else:
+ tflite_path = f'{FLAGS.model_architecture}.tflite'
+
+ # Load floating point model from checkpoint and convert it.
+ convert(model_settings, audio_processor, FLAGS.checkpoint,
+ FLAGS.quantize, FLAGS.inference_type, tflite_path)
+
+ # Test the newly converted model on the test set.
+ tflite_test(model_settings, audio_processor, tflite_path)
+
+
+if __name__ == '__main__':
+ parser = argparse.ArgumentParser()
+ parser.add_argument(
+ '--data_url',
+ type=str,
+ default='http://download.tensorflow.org/data/speech_commands_v0.02.tar.gz',
+ help='Location of speech training data archive on the web.')
+ parser.add_argument(
+ '--data_dir',
+ type=str,
+ default='/tmp/speech_dataset/',
+ help="""\
+ Where to download the speech training data to.
+ """)
+ parser.add_argument(
+ '--silence_percentage',
+ type=float,
+ default=10.0,
+ help="""\
+ How much of the training data should be silence.
+ """)
+ parser.add_argument(
+ '--unknown_percentage',
+ type=float,
+ default=10.0,
+ help="""\
+ How much of the training data should be unknown words.
+ """)
+ parser.add_argument(
+ '--testing_percentage',
+ type=int,
+ default=10,
+ help='What percentage of wavs to use as a test set.')
+ parser.add_argument(
+ '--validation_percentage',
+ type=int,
+ default=10,
+ help='What percentage of wavs to use as a validation set.')
+ parser.add_argument(
+ '--sample_rate',
+ type=int,
+ default=16000,
+ help='Expected sample rate of the wavs',)
+ parser.add_argument(
+ '--clip_duration_ms',
+ type=int,
+ default=1000,
+ help='Expected duration in milliseconds of the wavs',)
+ parser.add_argument(
+ '--window_size_ms',
+ type=float,
+ default=30.0,
+ help='How long each spectrogram timeslice is',)
+ parser.add_argument(
+ '--window_stride_ms',
+ type=float,
+ default=10.0,
+ help='How long each spectrogram timeslice is',)
+ parser.add_argument(
+ '--dct_coefficient_count',
+ type=int,
+ default=40,
+ help='How many bins to use for the MFCC fingerprint',)
+ parser.add_argument(
+ '--wanted_words',
+ type=str,
+ default='yes,no,up,down,left,right,on,off,stop,go',
+ help='Words to use (others will be added to an unknown label)',)
+ parser.add_argument(
+ '--model_architecture',
+ type=str,
+ default='dnn',
+ help='What model architecture to use')
+ parser.add_argument(
+ '--model_size_info',
+ type=int,
+ nargs="+",
+ default=[128, 128, 128],
+ help='Model dimensions - different for various models')
+ parser.add_argument(
+ '--checkpoint',
+ type=str,
+ help='Checkpoint to load the weights from.')
+ parser.add_argument(
+ '--quantize',
+ dest='quantize',
+ action="store_true",
+ default=True,
+ help='Whether to quantize the model or convert to fp32 TFLite model. Defaults to True.')
+ parser.add_argument(
+ '--no-quantize',
+ dest='quantize',
+ action="store_false",
+ help='Whether to quantize the model or convert to fp32 TFLite model. Defaults to True.')
+ parser.add_argument(
+ '--inference_type',
+ type=str,
+ default='fp32',
+ help='If quantize is true, whether the model input and output is float32, int8 or int16')
+
+ FLAGS, _ = parser.parse_known_args()
+ main()
diff --git a/models/keyword_spotting/dnn_small/model_package_tf/data_processing/__init__.py b/models/keyword_spotting/dnn_small/model_package_tf/data_processing/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/models/keyword_spotting/dnn_small/model_package_tf/data_processing/data_preprocessing.py b/models/keyword_spotting/dnn_small/model_package_tf/data_processing/data_preprocessing.py
new file mode 100644
index 0000000..05cf5ba
--- /dev/null
+++ b/models/keyword_spotting/dnn_small/model_package_tf/data_processing/data_preprocessing.py
@@ -0,0 +1,462 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+#
+# Modifications Copyright 2023 Arm Inc. All Rights Reserved.
+# Modified to use TensorFlow 2.0 and data pipelines.
+#
+"""Functions for loading and preparing data for keyword spotting."""
+
+import os
+import re
+import sys
+import urllib
+from pathlib import Path
+import tarfile
+import hashlib
+import random
+import math
+from enum import Enum
+
+import numpy as np
+import tensorflow as tf
+from tensorflow.python.ops import gen_audio_ops as audio_ops
+
+MAX_NUM_WAVS_PER_CLASS = 2**27 - 1 # ~134M
+RANDOM_SEED = 59185
+BACKGROUND_NOISE_DIR_NAME = '_background_noise_'
+SILENCE_LABEL = '_silence_'
+SILENCE_INDEX = 0
+UNKNOWN_WORD_INDEX = 1
+UNKNOWN_WORD_LABEL = '_unknown_'
+
+
+def load_wav_file(wav_filename, desired_samples):
+ """Loads and then decodes a given 16bit PCM wav file.
+
+ Decoded audio is scaled to the range [-1, 1] and padded or cropped to the desired number of samples.
+
+ Args:
+ wav_filename: 16bit PCM wav file to load.
+ desired_samples: Number of samples wanted from the audio file.
+
+ Returns:
+ Tuple consisting of the decoded audio and sample rate.
+ """
+ wav_file = tf.io.read_file(wav_filename)
+ decoded_wav = audio_ops.decode_wav(wav_file, desired_channels=1, desired_samples=desired_samples)
+
+ return decoded_wav.audio, decoded_wav.sample_rate
+
+
+def calculate_mfcc(audio_signal, audio_sample_rate, window_size, window_stride, num_mfcc):
+ """Returns Mel Frequency Cepstral Coefficients (MFCC) for a given audio signal.
+
+ Args:
+ audio_signal: Raw audio signal in range [-1, 1]
+ audio_sample_rate: Audio signal sample rate
+ window_size: Window size in samples for calculating spectrogram
+ window_stride: Window stride in samples for calculating spectrogram
+ num_mfcc: The number of MFCC features wanted.
+
+ Returns:
+ Calculated mffc features.
+ """
+ spectrogram = audio_ops.audio_spectrogram(input=audio_signal, window_size=window_size, stride=window_stride,
+ magnitude_squared=True)
+
+ mfcc_features = audio_ops.mfcc(spectrogram, audio_sample_rate, dct_coefficient_count=num_mfcc)
+
+ return mfcc_features
+
+
+def which_set(filename, validation_percentage, testing_percentage):
+ """Determines which data partition the file should belong to.
+
+ We want to keep files in the same training, validation, or testing sets even
+ if new ones are added over time. This makes it less likely that testing
+ samples will accidentally be reused in training when long runs are restarted
+ for example. To keep this stability, a hash of the filename is taken and used
+ to determine which set it should belong to. This determination only depends on
+ the name and the set proportions, so it won't change as other files are added.
+ It's also useful to associate particular files as related (for example words
+ spoken by the same person), so anything after '_nohash_' in a filename is
+ ignored for set determination. This ensures that 'bobby_nohash_0.wav' and
+ 'bobby_nohash_1.wav' are always in the same set, for example.
+
+ Args:
+ filename: File path of the data sample.
+ validation_percentage: How much of the data set to use for validation.
+ testing_percentage: How much of the data set to use for testing.
+
+ Returns:
+ String, one of 'training', 'validation', or 'testing'.
+ """
+ base_name = os.path.basename(filename)
+ # We want to ignore anything after '_nohash_' in the file name when
+ # deciding which set to put a wav in, so the data set creator has a way of
+ # grouping wavs that are close variations of each other.
+ hash_name = re.sub(r'_nohash_.*$', '', base_name)
+ # This looks a bit magical, but we need to decide whether this file should
+ # go into the training, testing, or validation sets, and we want to keep
+ # existing files in the same set even if more files are subsequently
+ # added.
+ # To do that, we need a stable way of deciding based on just the file name
+ # itself, so we do a hash of that and then use that to generate a
+ # probability value that we use to assign it.
+ hash_name_hashed = hashlib.sha1(tf.compat.as_bytes(hash_name)).hexdigest()
+ percentage_hash = ((int(hash_name_hashed, 16) %
+ (MAX_NUM_WAVS_PER_CLASS + 1)) *
+ (100.0 / MAX_NUM_WAVS_PER_CLASS))
+ if percentage_hash < validation_percentage:
+ result = 'validation'
+ elif percentage_hash < (testing_percentage + validation_percentage):
+ result = 'testing'
+ else:
+ result = 'training'
+ return result
+
+
+def prepare_words_list(wanted_words):
+ """Prepends common tokens to the custom word list.
+
+ Args:
+ wanted_words: List of strings containing custom words to spot.
+
+ Returns:
+ List of words with silence and unknown tokens added.
+ """
+ return [SILENCE_LABEL, UNKNOWN_WORD_LABEL] + wanted_words
+
+
+class AudioProcessor:
+ """Handles loading, partitioning, and preparing audio training data."""
+
+ class Modes(Enum):
+ TRAINING = 1
+ VALIDATION = 2
+ TESTING = 3
+
+ def __init__(self, data_url, data_dir, silence_percentage, unknown_percentage,
+ wanted_words, validation_percentage, testing_percentage, model_settings):
+ self.data_dir = Path(data_dir)
+ self.model_settings = model_settings
+ self.words_list = prepare_words_list(wanted_words)
+
+ self._tf_datasets = {}
+ self.background_data = None
+ self._set_size = {'training': 0, 'validation': 0, 'testing': 0}
+
+ self._download_and_extract_data(data_url, data_dir)
+ self._prepare_datasets(silence_percentage, unknown_percentage, wanted_words,
+ validation_percentage, testing_percentage)
+ self._prepare_background_data()
+
+ def get_data(self, mode, background_frequency=0, background_volume_range=0, time_shift=0):
+ """Returns the train, validation or test set for KWS as a TF Dataset.
+
+ Args:
+ mode: The set to return, see AudioProcessor.Modes enumeration.
+ background_frequency: How many of the samples have background noise mixed in.
+ background_volume_range: How loud the background noise should be, between 0 and 1.
+ time_shift: Range to randomly shift the training audio by in time.
+
+ Returns:
+ TF dataset that will generate tuples containing an mfcc and corresponding label.
+
+ Raises:
+ ValueError: If mode is not recognised.
+ """
+ if mode == AudioProcessor.Modes.TRAINING:
+ dataset = self._tf_datasets['training']
+ elif mode == AudioProcessor.Modes.VALIDATION:
+ dataset = self._tf_datasets['validation']
+ elif mode == AudioProcessor.Modes.TESTING:
+ dataset = self._tf_datasets['testing']
+ else:
+ ValueError("Incorrect dataset type given")
+
+ use_background = (self.background_data is not None) and (mode == AudioProcessor.Modes.TRAINING)
+ dataset = dataset.map(lambda path, label: self._process_path(path, label, self.model_settings,
+ background_frequency, background_volume_range,
+ time_shift, use_background, self.background_data),
+ num_parallel_calls=tf.data.experimental.AUTOTUNE)
+
+ return dataset
+
+ def set_size(self, mode):
+ """Get the number of samples in the requested dataset partition.
+
+ Args:
+ mode: Which partition, see AudioProcessor.Modes enumeration.
+
+ Returns:
+ Number of samples in the partition.
+
+ Raises:
+ ValueError: If mode is not recognised.
+ """
+ if mode == AudioProcessor.Modes.TRAINING:
+ return self._set_size['training']
+ elif mode == AudioProcessor.Modes.VALIDATION:
+ return self._set_size['validation']
+ elif mode == AudioProcessor.Modes.TESTING:
+ return self._set_size['testing']
+ else:
+ ValueError('Incorrect dataset type given')
+
+ @staticmethod
+ def _process_path(path, label, model_settings, background_frequency, background_volume_range, time_shift_samples,
+ use_background, background_data):
+ """Load wav files and calculate mfcc features.
+
+ Random shifting of samples and adding in background noise is done within this function as well.
+ This function is meant to be mapped onto a TF Dataset by using a lambda function.
+
+ Args:
+ path: Path to the wav file to load.
+ label: Integer label for classifying the audio clip.
+ model_settings: Dictionary of settings for model being trained.
+ background_frequency: How many clips will have background noise, 0.0 to 1.0.
+ background_volume_range: How loud the background noise will be.
+ time_shift_samples: How much to randomly shift the clips by.
+ use_background: Add in background noise to audio clips or not.
+ background_data: Ragged tensor of loaded background noise samples.
+
+ Returns:
+ Tuple of calculated flattened mfcc and its class label.
+ """
+
+ desired_samples = model_settings['desired_samples']
+ audio, sample_rate = load_wav_file(path, desired_samples=desired_samples)
+
+ # Make our own silence audio data.
+ if label == SILENCE_INDEX:
+ audio = tf.multiply(audio, 0)
+
+ # Shift samples start position and pad any gaps with zeros.
+ if time_shift_samples > 0:
+ time_shift_amount = tf.random.uniform(shape=(), minval=-time_shift_samples, maxval=time_shift_samples,
+ dtype=tf.int32)
+ else:
+ time_shift_amount = 0
+ if time_shift_amount > 0:
+ time_shift_padding = [[time_shift_amount, 0], [0, 0]]
+ time_shift_offset = [0, 0]
+ else:
+ time_shift_padding = [[0, -time_shift_amount], [0, 0]]
+ time_shift_offset = [-time_shift_amount, 0]
+
+ padded_foreground = tf.pad(audio, time_shift_padding, mode='CONSTANT')
+ sliced_foreground = tf.slice(padded_foreground, time_shift_offset, [desired_samples, -1])
+
+ # Get a random section of background noise.
+ if use_background:
+ background_index = tf.random.uniform(shape=(), maxval=background_data.shape[0], dtype=tf.int32)
+ background_sample = background_data[background_index]
+ background_offset = tf.random.uniform(shape=(), maxval=len(background_sample)-desired_samples,
+ dtype=tf.int32)
+ background_clipped = background_sample[background_offset:(background_offset + desired_samples)]
+ background_reshaped = tf.reshape(background_clipped, [desired_samples, 1])
+ if tf.random.uniform(shape=(), maxval=1) < background_frequency:
+ background_volume = tf.random.uniform(shape=(), maxval=background_volume_range)
+ else:
+ background_volume = tf.constant(0, dtype='float32')
+ else:
+ background_reshaped = np.zeros([desired_samples, 1], dtype=np.float32)
+ background_volume = tf.constant(0, dtype='float32')
+
+ # Mix in background noise.
+ background_mul = tf.multiply(background_reshaped, background_volume)
+ background_add = tf.add(background_mul, sliced_foreground)
+ background_clamp = tf.clip_by_value(background_add, -1.0, 1.0)
+
+ mfcc = calculate_mfcc(background_clamp, sample_rate, model_settings['window_size_samples'],
+ model_settings['window_stride_samples'],
+ model_settings['dct_coefficient_count'])
+ mfcc = tf.reshape(mfcc, [-1])
+
+ return mfcc, label
+
+ def _download_and_extract_data(self, data_url, target_directory):
+ """Downloads and extracts file to target directory.
+
+ If the file does not already exist download it and then untar into the target directory.
+
+ Args:
+ data_url: Web link to the tarred data to download.
+ target_directory: Directory to download and extract to.
+ """
+ target_directory = Path(target_directory)
+ target_directory.mkdir(exist_ok=True)
+
+ filename = data_url.split('/')[-1]
+ filepath = target_directory / filename
+
+ if not filepath.exists():
+ def _report_hook(block_num, block_size, total_size):
+ """Function to track download progress in urllib"""
+ read_so_far = block_num * block_size
+ percent = (read_so_far / total_size) * 100.0
+
+ s = f"\rDownloading {filename} {percent:.1f}%"
+
+ sys.stdout.write(s)
+ sys.stdout.flush()
+
+ filepath, _ = urllib.request.urlretrieve(data_url, filepath, _report_hook)
+ print()
+
+ print(f'Untarring {filename}...')
+ tarfile.open(filepath, 'r:gz').extractall(target_directory)
+
+ def _prepare_datasets(self, silence_percentage, unknown_percentage, wanted_words,
+ validation_percentage, testing_percentage):
+ """Split the data into train, validation and testing sets.
+
+ Silence and unknown data is added, then sets are converted to TF Datasets.
+
+ Args:
+ silence_percentage: Percent of words should be silence.
+ unknown_percentage: Percent of words that should be unknown.
+ wanted_words: List of words wanted to classify.
+ validation_percentage: Percent to split off for validation.
+ testing_percentage: Percent to split off for testing.
+ """
+ # Make sure the shuffling and picking of unknowns is deterministic.
+ random.seed(RANDOM_SEED)
+ wanted_words_index = {}
+
+ for index, wanted_word in enumerate(wanted_words):
+ wanted_words_index[wanted_word] = index + 2
+
+ # Find all wav files in subfolders.
+ search_path = self.data_dir / '*' / '*.wav'
+ data_index, unknown_index, all_words = self._find_and_sort_wavs(search_path, validation_percentage,
+ testing_percentage, wanted_words_index)
+
+ for index, wanted_word in enumerate(wanted_words):
+ if wanted_word not in all_words:
+ raise Exception(f'Tried to find {wanted_word} in labels but only found: {", ".join(all_words.keys())}')
+
+ word_to_index = {}
+ for word in all_words:
+ if word in wanted_words_index:
+ word_to_index[word] = wanted_words_index[word]
+ else:
+ word_to_index[word] = UNKNOWN_WORD_INDEX
+ word_to_index[SILENCE_LABEL] = SILENCE_INDEX
+
+ # We need an arbitrary file to load as the input for the silence samples.
+ # It's multiplied by zero later, so the content doesn't matter.
+ silence_wav_path = data_index['training'][0]['file']
+ for set_index in ['validation', 'testing', 'training']:
+ set_size = len(data_index[set_index]) # Size before adding silence and unknown samples.
+ silence_size = int(math.ceil(set_size * silence_percentage / 100))
+ for _ in range(silence_size):
+ data_index[set_index].append({
+ 'label': SILENCE_LABEL,
+ 'file': silence_wav_path
+ })
+ # Pick some unknowns to add to each partition of the data set.
+ random.shuffle(unknown_index[set_index])
+ unknown_size = int(math.ceil(set_size * unknown_percentage / 100))
+ data_index[set_index].extend(unknown_index[set_index][:unknown_size])
+
+ self._set_size[set_index] = len(data_index[set_index]) # Size after adding silence and unknown samples.
+
+ # Make sure the ordering is random.
+ random.shuffle(data_index[set_index])
+
+ # Transform into TF Datasets ready for easier processing later.
+ labels, paths = list(zip(*[d.values() for d in data_index[set_index]]))
+ labels = [word_to_index[label] for label in labels]
+ self._tf_datasets[set_index] = tf.data.Dataset.from_tensor_slices((list(paths), labels))
+
+ def _find_and_sort_wavs(self, search_pattern, validation_percentage, testing_percentage, wanted_words_index):
+ """Find and sort wav files into known and unknown word sets.
+
+ Known words are files containing words in the list of wanted words.
+ Any other clip goes to the unknown label set. Labels come from the folder names.
+ All clips are also assigned to train, test and validation sets.
+
+ Args:
+ search_pattern: Path pattern used by glob to find wav files.
+ validation_percentage: Percent to split off for validation.
+ testing_percentage: Percent to split off for testing.
+ wanted_words_index: Dict mapping wanted words to their label index.
+
+ Returns:
+ 3-tuple of known words, unknown words and mapping of all word labels.
+ """
+ data_index = {'validation': [], 'testing': [], 'training': []}
+ unknown_index = {'validation': [], 'testing': [], 'training': []}
+ all_words = {}
+
+ for wav_path in sorted(tf.io.gfile.glob(str(search_pattern))):
+ word = Path(wav_path).parent.name.lower()
+
+ # Treat the '_background_noise_' folder as a special case, since we expect
+ # it to contain long audio samples we mix in to improve training.
+ if word == BACKGROUND_NOISE_DIR_NAME:
+ continue
+
+ all_words[word] = True
+ set_index = which_set(wav_path, validation_percentage, testing_percentage)
+ # If it's a known class, store its detail, otherwise add it to the list
+ # we'll use to train the unknown label.
+ if word in wanted_words_index:
+ data_index[set_index].append({'label': word, 'file': wav_path})
+ else:
+ unknown_index[set_index].append({'label': word, 'file': wav_path})
+ if not all_words:
+ raise Exception('No .wavs found at ' + str(search_pattern))
+
+ return data_index, unknown_index, all_words
+
+ def _prepare_background_data(self):
+ """Searches a folder for background noise audio, and loads it into memory.
+
+ It's expected that the background audio samples will be in a subdirectory
+ named '_background_noise_' inside the 'data_dir' folder, as .wavs that match
+ the sample rate of the training data, but can be much longer in duration.
+
+ If the '_background_noise_' folder doesn't exist at all, this isn't an
+ error, it's just taken to mean that no background noise augmentation should
+ be used. If the folder does exist, but it's empty, that's treated as an
+ error.
+
+ Returns:
+ Ragged tensor of raw PCM-encoded audio samples of background noise.
+ None if '_background_noise_' folder doesnt exist.
+
+ Raises:
+ Exception: If files aren't found in the folder.
+ """
+ background_data = []
+ background_dir = Path(self.data_dir / BACKGROUND_NOISE_DIR_NAME)
+ if not background_dir.exists():
+ self.background_data = None
+ return
+
+ search_path = Path(background_dir / '*.wav')
+ for wav_path in tf.io.gfile.glob(str(search_path)):
+ wav_data, _ = load_wav_file(wav_path, desired_samples=-1)
+ background_data.append(tf.reshape(wav_data, [-1]))
+
+ if not background_data:
+ raise Exception('No background wav files were found in ' + str(search_path))
+
+ # Ragged tensor as we cant use lists in tf dataset map functions.
+ self.background_data = tf.ragged.stack(background_data)
diff --git a/models/keyword_spotting/dnn_small/model_package_tf/dnn_s_inference_keras.py b/models/keyword_spotting/dnn_small/model_package_tf/dnn_s_inference_keras.py
new file mode 100644
index 0000000..db7694a
--- /dev/null
+++ b/models/keyword_spotting/dnn_small/model_package_tf/dnn_s_inference_keras.py
@@ -0,0 +1,76 @@
+# Copyright © 2023 Arm Ltd. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from data_processing.data_preprocessing import load_wav_file, calculate_mfcc
+
+import tensorflow as tf
+import argparse
+
+
+def load_labels(filename):
+ """Read in labels, one label per line."""
+ f = open(filename, "r")
+ return f.read().splitlines()
+
+
+def main():
+ window_size_samples = int(FLAGS.sample_rate * FLAGS.window_size_ms / 1000)
+ window_stride_samples = int(FLAGS.sample_rate * FLAGS.window_stride_ms / 1000)
+ decoded, sample = load_wav_file(FLAGS.wav, FLAGS.sample_rate)
+ x = calculate_mfcc(decoded, sample, window_size_samples, window_stride_samples, FLAGS.dct_coefficient_count)
+ x = tf.reshape(x, [1, -1])
+
+ model = tf.keras.models.load_model(FLAGS.keras_file_path)
+ predictions = model.predict(x)
+
+ # Sort to show labels in order of confidence
+ top_k = predictions[0].argsort()[-1:][::-1]
+ for node_id in top_k:
+ human_string = load_labels(FLAGS.labels)[int(node_id)]
+ score = predictions[0,node_id]
+ print(f'model predicted: {human_string} with score {score:.5f}')
+
+
+if __name__ == '__main__':
+ parser = argparse.ArgumentParser()
+ parser.add_argument(
+ '--wav', type=str, default='', help='Audio file to be identified.')
+ parser.add_argument(
+ '--labels', type=str, default='', help='Path to file containing labels.')
+ parser.add_argument(
+ '--sample_rate',
+ type=int,
+ default=16000,
+ help='Expected sample rate of the wavs', )
+ parser.add_argument(
+ '--window_size_ms',
+ type=float,
+ default=30.0,
+ help='How long each spectrogram timeslice is', )
+ parser.add_argument(
+ '--window_stride_ms',
+ type=float,
+ default=10.0,
+ help='How long each spectrogram timeslice is', )
+ parser.add_argument(
+ '--dct_coefficient_count',
+ type=int,
+ default=40,
+ help='How many bins to use for the MFCC fingerprint', )
+ parser.add_argument(
+ '--keras_file_path',
+ type=str,
+ default='',
+ help='Path to the .h5 Keras model file to use for testing.')
+ FLAGS, unparsed = parser.parse_known_args()
+ main()
diff --git a/models/keyword_spotting/dnn_small/model_package_tf/dnn_s_inference_tflite.py b/models/keyword_spotting/dnn_small/model_package_tf/dnn_s_inference_tflite.py
new file mode 100644
index 0000000..9f79d99
--- /dev/null
+++ b/models/keyword_spotting/dnn_small/model_package_tf/dnn_s_inference_tflite.py
@@ -0,0 +1,120 @@
+# Copyright © 2023 Arm Ltd. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from data_processing.data_preprocessing import load_wav_file, calculate_mfcc
+
+import tensorflow as tf
+import numpy as np
+import argparse
+
+
+def tflite_inference(input_data, tflite_path):
+ """Call forwards pass of TFLite file and returns the result.
+
+ Args:
+ input_data: Input data to use on forward pass.
+ tflite_path: Path to TFLite file to run.
+
+ Returns:
+ Output from inference.
+ """
+ supported_quant_dtypes = (np.int8, np.int16)
+ interpreter = tf.lite.Interpreter(model_path=tflite_path)
+ interpreter.allocate_tensors()
+
+ input_details = interpreter.get_input_details()
+ output_details = interpreter.get_output_details()
+
+ input_dtype = input_details[0]["dtype"]
+ output_dtype = output_details[0]["dtype"]
+
+ # Check if the input/output type is quantized,
+ # set scale and zero-point accordingly
+ if input_dtype in supported_quant_dtypes:
+ input_scale, input_zero_point = input_details[0]["quantization"]
+ else:
+ input_scale, input_zero_point = 1, 0
+
+ input_data = input_data / input_scale + input_zero_point
+ input_data = np.round(input_data) if input_dtype in supported_quant_dtypes else input_data
+
+ if output_dtype in supported_quant_dtypes:
+ output_scale, output_zero_point = output_details[0]["quantization"]
+ else:
+ output_scale, output_zero_point = 1, 0
+
+ interpreter.set_tensor(input_details[0]['index'], tf.cast(input_data, input_dtype))
+ interpreter.invoke()
+
+ output_data = interpreter.get_tensor(output_details[0]['index'])
+
+ output_data = output_scale * (output_data.astype(np.float32) - output_zero_point)
+
+ return output_data
+
+
+def load_labels(filename):
+ """Read in labels, one label per line."""
+ f = open(filename, "r")
+ return f.read().splitlines()
+
+
+def main():
+ window_size_samples = int(FLAGS.sample_rate * FLAGS.window_size_ms / 1000)
+ window_stride_samples = int(FLAGS.sample_rate * FLAGS.window_stride_ms / 1000)
+ decoded, sample = load_wav_file(FLAGS.wav, FLAGS.sample_rate)
+ x = calculate_mfcc(decoded, sample, window_size_samples, window_stride_samples, FLAGS.dct_coefficient_count)
+ x = tf.reshape(x, [1, -1])
+ predictions = tflite_inference(x, FLAGS.tflite_path)
+
+ # Sort to show labels in order of confidence
+ top_k = predictions[0].argsort()[-1:][::-1]
+ for node_id in top_k:
+ human_string = load_labels(FLAGS.labels)[int(node_id)]
+ score = predictions[0,node_id]
+ print(f'model predicted: {human_string} with score {score:.5f}')
+
+
+if __name__ == '__main__':
+ parser = argparse.ArgumentParser()
+ parser.add_argument(
+ '--wav', type=str, default='', help='Audio file to be identified.')
+ parser.add_argument(
+ '--labels', type=str, default='', help='Path to file containing labels.')
+ parser.add_argument(
+ '--sample_rate',
+ type=int,
+ default=16000,
+ help='Expected sample rate of the wavs', )
+ parser.add_argument(
+ '--window_size_ms',
+ type=float,
+ default=30.0,
+ help='How long each spectrogram timeslice is', )
+ parser.add_argument(
+ '--window_stride_ms',
+ type=float,
+ default=10.0,
+ help='How long each spectrogram timeslice is', )
+ parser.add_argument(
+ '--dct_coefficient_count',
+ type=int,
+ default=40,
+ help='How many bins to use for the MFCC fingerprint', )
+ parser.add_argument(
+ '--tflite_path',
+ type=str,
+ default='',
+ help='Path to TFLite file to use for testing.')
+ FLAGS, unparsed = parser.parse_known_args()
+ main()
diff --git a/models/keyword_spotting/dnn_small/model_package_tf/evaluation.py b/models/keyword_spotting/dnn_small/model_package_tf/evaluation.py
new file mode 100644
index 0000000..9cf3d0c
--- /dev/null
+++ b/models/keyword_spotting/dnn_small/model_package_tf/evaluation.py
@@ -0,0 +1,250 @@
+# Copyright © 2023 Arm Ltd. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Functions for testing trained keyword spotting models from checkpoint files and TFLite files."""
+
+import argparse
+
+import numpy as np
+import tensorflow as tf
+
+from data_processing import data_preprocessing
+from model_core_utils import models
+from dnn_s_inference_tflite import tflite_inference
+
+
+def tflite_test(model_settings, audio_processor, tflite_path):
+ """Calculate accuracy and confusion matrices on the validation and test sets.
+
+ A TFLite model is used for doing testing.
+
+ Args:
+ model_settings: Dictionary of common model settings.
+ audio_processor: Audio processor class object.
+ tflite_path: Path to TFLite file to use for inference.
+ """
+ # Evaluate on validation set.
+ print("Running TFLite evaluation on validation set...")
+ val_data = audio_processor.get_data(audio_processor.Modes.VALIDATION).batch(1)
+ expected_indices = np.concatenate([y for x, y in val_data])
+ predicted_indices = []
+
+ for mfcc, label in val_data:
+ prediction = tflite_inference(mfcc, tflite_path)
+ predicted_indices.append(np.squeeze(tf.argmax(prediction, axis=1)))
+
+ val_accuracy = calculate_accuracy(predicted_indices, expected_indices)
+ confusion_matrix = tf.math.confusion_matrix(expected_indices, predicted_indices,
+ num_classes=model_settings['label_count'])
+
+ print(confusion_matrix.numpy())
+ print(f'Validation accuracy = {val_accuracy * 100:.2f}%'
+ f'(N={audio_processor.set_size(audio_processor.Modes.VALIDATION)})')
+
+ # Evaluate on testing set.
+ print("Running TFLite evaluation on test set...")
+ test_data = audio_processor.get_data(audio_processor.Modes.TESTING).batch(1)
+ expected_indices = np.concatenate([y for x, y in test_data])
+ predicted_indices = []
+
+ for mfcc, label in test_data:
+ prediction = tflite_inference(mfcc, tflite_path)
+ predicted_indices.append(np.squeeze(tf.argmax(prediction, axis=1)))
+
+ test_accuracy = calculate_accuracy(predicted_indices, expected_indices)
+ confusion_matrix = tf.math.confusion_matrix(expected_indices, predicted_indices,
+ num_classes=model_settings['label_count'])
+
+ print(confusion_matrix.numpy())
+ print(f'Test accuracy = {test_accuracy * 100:.2f}%'
+ f'(N={audio_processor.set_size(audio_processor.Modes.TESTING)})')
+
+
+def keras_test(model_settings, audio_processor, model):
+ """Calculate accuracy and confusion matrices on the validation and test sets.
+
+ A loaded keras model is used for doing testing.
+
+ Args:
+ model_settings: Dictionary of common model settings.
+ audio_processor: Audio processor class object.
+ model: Loaded keras model.
+ """
+ # Evaluate on validation set.
+ print("Running TF evaluation on validation set...")
+ val_data = audio_processor.get_data(audio_processor.Modes.VALIDATION).batch(FLAGS.batch_size)
+ expected_indices = np.concatenate([y for x, y in val_data])
+
+ predictions = model.predict(val_data)
+ predicted_indices = tf.argmax(predictions, axis=1)
+
+ val_accuracy = calculate_accuracy(predicted_indices, expected_indices)
+ confusion_matrix = tf.math.confusion_matrix(expected_indices, predicted_indices,
+ num_classes=model_settings['label_count'])
+ print(confusion_matrix.numpy())
+ print(f'Validation accuracy = {val_accuracy * 100:.2f}%'
+ f'(N={audio_processor.set_size(audio_processor.Modes.VALIDATION)})')
+
+ # Evaluate on testing set.
+ print("Running TF evaluation on test set...")
+ test_data = audio_processor.get_data(audio_processor.Modes.TESTING).batch(FLAGS.batch_size)
+ expected_indices = np.concatenate([y for x, y in test_data])
+
+ predictions = model.predict(test_data)
+ predicted_indices = tf.argmax(predictions, axis=1)
+
+ test_accuracy = calculate_accuracy(predicted_indices, expected_indices)
+ confusion_matrix = tf.math.confusion_matrix(expected_indices, predicted_indices,
+ num_classes=model_settings['label_count'])
+ print(confusion_matrix.numpy())
+ print(f'Test accuracy = {test_accuracy * 100:.2f}%'
+ f'(N={audio_processor.set_size(audio_processor.Modes.TESTING)})')
+
+
+def calculate_accuracy(predicted_indices, expected_indices):
+ """Calculates and returns accuracy.
+
+ Args:
+ predicted_indices: List of predicted integer indices.
+ expected_indices: List of expected integer indices.
+
+ Returns:
+ Accuracy value between 0 and 1.
+ """
+ correct_prediction = tf.equal(predicted_indices, expected_indices)
+ accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
+ return accuracy
+
+
+def evaluate():
+ """Calculate accuracy and confusion matrices on validation and test sets.
+
+ Model is created and weights loaded from supplied command line arguments.
+ """
+ model_settings = models.prepare_model_settings(
+ len(data_preprocessing.prepare_words_list(FLAGS.wanted_words.split(','))),
+ FLAGS.sample_rate, FLAGS.clip_duration_ms, FLAGS.window_size_ms,
+ FLAGS.window_stride_ms, FLAGS.dct_coefficient_count)
+
+ audio_processor = data_preprocessing.AudioProcessor(data_url=FLAGS.data_url,
+ data_dir=FLAGS.data_dir,
+ silence_percentage=FLAGS.silence_percentage,
+ unknown_percentage=FLAGS.unknown_percentage,
+ wanted_words=FLAGS.wanted_words.split(','),
+ validation_percentage=FLAGS.validation_percentage,
+ testing_percentage=FLAGS.testing_percentage,
+ model_settings=model_settings)
+
+ if FLAGS.tflite_path:
+ tflite_test(model_settings, audio_processor, FLAGS.tflite_path)
+
+ if FLAGS.checkpoint:
+ model = models.create_model(model_settings, FLAGS.model_architecture, FLAGS.model_size_info, False)
+ model.load_weights(FLAGS.checkpoint).expect_partial()
+ keras_test(model_settings, audio_processor, model)
+
+
+if __name__ == '__main__':
+ parser = argparse.ArgumentParser()
+ parser.add_argument(
+ '--data_url',
+ type=str,
+ default='http://download.tensorflow.org/data/speech_commands_v0.02.tar.gz',
+ help='Location of speech training data archive on the web.')
+ parser.add_argument(
+ '--data_dir',
+ type=str,
+ default='/tmp/speech_dataset/',
+ help="""\
+ Where to download the speech training data to.
+ """)
+ parser.add_argument(
+ '--silence_percentage',
+ type=float,
+ default=10.0,
+ help="""\
+ How much of the training data should be silence.
+ """)
+ parser.add_argument(
+ '--unknown_percentage',
+ type=float,
+ default=10.0,
+ help="""\
+ How much of the training data should be unknown words.
+ """)
+ parser.add_argument(
+ '--testing_percentage',
+ type=int,
+ default=10,
+ help='What percentage of wavs to use as a test set.')
+ parser.add_argument(
+ '--validation_percentage',
+ type=int,
+ default=10,
+ help='What percentage of wavs to use as a validation set.')
+ parser.add_argument(
+ '--sample_rate',
+ type=int,
+ default=16000,
+ help='Expected sample rate of the wavs',)
+ parser.add_argument(
+ '--clip_duration_ms',
+ type=int,
+ default=1000,
+ help='Expected duration in milliseconds of the wavs',)
+ parser.add_argument(
+ '--window_size_ms',
+ type=float,
+ default=30.0,
+ help='How long each spectrogram timeslice is',)
+ parser.add_argument(
+ '--window_stride_ms',
+ type=float,
+ default=10.0,
+ help='How long each spectrogram timeslice is',)
+ parser.add_argument(
+ '--dct_coefficient_count',
+ type=int,
+ default=40,
+ help='How many bins to use for the MFCC fingerprint',)
+ parser.add_argument(
+ '--batch_size',
+ type=int,
+ default=100,
+ help='How many items to train with at once',)
+ parser.add_argument(
+ '--wanted_words',
+ type=str,
+ default='yes,no,up,down,left,right,on,off,stop,go',
+ help='Words to use (others will be added to an unknown label)',)
+ parser.add_argument(
+ '--checkpoint',
+ type=str,
+ help='Checkpoint to load the weights from')
+ parser.add_argument(
+ '--tflite_path',
+ type=str,
+ help='Path to TFLite file to use for evaluation')
+ parser.add_argument(
+ '--model_architecture',
+ type=str,
+ default='dnn',
+ help='What model architecture to use')
+ parser.add_argument(
+ '--model_size_info',
+ type=int,
+ nargs="+",
+ default=[128, 128, 128],
+ help='Model dimensions - different for various models')
+
+ FLAGS, _ = parser.parse_known_args()
+ evaluate()
diff --git a/models/keyword_spotting/dnn_small/model_package_tf/how_to_guidance.ipynb b/models/keyword_spotting/dnn_small/model_package_tf/how_to_guidance.ipynb
new file mode 100644
index 0000000..1332d4e
--- /dev/null
+++ b/models/keyword_spotting/dnn_small/model_package_tf/how_to_guidance.ipynb
@@ -0,0 +1,428 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Copyright (C) 2023 Arm Limited or its affiliates. All rights reserved.\n",
+ "#\n",
+ "# SPDX-License-Identifier: Apache-2.0\n",
+ "#\n",
+ "# Licensed under the Apache License, Version 2.0 (the License); you may\n",
+ "# not use this file except in compliance with the License.\n",
+ "# You may obtain a copy of the License at\n",
+ "#\n",
+ "# www.apache.org/licenses/LICENSE-2.0\n",
+ "#\n",
+ "# Unless required by applicable law or agreed to in writing, software\n",
+ "# distributed under the License is distributed on an AS IS BASIS, WITHOUT\n",
+ "# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
+ "# See the License for the specific language governing permissions and\n",
+ "# limitations under the License."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# DNN_Small - Optimised\n",
+ "\n",
+ "Here we reproduce the models with our established codebase and ModelPackage approach for your convenience.\n",
+ "\n",
+ "## Model-Package Overview:\n",
+ "\n",
+ "| Model \t| DNN_Small \t|\n",
+ "|:---------------:\t|:---------------------------------------------------------------:\t|\n",
+ "| **Format**: \t| Keras, Saved Model, TensorFlow Lite int8, TensorFlow Lite fp32 |\n",
+ "| **Feature**: \t| Keyword spotting for Arm Cortex-M CPUs |\n",
+ "| **Architectural Delta w.r.t. Vanilla**: | None |\n",
+ "| **Domain**: \t| Keyword spotting |\n",
+ "| **Package Quality**: \t| Optimised |"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Table of contents \n",
+ "\n",
+ "This how-to guidance presents the key steps to reproduce everything in this package. The contents are organised as below. We provided the internal navigation links for users to easy-jump among different sections. \n",
+ "\n",
+ " \n",
+ "* [1.0 Model recreation](#model_recreation)\n",
+ "\n",
+ "* [2.0 Training](#training)\n",
+ "\n",
+ "* [3.0 Testing](#testing)\n",
+ "\n",
+ "* [4.0 Optimization](#optimization)\n",
+ "\n",
+ "* [5.0 Quantization and TFLite conversion](#tflite_conversion)\n",
+ "\n",
+ "* [6.0 Inference the TFLite model files](#tflite_inference)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 1.0 Model Recreation\n",
+ "\n",
+ "In order to recreate the model you will first need to be using ```Python3.7``` and install the requirements in ```requirements.txt```.\n",
+ "\n",
+ "Once you have these requirements satisfied you can execute the recreation script contained within this folder, just run:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "2023-01-31 13:25:23.242199: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0\n",
+ "Untarring speech_commands_v0.02.tar.gz...\n",
+ "2023-01-31 13:26:16.311986: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcuda.so.1\n",
+ "2023-01-31 13:26:16.348776: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: \n",
+ "pciBusID: 0000:03:00.0 name: NVIDIA TITAN Xp computeCapability: 6.1\n",
+ "coreClock: 1.582GHz coreCount: 30 deviceMemorySize: 11.91GiB deviceMemoryBandwidth: 510.07GiB/s\n",
+ "2023-01-31 13:26:16.348818: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0\n",
+ "2023-01-31 13:26:16.369436: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcublas.so.11\n",
+ "2023-01-31 13:26:16.369509: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcublasLt.so.11\n",
+ "2023-01-31 13:26:16.372294: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcufft.so.10\n",
+ "2023-01-31 13:26:16.372684: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcurand.so.10\n",
+ "2023-01-31 13:26:16.373267: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcusolver.so.11\n",
+ "2023-01-31 13:26:16.374012: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcusparse.so.11\n",
+ "2023-01-31 13:26:16.374168: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudnn.so.8\n",
+ "2023-01-31 13:26:16.374680: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0\n",
+ "2023-01-31 13:26:16.374967: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA\n",
+ "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
+ "2023-01-31 13:26:16.375884: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: \n",
+ "pciBusID: 0000:03:00.0 name: NVIDIA TITAN Xp computeCapability: 6.1\n",
+ "coreClock: 1.582GHz coreCount: 30 deviceMemorySize: 11.91GiB deviceMemoryBandwidth: 510.07GiB/s\n",
+ "2023-01-31 13:26:16.376614: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0\n",
+ "2023-01-31 13:26:16.376682: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0\n",
+ "2023-01-31 13:26:16.822126: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1258] Device interconnect StreamExecutor with strength 1 edge matrix:\n",
+ "2023-01-31 13:26:16.822161: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1264] 0 \n",
+ "2023-01-31 13:26:16.822173: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1277] 0: N \n",
+ "2023-01-31 13:26:16.822780: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1418] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 10939 MB memory) -> physical GPU (device: 0, name: NVIDIA TITAN Xp, pci bus id: 0000:03:00.0, compute capability: 6.1)\n",
+ "WARNING:tensorflow:Compiled the loaded model, but the compiled metrics have yet to be built. `model.compile_metrics` will be empty until you train or evaluate the model.\n",
+ "2023-01-31 13:26:17.956358: W tensorflow/python/util/util.cc:348] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.\n",
+ "2023-01-31 13:26:18.216079: I tensorflow/core/grappler/devices.cc:69] Number of eligible GPUs (core count >= 8, compute capability >= 0.0): 1\n",
+ "2023-01-31 13:26:18.216285: I tensorflow/core/grappler/clusters/single_machine.cc:357] Starting new session\n",
+ "2023-01-31 13:26:18.216661: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: \n",
+ "pciBusID: 0000:03:00.0 name: NVIDIA TITAN Xp computeCapability: 6.1\n",
+ "coreClock: 1.582GHz coreCount: 30 deviceMemorySize: 11.91GiB deviceMemoryBandwidth: 510.07GiB/s\n",
+ "2023-01-31 13:26:18.216906: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0\n",
+ "2023-01-31 13:26:18.216936: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1258] Device interconnect StreamExecutor with strength 1 edge matrix:\n",
+ "2023-01-31 13:26:18.216946: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1264] 0 \n",
+ "2023-01-31 13:26:18.216953: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1277] 0: N \n",
+ "2023-01-31 13:26:18.217236: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1418] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 10939 MB memory) -> physical GPU (device: 0, name: NVIDIA TITAN Xp, pci bus id: 0000:03:00.0, compute capability: 6.1)\n",
+ "2023-01-31 13:26:18.235442: I tensorflow/core/platform/profile_utils/cpu_utils.cc:114] CPU Frequency: 3492140000 Hz\n",
+ "2023-01-31 13:26:18.236450: I tensorflow/core/grappler/optimizers/meta_optimizer.cc:1144] Optimization results for grappler item: graph_to_optimize\n",
+ " function_optimizer: function_optimizer did nothing. time = 0.011ms.\n",
+ " function_optimizer: function_optimizer did nothing. time = 0.001ms.\n",
+ "\n",
+ "2023-01-31 13:26:18.268723: W tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc:345] Ignored output_format.\n",
+ "2023-01-31 13:26:18.268758: W tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc:348] Ignored drop_control_dependency.\n",
+ "2023-01-31 13:26:18.271003: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:210] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.\n",
+ "2023-01-31 13:26:18.272912: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: \n",
+ "pciBusID: 0000:03:00.0 name: NVIDIA TITAN Xp computeCapability: 6.1\n",
+ "coreClock: 1.582GHz coreCount: 30 deviceMemorySize: 11.91GiB deviceMemoryBandwidth: 510.07GiB/s\n",
+ "2023-01-31 13:26:18.273329: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0\n",
+ "2023-01-31 13:26:18.273362: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1258] Device interconnect StreamExecutor with strength 1 edge matrix:\n",
+ "2023-01-31 13:26:18.273373: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1264] 0 \n",
+ "2023-01-31 13:26:18.273385: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1277] 0: N \n",
+ "2023-01-31 13:26:18.273700: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1418] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 10939 MB memory) -> physical GPU (device: 0, name: NVIDIA TITAN Xp, pci bus id: 0000:03:00.0, compute capability: 6.1)\n",
+ "Converted model saved to dnn.tflite.\n",
+ "Running TFLite evaluation on validation set...\n",
+ "2023-01-31 13:26:18.314546: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:176] None of the MLIR Optimization Passes are enabled (registered 2)\n",
+ "[[371 0 0 0 0 0 0 0 0 0 0 0]\n",
+ " [ 0 238 16 9 5 30 11 10 15 7 9 21]\n",
+ " [ 0 7 341 8 0 6 26 5 1 0 0 3]\n",
+ " [ 0 8 7 316 5 17 0 5 2 1 4 41]\n",
+ " [ 0 8 1 2 287 3 5 4 6 19 7 8]\n",
+ " [ 0 10 1 22 2 317 2 0 5 2 1 15]\n",
+ " [ 0 5 27 2 1 2 299 9 0 3 0 4]\n",
+ " [ 1 13 2 2 0 2 3 334 2 2 0 2]\n",
+ " [ 2 9 1 1 6 6 2 0 318 13 1 4]\n",
+ " [ 1 4 1 0 29 0 1 1 17 311 4 4]\n",
+ " [ 2 2 0 1 15 5 0 1 4 5 310 5]\n",
+ " [ 0 10 1 38 8 26 2 1 3 1 1 281]]\n",
+ "Validation accuracy = 83.76%(N=4445)\n",
+ "Running TFLite evaluation on test set...\n",
+ "[[408 0 0 0 0 0 0 0 0 0 0 0]\n",
+ " [ 0 274 12 20 8 11 16 13 20 7 8 19]\n",
+ " [ 1 9 375 3 0 6 20 0 0 0 1 4]\n",
+ " [ 0 14 5 312 1 33 6 0 0 0 3 31]\n",
+ " [ 0 12 0 3 362 5 3 5 8 11 13 3]\n",
+ " [ 0 10 2 34 2 332 5 0 5 0 3 13]\n",
+ " [ 0 12 27 5 4 1 339 17 1 2 2 2]\n",
+ " [ 0 12 0 2 4 1 9 362 1 3 0 2]\n",
+ " [ 1 12 0 3 3 14 1 1 336 20 1 4]\n",
+ " [ 1 6 3 2 16 0 3 1 19 338 2 11]\n",
+ " [ 0 5 1 2 22 4 3 0 0 2 367 5]\n",
+ " [ 0 17 0 65 6 17 3 2 2 5 2 283]]\n",
+ "Test accuracy = 83.60%(N=4890)\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "2023-01-31 13:26:30.279559: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0\n",
+ "Untarring speech_commands_v0.02.tar.gz...\n",
+ "2023-01-31 13:27:20.964068: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcuda.so.1\n",
+ "2023-01-31 13:27:21.007726: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: \n",
+ "pciBusID: 0000:03:00.0 name: NVIDIA TITAN Xp computeCapability: 6.1\n",
+ "coreClock: 1.582GHz coreCount: 30 deviceMemorySize: 11.91GiB deviceMemoryBandwidth: 510.07GiB/s\n",
+ "2023-01-31 13:27:21.007765: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0\n",
+ "2023-01-31 13:27:21.028042: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcublas.so.11\n",
+ "2023-01-31 13:27:21.028131: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcublasLt.so.11\n",
+ "2023-01-31 13:27:21.030956: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcufft.so.10\n",
+ "2023-01-31 13:27:21.031218: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcurand.so.10\n",
+ "2023-01-31 13:27:21.031788: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcusolver.so.11\n",
+ "2023-01-31 13:27:21.032512: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcusparse.so.11\n",
+ "2023-01-31 13:27:21.032668: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudnn.so.8\n",
+ "2023-01-31 13:27:21.033033: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0\n",
+ "2023-01-31 13:27:21.033325: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA\n",
+ "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
+ "2023-01-31 13:27:21.034039: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: \n",
+ "pciBusID: 0000:03:00.0 name: NVIDIA TITAN Xp computeCapability: 6.1\n",
+ "coreClock: 1.582GHz coreCount: 30 deviceMemorySize: 11.91GiB deviceMemoryBandwidth: 510.07GiB/s\n",
+ "2023-01-31 13:27:21.034415: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0\n",
+ "2023-01-31 13:27:21.034486: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0\n",
+ "2023-01-31 13:27:21.478837: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1258] Device interconnect StreamExecutor with strength 1 edge matrix:\n",
+ "2023-01-31 13:27:21.478873: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1264] 0 \n",
+ "2023-01-31 13:27:21.478882: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1277] 0: N \n",
+ "2023-01-31 13:27:21.479411: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1418] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 10939 MB memory) -> physical GPU (device: 0, name: NVIDIA TITAN Xp, pci bus id: 0000:03:00.0, compute capability: 6.1)\n",
+ "WARNING:tensorflow:Compiled the loaded model, but the compiled metrics have yet to be built. `model.compile_metrics` will be empty until you train or evaluate the model.\n",
+ "2023-01-31 13:27:22.568489: W tensorflow/python/util/util.cc:348] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.\n",
+ "2023-01-31 13:27:22.830822: I tensorflow/core/grappler/devices.cc:69] Number of eligible GPUs (core count >= 8, compute capability >= 0.0): 1\n",
+ "2023-01-31 13:27:22.831041: I tensorflow/core/grappler/clusters/single_machine.cc:357] Starting new session\n",
+ "2023-01-31 13:27:22.831444: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: \n",
+ "pciBusID: 0000:03:00.0 name: NVIDIA TITAN Xp computeCapability: 6.1\n",
+ "coreClock: 1.582GHz coreCount: 30 deviceMemorySize: 11.91GiB deviceMemoryBandwidth: 510.07GiB/s\n",
+ "2023-01-31 13:27:22.831775: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0\n",
+ "2023-01-31 13:27:22.831807: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1258] Device interconnect StreamExecutor with strength 1 edge matrix:\n",
+ "2023-01-31 13:27:22.831816: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1264] 0 \n",
+ "2023-01-31 13:27:22.831823: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1277] 0: N \n",
+ "2023-01-31 13:27:22.832109: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1418] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 10939 MB memory) -> physical GPU (device: 0, name: NVIDIA TITAN Xp, pci bus id: 0000:03:00.0, compute capability: 6.1)\n",
+ "2023-01-31 13:27:22.851539: I tensorflow/core/platform/profile_utils/cpu_utils.cc:114] CPU Frequency: 3492140000 Hz\n",
+ "2023-01-31 13:27:22.852738: I tensorflow/core/grappler/optimizers/meta_optimizer.cc:1144] Optimization results for grappler item: graph_to_optimize\n",
+ " function_optimizer: function_optimizer did nothing. time = 0.013ms.\n",
+ " function_optimizer: function_optimizer did nothing. time = 0.001ms.\n",
+ "\n",
+ "2023-01-31 13:27:22.888443: W tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc:345] Ignored output_format.\n",
+ "2023-01-31 13:27:22.888491: W tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc:348] Ignored drop_control_dependency.\n",
+ "2023-01-31 13:27:22.891172: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:210] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.\n",
+ "2023-01-31 13:27:22.893139: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: \n",
+ "pciBusID: 0000:03:00.0 name: NVIDIA TITAN Xp computeCapability: 6.1\n",
+ "coreClock: 1.582GHz coreCount: 30 deviceMemorySize: 11.91GiB deviceMemoryBandwidth: 510.07GiB/s\n",
+ "2023-01-31 13:27:22.893390: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0\n",
+ "2023-01-31 13:27:22.893420: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1258] Device interconnect StreamExecutor with strength 1 edge matrix:\n",
+ "2023-01-31 13:27:22.893430: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1264] 0 \n",
+ "2023-01-31 13:27:22.893437: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1277] 0: N \n",
+ "2023-01-31 13:27:22.893709: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1418] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 10939 MB memory) -> physical GPU (device: 0, name: NVIDIA TITAN Xp, pci bus id: 0000:03:00.0, compute capability: 6.1)\n",
+ "2023-01-31 13:27:22.923079: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:176] None of the MLIR Optimization Passes are enabled (registered 2)\n",
+ "fully_quantize: 0, inference_type: 6, input_inference_type: 9, output_inference_type: 9\n",
+ "Quantized model saved to dnn_quantized.tflite.\n",
+ "Running TFLite evaluation on validation set...\n",
+ "[[371 0 0 0 0 0 0 0 0 0 0 0]\n",
+ " [ 0 241 17 14 5 27 12 9 17 6 7 16]\n",
+ " [ 0 11 340 11 4 5 21 4 1 0 0 0]\n",
+ " [ 0 15 7 315 10 14 1 2 2 1 6 33]\n",
+ " [ 0 10 1 6 282 4 6 3 5 19 10 4]\n",
+ " [ 0 17 2 26 8 300 1 0 6 0 4 13]\n",
+ " [ 0 8 30 3 6 1 293 7 0 1 2 1]\n",
+ " [ 0 17 2 4 6 1 9 316 1 2 4 1]\n",
+ " [ 2 9 1 1 10 4 2 2 317 11 0 4]\n",
+ " [ 1 8 1 2 33 0 0 2 15 303 6 2]\n",
+ " [ 2 6 0 2 25 5 0 0 2 1 304 3]\n",
+ " [ 0 16 1 47 15 27 2 1 3 1 4 255]]\n",
+ "Validation accuracy = 81.82%(N=4445)\n",
+ "Running TFLite evaluation on test set...\n",
+ "[[408 0 0 0 0 0 0 0 0 0 0 0]\n",
+ " [ 0 281 13 23 11 10 14 14 21 6 6 9]\n",
+ " [ 0 12 372 6 6 6 13 0 0 0 1 3]\n",
+ " [ 0 19 8 311 6 28 4 0 0 0 4 25]\n",
+ " [ 0 20 2 6 359 6 1 2 6 7 14 2]\n",
+ " [ 0 15 5 36 10 318 3 0 4 2 2 11]\n",
+ " [ 0 12 33 6 13 3 320 19 0 2 2 2]\n",
+ " [ 1 17 0 4 5 1 11 347 1 1 7 1]\n",
+ " [ 0 16 0 6 8 16 1 1 326 18 3 1]\n",
+ " [ 1 6 3 4 37 1 3 2 19 314 3 9]\n",
+ " [ 0 10 0 6 28 3 4 0 0 1 354 5]\n",
+ " [ 0 19 0 73 18 19 3 2 3 4 2 259]]\n",
+ "Test accuracy = 81.17%(N=4890)\n"
+ ]
+ }
+ ],
+ "source": [
+ "!bash ./recreate_model.sh"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Running this script will use the pre-trained checkpoint files supplied in the ```./model_archive/model_source/weights``` folder to generate the TFLite files and perform evaluation on the test set. Both an fp32 version and a quantized version will be produced. The quantized version will use post-training quantization to fully quantize it.\n",
+ "\n",
+ "If you want to run training from scratch you can do this by supplying ```--train``` when running the script. For example:\n",
+ "\n",
+ "```bash\n",
+ "bash ./recreate_model.sh --train\n",
+ "```\n",
+ "\n",
+ "Training is then performed and should produce a model to the stated accuracy in this repository. Note that exporting to TFLite will still happen with the baseline pre-trained checkpoint files, so you will need to re-run the script and this time supply the path to the new checkpoint files you want to use, for example:\n",
+ "\n",
+ "```bash\n",
+ "bash ./recreate_model.sh --ckpt \n",
+ "```"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 2.0 Training\n",
+ "\n",
+ "The training scripts can be used to recreate any of the models from the [Hello Edge paper](https://arxiv.org/pdf/1711.07128.pdf) provided the right hyperparameters are used. The training commands with all the hyperparameters to reproduce the model in this repository are given [here](recreate_model.sh). The model in this part of the repository represents just one variation of the models from the paper, other varieties are covered in other parts of the repository.\n",
+ "\n",
+ "\n",
+ "As a general example of how to train a DNN with 3 fully-connected layers with 128 neurons in each layer, run:\n",
+ "```\n",
+ "python train.py --model_architecture dnn --model_size_info 128 128 128\n",
+ "```\n",
+ "\n",
+ "The command line argument *--model_size_info* is used to pass the neural network layer\n",
+ "dimensions such as number of layers, convolution filter size/stride as a list to models.py,\n",
+ "which builds the TensorFlow graph based on the provided model architecture\n",
+ "and layer dimensions. For more info on *model_size_info* for each network architecture see\n",
+ "[models.py](model_core_utils/models.py).\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 3.0 Testing\n",
+ "To run inference on the trained model from a checkpoint and get accuracy on validation and test sets, run:\n",
+ "```\n",
+ "python evaluation.py --model_architecture dnn --model_size_info 128 128 128 --checkpoint \n",
+ "```\n",
+ "**The model and feature extraction parameters passed to this script should match those used in the Training step.**"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 4.0 Optimization\n",
+ "\n",
+ "We introduce an *optional* step to optimize the trained keyword spotting model for deployment.\n",
+ "\n",
+ "Here we use TensorFlow's [weight clustering API](https://www.tensorflow.org/model_optimization/guide/clustering) to reduce the compressed model size and optimize inference on supported hardware. 32 weight clusters and kmeans++ cluster intialization method are used as the clustering hyperparameters.\n",
+ "\n",
+ "To optimize your trained model (e.g. a DNN), a trained model checkpoint is needed to run clustering and fine-tuning on.\n",
+ "You can use the pre-trained checkpoints provided, or train your own model and use the resulting checkpoint.\n",
+ "\n",
+ "To apply the optimization and fine-tuning, run the following command:\n",
+ "```\n",
+ "python optimisations.py --model_architecture dnn --model_size_info 128 128 128 --checkpoint \n",
+ "```\n",
+ "**The model and feature extraction parameters used here should match those used in the Training step, except for the number of training steps.\n",
+ "The number of training steps is reduced since the optimization step only requires fine-tuning.**\n",
+ "\n",
+ "This will generate a clustered model checkpoint that can be used in the quantization step to generate a quantized and clustered TFLite model."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 5.0 Quantization and TFLite Conversion\n",
+ "\n",
+ "You can now use TensorFlow's\n",
+ "[post training quantization](https://www.tensorflow.org/lite/performance/post_training_quantization) to\n",
+ "make quantization of the trained models super simple.\n",
+ "\n",
+ "To quantize your trained model (e.g. a DNN) run:\n",
+ "```\n",
+ "python convert_to_tflite.py --model_architecture dnn --model_size_info 128 128 128 --checkpoint [--inference_type int8|int16]\n",
+ "```\n",
+ "**The model and feature extraction parameters used here should match those used in the Training step.**\n",
+ "\n",
+ "The ```inference_type``` parameter is *optional* and to be used if a fully quantized model with inputs and outputs of type int8 or int16 is needed. It defaults to fp32.\n",
+ "\n",
+ "In this example, this step will produce a quantized TFLite file *dnn_quantized.tflite*."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "You can test the accuracy of this quantized model on the test set by running:\n",
+ "```\n",
+ "python evaluation.py --tflite_path dnn_quantized.tflite\n",
+ "```\n",
+ "**The model and feature extraction parameters used here should match those used in the Training step.**\n",
+ "\n",
+ "`convert_to_tflite.py` uses post-training quantization to generate a quantized model by default. If you wish to convert to a floating point TFLite model, use the command below:\n",
+ "\n",
+ "```\n",
+ "python convert_to_tflite.py --model_architecture dnn --model_size_info 128 128 128 --checkpoint --no-quantize\n",
+ "```\n",
+ "\n",
+ "This will produce a floating point TFLite file *dnn.tflite*. You can test the accuracy of this floating point model using `evaluation.py` as above.\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 6.0 Single inference of the TFLite model files \n",
+ "\n",
+ "You can conduct TFLite inference for .fp32 and .int8 model files by using the following command: \n",
+ "\n",
+ "```python dnn_s_inference_tflite.py --labels validation_utils/labels.txt --wav --tflite_path ```\n",
+ "\n",
+ "**The feature extraction parameters used here should match those used in the Training step.**\n",
+ "\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.7.4"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/models/keyword_spotting/dnn_small/model_package_tf/model_archive/TFLite/tflite_fp32/README.md b/models/keyword_spotting/dnn_small/model_package_tf/model_archive/TFLite/tflite_fp32/README.md
new file mode 100644
index 0000000..78f4f45
--- /dev/null
+++ b/models/keyword_spotting/dnn_small/model_package_tf/model_archive/TFLite/tflite_fp32/README.md
@@ -0,0 +1,62 @@
+# keyword_spotting/dnn_small/model_package_tf/model_archive/TFLite/tflite_fp32
+
+## Description
+This is a floating point fp32 version of the DNN Small model developed by Arm, from the Hello Edge paper.
+
+## License
+[Apache-2.0](https://spdx.org/licenses/Apache-2.0.html)
+
+## Network Information
+| Network Information | Value |
+|---------------------|-------|
+| Framework | TensorFlow Lite |
+| Datatype | fp32 |
+| SHA-1 Hash | 7491539a547ee30b87c266e6bbb4455e0c8f556d |
+| Size (Bytes) | 320648 |
+| Provenance | https://arxiv.org/abs/1711.07128 |
+| Training | Trained by Arm |
+| Paper | https://arxiv.org/abs/1711.07128 |
+
+## DataSet
+| Dataset Information | Value |
+|--------|-------|
+| Name | Google Speech Commands test set |
+
+## Accuracy
+
+| Metric | Value |
+|--------|-------|
+| accuracy | 83.60% |
+
+## HW Support
+| HW Support | Value |
+|--------------|-------|
+| Cortex-A |:heavy_check_mark: |
+| Cortex-M |:heavy_check_mark: |
+| Mali GPU |:heavy_check_mark: |
+| Ethos U |:heavy_multiplication_x: |
+
+### Key
+* :heavy_check_mark: - Will run on this platform.
+* :heavy_multiplication_x: - Will not run on this platform.
+
+## Network Quality
+| Network Quality | Value |
+|-------------------------|-------|
+| Recreate | :heavy_check_mark: |
+| Quality level | Deployable |
+| Vanilla | :heavy_check_mark: |
+| Clustered | :heavy_multiplication_x: |
+| Pruned | :heavy_multiplication_x: |
+| Quantization - default | :heavy_multiplication_x: |
+| Quantization - full | :heavy_multiplication_x: |
+
+## Network Inputs
+| Input Node Name | Shape | Type | Example Path | Example Type | Example Shape | Example Use Case |
+|-----------------|-------|-------|--------------|-------|-------|-----------------|
+| input | (1, 250) | fp32 | models/keyword_spotting/dnn_small/model_package_tf/model_archive/TFLite/tflite_fp32/testing_input/input | fp32 | [1, 250] | The input is a processed MFCCs |
+
+## Network Outputs
+| Output Node Name | Shape | Type | Example Path | Example Type | Example Shape | Example Use Case |
+|-----------------|-------|-------|--------------|-------|-------|-----------------|
+| Identity | (1, 12) | fp32 | models/keyword_spotting/dnn_small/model_package_tf/model_archive/TFLite/tflite_fp32/testing_output/Identity | fp32 | [1, 12] | The probability on 12 keywords |
\ No newline at end of file
diff --git a/models/keyword_spotting/dnn_small/model_package_tf/model_archive/TFLite/tflite_fp32/definition.yaml b/models/keyword_spotting/dnn_small/model_package_tf/model_archive/TFLite/tflite_fp32/definition.yaml
new file mode 100644
index 0000000..0458507
--- /dev/null
+++ b/models/keyword_spotting/dnn_small/model_package_tf/model_archive/TFLite/tflite_fp32/definition.yaml
@@ -0,0 +1,62 @@
+benchmark:
+ benchmark_metrics:
+ accuracy: 83.60%
+ benchmark_name: Google Speech Commands test set
+description: This is a floating point fp32 version of the DNN Small model developed
+ by Arm, from the Hello Edge paper.
+license:
+- Apache-2.0
+network:
+ datatype: fp32
+ file_size_bytes: 320648
+ filename: dnn_s.tflite
+ framework: TensorFlow Lite
+ hash:
+ algorithm: sha1
+ value: 7491539a547ee30b87c266e6bbb4455e0c8f556d
+ provenance: https://arxiv.org/abs/1711.07128
+ training: Trained by Arm
+network_parameters:
+ input_nodes:
+ - description: The input is a processed MFCCs of shape (1, 490)
+ example_input:
+ path: models/keyword_spotting/dnn_small/model_package_tf/model_archive/TFLite/tflite_fp32/testing_input/input
+ shape:
+ - 1
+ - 250
+ type: fp32
+ use_case: Random input for model regression.
+ input_datatype: fp32
+ name: input
+ shape:
+ - 1
+ - 250
+ output_nodes:
+ - description: The probability on 12 keywords.
+ example_output:
+ path: models/keyword_spotting/dnn_small/model_package_tf/model_archive/TFLite/tflite_fp32/testing_output/Identity
+ shape:
+ - 1
+ - 12
+ type: fp32
+ use_case: output for model regression.
+ name: Identity
+ output_datatype: fp32
+ shape:
+ - 1
+ - 12
+network_quality:
+ clustered: false
+ is_vanilla: true
+ pruned: false
+ quality_level: Deployable
+ quality_level_hero_hw: cortex_m
+ quantization_default: false
+ quantization_full: false
+ recreate: true
+operators:
+ TensorFlow Lite:
+ - FULLY_CONNECTED
+ - RELU
+ - SOFTMAX
+paper: https://arxiv.org/abs/1711.07128
\ No newline at end of file
diff --git a/models/keyword_spotting/dnn_small/model_package_tf/model_archive/TFLite/tflite_fp32/dnn_s.tflite b/models/keyword_spotting/dnn_small/model_package_tf/model_archive/TFLite/tflite_fp32/dnn_s.tflite
new file mode 100644
index 0000000..84cf83d
--- /dev/null
+++ b/models/keyword_spotting/dnn_small/model_package_tf/model_archive/TFLite/tflite_fp32/dnn_s.tflite
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7beaf5a4b740228324fc48db72eb2dab16854278676cb3f67268fee5910ab5f8
+size 320648
diff --git a/models/keyword_spotting/dnn_small/model_package_tf/model_archive/TFLite/tflite_fp32/testing_input/input/0.npy b/models/keyword_spotting/dnn_small/model_package_tf/model_archive/TFLite/tflite_fp32/testing_input/input/0.npy
new file mode 100644
index 0000000..fd525dc
--- /dev/null
+++ b/models/keyword_spotting/dnn_small/model_package_tf/model_archive/TFLite/tflite_fp32/testing_input/input/0.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1f9883bea3889da8d87477965f034c7f8a453636a4ed5897c34c0798a41924f8
+size 1128
diff --git a/models/keyword_spotting/dnn_small/model_package_tf/model_archive/TFLite/tflite_fp32/testing_output/Identity/0.npy b/models/keyword_spotting/dnn_small/model_package_tf/model_archive/TFLite/tflite_fp32/testing_output/Identity/0.npy
new file mode 100644
index 0000000..3d71018
--- /dev/null
+++ b/models/keyword_spotting/dnn_small/model_package_tf/model_archive/TFLite/tflite_fp32/testing_output/Identity/0.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b74580f29a9cea2e7f1f179e930c05d4d2ac884c70b535d7c5f988bc38c47258
+size 176
diff --git a/models/keyword_spotting/dnn_small/model_package_tf/model_archive/TFLite/tflite_int8/README.md b/models/keyword_spotting/dnn_small/model_package_tf/model_archive/TFLite/tflite_int8/README.md
new file mode 100644
index 0000000..91932d2
--- /dev/null
+++ b/models/keyword_spotting/dnn_small/model_package_tf/model_archive/TFLite/tflite_int8/README.md
@@ -0,0 +1,62 @@
+# keyword_spotting/dnn_small/model_package_tf/model_archive/TFLite/tflite_int8
+
+## Description
+This is a fully quantized int8 version of the DNN Small model developed by Arm, from the Hello Edge paper.
+
+## License
+[Apache-2.0](https://spdx.org/licenses/Apache-2.0.html)
+
+## Network Information
+| Network Information | Value |
+|---------------------|-------|
+| Framework | TensorFlow Lite |
+| Datatype | int8 |
+| SHA-1 Hash | 4b92e09fb43b2f042ce2811b91c7c67bf7186b6b |
+| Size (Bytes) | 83544 |
+| Provenance | https://arxiv.org/abs/1711.07128 |
+| Training | Trained by Arm |
+| Paper | https://arxiv.org/abs/1711.07128 |
+
+## DataSet
+| Dataset Information | Value |
+|--------|-------|
+| Name | Google Speech Commands test set |
+
+## Accuracy
+
+| Metric | Value |
+|--------|-------|
+| Accuracy | 82.11% |
+
+## HW Support
+| HW Support | Value |
+|--------------|-------|
+| Cortex-A |:heavy_check_mark: |
+| Cortex-M |:heavy_check_mark: |
+| Mali GPU |:heavy_check_mark: |
+| Ethos U |:heavy_check_mark: |
+
+### Key
+* :heavy_check_mark: - Will run on this platform.
+* :heavy_multiplication_x: - Will not run on this platform.
+
+## Network Quality
+| Network Quality | Value |
+|-------------------------|-------|
+| Recreate | :heavy_check_mark: |
+| Quality level | Deployable |
+| Vanilla | :heavy_check_mark: |
+| Clustered | :heavy_multiplication_x: |
+| Pruned | :heavy_multiplication_x: |
+| Quantization - default | :heavy_multiplication_x: |
+| Quantization - full | :heavy_check_mark: |
+
+## Network Inputs
+| Input Node Name | Shape | Type | Example Path | Example Type | Example Shape | Example Use Case |
+|-----------------|-------|-------|--------------|-------|-------|-----------------|
+| input | (1, 250) | int8 | models/keyword_spotting/dnn_small/model_package_tf/model_archive/TFLite/tflite_int8/testing_input/input | int8 | [1, 250] | The input is a processed MFCCs |
+
+## Network Outputs
+| Output Node Name | Shape | Type | Example Path | Example Type | Example Shape | Example Use Case |
+|-----------------|-------|-------|--------------|-------|-------|-----------------|
+| Identity | (1, 12) | int8 | models/keyword_spotting/dnn_small/model_package_tf/model_archive/TFLite/tflite_int8/testing_output/Identity | int8 | [1, 12] | The probability on 12 keywords |
\ No newline at end of file
diff --git a/models/keyword_spotting/dnn_small/model_package_tf/model_archive/TFLite/tflite_int8/definition.yaml b/models/keyword_spotting/dnn_small/model_package_tf/model_archive/TFLite/tflite_int8/definition.yaml
new file mode 100644
index 0000000..d653ebc
--- /dev/null
+++ b/models/keyword_spotting/dnn_small/model_package_tf/model_archive/TFLite/tflite_int8/definition.yaml
@@ -0,0 +1,62 @@
+benchmark:
+ benchmark_metrics:
+ Accuracy: 82.11%
+ benchmark_name: Google Speech Commands test set
+description: This is a fully quantized int8 version of the DNN Small model developed
+ by Arm, from the Hello Edge paper.
+license:
+- Apache-2.0
+network:
+ datatype: int8
+ file_size_bytes: 83544
+ filename: dnn_s_quantized.tflite
+ framework: TensorFlow Lite
+ hash:
+ algorithm: sha1
+ value: 4b92e09fb43b2f042ce2811b91c7c67bf7186b6b
+ provenance: https://arxiv.org/abs/1711.07128
+ training: Trained by Arm
+network_parameters:
+ input_nodes:
+ - description: The input is a processed MFCCs of shape (1, 250)
+ example_input:
+ path: models/keyword_spotting/dnn_small/model_package_tf/model_archive/TFLite/tflite_int8/testing_input/input
+ shape:
+ - 1
+ - 250
+ type: int8
+ use_case: Random input for model regression.
+ input_datatype: int8
+ name: input
+ shape:
+ - 1
+ - 250
+ output_nodes:
+ - description: The probability on 12 keywords.
+ example_output:
+ path: models/keyword_spotting/dnn_small/model_package_tf/model_archive/TFLite/tflite_int8/testing_output/Identity
+ shape:
+ - 1
+ - 12
+ type: int8
+ use_case: output for model regression.
+ name: Identity
+ output_datatype: int8
+ shape:
+ - 1
+ - 12
+network_quality:
+ clustered: false
+ is_vanilla: true
+ pruned: false
+ quality_level: Deployable
+ quality_level_hero_hw: cortex_m
+ quantization_default: false
+ quantization_full: true
+ recreate: true
+operators:
+ TensorFlow Lite:
+ - FULLY_CONNECTED
+ - RELU
+ - SOFTMAX
+paper: https://arxiv.org/abs/1711.07128
\ No newline at end of file
diff --git a/models/keyword_spotting/dnn_small/tflite_int8/dnn_s_quantized.tflite b/models/keyword_spotting/dnn_small/model_package_tf/model_archive/TFLite/tflite_int8/dnn_s_quantized.tflite
similarity index 100%
rename from models/keyword_spotting/dnn_small/tflite_int8/dnn_s_quantized.tflite
rename to models/keyword_spotting/dnn_small/model_package_tf/model_archive/TFLite/tflite_int8/dnn_s_quantized.tflite
diff --git a/models/keyword_spotting/dnn_small/tflite_int8/testing_input/input/0.npy b/models/keyword_spotting/dnn_small/model_package_tf/model_archive/TFLite/tflite_int8/testing_input/input/0.npy
similarity index 100%
rename from models/keyword_spotting/dnn_small/tflite_int8/testing_input/input/0.npy
rename to models/keyword_spotting/dnn_small/model_package_tf/model_archive/TFLite/tflite_int8/testing_input/input/0.npy
diff --git a/models/keyword_spotting/dnn_small/tflite_int8/testing_output/Identity/0.npy b/models/keyword_spotting/dnn_small/model_package_tf/model_archive/TFLite/tflite_int8/testing_output/Identity/0.npy
similarity index 100%
rename from models/keyword_spotting/dnn_small/tflite_int8/testing_output/Identity/0.npy
rename to models/keyword_spotting/dnn_small/model_package_tf/model_archive/TFLite/tflite_int8/testing_output/Identity/0.npy
diff --git a/models/keyword_spotting/dnn_small/model_package_tf/model_archive/model_source/saved_model/dnn_small/keras_metadata.pb b/models/keyword_spotting/dnn_small/model_package_tf/model_archive/model_source/saved_model/dnn_small/keras_metadata.pb
new file mode 100644
index 0000000..4f01a9c
--- /dev/null
+++ b/models/keyword_spotting/dnn_small/model_package_tf/model_archive/model_source/saved_model/dnn_small/keras_metadata.pb
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7035d087e4fea7940fc83080a1b64f4d8cdec6d8344aadb5876ff41994807bbf
+size 10087
diff --git a/models/keyword_spotting/dnn_small/model_package_tf/model_archive/model_source/saved_model/dnn_small/saved_model.pb b/models/keyword_spotting/dnn_small/model_package_tf/model_archive/model_source/saved_model/dnn_small/saved_model.pb
new file mode 100644
index 0000000..152a69e
--- /dev/null
+++ b/models/keyword_spotting/dnn_small/model_package_tf/model_archive/model_source/saved_model/dnn_small/saved_model.pb
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4c53338f2dc3fb47b591a96d93710047cc31fe9aa697bbf51283ce3b7d3557fe
+size 84664
diff --git a/models/keyword_spotting/dnn_small/model_package_tf/model_archive/model_source/saved_model/dnn_small/variables/variables.data-00000-of-00001 b/models/keyword_spotting/dnn_small/model_package_tf/model_archive/model_source/saved_model/dnn_small/variables/variables.data-00000-of-00001
new file mode 100644
index 0000000..d945297
--- /dev/null
+++ b/models/keyword_spotting/dnn_small/model_package_tf/model_archive/model_source/saved_model/dnn_small/variables/variables.data-00000-of-00001
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fd85a15e363ad2aeb3bf02308e5f89137221c1c6c658e71ccba21aefbba99d63
+size 321215
diff --git a/models/keyword_spotting/dnn_small/model_package_tf/model_archive/model_source/saved_model/dnn_small/variables/variables.index b/models/keyword_spotting/dnn_small/model_package_tf/model_archive/model_source/saved_model/dnn_small/variables/variables.index
new file mode 100644
index 0000000..35dd996
--- /dev/null
+++ b/models/keyword_spotting/dnn_small/model_package_tf/model_archive/model_source/saved_model/dnn_small/variables/variables.index
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cc2c60477817e8647d6ebbe1409b40435de6bcaef280b0a41cf5713d3ec95393
+size 641
diff --git a/models/keyword_spotting/dnn_small/tflite_int8/ckpt/checkpoint b/models/keyword_spotting/dnn_small/model_package_tf/model_archive/model_source/weights/checkpoint
similarity index 100%
rename from models/keyword_spotting/dnn_small/tflite_int8/ckpt/checkpoint
rename to models/keyword_spotting/dnn_small/model_package_tf/model_archive/model_source/weights/checkpoint
diff --git a/models/keyword_spotting/dnn_small/tflite_int8/ckpt/dnn_0.84_ckpt.data-00000-of-00001 b/models/keyword_spotting/dnn_small/model_package_tf/model_archive/model_source/weights/dnn_0.84_ckpt.data-00000-of-00001
similarity index 100%
rename from models/keyword_spotting/dnn_small/tflite_int8/ckpt/dnn_0.84_ckpt.data-00000-of-00001
rename to models/keyword_spotting/dnn_small/model_package_tf/model_archive/model_source/weights/dnn_0.84_ckpt.data-00000-of-00001
diff --git a/models/keyword_spotting/dnn_small/tflite_int8/ckpt/dnn_0.84_ckpt.index b/models/keyword_spotting/dnn_small/model_package_tf/model_archive/model_source/weights/dnn_0.84_ckpt.index
similarity index 100%
rename from models/keyword_spotting/dnn_small/tflite_int8/ckpt/dnn_0.84_ckpt.index
rename to models/keyword_spotting/dnn_small/model_package_tf/model_archive/model_source/weights/dnn_0.84_ckpt.index
diff --git a/models/keyword_spotting/dnn_small/model_package_tf/model_core_utils/__init__.py b/models/keyword_spotting/dnn_small/model_package_tf/model_core_utils/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/models/keyword_spotting/dnn_small/model_package_tf/model_core_utils/models.py b/models/keyword_spotting/dnn_small/model_package_tf/model_core_utils/models.py
new file mode 100644
index 0000000..1978136
--- /dev/null
+++ b/models/keyword_spotting/dnn_small/model_package_tf/model_core_utils/models.py
@@ -0,0 +1,327 @@
+# Copyright © 2023 Arm Ltd. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Model definitions for simple keyword spotting."""
+
+import math
+
+import tensorflow as tf
+
+
+def prepare_model_settings(label_count, sample_rate, clip_duration_ms,
+ window_size_ms, window_stride_ms,
+ dct_coefficient_count):
+ """Calculates common settings needed for all models.
+
+ Args:
+ label_count: How many classes are to be recognized.
+ sample_rate: Number of audio samples per second.
+ clip_duration_ms: Length of each audio clip to be analyzed.
+ window_size_ms: Duration of frequency analysis window.
+ window_stride_ms: How far to move in time between frequency windows.
+ dct_coefficient_count: Number of frequency bins to use for analysis.
+
+ Returns:
+ Dictionary containing common settings.
+ """
+ desired_samples = int(sample_rate * clip_duration_ms / 1000)
+ window_size_samples = int(sample_rate * window_size_ms / 1000)
+ window_stride_samples = int(sample_rate * window_stride_ms / 1000)
+ length_minus_window = (desired_samples - window_size_samples)
+ if length_minus_window < 0:
+ spectrogram_length = 0
+ else:
+ spectrogram_length = 1 + int(length_minus_window / window_stride_samples)
+ fingerprint_size = dct_coefficient_count * spectrogram_length
+
+ return {
+ 'desired_samples': desired_samples,
+ 'window_size_samples': window_size_samples,
+ 'window_stride_samples': window_stride_samples,
+ 'spectrogram_length': spectrogram_length,
+ 'dct_coefficient_count': dct_coefficient_count,
+ 'fingerprint_size': fingerprint_size,
+ 'label_count': label_count,
+ 'sample_rate': sample_rate,
+ }
+
+
+def create_model(model_settings, model_architecture, model_size_info, is_training):
+ """Builds a tf.keras model of the requested architecture compatible with the settings.
+
+ Args:
+ model_settings: Dictionary of information about the model.
+ model_architecture: String specifying which kind of model to create.
+ model_size_info: Array with specific information for the chosen architecture
+ (e.g convolutional parameters, number of layers).
+
+ Returns:
+ A tf.keras Model with the requested architecture.
+
+ Raises:
+ Exception: If the architecture type isn't recognized.
+ """
+
+ if model_architecture == 'dnn':
+ return create_dnn_model(model_settings, model_size_info)
+
+ elif model_architecture == 'cnn':
+ return create_cnn_model(model_settings, model_size_info)
+
+ elif model_architecture == 'ds_cnn':
+ return create_ds_cnn_model(model_settings, model_size_info)
+ elif model_architecture == 'single_fc':
+ return create_single_fc_model(model_settings)
+ elif model_architecture == 'basic_lstm':
+ return create_basic_lstm_model(model_settings, model_size_info, is_training)
+ else:
+ raise Exception(f'model_architecture argument {model_architecture} not recognized'
+ f', should be one of, "dnn", "cnn", "ds_cnn" ')
+
+
+def create_single_fc_model(model_settings):
+ """Builds a model with a single fully-connected layer.
+
+ For details see https://arxiv.org/abs/1711.07128.
+
+ Args:
+ model_settings: Dict of different settings for model training.
+
+ Returns:
+ tf.keras Model of the 'SINGLE_FC' architecture.
+ """
+ inputs = tf.keras.Input(shape=(model_settings['fingerprint_size'],), name='input')
+ # Fully connected layer
+ output = tf.keras.layers.Dense(units=model_settings['label_count'], activation='softmax')(inputs)
+
+ return tf.keras.Model(inputs, output)
+
+
+def create_basic_lstm_model(model_settings, model_size_info, is_training):
+ """Builds a model with a basic lstm layer.
+
+ For details see https://arxiv.org/abs/1711.07128.
+
+ Args:
+ model_settings: Dict of different settings for model training.
+ model_size_info: Length of the array defines the number of hidden-layers and
+ each element in the array represent the number of neurons in that layer.
+ is_training: Determining whether the use of the model is for training or for something else.
+
+ Returns:
+ tf.keras Model of the 'Basic_LSTM' architecture.
+ """
+ inputs = tf.keras.Input(shape=(model_settings['fingerprint_size'], ), name='input')
+
+ input_frequency_size = model_settings['dct_coefficient_count']
+ input_time_size = model_settings['spectrogram_length']
+ x = tf.reshape(inputs, shape=(-1, input_time_size, input_frequency_size))
+
+ # LSTM layer, and unrolling depending on whether you are training or not
+ if is_training:
+ x = tf.keras.layers.LSTM(units=model_size_info[0], time_major=False, unroll=False)(x)
+ else:
+ x = tf.keras.layers.LSTM(units=model_size_info[0], time_major=False, unroll=True)(x)
+
+ # Outputs a fully connected layer
+ output = tf.keras.layers.Dense(units=model_settings['label_count'], activation='softmax')(x)
+
+ return tf.keras.Model(inputs, output)
+
+
+def create_dnn_model(model_settings, model_size_info):
+ """Builds a model with multiple hidden fully-connected layers.
+
+ For details see https://arxiv.org/abs/1711.07128.
+
+ Args:
+ model_settings: Dict of different settings for model training.
+ model_size_info: Length of the array defines the number of hidden-layers and
+ each element in the array represent the number of neurons in that layer.
+
+ Returns:
+ tf.keras Model of the 'DNN' architecture.
+ """
+
+ inputs = tf.keras.Input(shape=(model_settings['fingerprint_size'], ), name='input')
+
+ # First fully connected layer.
+ x = tf.keras.layers.Dense(units=model_size_info[0], activation='relu')(inputs)
+
+ # Hidden layers with ReLU activations.
+ for i in range(1, len(model_size_info)):
+ x = tf.keras.layers.Dense(units=model_size_info[i], activation='relu')(x)
+
+ # Output fully connected layer.
+ output = tf.keras.layers.Dense(units=model_settings['label_count'], activation='softmax')(x)
+
+ return tf.keras.Model(inputs, output)
+
+
+def create_cnn_model(model_settings, model_size_info):
+ """Builds a model with 2 convolution layers followed by a linear layer and a hidden fully-connected layer.
+
+ For details see https://arxiv.org/abs/1711.07128.
+
+ Args:
+ model_settings: Dict of different settings for model training.
+ model_size_info: Defines the first and second convolution parameters in
+ {number of conv features, conv filter height, width, stride in y,x dir.},
+ followed by linear layer size and fully-connected layer size.
+
+ Returns:
+ tf.keras Model of the 'CNN' architecture.
+ """
+
+ input_frequency_size = model_settings['dct_coefficient_count']
+ input_time_size = model_settings['spectrogram_length']
+
+ first_filter_count = model_size_info[0]
+ first_filter_height = model_size_info[1] # Time axis.
+ first_filter_width = model_size_info[2] # Frequency axis.
+ first_filter_stride_y = model_size_info[3] # Time axis.
+ first_filter_stride_x = model_size_info[4] # Frequency_axis.
+
+ second_filter_count = model_size_info[5]
+ second_filter_height = model_size_info[6] # Time axis.
+ second_filter_width = model_size_info[7] # Frequency axis.
+ second_filter_stride_y = model_size_info[8] # Time axis.
+ second_filter_stride_x = model_size_info[9] # Frequency axis.
+
+ linear_layer_size = model_size_info[10]
+ fc_size = model_size_info[11]
+
+ inputs = tf.keras.Input(shape=(model_settings['fingerprint_size']), name='input')
+
+ # Reshape the flattened input.
+ x = tf.reshape(inputs, shape=(-1, input_time_size, input_frequency_size, 1))
+
+ # First convolution.
+ x = tf.keras.layers.Conv2D(filters=first_filter_count,
+ kernel_size=(first_filter_height, first_filter_width),
+ strides=(first_filter_stride_y, first_filter_stride_x),
+ padding='VALID')(x)
+ x = tf.keras.layers.BatchNormalization()(x)
+ x = tf.keras.layers.ReLU()(x)
+ x = tf.keras.layers.Dropout(rate=0)(x)
+
+ # Second convolution.
+ x = tf.keras.layers.Conv2D(filters=second_filter_count,
+ kernel_size=(second_filter_height, second_filter_width),
+ strides=(second_filter_stride_y, second_filter_stride_x),
+ padding='VALID')(x)
+ x = tf.keras.layers.BatchNormalization()(x)
+ x = tf.keras.layers.ReLU()(x)
+ x = tf.keras.layers.Dropout(rate=0)(x)
+
+ # Flatten for fully connected layers.
+ x = tf.keras.layers.Flatten()(x)
+
+ # Fully connected layer with no activation.
+ x = tf.keras.layers.Dense(units=linear_layer_size)(x)
+
+ # Fully connected layer with ReLU activation.
+ x = tf.keras.layers.Dense(units=fc_size)(x)
+ x = tf.keras.layers.BatchNormalization()(x)
+ x = tf.keras.layers.ReLU()(x)
+ x = tf.keras.layers.Dropout(rate=0)(x)
+
+ # Output fully connected.
+ output = tf.keras.layers.Dense(units=model_settings['label_count'], activation='softmax')(x)
+
+ return tf.keras.Model(inputs, output)
+
+
+def create_ds_cnn_model(model_settings, model_size_info):
+ """Builds a model with convolutional & depthwise separable convolutional layers.
+
+ For more details see https://arxiv.org/abs/1711.07128.
+
+ Args:
+ model_settings: Dict of different settings for model training.
+ model_size_info: Defines number of layers, followed by the DS-Conv layer
+ parameters in the order {number of conv features, conv filter height,
+ width and stride in y,x dir.} for each of the layers.
+
+ Returns:
+ tf.keras Model of the 'DS-CNN' architecture.
+ """
+
+ label_count = model_settings['label_count']
+ input_frequency_size = model_settings['dct_coefficient_count']
+ input_time_size = model_settings['spectrogram_length']
+
+ t_dim = input_time_size
+ f_dim = input_frequency_size
+
+ # Extract model dimensions from model_size_info.
+ num_layers = model_size_info[0]
+ conv_feat = [None]*num_layers
+ conv_kt = [None]*num_layers
+ conv_kf = [None]*num_layers
+ conv_st = [None]*num_layers
+ conv_sf = [None]*num_layers
+
+ i = 1
+ for layer_no in range(0, num_layers):
+ conv_feat[layer_no] = model_size_info[i]
+ i += 1
+ conv_kt[layer_no] = model_size_info[i]
+ i += 1
+ conv_kf[layer_no] = model_size_info[i]
+ i += 1
+ conv_st[layer_no] = model_size_info[i]
+ i += 1
+ conv_sf[layer_no] = model_size_info[i]
+ i += 1
+
+ inputs = tf.keras.Input(shape=(model_settings['fingerprint_size']), name='input')
+
+ # Reshape the flattened input.
+ x = tf.reshape(inputs, shape=(-1, input_time_size, input_frequency_size, 1))
+
+ # Depthwise separable convolutions.
+ for layer_no in range(0, num_layers):
+ if layer_no == 0:
+ # First convolution.
+ x = tf.keras.layers.Conv2D(filters=conv_feat[0],
+ kernel_size=(conv_kt[0], conv_kf[0]),
+ strides=(conv_st[0], conv_sf[0]),
+ padding='SAME')(x)
+ x = tf.keras.layers.BatchNormalization()(x)
+ x = tf.keras.layers.ReLU()(x)
+ else:
+ # Depthwise convolution.
+ x = tf.keras.layers.DepthwiseConv2D(kernel_size=(conv_kt[layer_no], conv_kf[layer_no]),
+ strides=(conv_sf[layer_no], conv_st[layer_no]),
+ padding='SAME')(x)
+ x = tf.keras.layers.BatchNormalization()(x)
+ x = tf.keras.layers.ReLU()(x)
+
+ # Pointwise convolution.
+ x = tf.keras.layers.Conv2D(filters=conv_feat[layer_no], kernel_size=(1, 1))(x)
+ x = tf.keras.layers.BatchNormalization()(x)
+ x = tf.keras.layers.ReLU()(x)
+
+ t_dim = math.ceil(t_dim/float(conv_st[layer_no]))
+ f_dim = math.ceil(f_dim/float(conv_sf[layer_no]))
+
+ # Global average pool.
+ x = tf.keras.layers.AveragePooling2D(pool_size=(t_dim, f_dim), strides=1)(x)
+
+ # Squeeze before passing to output fully connected layer.
+ x = tf.reshape(x, shape=(-1, conv_feat[layer_no]))
+
+ # Output connected layer.
+ output = tf.keras.layers.Dense(units=label_count, activation='softmax')(x)
+
+ return tf.keras.Model(inputs, output)
diff --git a/models/keyword_spotting/dnn_small/model_package_tf/optimisations.py b/models/keyword_spotting/dnn_small/model_package_tf/optimisations.py
new file mode 100644
index 0000000..16b6f4c
--- /dev/null
+++ b/models/keyword_spotting/dnn_small/model_package_tf/optimisations.py
@@ -0,0 +1,259 @@
+# Copyright © 2023 Arm Ltd. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Functions for optimizing simple keyword spotting models using clustering API."""
+
+import argparse
+from pathlib import Path
+
+import tensorflow as tf
+import numpy as np
+import tensorflow_model_optimization as tfmot
+
+from data_processing import data_preprocessing
+from model_core_utils import models
+
+
+def print_model_weight_clusters(model):
+
+ for layer in model.layers:
+ if isinstance(layer, tf.keras.layers.Wrapper):
+ weights = layer.trainable_weights
+ else:
+ weights = layer.weights
+ for weight in weights:
+ if "kernel" in weight.name:
+ unique_count = len(np.unique(weight))
+ print(
+ f"{layer.name}/{weight.name}: {unique_count} clusters "
+ )
+
+
+def optimize():
+ model_settings = models.prepare_model_settings(
+ len(data_preprocessing.prepare_words_list(FLAGS.wanted_words.split(','))),
+ FLAGS.sample_rate, FLAGS.clip_duration_ms, FLAGS.window_size_ms,
+ FLAGS.window_stride_ms, FLAGS.dct_coefficient_count)
+
+ # Create the model to optimize from checkpoint.
+ model = models.create_model(model_settings, FLAGS.model_architecture, FLAGS.model_size_info)
+ model.load_weights(FLAGS.checkpoint).expect_partial()
+
+ audio_processor = data_preprocessing.AudioProcessor(data_url=FLAGS.data_url,
+ data_dir=FLAGS.data_dir,
+ silence_percentage=FLAGS.silence_percentage,
+ unknown_percentage=FLAGS.unknown_percentage,
+ wanted_words=FLAGS.wanted_words.split(','),
+ validation_percentage=FLAGS.validation_percentage,
+ testing_percentage=FLAGS.testing_percentage,
+ model_settings=model_settings)
+
+ # We decay learning rate in a constant piecewise way to help learning.
+ training_steps_list = list(map(int, FLAGS.how_many_training_steps.split(',')))
+ learning_rates_list = list(map(float, FLAGS.learning_rate.split(',')))
+ lr_boundary_list = training_steps_list[:-1] # Only need the values at which to change lr.
+ lr_schedule = tf.keras.optimizers.schedules.PiecewiseConstantDecay(boundaries=lr_boundary_list,
+ values=learning_rates_list)
+
+ cluster_weights = tfmot.clustering.keras.cluster_weights
+ CentroidInitialization = tfmot.clustering.keras.CentroidInitialization
+
+ clustering_params = {
+ 'number_of_clusters': 32,
+ 'cluster_centroids_init': CentroidInitialization.KMEANS_PLUS_PLUS}
+
+ clustered_model = cluster_weights(model, **clustering_params)
+
+ # Specify the optimizer configurations.
+ optimizer = tf.keras.optimizers.Adam(learning_rate=lr_schedule)
+ clustered_model.compile(optimizer=optimizer,
+ loss=tf.keras.losses.SparseCategoricalCrossentropy(),
+ metrics=['accuracy'])
+
+ train_data = audio_processor.get_data(audio_processor.Modes.TRAINING,
+ FLAGS.background_frequency, FLAGS.background_volume,
+ int((FLAGS.time_shift_ms * FLAGS.sample_rate) / 1000))
+ train_data = train_data.repeat().batch(FLAGS.batch_size).prefetch(tf.data.AUTOTUNE)
+ val_data = audio_processor.get_data(audio_processor.Modes.VALIDATION)
+ val_data = val_data.batch(FLAGS.batch_size).prefetch(tf.data.AUTOTUNE)
+
+ # We train for a max number of iterations so need to calculate how many 'epochs' this will be.
+ training_steps_max = np.sum(training_steps_list)
+ training_epoch_max = int(np.ceil(training_steps_max / FLAGS.eval_step_interval))
+
+ # Train the model with clustering applied.
+ clustered_model.fit(x=train_data,
+ steps_per_epoch=FLAGS.eval_step_interval,
+ epochs=training_epoch_max,
+ validation_data=val_data)
+
+ stripped_clustered_model = tfmot.clustering.keras.strip_clustering(clustered_model)
+
+ print_model_weight_clusters(stripped_clustered_model)
+
+ # Save the clustered model weights
+ train_dir = Path(FLAGS.train_dir) / "optimized"
+ train_dir.mkdir(parents=True, exist_ok=True)
+
+ stripped_clustered_model.save_weights((train_dir /
+ (FLAGS.model_architecture +
+ "_clustered_ckpt")))
+
+ # Test the model.
+ test_data = audio_processor.get_data(audio_processor.Modes.TESTING)
+ test_data = test_data.batch(FLAGS.batch_size)
+
+ stripped_clustered_model.compile(optimizer=optimizer,
+ loss=tf.keras.losses.SparseCategoricalCrossentropy(),
+ metrics=['accuracy'])
+
+ test_loss, test_acc = stripped_clustered_model.evaluate(x=test_data)
+ print(f'Final test accuracy: {test_acc*100:.2f}%')
+
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser()
+ parser.add_argument(
+ '--data_url',
+ type=str,
+ default='http://download.tensorflow.org/data/speech_commands_v0.02.tar.gz',
+ help='Location of speech training data archive on the web.')
+ parser.add_argument(
+ '--data_dir',
+ type=str,
+ default='/tmp/speech_dataset/',
+ help="""\
+ Where to download the speech training data to.
+ """)
+ parser.add_argument(
+ '--background_volume',
+ type=float,
+ default=0.1,
+ help="""\
+ How loud the background noise should be, between 0 and 1.
+ """)
+ parser.add_argument(
+ '--background_frequency',
+ type=float,
+ default=0.8,
+ help="""\
+ How many of the training samples have background noise mixed in.
+ """)
+ parser.add_argument(
+ '--silence_percentage',
+ type=float,
+ default=10.0,
+ help="""\
+ How much of the training data should be silence.
+ """)
+ parser.add_argument(
+ '--unknown_percentage',
+ type=float,
+ default=10.0,
+ help="""\
+ How much of the training data should be unknown words.
+ """)
+ parser.add_argument(
+ '--time_shift_ms',
+ type=float,
+ default=100.0,
+ help="""\
+ Range to randomly shift the training audio by in time.
+ """)
+ parser.add_argument(
+ '--testing_percentage',
+ type=int,
+ default=10,
+ help='What percentage of wavs to use as a test set.')
+ parser.add_argument(
+ '--validation_percentage',
+ type=int,
+ default=10,
+ help='What percentage of wavs to use as a validation set.')
+ parser.add_argument(
+ '--sample_rate',
+ type=int,
+ default=16000,
+ help='Expected sample rate of the wavs',)
+ parser.add_argument(
+ '--clip_duration_ms',
+ type=int,
+ default=1000,
+ help='Expected duration in milliseconds of the wavs',)
+ parser.add_argument(
+ '--window_size_ms',
+ type=float,
+ default=30.0,
+ help='How long each spectrogram timeslice is',)
+ parser.add_argument(
+ '--window_stride_ms',
+ type=float,
+ default=10.0,
+ help='How long each spectrogram timeslice is',)
+ parser.add_argument(
+ '--dct_coefficient_count',
+ type=int,
+ default=40,
+ help='How many bins to use for the MFCC fingerprint',)
+ parser.add_argument(
+ '--how_many_training_steps',
+ type=str,
+ default='3750,750',
+ help='How many training loops to run',)
+ parser.add_argument(
+ '--eval_step_interval',
+ type=int,
+ default=400,
+ help='How often to evaluate the training results.')
+ parser.add_argument(
+ '--learning_rate',
+ type=str,
+ default='0.001,0.0001',
+ help='How large a learning rate to use when training.')
+ parser.add_argument(
+ '--batch_size',
+ type=int,
+ default=100,
+ help='How many items to train with at once',)
+ parser.add_argument(
+ '--wanted_words',
+ type=str,
+ default='yes,no,up,down,left,right,on,off,stop,go',
+ help='Words to use (others will be added to an unknown label)',)
+ parser.add_argument(
+ '--train_dir',
+ type=str,
+ default='/tmp/speech_commands_train',
+ help='Directory to write event logs and checkpoint.')
+ parser.add_argument(
+ '--save_step_interval',
+ type=int,
+ default=100,
+ help='Save model checkpoint every save_steps.')
+ parser.add_argument(
+ '--checkpoint',
+ type=str,
+ help='Checkpoint to load the weights from before fine-tuning.')
+ parser.add_argument(
+ '--model_architecture',
+ type=str,
+ default='dnn',
+ help='What model architecture to use')
+ parser.add_argument(
+ '--model_size_info',
+ type=int,
+ nargs="+",
+ default=[128, 128, 128],
+ help='Model dimensions - different for various models')
+
+ FLAGS, _ = parser.parse_known_args()
+ optimize()
diff --git a/models/keyword_spotting/dnn_small/model_package_tf/recreate_model.sh b/models/keyword_spotting/dnn_small/model_package_tf/recreate_model.sh
new file mode 100644
index 0000000..d00f43f
--- /dev/null
+++ b/models/keyword_spotting/dnn_small/model_package_tf/recreate_model.sh
@@ -0,0 +1,52 @@
+#!/usr/bin/env bash
+# Copyright (C) 2023 Arm Limited or its affiliates. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+ckpt_path=model_archive/model_source/weights/dnn_0.84_ckpt
+train=false
+
+# Parse command line args
+while (( $# >= 1 )); do
+ case $1 in
+ --ckpt)
+ if [ "$2" ]; then
+ ckpt_path=$2
+ shift
+ else
+ printf 'ERROR: "--ckpt" requires a path to be supplied.\n'
+ exit 1
+ fi
+ ;;
+ --train)
+ train=true
+ break;;
+ *) shift;
+ esac;
+done
+
+
+# DNN Small training
+if [ "$train" = true ]
+then
+python train.py --model_architecture dnn --model_size_info 144 144 144 --dct_coefficient_count 10 --window_size_ms 40 --window_stride_ms 40 --learning_rate 0.0005,0.0001,0.00002 --how_many_training_steps 10000,10000,10000 --summaries_dir work/DNN/DNN_S/retrain_logs --train_dir work/DNN/DNN_S/training
+fi
+
+# Conversion to TFLite fp32
+python convert_to_tflite.py --model_architecture dnn --model_size_info 144 144 144 --dct_coefficient_count 10 --window_size_ms 40 --window_stride_ms 40 --checkpoint $ckpt_path --no-quantize
+
+# Conversion to TFLite int8
+python convert_to_tflite.py --model_architecture dnn --model_size_info 144 144 144 --dct_coefficient_count 10 --window_size_ms 40 --window_stride_ms 40 --checkpoint $ckpt_path --inference_type int8
+
diff --git a/models/keyword_spotting/dnn_small/model_package_tf/requirements.txt b/models/keyword_spotting/dnn_small/model_package_tf/requirements.txt
new file mode 100644
index 0000000..3448cff
--- /dev/null
+++ b/models/keyword_spotting/dnn_small/model_package_tf/requirements.txt
@@ -0,0 +1,3 @@
+numpy == 1.19.5
+tensorflow == 2.5.0
+tensorflow-model-optimization == 0.6.0
\ No newline at end of file
diff --git a/models/keyword_spotting/dnn_small/model_package_tf/train.py b/models/keyword_spotting/dnn_small/model_package_tf/train.py
new file mode 100644
index 0000000..8c488b3
--- /dev/null
+++ b/models/keyword_spotting/dnn_small/model_package_tf/train.py
@@ -0,0 +1,227 @@
+# Copyright © 2023 Arm Ltd. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Functions for training simple keyword spotting models."""
+
+import argparse
+from pathlib import Path
+
+import tensorflow as tf
+import numpy as np
+
+from data_processing import data_preprocessing
+from model_core_utils import models
+
+
+def train():
+ model_settings = models.prepare_model_settings(
+ len(data_preprocessing.prepare_words_list(FLAGS.wanted_words.split(','))),
+ FLAGS.sample_rate, FLAGS.clip_duration_ms, FLAGS.window_size_ms,
+ FLAGS.window_stride_ms, FLAGS.dct_coefficient_count)
+
+ # Create the model.
+ model = models.create_model(model_settings, FLAGS.model_architecture, FLAGS.model_size_info, True)
+
+ audio_processor = data_preprocessing.AudioProcessor(data_url=FLAGS.data_url,
+ data_dir=FLAGS.data_dir,
+ silence_percentage=FLAGS.silence_percentage,
+ unknown_percentage=FLAGS.unknown_percentage,
+ wanted_words=FLAGS.wanted_words.split(','),
+ validation_percentage=FLAGS.validation_percentage,
+ testing_percentage=FLAGS.testing_percentage,
+ model_settings=model_settings)
+
+ # We decay learning rate in a constant piecewise way to help learning.
+ training_steps_list = list(map(int, FLAGS.how_many_training_steps.split(',')))
+ learning_rates_list = list(map(float, FLAGS.learning_rate.split(',')))
+ lr_boundary_list = training_steps_list[:-1] # Only need the values at which to change lr.
+ lr_schedule = tf.keras.optimizers.schedules.PiecewiseConstantDecay(boundaries=lr_boundary_list,
+ values=learning_rates_list)
+
+ # Specify the optimizer configurations.
+ optimizer = tf.keras.optimizers.Adam(learning_rate=lr_schedule)
+ model.compile(optimizer=optimizer,
+ loss=tf.keras.losses.SparseCategoricalCrossentropy(),
+ metrics=['accuracy'])
+
+ train_data = audio_processor.get_data(audio_processor.Modes.TRAINING,
+ FLAGS.background_frequency, FLAGS.background_volume,
+ int((FLAGS.time_shift_ms * FLAGS.sample_rate) / 1000))
+ train_data = train_data.repeat().batch(FLAGS.batch_size).prefetch(tf.data.AUTOTUNE)
+ val_data = audio_processor.get_data(audio_processor.Modes.VALIDATION)
+ val_data = val_data.batch(FLAGS.batch_size).prefetch(tf.data.AUTOTUNE)
+
+ # We train for a max number of iterations so need to calculate how many 'epochs' this will be.
+ training_steps_max = np.sum(training_steps_list)
+ training_epoch_max = int(np.ceil(training_steps_max / FLAGS.eval_step_interval))
+
+ # Callbacks.
+ train_dir = Path(FLAGS.train_dir) / "best"
+ train_dir.mkdir(parents=True, exist_ok=True)
+ model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
+ filepath=(train_dir / (FLAGS.model_architecture + "_{val_accuracy:.3f}_ckpt")),
+ save_weights_only=True,
+ monitor='val_accuracy',
+ mode='max',
+ save_best_only=True)
+ tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=FLAGS.summaries_dir)
+
+ # Train the model.
+ model.fit(x=train_data,
+ steps_per_epoch=FLAGS.eval_step_interval,
+ epochs=training_epoch_max,
+ validation_data=val_data,
+ callbacks=[model_checkpoint_callback, tensorboard_callback])
+
+ # Test and save the model.
+ test_data = audio_processor.get_data(audio_processor.Modes.TESTING)
+ test_data = test_data.batch(FLAGS.batch_size)
+
+ test_loss, test_acc = model.evaluate(x=test_data)
+ print(f'Final test accuracy: {test_acc*100:.2f}%')
+ model.save(f'saved_model/{FLAGS.model_architecture}')
+ model.save(f'keras/{FLAGS.model_architecture}.h5')
+
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser()
+ parser.add_argument(
+ '--data_url',
+ type=str,
+ default='http://download.tensorflow.org/data/speech_commands_v0.02.tar.gz',
+ help='Location of speech training data archive on the web.')
+ parser.add_argument(
+ '--data_dir',
+ type=str,
+ default='/tmp/speech_dataset/',
+ help="""\
+ Where to download the speech training data to.
+ """)
+ parser.add_argument(
+ '--background_volume',
+ type=float,
+ default=0.1,
+ help="""\
+ How loud the background noise should be, between 0 and 1.
+ """)
+ parser.add_argument(
+ '--background_frequency',
+ type=float,
+ default=0.8,
+ help="""\
+ How many of the training samples have background noise mixed in.
+ """)
+ parser.add_argument(
+ '--silence_percentage',
+ type=float,
+ default=10.0,
+ help="""\
+ How much of the training data should be silence.
+ """)
+ parser.add_argument(
+ '--unknown_percentage',
+ type=float,
+ default=10.0,
+ help="""\
+ How much of the training data should be unknown words.
+ """)
+ parser.add_argument(
+ '--time_shift_ms',
+ type=float,
+ default=100.0,
+ help="""\
+ Range to randomly shift the training audio by in time.
+ """)
+ parser.add_argument(
+ '--testing_percentage',
+ type=int,
+ default=10,
+ help='What percentage of wavs to use as a test set.')
+ parser.add_argument(
+ '--validation_percentage',
+ type=int,
+ default=10,
+ help='What percentage of wavs to use as a validation set.')
+ parser.add_argument(
+ '--sample_rate',
+ type=int,
+ default=16000,
+ help='Expected sample rate of the wavs',)
+ parser.add_argument(
+ '--clip_duration_ms',
+ type=int,
+ default=1000,
+ help='Expected duration in milliseconds of the wavs',)
+ parser.add_argument(
+ '--window_size_ms',
+ type=float,
+ default=30.0,
+ help='How long each spectrogram timeslice is',)
+ parser.add_argument(
+ '--window_stride_ms',
+ type=float,
+ default=10.0,
+ help='How long each spectrogram timeslice is',)
+ parser.add_argument(
+ '--dct_coefficient_count',
+ type=int,
+ default=40,
+ help='How many bins to use for the MFCC fingerprint',)
+ parser.add_argument(
+ '--how_many_training_steps',
+ type=str,
+ default='15000,3000',
+ help='How many training loops to run',)
+ parser.add_argument(
+ '--eval_step_interval',
+ type=int,
+ default=400,
+ help='How often to evaluate the training results.')
+ parser.add_argument(
+ '--learning_rate',
+ type=str,
+ default='0.001,0.0001',
+ help='How large a learning rate to use when training.')
+ parser.add_argument(
+ '--batch_size',
+ type=int,
+ default=100,
+ help='How many items to train with at once',)
+ parser.add_argument(
+ '--summaries_dir',
+ type=str,
+ default='/tmp/retrain_logs',
+ help='Where to save summary logs for TensorBoard.')
+ parser.add_argument(
+ '--wanted_words',
+ type=str,
+ default='yes,no,up,down,left,right,on,off,stop,go',
+ help='Words to use (others will be added to an unknown label)',)
+ parser.add_argument(
+ '--train_dir',
+ type=str,
+ default='/tmp/speech_commands_train',
+ help='Directory to write event logs and checkpoint.')
+ parser.add_argument(
+ '--model_architecture',
+ type=str,
+ default='dnn',
+ help='What model architecture to use')
+ parser.add_argument(
+ '--model_size_info',
+ type=int,
+ nargs="+",
+ default=[128, 128, 128],
+ help='Model dimensions - different for various models')
+
+ FLAGS, _ = parser.parse_known_args()
+ train()
diff --git a/models/keyword_spotting/dnn_small/model_package_tf/validation_utils/labels.txt b/models/keyword_spotting/dnn_small/model_package_tf/validation_utils/labels.txt
new file mode 100644
index 0000000..ba41645
--- /dev/null
+++ b/models/keyword_spotting/dnn_small/model_package_tf/validation_utils/labels.txt
@@ -0,0 +1,12 @@
+_silence_
+_unknown_
+yes
+no
+up
+down
+left
+right
+on
+off
+stop
+go
\ No newline at end of file
diff --git a/models/keyword_spotting/dnn_small/tflite_int8/README.md b/models/keyword_spotting/dnn_small/tflite_int8/README.md
deleted file mode 100644
index 1f5d3f8..0000000
--- a/models/keyword_spotting/dnn_small/tflite_int8/README.md
+++ /dev/null
@@ -1,59 +0,0 @@
-# DNN Small INT8
-
-## Description
-This is a fully quantized version (asymmetrical int8) of the DNN Small model developed by Arm, with training checkpoints, from the Hello Edge paper. Code to recreate this model can be found here: https://github.com/ARM-software/ML-examples/tree/master/tflu-kws-cortex-m
-
-## License
-[Apache-2.0](https://spdx.org/licenses/Apache-2.0.html)
-
-## Related Materials
-### Class Labels
-The class labels associated with this model can be downloaded by running the script `get_class_labels.sh`.
-
-### Model Recreation Code
-Code to recreate this model can be found here: https://github.com/ARM-software/ML-examples/tree/master/tflu-kws-cortex-m.
-
-## Network Information
-| Network Information | Value |
-|---------------------|------------------|
-| Framework | TensorFlow Lite |
-| SHA-1 Hash | 4b92e09fb43b2f042ce2811b91c7c67bf7186b6b |
-| Size (Bytes) | 83544 |
-| Provenance | https://github.com/ARM-software/ML-examples/tree/master/tflu-kws-cortex-m |
-| Paper | https://arxiv.org/abs/1711.07128 |
-
-## Accuracy
-Dataset: Google Speech Commands Test Set
-
-| Metric | Value |
-|--------|-------|
-| Accuracy | 0.825 |
-
-## Performance
-| Platform | Optimized |
-|----------|:---------:|
-| Cortex-A |:heavy_check_mark: |
-| Cortex-M |:heavy_check_mark: |
-| Mali GPU |:heavy_check_mark: |
-| Ethos U |:heavy_check_mark: |
-
-### Key
-* :heavy_check_mark: - Will run on this platform.
-* :heavy_multiplication_x: - Will not run on this platform.
-
-
-
-## Optimizations
-| Optimization | Value |
-|-----------------|---------|
-| Quantization | INT8 |
-
-## Network Inputs
-| Input Node Name | Shape | Description |
-|-----------------|---------|-------------|
-| input | (1, 250) | The input is a processed MFCCs of shape (1, 250) |
-
-## Network Outputs
-| Output Node Name | Shape | Description |
-|------------------|---------|-------------|
-| Identity | (1, 12) | The probability on 12 keywords. |
diff --git a/models/keyword_spotting/dnn_small/tflite_int8/definition.yaml b/models/keyword_spotting/dnn_small/tflite_int8/definition.yaml
deleted file mode 100644
index 7f66d4d..0000000
--- a/models/keyword_spotting/dnn_small/tflite_int8/definition.yaml
+++ /dev/null
@@ -1,41 +0,0 @@
-benchmark:
- Google Speech Commands test set:
- Accuracy: 82.45%
-description: 'This is a fully quantized version (asymmetrical int8) of the DNN Small
- model developed by Arm, with training checkpoints, from the Hello Edge paper. Code
- to recreate this model can be found here: https://github.com/ARM-software/ML-examples/tree/master/tflu-kws-cortex-m'
-license:
-- Apache-2.0
-network:
- file_size_bytes: 83544
- filename: dnn_s_quantized.tflite
- framework: TensorFlow Lite
- hash:
- algorithm: sha1
- value: 4b92e09fb43b2f042ce2811b91c7c67bf7186b6b
- provenance: https://github.com/ARM-software/ML-examples/tree/master/tflu-kws-cortex-m
- quality_level: null
-network_parameters:
- input_nodes:
- - description: The input is a processed MFCCs of shape (1, 250)
- example_input:
- path: models/keyword_spotting/dnn_small/tflite_int8/testing_input/input
- name: input
- shape:
- - 1
- - 250
- output_nodes:
- - description: The probability on 12 keywords.
- name: Identity
- shape:
- - 1
- - 12
- test_output_path: models/keyword_spotting/dnn_small/tflite_int8/testing_output/Identity
-operators:
- TensorFlow Lite:
- - DEQUANTIZE
- - FULLY_CONNECTED
- - QUANTIZE
- - RELU
- - SOFTMAX
-paper: https://arxiv.org/abs/1711.07128
diff --git a/models/keyword_spotting/dnn_small/tflite_int8/get_class_labels.sh b/models/keyword_spotting/dnn_small/tflite_int8/get_class_labels.sh
deleted file mode 100755
index e59caf5..0000000
--- a/models/keyword_spotting/dnn_small/tflite_int8/get_class_labels.sh
+++ /dev/null
@@ -1,20 +0,0 @@
-# Copyright (C) 2021 Arm Limited or its affiliates. All rights reserved.
-#
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-#!/usr/bin/env bash
-
-wget https://raw.githubusercontent.com/ARM-software/ML-KWS-for-MCU/e9cf319e9aa2ff71d433e111477dd95329fb94cb/Pretrained_models/labels.txt
-mv labels.txt labelmappings.txt
\ No newline at end of file
diff --git a/models/keyword_spotting/ds_cnn_large/model_package_tf/README.md b/models/keyword_spotting/ds_cnn_large/model_package_tf/README.md
new file mode 100644
index 0000000..c4e4d69
--- /dev/null
+++ b/models/keyword_spotting/ds_cnn_large/model_package_tf/README.md
@@ -0,0 +1,115 @@
+# DS-CNN Large model package
+
+This folder contains code that will allow you to recreate the DS-CNN Large keyword spotting model from
+the [Hello Edge paper](https://arxiv.org/pdf/1711.07128.pdf).
+
+## License
+[Apache-2.0](https://spdx.org/licenses/Apache-2.0.html)
+
+## Model Package Overview
+| Model | DS_CNN_Large |
+|:---------------: |:------------------------------------------:|
+| **Format**: | Keras, Saved Model, TensorFlow Lite int8, TensorFlow Lite fp32 |
+| **Feature**: | Keyword spotting for Arm Cortex-M CPUs |
+| **Architectural Delta w.r.t. Vanilla**: | None |
+| **Domain**: | Keyword spotting |
+| **Package Quality**: | Hero |
+
+## Model Recreation
+
+In order to recreate the model you will first need to be using ```Python3.7``` and install the requirements in ```requirements.txt```.
+
+Once you have these requirements satisfied you can execute the recreation script contained within this folder, just run:
+
+```bash
+bash ./recreate_model.sh
+```
+
+Running this script will use the pre-trained checkpoint files supplied in the ```./model_archive/model_source/weights``` folder
+to generate the TFLite files and perform evaluation on the test sets. Both an fp32 version and a quantized version will be produced.
+The quantized version will use post-training quantization to fully quantize it.
+
+If you want to run training from scratch you can do this by supplying ```--train``` when running the script. For example:
+
+```bash
+bash ./recreate_model.sh --train
+```
+
+Training is then performed and should produce a model to the stated accuracy in this repository.
+Note that exporting to TFLite will still happen with the pre-trained checkpoint files so you will need to re-run the script
+and this time supply the path to the new checkpoint files you want to use, for example:
+
+```bash
+bash ./recreate_model.sh --ckpt
+```
+
+
+## Training
+
+To train a DNN with 3 fully-connected layers with 128 neurons in each layer, run:
+
+```
+python train.py --model_architecture dnn --model_size_info 128 128 128
+```
+The command line argument *--model_size_info* is used to pass the neural network layer
+dimensions such as number of layers, convolution filter size/stride as a list to models.py,
+which builds the TensorFlow graph based on the provided model architecture
+and layer dimensions. For more info on *model_size_info* for each network architecture see
+[models.py](models.py).
+
+The training commands with all the hyperparameters to reproduce the models shown in the
+[paper](https://arxiv.org/pdf/1711.07128.pdf) are given [here](recreate_model.sh).
+
+## Testing
+To run inference on the trained model from a checkpoint and get accuracy on validation and test sets, run:
+```
+python evaluation.py --model_architecture dnn --model_size_info 128 128 128 --checkpoint
+```
+The parameters used here should match those used in the Training step.
+
+## Optimization
+
+We introduce a new *optional* step to optimize the trained keyword spotting model for deployment.
+
+Here we use TensorFlow's [weight clustering API](https://www.tensorflow.org/model_optimization/guide/clustering) to reduce the compressed model size and optimize inference on supported hardware. 32 weight clusters and kmeans++ cluster intialization method are used as the clustering hyperparameters.
+
+To optimize your trained model (e.g. a DNN), a trained model checkpoint is needed to run clustering and fine-tuning on.
+You can use the pre-trained checkpoints provided, or train your own model and use the resulting checkpoint.
+
+To apply the optimization and fine-tuning, run the following command:
+```
+python optimisations.py --model_architecture dnn --model_size_info 128 128 128 --checkpoint
+```
+The parameters used here should match those used in the Training step, except for the number of training steps.
+The number of training steps is reduced since the optimization step only requires fine-tuning.
+
+This will generate a clustered model checkpoint that can be used in the quantization step to generate a quantized and clustered TFLite model.
+
+## Quantization and TFLite Conversion
+
+As part of the update we now use TensorFlow's
+[post training quantization](https://www.tensorflow.org/lite/performance/post_training_quantization) to
+make quantization of the trained models super simple.
+
+To quantize your trained model (e.g. a DNN) run:
+```
+python convert_to_tflite.py --model_architecture dnn --model_size_info 128 128 128 --checkpoint [--inference_type int8|int16]
+```
+The parameters used here should match those used in the Training step.
+
+The inference_type parameter is *optional* and to be used if a fully quantized model with inputs and outputs of type int8 or int16 is needed. It defaults to fp32.
+
+This step will produce a quantized TFLite file *dnn_quantized.tflite*.
+You can test the accuracy of this quantized model on the test set by running:
+```
+python evaluation.py --tflite_path dnn_quantized.tflite
+```
+The parameters used here should match those used in the Training step.
+
+`convert_to_tflite.py` uses post-training quantization to generate a quantized model by default. If you wish to convert to a floating point TFLite model, use the command below:
+
+```
+python convert_to_tflite.py --model_architecture dnn --model_size_info 128 128 128 --checkpoint --no-quantize
+```
+
+This will produce a floating point TFLite file *dnn.tflite*. You can test the accuracy of this floating point model using `evaluation.py` as above.
diff --git a/models/keyword_spotting/ds_cnn_large/model_package_tf/convert_to_tflite.py b/models/keyword_spotting/ds_cnn_large/model_package_tf/convert_to_tflite.py
new file mode 100644
index 0000000..64ab8df
--- /dev/null
+++ b/models/keyword_spotting/ds_cnn_large/model_package_tf/convert_to_tflite.py
@@ -0,0 +1,234 @@
+# Copyright © 2023 Arm Ltd. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Functions for converting and quantizing a trained keyword spotting
+ model and saving to TFLite."""
+
+import argparse
+
+import tensorflow as tf
+
+from data_processing import data_preprocessing
+from model_core_utils import models
+from evaluation import tflite_test
+
+NUM_REP_DATA_SAMPLES = 100 # How many samples to use for post training quantization.
+
+
+def convert(model_settings, audio_processor, checkpoint, quantize, inference_type, tflite_path):
+ """Load our trained floating point model and convert it.
+
+ TFLite conversion or post training quantization is performed and the
+ resulting model is saved as a TFLite file.
+ We use samples from the validation set to do post training quantization.
+
+ Args:
+ model_settings: Dictionary of common model settings.
+ audio_processor: Audio processor class object.
+ checkpoint: Path to training checkpoint to load.
+ quantize: Whether to quantize the model or convert to fp32 TFLite model.
+ inference_type: Input/output type of the quantized model.
+ tflite_path: Output TFLite file save path.
+ """
+ model = models.create_model(model_settings, FLAGS.model_architecture, FLAGS.model_size_info, False)
+ model.load_weights(checkpoint).expect_partial()
+
+ val_data = audio_processor.get_data(audio_processor.Modes.VALIDATION).batch(1)
+
+ def _rep_dataset():
+ """Generator function to produce representative dataset."""
+ i = 0
+ for mfcc, label in val_data:
+ if i > NUM_REP_DATA_SAMPLES:
+ break
+ i += 1
+ yield [mfcc]
+
+ if quantize:
+ # Quantize model and save to disk.
+ tflite_model = post_training_quantize(model, inference_type, _rep_dataset)
+ with open(tflite_path, 'wb') as f:
+ f.write(tflite_model)
+ print(f'Quantized model saved to {tflite_path}.')
+ else:
+ converter = tf.lite.TFLiteConverter.from_keras_model(model)
+ tflite_model = converter.convert()
+ with open(tflite_path, 'wb') as f:
+ f.write(tflite_model)
+ print(f'Converted model saved to {tflite_path}.')
+
+
+def post_training_quantize(keras_model, inference_type, rep_dataset):
+ """Perform post training quantization and returns the TFLite model ready for saving.
+
+ See https://www.tensorflow.org/lite/performance/post_training_quantization#full_integer_quantization for
+ more details.
+
+ Args:
+ keras_model: The trained tf Keras model used for post training quantization.
+ inference_type: Input/output type of the quantized model.
+ rep_dataset: Function to use as a representative dataset, must be callable.
+
+ Returns:
+ Quantized TFLite model ready for saving to disk.
+ """
+ converter = tf.lite.TFLiteConverter.from_keras_model(keras_model)
+ converter.optimizations = [tf.lite.Optimize.DEFAULT]
+
+ if inference_type == 'int8':
+ converter.inference_input_type = tf.int8
+ converter.inference_output_type = tf.int8
+ supported_ops = tf.lite.OpsSet.TFLITE_BUILTINS_INT8
+ if inference_type == 'int16':
+ converter.inference_input_type = tf.int16
+ converter.inference_output_type = tf.int16
+ supported_ops = tf.lite.OpsSet.EXPERIMENTAL_TFLITE_BUILTINS_ACTIVATIONS_INT16_WEIGHTS_INT8
+
+ # Int8 post training quantization needs representative dataset.
+ converter.representative_dataset = rep_dataset
+ converter.target_spec.supported_ops = [supported_ops]
+
+ tflite_model = converter.convert()
+
+ return tflite_model
+
+
+def main():
+ model_settings = models.prepare_model_settings(
+ len(data_preprocessing.prepare_words_list(FLAGS.wanted_words.split(','))),
+ FLAGS.sample_rate, FLAGS.clip_duration_ms, FLAGS.window_size_ms,
+ FLAGS.window_stride_ms, FLAGS.dct_coefficient_count)
+
+ audio_processor = data_preprocessing.AudioProcessor(data_url=FLAGS.data_url,
+ data_dir=FLAGS.data_dir,
+ silence_percentage=FLAGS.silence_percentage,
+ unknown_percentage=FLAGS.unknown_percentage,
+ wanted_words=FLAGS.wanted_words.split(','),
+ validation_percentage=FLAGS.validation_percentage,
+ testing_percentage=FLAGS.testing_percentage,
+ model_settings=model_settings)
+
+ if FLAGS.quantize:
+ tflite_path = f'{FLAGS.model_architecture}_quantized.tflite'
+ else:
+ tflite_path = f'{FLAGS.model_architecture}.tflite'
+
+ # Load floating point model from checkpoint and convert it.
+ convert(model_settings, audio_processor, FLAGS.checkpoint,
+ FLAGS.quantize, FLAGS.inference_type, tflite_path)
+
+ # Test the newly converted model on the test set.
+ tflite_test(model_settings, audio_processor, tflite_path)
+
+
+if __name__ == '__main__':
+ parser = argparse.ArgumentParser()
+ parser.add_argument(
+ '--data_url',
+ type=str,
+ default='http://download.tensorflow.org/data/speech_commands_v0.02.tar.gz',
+ help='Location of speech training data archive on the web.')
+ parser.add_argument(
+ '--data_dir',
+ type=str,
+ default='/tmp/speech_dataset/',
+ help="""\
+ Where to download the speech training data to.
+ """)
+ parser.add_argument(
+ '--silence_percentage',
+ type=float,
+ default=10.0,
+ help="""\
+ How much of the training data should be silence.
+ """)
+ parser.add_argument(
+ '--unknown_percentage',
+ type=float,
+ default=10.0,
+ help="""\
+ How much of the training data should be unknown words.
+ """)
+ parser.add_argument(
+ '--testing_percentage',
+ type=int,
+ default=10,
+ help='What percentage of wavs to use as a test set.')
+ parser.add_argument(
+ '--validation_percentage',
+ type=int,
+ default=10,
+ help='What percentage of wavs to use as a validation set.')
+ parser.add_argument(
+ '--sample_rate',
+ type=int,
+ default=16000,
+ help='Expected sample rate of the wavs',)
+ parser.add_argument(
+ '--clip_duration_ms',
+ type=int,
+ default=1000,
+ help='Expected duration in milliseconds of the wavs',)
+ parser.add_argument(
+ '--window_size_ms',
+ type=float,
+ default=30.0,
+ help='How long each spectrogram timeslice is',)
+ parser.add_argument(
+ '--window_stride_ms',
+ type=float,
+ default=10.0,
+ help='How long each spectrogram timeslice is',)
+ parser.add_argument(
+ '--dct_coefficient_count',
+ type=int,
+ default=40,
+ help='How many bins to use for the MFCC fingerprint',)
+ parser.add_argument(
+ '--wanted_words',
+ type=str,
+ default='yes,no,up,down,left,right,on,off,stop,go',
+ help='Words to use (others will be added to an unknown label)',)
+ parser.add_argument(
+ '--model_architecture',
+ type=str,
+ default='dnn',
+ help='What model architecture to use')
+ parser.add_argument(
+ '--model_size_info',
+ type=int,
+ nargs="+",
+ default=[128, 128, 128],
+ help='Model dimensions - different for various models')
+ parser.add_argument(
+ '--checkpoint',
+ type=str,
+ help='Checkpoint to load the weights from.')
+ parser.add_argument(
+ '--quantize',
+ dest='quantize',
+ action="store_true",
+ default=True,
+ help='Whether to quantize the model or convert to fp32 TFLite model. Defaults to True.')
+ parser.add_argument(
+ '--no-quantize',
+ dest='quantize',
+ action="store_false",
+ help='Whether to quantize the model or convert to fp32 TFLite model. Defaults to True.')
+ parser.add_argument(
+ '--inference_type',
+ type=str,
+ default='fp32',
+ help='If quantize is true, whether the model input and output is float32, int8 or int16')
+
+ FLAGS, _ = parser.parse_known_args()
+ main()
diff --git a/models/keyword_spotting/ds_cnn_large/model_package_tf/data_processing/__init__.py b/models/keyword_spotting/ds_cnn_large/model_package_tf/data_processing/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/models/keyword_spotting/ds_cnn_large/model_package_tf/data_processing/data_preprocessing.py b/models/keyword_spotting/ds_cnn_large/model_package_tf/data_processing/data_preprocessing.py
new file mode 100644
index 0000000..05cf5ba
--- /dev/null
+++ b/models/keyword_spotting/ds_cnn_large/model_package_tf/data_processing/data_preprocessing.py
@@ -0,0 +1,462 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+#
+# Modifications Copyright 2023 Arm Inc. All Rights Reserved.
+# Modified to use TensorFlow 2.0 and data pipelines.
+#
+"""Functions for loading and preparing data for keyword spotting."""
+
+import os
+import re
+import sys
+import urllib
+from pathlib import Path
+import tarfile
+import hashlib
+import random
+import math
+from enum import Enum
+
+import numpy as np
+import tensorflow as tf
+from tensorflow.python.ops import gen_audio_ops as audio_ops
+
+MAX_NUM_WAVS_PER_CLASS = 2**27 - 1 # ~134M
+RANDOM_SEED = 59185
+BACKGROUND_NOISE_DIR_NAME = '_background_noise_'
+SILENCE_LABEL = '_silence_'
+SILENCE_INDEX = 0
+UNKNOWN_WORD_INDEX = 1
+UNKNOWN_WORD_LABEL = '_unknown_'
+
+
+def load_wav_file(wav_filename, desired_samples):
+ """Loads and then decodes a given 16bit PCM wav file.
+
+ Decoded audio is scaled to the range [-1, 1] and padded or cropped to the desired number of samples.
+
+ Args:
+ wav_filename: 16bit PCM wav file to load.
+ desired_samples: Number of samples wanted from the audio file.
+
+ Returns:
+ Tuple consisting of the decoded audio and sample rate.
+ """
+ wav_file = tf.io.read_file(wav_filename)
+ decoded_wav = audio_ops.decode_wav(wav_file, desired_channels=1, desired_samples=desired_samples)
+
+ return decoded_wav.audio, decoded_wav.sample_rate
+
+
+def calculate_mfcc(audio_signal, audio_sample_rate, window_size, window_stride, num_mfcc):
+ """Returns Mel Frequency Cepstral Coefficients (MFCC) for a given audio signal.
+
+ Args:
+ audio_signal: Raw audio signal in range [-1, 1]
+ audio_sample_rate: Audio signal sample rate
+ window_size: Window size in samples for calculating spectrogram
+ window_stride: Window stride in samples for calculating spectrogram
+ num_mfcc: The number of MFCC features wanted.
+
+ Returns:
+ Calculated mffc features.
+ """
+ spectrogram = audio_ops.audio_spectrogram(input=audio_signal, window_size=window_size, stride=window_stride,
+ magnitude_squared=True)
+
+ mfcc_features = audio_ops.mfcc(spectrogram, audio_sample_rate, dct_coefficient_count=num_mfcc)
+
+ return mfcc_features
+
+
+def which_set(filename, validation_percentage, testing_percentage):
+ """Determines which data partition the file should belong to.
+
+ We want to keep files in the same training, validation, or testing sets even
+ if new ones are added over time. This makes it less likely that testing
+ samples will accidentally be reused in training when long runs are restarted
+ for example. To keep this stability, a hash of the filename is taken and used
+ to determine which set it should belong to. This determination only depends on
+ the name and the set proportions, so it won't change as other files are added.
+ It's also useful to associate particular files as related (for example words
+ spoken by the same person), so anything after '_nohash_' in a filename is
+ ignored for set determination. This ensures that 'bobby_nohash_0.wav' and
+ 'bobby_nohash_1.wav' are always in the same set, for example.
+
+ Args:
+ filename: File path of the data sample.
+ validation_percentage: How much of the data set to use for validation.
+ testing_percentage: How much of the data set to use for testing.
+
+ Returns:
+ String, one of 'training', 'validation', or 'testing'.
+ """
+ base_name = os.path.basename(filename)
+ # We want to ignore anything after '_nohash_' in the file name when
+ # deciding which set to put a wav in, so the data set creator has a way of
+ # grouping wavs that are close variations of each other.
+ hash_name = re.sub(r'_nohash_.*$', '', base_name)
+ # This looks a bit magical, but we need to decide whether this file should
+ # go into the training, testing, or validation sets, and we want to keep
+ # existing files in the same set even if more files are subsequently
+ # added.
+ # To do that, we need a stable way of deciding based on just the file name
+ # itself, so we do a hash of that and then use that to generate a
+ # probability value that we use to assign it.
+ hash_name_hashed = hashlib.sha1(tf.compat.as_bytes(hash_name)).hexdigest()
+ percentage_hash = ((int(hash_name_hashed, 16) %
+ (MAX_NUM_WAVS_PER_CLASS + 1)) *
+ (100.0 / MAX_NUM_WAVS_PER_CLASS))
+ if percentage_hash < validation_percentage:
+ result = 'validation'
+ elif percentage_hash < (testing_percentage + validation_percentage):
+ result = 'testing'
+ else:
+ result = 'training'
+ return result
+
+
+def prepare_words_list(wanted_words):
+ """Prepends common tokens to the custom word list.
+
+ Args:
+ wanted_words: List of strings containing custom words to spot.
+
+ Returns:
+ List of words with silence and unknown tokens added.
+ """
+ return [SILENCE_LABEL, UNKNOWN_WORD_LABEL] + wanted_words
+
+
+class AudioProcessor:
+ """Handles loading, partitioning, and preparing audio training data."""
+
+ class Modes(Enum):
+ TRAINING = 1
+ VALIDATION = 2
+ TESTING = 3
+
+ def __init__(self, data_url, data_dir, silence_percentage, unknown_percentage,
+ wanted_words, validation_percentage, testing_percentage, model_settings):
+ self.data_dir = Path(data_dir)
+ self.model_settings = model_settings
+ self.words_list = prepare_words_list(wanted_words)
+
+ self._tf_datasets = {}
+ self.background_data = None
+ self._set_size = {'training': 0, 'validation': 0, 'testing': 0}
+
+ self._download_and_extract_data(data_url, data_dir)
+ self._prepare_datasets(silence_percentage, unknown_percentage, wanted_words,
+ validation_percentage, testing_percentage)
+ self._prepare_background_data()
+
+ def get_data(self, mode, background_frequency=0, background_volume_range=0, time_shift=0):
+ """Returns the train, validation or test set for KWS as a TF Dataset.
+
+ Args:
+ mode: The set to return, see AudioProcessor.Modes enumeration.
+ background_frequency: How many of the samples have background noise mixed in.
+ background_volume_range: How loud the background noise should be, between 0 and 1.
+ time_shift: Range to randomly shift the training audio by in time.
+
+ Returns:
+ TF dataset that will generate tuples containing an mfcc and corresponding label.
+
+ Raises:
+ ValueError: If mode is not recognised.
+ """
+ if mode == AudioProcessor.Modes.TRAINING:
+ dataset = self._tf_datasets['training']
+ elif mode == AudioProcessor.Modes.VALIDATION:
+ dataset = self._tf_datasets['validation']
+ elif mode == AudioProcessor.Modes.TESTING:
+ dataset = self._tf_datasets['testing']
+ else:
+ ValueError("Incorrect dataset type given")
+
+ use_background = (self.background_data is not None) and (mode == AudioProcessor.Modes.TRAINING)
+ dataset = dataset.map(lambda path, label: self._process_path(path, label, self.model_settings,
+ background_frequency, background_volume_range,
+ time_shift, use_background, self.background_data),
+ num_parallel_calls=tf.data.experimental.AUTOTUNE)
+
+ return dataset
+
+ def set_size(self, mode):
+ """Get the number of samples in the requested dataset partition.
+
+ Args:
+ mode: Which partition, see AudioProcessor.Modes enumeration.
+
+ Returns:
+ Number of samples in the partition.
+
+ Raises:
+ ValueError: If mode is not recognised.
+ """
+ if mode == AudioProcessor.Modes.TRAINING:
+ return self._set_size['training']
+ elif mode == AudioProcessor.Modes.VALIDATION:
+ return self._set_size['validation']
+ elif mode == AudioProcessor.Modes.TESTING:
+ return self._set_size['testing']
+ else:
+ ValueError('Incorrect dataset type given')
+
+ @staticmethod
+ def _process_path(path, label, model_settings, background_frequency, background_volume_range, time_shift_samples,
+ use_background, background_data):
+ """Load wav files and calculate mfcc features.
+
+ Random shifting of samples and adding in background noise is done within this function as well.
+ This function is meant to be mapped onto a TF Dataset by using a lambda function.
+
+ Args:
+ path: Path to the wav file to load.
+ label: Integer label for classifying the audio clip.
+ model_settings: Dictionary of settings for model being trained.
+ background_frequency: How many clips will have background noise, 0.0 to 1.0.
+ background_volume_range: How loud the background noise will be.
+ time_shift_samples: How much to randomly shift the clips by.
+ use_background: Add in background noise to audio clips or not.
+ background_data: Ragged tensor of loaded background noise samples.
+
+ Returns:
+ Tuple of calculated flattened mfcc and its class label.
+ """
+
+ desired_samples = model_settings['desired_samples']
+ audio, sample_rate = load_wav_file(path, desired_samples=desired_samples)
+
+ # Make our own silence audio data.
+ if label == SILENCE_INDEX:
+ audio = tf.multiply(audio, 0)
+
+ # Shift samples start position and pad any gaps with zeros.
+ if time_shift_samples > 0:
+ time_shift_amount = tf.random.uniform(shape=(), minval=-time_shift_samples, maxval=time_shift_samples,
+ dtype=tf.int32)
+ else:
+ time_shift_amount = 0
+ if time_shift_amount > 0:
+ time_shift_padding = [[time_shift_amount, 0], [0, 0]]
+ time_shift_offset = [0, 0]
+ else:
+ time_shift_padding = [[0, -time_shift_amount], [0, 0]]
+ time_shift_offset = [-time_shift_amount, 0]
+
+ padded_foreground = tf.pad(audio, time_shift_padding, mode='CONSTANT')
+ sliced_foreground = tf.slice(padded_foreground, time_shift_offset, [desired_samples, -1])
+
+ # Get a random section of background noise.
+ if use_background:
+ background_index = tf.random.uniform(shape=(), maxval=background_data.shape[0], dtype=tf.int32)
+ background_sample = background_data[background_index]
+ background_offset = tf.random.uniform(shape=(), maxval=len(background_sample)-desired_samples,
+ dtype=tf.int32)
+ background_clipped = background_sample[background_offset:(background_offset + desired_samples)]
+ background_reshaped = tf.reshape(background_clipped, [desired_samples, 1])
+ if tf.random.uniform(shape=(), maxval=1) < background_frequency:
+ background_volume = tf.random.uniform(shape=(), maxval=background_volume_range)
+ else:
+ background_volume = tf.constant(0, dtype='float32')
+ else:
+ background_reshaped = np.zeros([desired_samples, 1], dtype=np.float32)
+ background_volume = tf.constant(0, dtype='float32')
+
+ # Mix in background noise.
+ background_mul = tf.multiply(background_reshaped, background_volume)
+ background_add = tf.add(background_mul, sliced_foreground)
+ background_clamp = tf.clip_by_value(background_add, -1.0, 1.0)
+
+ mfcc = calculate_mfcc(background_clamp, sample_rate, model_settings['window_size_samples'],
+ model_settings['window_stride_samples'],
+ model_settings['dct_coefficient_count'])
+ mfcc = tf.reshape(mfcc, [-1])
+
+ return mfcc, label
+
+ def _download_and_extract_data(self, data_url, target_directory):
+ """Downloads and extracts file to target directory.
+
+ If the file does not already exist download it and then untar into the target directory.
+
+ Args:
+ data_url: Web link to the tarred data to download.
+ target_directory: Directory to download and extract to.
+ """
+ target_directory = Path(target_directory)
+ target_directory.mkdir(exist_ok=True)
+
+ filename = data_url.split('/')[-1]
+ filepath = target_directory / filename
+
+ if not filepath.exists():
+ def _report_hook(block_num, block_size, total_size):
+ """Function to track download progress in urllib"""
+ read_so_far = block_num * block_size
+ percent = (read_so_far / total_size) * 100.0
+
+ s = f"\rDownloading {filename} {percent:.1f}%"
+
+ sys.stdout.write(s)
+ sys.stdout.flush()
+
+ filepath, _ = urllib.request.urlretrieve(data_url, filepath, _report_hook)
+ print()
+
+ print(f'Untarring {filename}...')
+ tarfile.open(filepath, 'r:gz').extractall(target_directory)
+
+ def _prepare_datasets(self, silence_percentage, unknown_percentage, wanted_words,
+ validation_percentage, testing_percentage):
+ """Split the data into train, validation and testing sets.
+
+ Silence and unknown data is added, then sets are converted to TF Datasets.
+
+ Args:
+ silence_percentage: Percent of words should be silence.
+ unknown_percentage: Percent of words that should be unknown.
+ wanted_words: List of words wanted to classify.
+ validation_percentage: Percent to split off for validation.
+ testing_percentage: Percent to split off for testing.
+ """
+ # Make sure the shuffling and picking of unknowns is deterministic.
+ random.seed(RANDOM_SEED)
+ wanted_words_index = {}
+
+ for index, wanted_word in enumerate(wanted_words):
+ wanted_words_index[wanted_word] = index + 2
+
+ # Find all wav files in subfolders.
+ search_path = self.data_dir / '*' / '*.wav'
+ data_index, unknown_index, all_words = self._find_and_sort_wavs(search_path, validation_percentage,
+ testing_percentage, wanted_words_index)
+
+ for index, wanted_word in enumerate(wanted_words):
+ if wanted_word not in all_words:
+ raise Exception(f'Tried to find {wanted_word} in labels but only found: {", ".join(all_words.keys())}')
+
+ word_to_index = {}
+ for word in all_words:
+ if word in wanted_words_index:
+ word_to_index[word] = wanted_words_index[word]
+ else:
+ word_to_index[word] = UNKNOWN_WORD_INDEX
+ word_to_index[SILENCE_LABEL] = SILENCE_INDEX
+
+ # We need an arbitrary file to load as the input for the silence samples.
+ # It's multiplied by zero later, so the content doesn't matter.
+ silence_wav_path = data_index['training'][0]['file']
+ for set_index in ['validation', 'testing', 'training']:
+ set_size = len(data_index[set_index]) # Size before adding silence and unknown samples.
+ silence_size = int(math.ceil(set_size * silence_percentage / 100))
+ for _ in range(silence_size):
+ data_index[set_index].append({
+ 'label': SILENCE_LABEL,
+ 'file': silence_wav_path
+ })
+ # Pick some unknowns to add to each partition of the data set.
+ random.shuffle(unknown_index[set_index])
+ unknown_size = int(math.ceil(set_size * unknown_percentage / 100))
+ data_index[set_index].extend(unknown_index[set_index][:unknown_size])
+
+ self._set_size[set_index] = len(data_index[set_index]) # Size after adding silence and unknown samples.
+
+ # Make sure the ordering is random.
+ random.shuffle(data_index[set_index])
+
+ # Transform into TF Datasets ready for easier processing later.
+ labels, paths = list(zip(*[d.values() for d in data_index[set_index]]))
+ labels = [word_to_index[label] for label in labels]
+ self._tf_datasets[set_index] = tf.data.Dataset.from_tensor_slices((list(paths), labels))
+
+ def _find_and_sort_wavs(self, search_pattern, validation_percentage, testing_percentage, wanted_words_index):
+ """Find and sort wav files into known and unknown word sets.
+
+ Known words are files containing words in the list of wanted words.
+ Any other clip goes to the unknown label set. Labels come from the folder names.
+ All clips are also assigned to train, test and validation sets.
+
+ Args:
+ search_pattern: Path pattern used by glob to find wav files.
+ validation_percentage: Percent to split off for validation.
+ testing_percentage: Percent to split off for testing.
+ wanted_words_index: Dict mapping wanted words to their label index.
+
+ Returns:
+ 3-tuple of known words, unknown words and mapping of all word labels.
+ """
+ data_index = {'validation': [], 'testing': [], 'training': []}
+ unknown_index = {'validation': [], 'testing': [], 'training': []}
+ all_words = {}
+
+ for wav_path in sorted(tf.io.gfile.glob(str(search_pattern))):
+ word = Path(wav_path).parent.name.lower()
+
+ # Treat the '_background_noise_' folder as a special case, since we expect
+ # it to contain long audio samples we mix in to improve training.
+ if word == BACKGROUND_NOISE_DIR_NAME:
+ continue
+
+ all_words[word] = True
+ set_index = which_set(wav_path, validation_percentage, testing_percentage)
+ # If it's a known class, store its detail, otherwise add it to the list
+ # we'll use to train the unknown label.
+ if word in wanted_words_index:
+ data_index[set_index].append({'label': word, 'file': wav_path})
+ else:
+ unknown_index[set_index].append({'label': word, 'file': wav_path})
+ if not all_words:
+ raise Exception('No .wavs found at ' + str(search_pattern))
+
+ return data_index, unknown_index, all_words
+
+ def _prepare_background_data(self):
+ """Searches a folder for background noise audio, and loads it into memory.
+
+ It's expected that the background audio samples will be in a subdirectory
+ named '_background_noise_' inside the 'data_dir' folder, as .wavs that match
+ the sample rate of the training data, but can be much longer in duration.
+
+ If the '_background_noise_' folder doesn't exist at all, this isn't an
+ error, it's just taken to mean that no background noise augmentation should
+ be used. If the folder does exist, but it's empty, that's treated as an
+ error.
+
+ Returns:
+ Ragged tensor of raw PCM-encoded audio samples of background noise.
+ None if '_background_noise_' folder doesnt exist.
+
+ Raises:
+ Exception: If files aren't found in the folder.
+ """
+ background_data = []
+ background_dir = Path(self.data_dir / BACKGROUND_NOISE_DIR_NAME)
+ if not background_dir.exists():
+ self.background_data = None
+ return
+
+ search_path = Path(background_dir / '*.wav')
+ for wav_path in tf.io.gfile.glob(str(search_path)):
+ wav_data, _ = load_wav_file(wav_path, desired_samples=-1)
+ background_data.append(tf.reshape(wav_data, [-1]))
+
+ if not background_data:
+ raise Exception('No background wav files were found in ' + str(search_path))
+
+ # Ragged tensor as we cant use lists in tf dataset map functions.
+ self.background_data = tf.ragged.stack(background_data)
diff --git a/models/keyword_spotting/ds_cnn_large/model_package_tf/ds_cnn_l_inference_keras.py b/models/keyword_spotting/ds_cnn_large/model_package_tf/ds_cnn_l_inference_keras.py
new file mode 100644
index 0000000..db7694a
--- /dev/null
+++ b/models/keyword_spotting/ds_cnn_large/model_package_tf/ds_cnn_l_inference_keras.py
@@ -0,0 +1,76 @@
+# Copyright © 2023 Arm Ltd. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from data_processing.data_preprocessing import load_wav_file, calculate_mfcc
+
+import tensorflow as tf
+import argparse
+
+
+def load_labels(filename):
+ """Read in labels, one label per line."""
+ f = open(filename, "r")
+ return f.read().splitlines()
+
+
+def main():
+ window_size_samples = int(FLAGS.sample_rate * FLAGS.window_size_ms / 1000)
+ window_stride_samples = int(FLAGS.sample_rate * FLAGS.window_stride_ms / 1000)
+ decoded, sample = load_wav_file(FLAGS.wav, FLAGS.sample_rate)
+ x = calculate_mfcc(decoded, sample, window_size_samples, window_stride_samples, FLAGS.dct_coefficient_count)
+ x = tf.reshape(x, [1, -1])
+
+ model = tf.keras.models.load_model(FLAGS.keras_file_path)
+ predictions = model.predict(x)
+
+ # Sort to show labels in order of confidence
+ top_k = predictions[0].argsort()[-1:][::-1]
+ for node_id in top_k:
+ human_string = load_labels(FLAGS.labels)[int(node_id)]
+ score = predictions[0,node_id]
+ print(f'model predicted: {human_string} with score {score:.5f}')
+
+
+if __name__ == '__main__':
+ parser = argparse.ArgumentParser()
+ parser.add_argument(
+ '--wav', type=str, default='', help='Audio file to be identified.')
+ parser.add_argument(
+ '--labels', type=str, default='', help='Path to file containing labels.')
+ parser.add_argument(
+ '--sample_rate',
+ type=int,
+ default=16000,
+ help='Expected sample rate of the wavs', )
+ parser.add_argument(
+ '--window_size_ms',
+ type=float,
+ default=30.0,
+ help='How long each spectrogram timeslice is', )
+ parser.add_argument(
+ '--window_stride_ms',
+ type=float,
+ default=10.0,
+ help='How long each spectrogram timeslice is', )
+ parser.add_argument(
+ '--dct_coefficient_count',
+ type=int,
+ default=40,
+ help='How many bins to use for the MFCC fingerprint', )
+ parser.add_argument(
+ '--keras_file_path',
+ type=str,
+ default='',
+ help='Path to the .h5 Keras model file to use for testing.')
+ FLAGS, unparsed = parser.parse_known_args()
+ main()
diff --git a/models/keyword_spotting/ds_cnn_large/model_package_tf/ds_cnn_l_inference_tflite.py b/models/keyword_spotting/ds_cnn_large/model_package_tf/ds_cnn_l_inference_tflite.py
new file mode 100644
index 0000000..9f79d99
--- /dev/null
+++ b/models/keyword_spotting/ds_cnn_large/model_package_tf/ds_cnn_l_inference_tflite.py
@@ -0,0 +1,120 @@
+# Copyright © 2023 Arm Ltd. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from data_processing.data_preprocessing import load_wav_file, calculate_mfcc
+
+import tensorflow as tf
+import numpy as np
+import argparse
+
+
+def tflite_inference(input_data, tflite_path):
+ """Call forwards pass of TFLite file and returns the result.
+
+ Args:
+ input_data: Input data to use on forward pass.
+ tflite_path: Path to TFLite file to run.
+
+ Returns:
+ Output from inference.
+ """
+ supported_quant_dtypes = (np.int8, np.int16)
+ interpreter = tf.lite.Interpreter(model_path=tflite_path)
+ interpreter.allocate_tensors()
+
+ input_details = interpreter.get_input_details()
+ output_details = interpreter.get_output_details()
+
+ input_dtype = input_details[0]["dtype"]
+ output_dtype = output_details[0]["dtype"]
+
+ # Check if the input/output type is quantized,
+ # set scale and zero-point accordingly
+ if input_dtype in supported_quant_dtypes:
+ input_scale, input_zero_point = input_details[0]["quantization"]
+ else:
+ input_scale, input_zero_point = 1, 0
+
+ input_data = input_data / input_scale + input_zero_point
+ input_data = np.round(input_data) if input_dtype in supported_quant_dtypes else input_data
+
+ if output_dtype in supported_quant_dtypes:
+ output_scale, output_zero_point = output_details[0]["quantization"]
+ else:
+ output_scale, output_zero_point = 1, 0
+
+ interpreter.set_tensor(input_details[0]['index'], tf.cast(input_data, input_dtype))
+ interpreter.invoke()
+
+ output_data = interpreter.get_tensor(output_details[0]['index'])
+
+ output_data = output_scale * (output_data.astype(np.float32) - output_zero_point)
+
+ return output_data
+
+
+def load_labels(filename):
+ """Read in labels, one label per line."""
+ f = open(filename, "r")
+ return f.read().splitlines()
+
+
+def main():
+ window_size_samples = int(FLAGS.sample_rate * FLAGS.window_size_ms / 1000)
+ window_stride_samples = int(FLAGS.sample_rate * FLAGS.window_stride_ms / 1000)
+ decoded, sample = load_wav_file(FLAGS.wav, FLAGS.sample_rate)
+ x = calculate_mfcc(decoded, sample, window_size_samples, window_stride_samples, FLAGS.dct_coefficient_count)
+ x = tf.reshape(x, [1, -1])
+ predictions = tflite_inference(x, FLAGS.tflite_path)
+
+ # Sort to show labels in order of confidence
+ top_k = predictions[0].argsort()[-1:][::-1]
+ for node_id in top_k:
+ human_string = load_labels(FLAGS.labels)[int(node_id)]
+ score = predictions[0,node_id]
+ print(f'model predicted: {human_string} with score {score:.5f}')
+
+
+if __name__ == '__main__':
+ parser = argparse.ArgumentParser()
+ parser.add_argument(
+ '--wav', type=str, default='', help='Audio file to be identified.')
+ parser.add_argument(
+ '--labels', type=str, default='', help='Path to file containing labels.')
+ parser.add_argument(
+ '--sample_rate',
+ type=int,
+ default=16000,
+ help='Expected sample rate of the wavs', )
+ parser.add_argument(
+ '--window_size_ms',
+ type=float,
+ default=30.0,
+ help='How long each spectrogram timeslice is', )
+ parser.add_argument(
+ '--window_stride_ms',
+ type=float,
+ default=10.0,
+ help='How long each spectrogram timeslice is', )
+ parser.add_argument(
+ '--dct_coefficient_count',
+ type=int,
+ default=40,
+ help='How many bins to use for the MFCC fingerprint', )
+ parser.add_argument(
+ '--tflite_path',
+ type=str,
+ default='',
+ help='Path to TFLite file to use for testing.')
+ FLAGS, unparsed = parser.parse_known_args()
+ main()
diff --git a/models/keyword_spotting/ds_cnn_large/model_package_tf/evaluation.py b/models/keyword_spotting/ds_cnn_large/model_package_tf/evaluation.py
new file mode 100644
index 0000000..da2c57c
--- /dev/null
+++ b/models/keyword_spotting/ds_cnn_large/model_package_tf/evaluation.py
@@ -0,0 +1,250 @@
+# Copyright © 2023 Arm Ltd. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Functions for testing trained keyword spotting models from checkpoint files and TFLite files."""
+
+import argparse
+
+import numpy as np
+import tensorflow as tf
+
+from data_processing import data_preprocessing
+from model_core_utils import models
+from ds_cnn_l_inference_tflite import tflite_inference
+
+
+def tflite_test(model_settings, audio_processor, tflite_path):
+ """Calculate accuracy and confusion matrices on the validation and test sets.
+
+ A TFLite model is used for doing testing.
+
+ Args:
+ model_settings: Dictionary of common model settings.
+ audio_processor: Audio processor class object.
+ tflite_path: Path to TFLite file to use for inference.
+ """
+ # Evaluate on validation set.
+ print("Running TFLite evaluation on validation set...")
+ val_data = audio_processor.get_data(audio_processor.Modes.VALIDATION).batch(1)
+ expected_indices = np.concatenate([y for x, y in val_data])
+ predicted_indices = []
+
+ for mfcc, label in val_data:
+ prediction = tflite_inference(mfcc, tflite_path)
+ predicted_indices.append(np.squeeze(tf.argmax(prediction, axis=1)))
+
+ val_accuracy = calculate_accuracy(predicted_indices, expected_indices)
+ confusion_matrix = tf.math.confusion_matrix(expected_indices, predicted_indices,
+ num_classes=model_settings['label_count'])
+
+ print(confusion_matrix.numpy())
+ print(f'Validation accuracy = {val_accuracy * 100:.2f}%'
+ f'(N={audio_processor.set_size(audio_processor.Modes.VALIDATION)})')
+
+ # Evaluate on testing set.
+ print("Running TFLite evaluation on test set...")
+ test_data = audio_processor.get_data(audio_processor.Modes.TESTING).batch(1)
+ expected_indices = np.concatenate([y for x, y in test_data])
+ predicted_indices = []
+
+ for mfcc, label in test_data:
+ prediction = tflite_inference(mfcc, tflite_path)
+ predicted_indices.append(np.squeeze(tf.argmax(prediction, axis=1)))
+
+ test_accuracy = calculate_accuracy(predicted_indices, expected_indices)
+ confusion_matrix = tf.math.confusion_matrix(expected_indices, predicted_indices,
+ num_classes=model_settings['label_count'])
+
+ print(confusion_matrix.numpy())
+ print(f'Test accuracy = {test_accuracy * 100:.2f}%'
+ f'(N={audio_processor.set_size(audio_processor.Modes.TESTING)})')
+
+
+def keras_test(model_settings, audio_processor, model):
+ """Calculate accuracy and confusion matrices on the validation and test sets.
+
+ A loaded keras model is used for doing testing.
+
+ Args:
+ model_settings: Dictionary of common model settings.
+ audio_processor: Audio processor class object.
+ model: Loaded keras model.
+ """
+ # Evaluate on validation set.
+ print("Running TF evaluation on validation set...")
+ val_data = audio_processor.get_data(audio_processor.Modes.VALIDATION).batch(FLAGS.batch_size)
+ expected_indices = np.concatenate([y for x, y in val_data])
+
+ predictions = model.predict(val_data)
+ predicted_indices = tf.argmax(predictions, axis=1)
+
+ val_accuracy = calculate_accuracy(predicted_indices, expected_indices)
+ confusion_matrix = tf.math.confusion_matrix(expected_indices, predicted_indices,
+ num_classes=model_settings['label_count'])
+ print(confusion_matrix.numpy())
+ print(f'Validation accuracy = {val_accuracy * 100:.2f}%'
+ f'(N={audio_processor.set_size(audio_processor.Modes.VALIDATION)})')
+
+ # Evaluate on testing set.
+ print("Running TF evaluation on test set...")
+ test_data = audio_processor.get_data(audio_processor.Modes.TESTING).batch(FLAGS.batch_size)
+ expected_indices = np.concatenate([y for x, y in test_data])
+
+ predictions = model.predict(test_data)
+ predicted_indices = tf.argmax(predictions, axis=1)
+
+ test_accuracy = calculate_accuracy(predicted_indices, expected_indices)
+ confusion_matrix = tf.math.confusion_matrix(expected_indices, predicted_indices,
+ num_classes=model_settings['label_count'])
+ print(confusion_matrix.numpy())
+ print(f'Test accuracy = {test_accuracy * 100:.2f}%'
+ f'(N={audio_processor.set_size(audio_processor.Modes.TESTING)})')
+
+
+def calculate_accuracy(predicted_indices, expected_indices):
+ """Calculates and returns accuracy.
+
+ Args:
+ predicted_indices: List of predicted integer indices.
+ expected_indices: List of expected integer indices.
+
+ Returns:
+ Accuracy value between 0 and 1.
+ """
+ correct_prediction = tf.equal(predicted_indices, expected_indices)
+ accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
+ return accuracy
+
+
+def evaluate():
+ """Calculate accuracy and confusion matrices on validation and test sets.
+
+ Model is created and weights loaded from supplied command line arguments.
+ """
+ model_settings = models.prepare_model_settings(
+ len(data_preprocessing.prepare_words_list(FLAGS.wanted_words.split(','))),
+ FLAGS.sample_rate, FLAGS.clip_duration_ms, FLAGS.window_size_ms,
+ FLAGS.window_stride_ms, FLAGS.dct_coefficient_count)
+
+ audio_processor = data_preprocessing.AudioProcessor(data_url=FLAGS.data_url,
+ data_dir=FLAGS.data_dir,
+ silence_percentage=FLAGS.silence_percentage,
+ unknown_percentage=FLAGS.unknown_percentage,
+ wanted_words=FLAGS.wanted_words.split(','),
+ validation_percentage=FLAGS.validation_percentage,
+ testing_percentage=FLAGS.testing_percentage,
+ model_settings=model_settings)
+
+ if FLAGS.tflite_path:
+ tflite_test(model_settings, audio_processor, FLAGS.tflite_path)
+
+ if FLAGS.checkpoint:
+ model = models.create_model(model_settings, FLAGS.model_architecture, FLAGS.model_size_info, False)
+ model.load_weights(FLAGS.checkpoint).expect_partial()
+ keras_test(model_settings, audio_processor, model)
+
+
+if __name__ == '__main__':
+ parser = argparse.ArgumentParser()
+ parser.add_argument(
+ '--data_url',
+ type=str,
+ default='http://download.tensorflow.org/data/speech_commands_v0.02.tar.gz',
+ help='Location of speech training data archive on the web.')
+ parser.add_argument(
+ '--data_dir',
+ type=str,
+ default='/tmp/speech_dataset/',
+ help="""\
+ Where to download the speech training data to.
+ """)
+ parser.add_argument(
+ '--silence_percentage',
+ type=float,
+ default=10.0,
+ help="""\
+ How much of the training data should be silence.
+ """)
+ parser.add_argument(
+ '--unknown_percentage',
+ type=float,
+ default=10.0,
+ help="""\
+ How much of the training data should be unknown words.
+ """)
+ parser.add_argument(
+ '--testing_percentage',
+ type=int,
+ default=10,
+ help='What percentage of wavs to use as a test set.')
+ parser.add_argument(
+ '--validation_percentage',
+ type=int,
+ default=10,
+ help='What percentage of wavs to use as a validation set.')
+ parser.add_argument(
+ '--sample_rate',
+ type=int,
+ default=16000,
+ help='Expected sample rate of the wavs',)
+ parser.add_argument(
+ '--clip_duration_ms',
+ type=int,
+ default=1000,
+ help='Expected duration in milliseconds of the wavs',)
+ parser.add_argument(
+ '--window_size_ms',
+ type=float,
+ default=30.0,
+ help='How long each spectrogram timeslice is',)
+ parser.add_argument(
+ '--window_stride_ms',
+ type=float,
+ default=10.0,
+ help='How long each spectrogram timeslice is',)
+ parser.add_argument(
+ '--dct_coefficient_count',
+ type=int,
+ default=40,
+ help='How many bins to use for the MFCC fingerprint',)
+ parser.add_argument(
+ '--batch_size',
+ type=int,
+ default=100,
+ help='How many items to train with at once',)
+ parser.add_argument(
+ '--wanted_words',
+ type=str,
+ default='yes,no,up,down,left,right,on,off,stop,go',
+ help='Words to use (others will be added to an unknown label)',)
+ parser.add_argument(
+ '--checkpoint',
+ type=str,
+ help='Checkpoint to load the weights from')
+ parser.add_argument(
+ '--tflite_path',
+ type=str,
+ help='Path to TFLite file to use for evaluation')
+ parser.add_argument(
+ '--model_architecture',
+ type=str,
+ default='dnn',
+ help='What model architecture to use')
+ parser.add_argument(
+ '--model_size_info',
+ type=int,
+ nargs="+",
+ default=[128, 128, 128],
+ help='Model dimensions - different for various models')
+
+ FLAGS, _ = parser.parse_known_args()
+ evaluate()
diff --git a/models/keyword_spotting/ds_cnn_large/model_package_tf/how_to_guidance.ipynb b/models/keyword_spotting/ds_cnn_large/model_package_tf/how_to_guidance.ipynb
new file mode 100644
index 0000000..73d594b
--- /dev/null
+++ b/models/keyword_spotting/ds_cnn_large/model_package_tf/how_to_guidance.ipynb
@@ -0,0 +1,428 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Copyright (C) 2023 Arm Limited or its affiliates. All rights reserved.\n",
+ "#\n",
+ "# SPDX-License-Identifier: Apache-2.0\n",
+ "#\n",
+ "# Licensed under the Apache License, Version 2.0 (the License); you may\n",
+ "# not use this file except in compliance with the License.\n",
+ "# You may obtain a copy of the License at\n",
+ "#\n",
+ "# www.apache.org/licenses/LICENSE-2.0\n",
+ "#\n",
+ "# Unless required by applicable law or agreed to in writing, software\n",
+ "# distributed under the License is distributed on an AS IS BASIS, WITHOUT\n",
+ "# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
+ "# See the License for the specific language governing permissions and\n",
+ "# limitations under the License."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# DS_CNN_Large - Hero\n",
+ "\n",
+ "Here we reproduce the models with our established codebase and ModelPackage approach for your convenience.\n",
+ "\n",
+ "## Model-Package Overview:\n",
+ "\n",
+ "| Model \t| DS_CNN_Large \t|\n",
+ "|:---------------:\t|:---------------------------------------------------------------:\t|\n",
+ "| **Format**: \t| Keras, Saved Model, TensorFlow Lite int8, TensorFlow Lite fp32 |\n",
+ "| **Feature**: \t| Keyword spotting for Arm Cortex-M CPUs |\n",
+ "| **Architectural Delta w.r.t. Vanilla**: | None |\n",
+ "| **Domain**: \t| Keyword spotting |\n",
+ "| **Package Quality**: \t| Hero |"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Table of contents \n",
+ "\n",
+ "This how-to guidance presents the key steps to reproduce everything in this package. The contents are organised as below. We provided the internal navigation links for users to easy-jump among different sections. \n",
+ "\n",
+ " \n",
+ "* [1.0 Model recreation](#model_recreation)\n",
+ "\n",
+ "* [2.0 Training](#training)\n",
+ "\n",
+ "* [3.0 Testing](#testing)\n",
+ "\n",
+ "* [4.0 Optimization](#optimization)\n",
+ "\n",
+ "* [5.0 Quantization and TFLite conversion](#tflite_conversion)\n",
+ "\n",
+ "* [6.0 Inference the TFLite model files](#tflite_inference)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 1.0 Model Recreation\n",
+ "\n",
+ "In order to recreate the model you will first need to be using ```Python3.7``` and install the requirements in ```requirements.txt```.\n",
+ "\n",
+ "Once you have these requirements satisfied you can execute the recreation script contained within this folder, just run:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "2023-01-31 11:38:02.599656: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0\n",
+ "Untarring speech_commands_v0.02.tar.gz...\n",
+ "2023-01-31 11:38:53.030038: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcuda.so.1\n",
+ "2023-01-31 11:38:53.069964: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: \n",
+ "pciBusID: 0000:03:00.0 name: NVIDIA TITAN Xp computeCapability: 6.1\n",
+ "coreClock: 1.582GHz coreCount: 30 deviceMemorySize: 11.91GiB deviceMemoryBandwidth: 510.07GiB/s\n",
+ "2023-01-31 11:38:53.070029: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0\n",
+ "2023-01-31 11:38:53.094139: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcublas.so.11\n",
+ "2023-01-31 11:38:53.094219: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcublasLt.so.11\n",
+ "2023-01-31 11:38:53.096985: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcufft.so.10\n",
+ "2023-01-31 11:38:53.097285: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcurand.so.10\n",
+ "2023-01-31 11:38:53.097852: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcusolver.so.11\n",
+ "2023-01-31 11:38:53.098590: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcusparse.so.11\n",
+ "2023-01-31 11:38:53.098752: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudnn.so.8\n",
+ "2023-01-31 11:38:53.099168: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0\n",
+ "2023-01-31 11:38:53.099481: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA\n",
+ "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
+ "2023-01-31 11:38:53.100222: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: \n",
+ "pciBusID: 0000:03:00.0 name: NVIDIA TITAN Xp computeCapability: 6.1\n",
+ "coreClock: 1.582GHz coreCount: 30 deviceMemorySize: 11.91GiB deviceMemoryBandwidth: 510.07GiB/s\n",
+ "2023-01-31 11:38:53.100624: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0\n",
+ "2023-01-31 11:38:53.100693: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0\n",
+ "2023-01-31 11:38:53.524442: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1258] Device interconnect StreamExecutor with strength 1 edge matrix:\n",
+ "2023-01-31 11:38:53.524481: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1264] 0 \n",
+ "2023-01-31 11:38:53.524492: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1277] 0: N \n",
+ "2023-01-31 11:38:53.524999: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1418] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 10974 MB memory) -> physical GPU (device: 0, name: NVIDIA TITAN Xp, pci bus id: 0000:03:00.0, compute capability: 6.1)\n",
+ "WARNING:tensorflow:Compiled the loaded model, but the compiled metrics have yet to be built. `model.compile_metrics` will be empty until you train or evaluate the model.\n",
+ "2023-01-31 11:38:56.213089: W tensorflow/python/util/util.cc:348] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.\n",
+ "2023-01-31 11:38:58.326629: I tensorflow/core/grappler/devices.cc:69] Number of eligible GPUs (core count >= 8, compute capability >= 0.0): 1\n",
+ "2023-01-31 11:38:58.326721: I tensorflow/core/grappler/clusters/single_machine.cc:357] Starting new session\n",
+ "2023-01-31 11:38:58.327408: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: \n",
+ "pciBusID: 0000:03:00.0 name: NVIDIA TITAN Xp computeCapability: 6.1\n",
+ "coreClock: 1.582GHz coreCount: 30 deviceMemorySize: 11.91GiB deviceMemoryBandwidth: 510.07GiB/s\n",
+ "2023-01-31 11:38:58.327678: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0\n",
+ "2023-01-31 11:38:58.327711: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1258] Device interconnect StreamExecutor with strength 1 edge matrix:\n",
+ "2023-01-31 11:38:58.327721: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1264] 0 \n",
+ "2023-01-31 11:38:58.327731: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1277] 0: N \n",
+ "2023-01-31 11:38:58.328025: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1418] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 10974 MB memory) -> physical GPU (device: 0, name: NVIDIA TITAN Xp, pci bus id: 0000:03:00.0, compute capability: 6.1)\n",
+ "2023-01-31 11:38:58.347388: I tensorflow/core/platform/profile_utils/cpu_utils.cc:114] CPU Frequency: 3492140000 Hz\n",
+ "2023-01-31 11:38:58.352977: I tensorflow/core/grappler/optimizers/meta_optimizer.cc:1144] Optimization results for grappler item: graph_to_optimize\n",
+ " function_optimizer: function_optimizer did nothing. time = 0.012ms.\n",
+ " function_optimizer: function_optimizer did nothing. time = 0.002ms.\n",
+ "\n",
+ "2023-01-31 11:38:58.537693: W tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc:345] Ignored output_format.\n",
+ "2023-01-31 11:38:58.537738: W tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc:348] Ignored drop_control_dependency.\n",
+ "2023-01-31 11:38:58.545075: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:210] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.\n",
+ "2023-01-31 11:38:58.548334: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: \n",
+ "pciBusID: 0000:03:00.0 name: NVIDIA TITAN Xp computeCapability: 6.1\n",
+ "coreClock: 1.582GHz coreCount: 30 deviceMemorySize: 11.91GiB deviceMemoryBandwidth: 510.07GiB/s\n",
+ "2023-01-31 11:38:58.548626: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0\n",
+ "2023-01-31 11:38:58.548661: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1258] Device interconnect StreamExecutor with strength 1 edge matrix:\n",
+ "2023-01-31 11:38:58.548672: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1264] 0 \n",
+ "2023-01-31 11:38:58.548679: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1277] 0: N \n",
+ "2023-01-31 11:38:58.548981: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1418] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 10974 MB memory) -> physical GPU (device: 0, name: NVIDIA TITAN Xp, pci bus id: 0000:03:00.0, compute capability: 6.1)\n",
+ "Converted model saved to ds_cnn.tflite.\n",
+ "Running TFLite evaluation on validation set...\n",
+ "2023-01-31 11:38:58.616947: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:176] None of the MLIR Optimization Passes are enabled (registered 2)\n",
+ "[[371 0 0 0 0 0 0 0 0 0 0 0]\n",
+ " [ 0 339 1 5 2 3 6 5 3 2 1 4]\n",
+ " [ 0 2 382 0 0 2 8 0 0 0 0 3]\n",
+ " [ 0 5 2 382 1 7 1 0 0 0 0 8]\n",
+ " [ 0 2 0 0 323 1 3 0 1 14 5 1]\n",
+ " [ 0 2 0 6 1 361 1 1 0 0 3 2]\n",
+ " [ 0 0 5 1 0 0 344 2 0 0 0 0]\n",
+ " [ 0 3 0 1 0 0 0 358 0 0 0 1]\n",
+ " [ 1 3 0 2 4 1 0 0 344 7 0 1]\n",
+ " [ 0 2 1 0 18 0 1 0 4 342 3 2]\n",
+ " [ 0 1 0 0 8 0 0 1 0 4 335 1]\n",
+ " [ 0 4 0 9 1 5 0 0 1 2 2 348]]\n",
+ "Validation accuracy = 95.14%(N=4445)\n",
+ "Running TFLite evaluation on test set...\n",
+ "[[408 0 0 0 0 0 0 0 0 0 0 0]\n",
+ " [ 0 368 2 3 1 4 6 4 6 3 1 10]\n",
+ " [ 0 1 405 2 0 0 10 0 0 1 0 0]\n",
+ " [ 0 0 1 389 0 7 2 0 0 1 0 5]\n",
+ " [ 0 6 0 0 399 2 0 0 2 10 5 1]\n",
+ " [ 0 5 2 8 0 380 3 0 0 0 1 7]\n",
+ " [ 0 3 5 2 0 1 400 1 0 0 0 0]\n",
+ " [ 0 6 1 1 0 0 4 383 0 0 1 0]\n",
+ " [ 0 7 0 0 3 8 0 0 369 8 0 1]\n",
+ " [ 0 2 0 2 13 0 0 0 5 374 0 6]\n",
+ " [ 0 0 0 1 7 3 0 0 1 0 398 1]\n",
+ " [ 0 3 1 18 3 2 0 0 0 1 0 374]]\n",
+ "Test accuracy = 95.03%(N=4890)\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "2023-01-31 11:39:46.821173: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0\n",
+ "Untarring speech_commands_v0.02.tar.gz...\n",
+ "2023-01-31 11:40:36.690810: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcuda.so.1\n",
+ "2023-01-31 11:40:36.728954: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: \n",
+ "pciBusID: 0000:03:00.0 name: NVIDIA TITAN Xp computeCapability: 6.1\n",
+ "coreClock: 1.582GHz coreCount: 30 deviceMemorySize: 11.91GiB deviceMemoryBandwidth: 510.07GiB/s\n",
+ "2023-01-31 11:40:36.728995: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0\n",
+ "2023-01-31 11:40:36.749408: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcublas.so.11\n",
+ "2023-01-31 11:40:36.749475: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcublasLt.so.11\n",
+ "2023-01-31 11:40:36.752323: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcufft.so.10\n",
+ "2023-01-31 11:40:36.752624: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcurand.so.10\n",
+ "2023-01-31 11:40:36.753198: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcusolver.so.11\n",
+ "2023-01-31 11:40:36.753937: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcusparse.so.11\n",
+ "2023-01-31 11:40:36.754090: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudnn.so.8\n",
+ "2023-01-31 11:40:36.754586: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0\n",
+ "2023-01-31 11:40:36.754864: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA\n",
+ "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
+ "2023-01-31 11:40:36.755740: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: \n",
+ "pciBusID: 0000:03:00.0 name: NVIDIA TITAN Xp computeCapability: 6.1\n",
+ "coreClock: 1.582GHz coreCount: 30 deviceMemorySize: 11.91GiB deviceMemoryBandwidth: 510.07GiB/s\n",
+ "2023-01-31 11:40:36.756134: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0\n",
+ "2023-01-31 11:40:36.756197: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0\n",
+ "2023-01-31 11:40:37.210806: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1258] Device interconnect StreamExecutor with strength 1 edge matrix:\n",
+ "2023-01-31 11:40:37.210845: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1264] 0 \n",
+ "2023-01-31 11:40:37.210854: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1277] 0: N \n",
+ "2023-01-31 11:40:37.211393: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1418] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 10994 MB memory) -> physical GPU (device: 0, name: NVIDIA TITAN Xp, pci bus id: 0000:03:00.0, compute capability: 6.1)\n",
+ "WARNING:tensorflow:Compiled the loaded model, but the compiled metrics have yet to be built. `model.compile_metrics` will be empty until you train or evaluate the model.\n",
+ "2023-01-31 11:40:39.812506: W tensorflow/python/util/util.cc:348] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.\n",
+ "2023-01-31 11:40:42.235293: I tensorflow/core/grappler/devices.cc:69] Number of eligible GPUs (core count >= 8, compute capability >= 0.0): 1\n",
+ "2023-01-31 11:40:42.235385: I tensorflow/core/grappler/clusters/single_machine.cc:357] Starting new session\n",
+ "2023-01-31 11:40:42.236028: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: \n",
+ "pciBusID: 0000:03:00.0 name: NVIDIA TITAN Xp computeCapability: 6.1\n",
+ "coreClock: 1.582GHz coreCount: 30 deviceMemorySize: 11.91GiB deviceMemoryBandwidth: 510.07GiB/s\n",
+ "2023-01-31 11:40:42.236295: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0\n",
+ "2023-01-31 11:40:42.236328: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1258] Device interconnect StreamExecutor with strength 1 edge matrix:\n",
+ "2023-01-31 11:40:42.236339: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1264] 0 \n",
+ "2023-01-31 11:40:42.236348: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1277] 0: N \n",
+ "2023-01-31 11:40:42.236662: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1418] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 10994 MB memory) -> physical GPU (device: 0, name: NVIDIA TITAN Xp, pci bus id: 0000:03:00.0, compute capability: 6.1)\n",
+ "2023-01-31 11:40:42.255416: I tensorflow/core/platform/profile_utils/cpu_utils.cc:114] CPU Frequency: 3492140000 Hz\n",
+ "2023-01-31 11:40:42.259691: I tensorflow/core/grappler/optimizers/meta_optimizer.cc:1144] Optimization results for grappler item: graph_to_optimize\n",
+ " function_optimizer: function_optimizer did nothing. time = 0.012ms.\n",
+ " function_optimizer: function_optimizer did nothing. time = 0.002ms.\n",
+ "\n",
+ "2023-01-31 11:40:42.434390: W tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc:345] Ignored output_format.\n",
+ "2023-01-31 11:40:42.434429: W tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc:348] Ignored drop_control_dependency.\n",
+ "2023-01-31 11:40:42.441258: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:210] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.\n",
+ "2023-01-31 11:40:42.444349: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: \n",
+ "pciBusID: 0000:03:00.0 name: NVIDIA TITAN Xp computeCapability: 6.1\n",
+ "coreClock: 1.582GHz coreCount: 30 deviceMemorySize: 11.91GiB deviceMemoryBandwidth: 510.07GiB/s\n",
+ "2023-01-31 11:40:42.444613: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0\n",
+ "2023-01-31 11:40:42.444644: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1258] Device interconnect StreamExecutor with strength 1 edge matrix:\n",
+ "2023-01-31 11:40:42.444655: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1264] 0 \n",
+ "2023-01-31 11:40:42.444662: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1277] 0: N \n",
+ "2023-01-31 11:40:42.444950: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1418] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 10994 MB memory) -> physical GPU (device: 0, name: NVIDIA TITAN Xp, pci bus id: 0000:03:00.0, compute capability: 6.1)\n",
+ "2023-01-31 11:40:42.484939: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:176] None of the MLIR Optimization Passes are enabled (registered 2)\n",
+ "fully_quantize: 0, inference_type: 6, input_inference_type: 9, output_inference_type: 9\n",
+ "Quantized model saved to ds_cnn_quantized.tflite.\n",
+ "Running TFLite evaluation on validation set...\n",
+ "[[371 0 0 0 0 0 0 0 0 0 0 0]\n",
+ " [ 0 333 2 4 2 3 6 5 3 1 3 9]\n",
+ " [ 0 2 378 1 0 2 8 1 0 0 2 3]\n",
+ " [ 0 5 1 375 2 5 2 0 2 0 1 13]\n",
+ " [ 0 5 0 0 321 2 3 0 1 11 6 1]\n",
+ " [ 0 2 0 7 1 354 1 1 1 2 6 2]\n",
+ " [ 0 1 8 1 2 0 338 2 0 0 0 0]\n",
+ " [ 0 2 0 1 1 0 0 355 0 1 1 2]\n",
+ " [ 1 4 0 1 3 1 1 0 345 6 1 0]\n",
+ " [ 0 1 0 1 27 0 2 1 5 330 4 2]\n",
+ " [ 0 2 1 0 9 0 0 1 0 3 333 1]\n",
+ " [ 0 4 0 12 3 5 1 0 1 0 6 340]]\n",
+ "Validation accuracy = 93.88%(N=4445)\n",
+ "Running TFLite evaluation on test set...\n",
+ "[[408 0 0 0 0 0 0 0 0 0 0 0]\n",
+ " [ 0 363 2 5 3 5 7 3 8 0 3 9]\n",
+ " [ 0 2 399 3 0 0 13 0 0 1 1 0]\n",
+ " [ 0 1 1 384 0 7 4 0 0 0 2 6]\n",
+ " [ 0 9 0 0 398 2 1 0 1 7 6 1]\n",
+ " [ 0 5 3 12 1 372 5 0 1 0 1 6]\n",
+ " [ 0 4 5 2 1 0 395 1 0 0 4 0]\n",
+ " [ 0 8 0 4 3 2 7 370 0 0 2 0]\n",
+ " [ 0 9 0 1 6 8 0 2 361 7 2 0]\n",
+ " [ 0 2 0 2 16 0 1 0 5 367 2 7]\n",
+ " [ 0 0 0 0 11 3 0 3 1 2 389 2]\n",
+ " [ 0 6 1 19 4 5 3 0 0 1 2 361]]\n",
+ "Test accuracy = 93.39%(N=4890)\n"
+ ]
+ }
+ ],
+ "source": [
+ "!bash ./recreate_model.sh"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Running this script will use the pre-trained checkpoint files supplied in the ```./model_archive/model_source/weights``` folder to generate the TFLite files and perform evaluation on the test set. Both an fp32 version and a quantized version will be produced. The quantized version will use post-training quantization to fully quantize it.\n",
+ "\n",
+ "If you want to run training from scratch you can do this by supplying ```--train``` when running the script. For example:\n",
+ "\n",
+ "```bash\n",
+ "bash ./recreate_model.sh --train\n",
+ "```\n",
+ "\n",
+ "Training is then performed and should produce a model to the stated accuracy in this repository. Note that exporting to TFLite will still happen with the baseline pre-trained checkpoint files, so you will need to re-run the script and this time supply the path to the new checkpoint files you want to use, for example:\n",
+ "\n",
+ "```bash\n",
+ "bash ./recreate_model.sh --ckpt \n",
+ "```"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 2.0 Training\n",
+ "\n",
+ "The training scripts can be used to recreate any of the models from the [Hello Edge paper](https://arxiv.org/pdf/1711.07128.pdf) provided the right hyperparameters are used. The training commands with all the hyperparameters to reproduce the model in this repository are given [here](recreate_model.sh). The model in this part of the repository represents just one variation of the models from the paper, other varieties are covered in other parts of the repository.\n",
+ "\n",
+ "\n",
+ "As a general example of how to train a DNN with 3 fully-connected layers with 128 neurons in each layer, run:\n",
+ "```\n",
+ "python train.py --model_architecture dnn --model_size_info 128 128 128\n",
+ "```\n",
+ "\n",
+ "The command line argument *--model_size_info* is used to pass the neural network layer\n",
+ "dimensions such as number of layers, convolution filter size/stride as a list to models.py,\n",
+ "which builds the TensorFlow graph based on the provided model architecture\n",
+ "and layer dimensions. For more info on *model_size_info* for each network architecture see\n",
+ "[models.py](model_core_utils/models.py).\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 3.0 Testing\n",
+ "To run inference on the trained model from a checkpoint and get accuracy on validation and test sets, run:\n",
+ "```\n",
+ "python evaluation.py --model_architecture dnn --model_size_info 128 128 128 --checkpoint \n",
+ "```\n",
+ "**The model and feature extraction parameters passed to this script should match those used in the Training step.**"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 4.0 Optimization\n",
+ "\n",
+ "We introduce an *optional* step to optimize the trained keyword spotting model for deployment.\n",
+ "\n",
+ "Here we use TensorFlow's [weight clustering API](https://www.tensorflow.org/model_optimization/guide/clustering) to reduce the compressed model size and optimize inference on supported hardware. 32 weight clusters and kmeans++ cluster intialization method are used as the clustering hyperparameters.\n",
+ "\n",
+ "To optimize your trained model (e.g. a DNN), a trained model checkpoint is needed to run clustering and fine-tuning on.\n",
+ "You can use the pre-trained checkpoints provided, or train your own model and use the resulting checkpoint.\n",
+ "\n",
+ "To apply the optimization and fine-tuning, run the following command:\n",
+ "```\n",
+ "python optimisations.py --model_architecture dnn --model_size_info 128 128 128 --checkpoint \n",
+ "```\n",
+ "**The model and feature extraction parameters used here should match those used in the Training step, except for the number of training steps.\n",
+ "The number of training steps is reduced since the optimization step only requires fine-tuning.**\n",
+ "\n",
+ "This will generate a clustered model checkpoint that can be used in the quantization step to generate a quantized and clustered TFLite model."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 5.0 Quantization and TFLite Conversion\n",
+ "\n",
+ "You can now use TensorFlow's\n",
+ "[post training quantization](https://www.tensorflow.org/lite/performance/post_training_quantization) to\n",
+ "make quantization of the trained models super simple.\n",
+ "\n",
+ "To quantize your trained model (e.g. a DNN) run:\n",
+ "```\n",
+ "python convert_to_tflite.py --model_architecture dnn --model_size_info 128 128 128 --checkpoint [--inference_type int8|int16]\n",
+ "```\n",
+ "**The model and feature extraction parameters used here should match those used in the Training step.**\n",
+ "\n",
+ "The ```inference_type``` parameter is *optional* and to be used if a fully quantized model with inputs and outputs of type int8 or int16 is needed. It defaults to fp32.\n",
+ "\n",
+ "In this example, this step will produce a quantized TFLite file *dnn_quantized.tflite*."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "You can test the accuracy of this quantized model on the test set by running:\n",
+ "```\n",
+ "python evaluation.py --tflite_path dnn_quantized.tflite\n",
+ "```\n",
+ "**The model and feature extraction parameters used here should match those used in the Training step.**\n",
+ "\n",
+ "`convert_to_tflite.py` uses post-training quantization to generate a quantized model by default. If you wish to convert to a floating point TFLite model, use the command below:\n",
+ "\n",
+ "```\n",
+ "python convert_to_tflite.py --model_architecture dnn --model_size_info 128 128 128 --checkpoint --no-quantize\n",
+ "```\n",
+ "\n",
+ "This will produce a floating point TFLite file *dnn.tflite*. You can test the accuracy of this floating point model using `evaluation.py` as above.\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 6.0 Single inference of the TFLite model files \n",
+ "\n",
+ "You can conduct TFLite inference for .fp32 and .int8 model files by using the following command: \n",
+ "\n",
+ "```python ds_cnn_l_inference_tflite.py --labels validation_utils/labels.txt --wav --tflite_path ```\n",
+ "\n",
+ "**The feature extraction parameters used here should match those used in the Training step.**\n",
+ "\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.7.4"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/TFLite/tflite_clustered_fp32/README.md b/models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/TFLite/tflite_clustered_fp32/README.md
new file mode 100644
index 0000000..be17ae3
--- /dev/null
+++ b/models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/TFLite/tflite_clustered_fp32/README.md
@@ -0,0 +1,62 @@
+# keyword_spotting/ds_cnn_large/model_package_tf/model_archive/TFLite/tflite_clustered_fp32
+
+## Description
+This is a clustered (32 clusters, kmeans++ centroid initialization) and retrained (fine-tuned) floating point fp32 version of the DS-CNN Large model developed by Arm, from the Hello Edge paper.
+
+## License
+[Apache-2.0](https://spdx.org/licenses/Apache-2.0.html)
+
+## Network Information
+| Network Information | Value |
+|---------------------|-------|
+| Framework | TensorFlow Lite |
+| Datatype | fp32 |
+| SHA-1 Hash | d9af9829a2363c21fd6158c7bc425d0b635eb55c |
+| Size (Bytes) | 1652648 |
+| Provenance | https://arxiv.org/abs/1711.07128 |
+| Training | Trained by Arm |
+| Paper | https://arxiv.org/abs/1711.07128 |
+
+## DataSet
+| Dataset Information | Value |
+|--------|-------|
+| Name | Google Speech Commands test set |
+
+## Accuracy
+
+| Metric | Value |
+|--------|-------|
+| accuracy | 94.76% |
+
+## HW Support
+| HW Support | Value |
+|--------------|-------|
+| Cortex-A |:heavy_check_mark: |
+| Cortex-M |:heavy_check_mark: |
+| Mali GPU |:heavy_check_mark: |
+| Ethos U |:heavy_multiplication_x: |
+
+### Key
+* :heavy_check_mark: - Will run on this platform.
+* :heavy_multiplication_x: - Will not run on this platform.
+
+## Network Quality
+| Network Quality | Value |
+|-------------------------|-------|
+| Recreate | :heavy_check_mark: |
+| Quality level | Deployable |
+| Vanilla | :heavy_check_mark: |
+| Clustered | :heavy_check_mark: |
+| Pruned | :heavy_multiplication_x: |
+| Quantization - default | :heavy_multiplication_x: |
+| Quantization - full | :heavy_multiplication_x: |
+
+## Network Inputs
+| Input Node Name | Shape | Type | Example Path | Example Type | Example Shape | Example Use Case |
+|-----------------|-------|-------|--------------|-------|-------|-----------------|
+| input | (1, 490) | fp32 | models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/TFLite/tflite_clustered_fp32/testing_input/input | fp32 | [1, 490] | The input is a processed MFCCs |
+
+## Network Outputs
+| Output Node Name | Shape | Type | Example Path | Example Type | Example Shape | Example Use Case |
+|-----------------|-------|-------|--------------|-------|-------|-----------------|
+| Identity | (1, 12) | fp32 | models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/TFLite/tflite_clustered_fp32/testing_output/Identity | fp32 | [1, 12] | The probability on 12 keywords |
\ No newline at end of file
diff --git a/models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/TFLite/tflite_clustered_fp32/definition.yaml b/models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/TFLite/tflite_clustered_fp32/definition.yaml
new file mode 100644
index 0000000..77d4f8c
--- /dev/null
+++ b/models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/TFLite/tflite_clustered_fp32/definition.yaml
@@ -0,0 +1,67 @@
+benchmark:
+ benchmark_metrics:
+ accuracy: 94.76%
+ benchmark_name: Google Speech Commands test set
+description: This is a clustered (32 clusters, kmeans++ centroid initialization)
+ and retrained (fine-tuned) fp32 version of the DS-CNN Large model developed
+ by Arm, from the Hello Edge paper.
+license:
+- Apache-2.0
+network:
+ datatype: fp32
+ file_size_bytes: 1652648
+ filename: ds_cnn_l_clustered_fp32.tflite
+ framework: TensorFlow Lite
+ hash:
+ algorithm: sha1
+ value: d9af9829a2363c21fd6158c7bc425d0b635eb55c
+ provenance: https://arxiv.org/abs/1711.07128
+ training: Trained by Arm
+network_parameters:
+ input_nodes:
+ - description: The input is a processed MFCCs of shape (1, 490)
+ example_input:
+ path: models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/TFLite/tflite_clustered_fp32/testing_input/input
+ shape:
+ - 1
+ - 490
+ type: fp32
+ use_case: Random input for model regression.
+ input_datatype: fp32
+ name: input
+ shape:
+ - 1
+ - 490
+ output_nodes:
+ - description: The probability on 12 keywords.
+ example_output:
+ path: models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/TFLite/tflite_clustered_fp32/testing_output/Identity
+ shape:
+ - 1
+ - 12
+ type: fp32
+ use_case: output for model regression.
+ name: Identity
+ output_datatype: fp32
+ shape:
+ - 1
+ - 12
+network_quality:
+ clustered: true
+ is_vanilla: true
+ pruned: false
+ quality_level: Deployable
+ quality_level_hero_hw: cortex_m
+ quantization_default: false
+ quantization_full: false
+ recreate: true
+operators:
+ TensorFlow Lite:
+ - AVERAGE_POOL_2D
+ - CONV_2D
+ - DEPTHWISE_CONV_2D
+ - FULLY_CONNECTED
+ - RELU
+ - RESHAPE
+ - SOFTMAX
+paper: https://arxiv.org/abs/1711.07128
\ No newline at end of file
diff --git a/models/keyword_spotting/ds_cnn_large/tflite_clustered_fp32/ds_cnn_clustered_fp32.tflite b/models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/TFLite/tflite_clustered_fp32/ds_cnn_l_clustered_fp32.tflite
similarity index 100%
rename from models/keyword_spotting/ds_cnn_large/tflite_clustered_fp32/ds_cnn_clustered_fp32.tflite
rename to models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/TFLite/tflite_clustered_fp32/ds_cnn_l_clustered_fp32.tflite
diff --git a/models/keyword_spotting/ds_cnn_large/tflite_clustered_fp32/testing_input/input/0.npy b/models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/TFLite/tflite_clustered_fp32/testing_input/input/0.npy
similarity index 100%
rename from models/keyword_spotting/ds_cnn_large/tflite_clustered_fp32/testing_input/input/0.npy
rename to models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/TFLite/tflite_clustered_fp32/testing_input/input/0.npy
diff --git a/models/keyword_spotting/ds_cnn_large/tflite_clustered_fp32/testing_output/Identity/0.npy b/models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/TFLite/tflite_clustered_fp32/testing_output/Identity/0.npy
similarity index 100%
rename from models/keyword_spotting/ds_cnn_large/tflite_clustered_fp32/testing_output/Identity/0.npy
rename to models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/TFLite/tflite_clustered_fp32/testing_output/Identity/0.npy
diff --git a/models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/TFLite/tflite_clustered_int8/README.md b/models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/TFLite/tflite_clustered_int8/README.md
new file mode 100644
index 0000000..976c8c6
--- /dev/null
+++ b/models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/TFLite/tflite_clustered_int8/README.md
@@ -0,0 +1,62 @@
+# keyword_spotting/ds_cnn_large/model_package_tf/model_archive/TFLite/tflite_clustered_int8
+
+## Description
+This is a clustered (32 clusters, kmeans++ centroid initialization) and retrained (fine-tuned) fully quantized int8 version of the DS-CNN Large model developed by Arm, from the Hello Edge paper.
+
+## License
+[Apache-2.0](https://spdx.org/licenses/Apache-2.0.html)
+
+## Network Information
+| Network Information | Value |
+|---------------------|------------------------------------------|
+| Framework | TensorFlow Lite |
+| Datatype | int8 |
+| SHA-1 Hash | 2ee38794ed171c75d3313460a1633c5d6a79f530 |
+| Size (Bytes) | 503816 |
+| Provenance | https://arxiv.org/abs/1711.07128 |
+| Training | Trained by Arm |
+| Paper | https://arxiv.org/abs/1711.07128 |
+
+## DataSet
+| Dataset Information | Value |
+|--------|-------|
+| Name | Google Speech Commands test set |
+
+## Accuracy
+
+| Metric | Value |
+|--------|-------|
+| accuracy | 93.87% |
+
+## HW Support
+| HW Support | Value |
+|--------------|-------|
+| Cortex-A |:heavy_check_mark: |
+| Cortex-M |:heavy_check_mark: |
+| Mali GPU |:heavy_check_mark: |
+| Ethos U |:heavy_check_mark: |
+
+### Key
+* :heavy_check_mark: - Will run on this platform.
+* :heavy_multiplication_x: - Will not run on this platform.
+
+## Network Quality
+| Network Quality | Value |
+|-------------------------|-------|
+| Recreate | :heavy_check_mark: |
+| Quality level | Deployable |
+| Vanilla | :heavy_check_mark: |
+| Clustered | :heavy_check_mark: |
+| Pruned | :heavy_multiplication_x: |
+| Quantization - default | :heavy_multiplication_x: |
+| Quantization - full | :heavy_check_mark: |
+
+## Network Inputs
+| Input Node Name | Shape | Type | Example Path | Example Type | Example Shape | Example Use Case |
+|-----------------|-------|-------|--------------|-------|-------|-----------------|
+| input | (1, 490) | int8 | models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/TFLite/tflite_clustered_int8/testing_input/input | int8 | [1, 490] | The input is a processed MFCCs |
+
+## Network Outputs
+| Output Node Name | Shape | Type | Example Path | Example Type | Example Shape | Example Use Case |
+|-----------------|-------|-------|--------------|-------|-------|-----------------|
+| Identity | (1, 12) | int8 | models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/TFLite/tflite_clustered_int8/testing_output/Identity | int8 | [1, 12] | The probability on 12 keywords |
\ No newline at end of file
diff --git a/models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/TFLite/tflite_clustered_int8/definition.yaml b/models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/TFLite/tflite_clustered_int8/definition.yaml
new file mode 100644
index 0000000..a3adef5
--- /dev/null
+++ b/models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/TFLite/tflite_clustered_int8/definition.yaml
@@ -0,0 +1,67 @@
+benchmark:
+ benchmark_metrics:
+ accuracy: 93.87%
+ benchmark_name: Google Speech Commands test set
+description: This is a clustered (32 clusters, kmeans++ centroid initialization)
+ and retrained (fine-tuned) fully quantized int8 version of the DS-CNN Large model developed
+ by Arm, from the Hello Edge paper.
+license:
+- Apache-2.0
+network:
+ datatype: int8
+ file_size_bytes: 503816
+ filename: ds_cnn_l_clustered_int8.tflite
+ framework: TensorFlow Lite
+ hash:
+ algorithm: sha1
+ value: 2ee38794ed171c75d3313460a1633c5d6a79f530
+ provenance: https://arxiv.org/abs/1711.07128
+ training: Trained by Arm
+network_parameters:
+ input_nodes:
+ - description: The input is a processed MFCCs of shape (1, 490)
+ example_input:
+ path: models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/TFLite/tflite_clustered_int8/testing_input/input
+ shape:
+ - 1
+ - 490
+ type: int8
+ use_case: Random input for model regression.
+ input_datatype: int8
+ name: input
+ shape:
+ - 1
+ - 490
+ output_nodes:
+ - description: The probability on 12 keywords.
+ example_output:
+ path: models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/TFLite/tflite_clustered_int8/testing_output/Identity
+ shape:
+ - 1
+ - 12
+ type: int8
+ use_case: output for model regression.
+ name: Identity
+ output_datatype: int8
+ shape:
+ - 1
+ - 12
+network_quality:
+ clustered: true
+ is_vanilla: true
+ pruned: false
+ quality_level: Deployable
+ quality_level_hero_hw: cortex_m
+ quantization_default: false
+ quantization_full: true
+ recreate: true
+operators:
+ TensorFlow Lite:
+ - AVERAGE_POOL_2D
+ - CONV_2D
+ - DEPTHWISE_CONV_2D
+ - FULLY_CONNECTED
+ - RELU
+ - RESHAPE
+ - SOFTMAX
+paper: https://arxiv.org/abs/1711.07128
\ No newline at end of file
diff --git a/models/keyword_spotting/ds_cnn_large/tflite_clustered_int8/ds_cnn_clustered_int8.tflite b/models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/TFLite/tflite_clustered_int8/ds_cnn_l_clustered_int8.tflite
similarity index 100%
rename from models/keyword_spotting/ds_cnn_large/tflite_clustered_int8/ds_cnn_clustered_int8.tflite
rename to models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/TFLite/tflite_clustered_int8/ds_cnn_l_clustered_int8.tflite
diff --git a/models/keyword_spotting/ds_cnn_large/tflite_clustered_int8/testing_input/input/0.npy b/models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/TFLite/tflite_clustered_int8/testing_input/input/0.npy
similarity index 100%
rename from models/keyword_spotting/ds_cnn_large/tflite_clustered_int8/testing_input/input/0.npy
rename to models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/TFLite/tflite_clustered_int8/testing_input/input/0.npy
diff --git a/models/keyword_spotting/ds_cnn_large/tflite_clustered_int8/testing_output/Identity/0.npy b/models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/TFLite/tflite_clustered_int8/testing_output/Identity/0.npy
similarity index 100%
rename from models/keyword_spotting/ds_cnn_large/tflite_clustered_int8/testing_output/Identity/0.npy
rename to models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/TFLite/tflite_clustered_int8/testing_output/Identity/0.npy
diff --git a/models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/TFLite/tflite_fp32/README.md b/models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/TFLite/tflite_fp32/README.md
new file mode 100644
index 0000000..7647971
--- /dev/null
+++ b/models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/TFLite/tflite_fp32/README.md
@@ -0,0 +1,62 @@
+# keyword_spotting/ds_cnn_large/model_package_tf/model_archive/TFLite/tflite_fp32
+
+## Description
+This is a floating point fp32 version of the DS-CNN Large model developed by Arm, from the Hello Edge paper.
+
+## License
+[Apache-2.0](https://spdx.org/licenses/Apache-2.0.html)
+
+## Network Information
+| Network Information | Value |
+|---------------------|-------|
+| Framework | TensorFlow Lite |
+| Datatype | fp32 |
+| SHA-1 Hash | fea0e0dc13fc4207dd44904fe701f34254dd4767 |
+| Size (Bytes) | 1652648 |
+| Provenance | https://arxiv.org/abs/1711.07128 |
+| Training | Trained by Arm |
+| Paper | https://arxiv.org/abs/1711.07128 |
+
+## DataSet
+| Dataset Information | Value |
+|--------|-------|
+| Name | Google Speech Commands test set |
+
+## Accuracy
+
+| Metric | Value |
+|--------|-------|
+| accuracy | 95.03% |
+
+## HW Support
+| HW Support | Value |
+|--------------|-------|
+| Cortex-A |:heavy_check_mark: |
+| Cortex-M |:heavy_check_mark: HERO |
+| Mali GPU |:heavy_check_mark: |
+| Ethos U |:heavy_multiplication_x: |
+
+### Key
+* :heavy_check_mark: - Will run on this platform.
+* :heavy_multiplication_x: - Will not run on this platform.
+
+## Network Quality
+| Network Quality | Value |
+|-------------------------|-------|
+| Recreate | :heavy_check_mark: |
+| Quality level | Hero |
+| Vanilla | :heavy_check_mark: |
+| Clustered | :heavy_multiplication_x: |
+| Pruned | :heavy_multiplication_x: |
+| Quantization - default | :heavy_multiplication_x: |
+| Quantization - full | :heavy_multiplication_x: |
+
+## Network Inputs
+| Input Node Name | Shape | Type | Example Path | Example Type | Example Shape | Example Use Case |
+|-----------------|-------|-------|--------------|-------|-------|-----------------|
+| input | (1, 490) | int8 | models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/TFLite/tflite_fp32/testing_input/input | fp32 | [1, 490] | The input is a processed MFCCs |
+
+## Network Outputs
+| Output Node Name | Shape | Type | Example Path | Example Type | Example Shape | Example Use Case |
+|-----------------|-------|-------|--------------|-------|-------|-----------------|
+| Identity | (1, 12) | int8 | models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/TFLite/tflite_fp32/testing_output/Identity | fp32 | [1, 12] | The probability on 12 keywords |
\ No newline at end of file
diff --git a/models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/TFLite/tflite_fp32/definition.yaml b/models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/TFLite/tflite_fp32/definition.yaml
new file mode 100644
index 0000000..288d185
--- /dev/null
+++ b/models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/TFLite/tflite_fp32/definition.yaml
@@ -0,0 +1,66 @@
+benchmark:
+ benchmark_metrics:
+ accuracy: 95.03%
+ benchmark_name: Google Speech Commands test set
+description: This is a floating point fp32 version of the DS-CNN Large model developed
+ by Arm, from the Hello Edge paper.
+license:
+- Apache-2.0
+network:
+ datatype: fp32
+ file_size_bytes: 1652648
+ filename: ds_cnn_l.tflite
+ framework: TensorFlow Lite
+ hash:
+ algorithm: sha1
+ value: fea0e0dc13fc4207dd44904fe701f34254dd4767
+ provenance: https://arxiv.org/abs/1711.07128
+ training: Trained by Arm
+network_parameters:
+ input_nodes:
+ - description: The input is a processed MFCCs of shape (1, 490)
+ example_input:
+ path: models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/TFLite/tflite_fp32/testing_input/input
+ shape:
+ - 1
+ - 490
+ type: fp32
+ use_case: Random input for model regression.
+ input_datatype: fp32
+ name: input
+ shape:
+ - 1
+ - 490
+ output_nodes:
+ - description: The probability on 12 keywords.
+ example_output:
+ path: models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/TFLite/tflite_fp32/testing_output/Identity
+ shape:
+ - 1
+ - 12
+ type: fp32
+ use_case: output for model regression.
+ name: Identity
+ output_datatype: fp32
+ shape:
+ - 1
+ - 12
+network_quality:
+ clustered: false
+ is_vanilla: true
+ pruned: false
+ quality_level: Deployable
+ quality_level_hero_hw: cortex_m
+ quantization_default: false
+ quantization_full: false
+ recreate: true
+operators:
+ TensorFlow Lite:
+ - AVERAGE_POOL_2D
+ - CONV_2D
+ - DEPTHWISE_CONV_2D
+ - FULLY_CONNECTED
+ - RELU
+ - RESHAPE
+ - SOFTMAX
+paper: https://arxiv.org/abs/1711.07128
\ No newline at end of file
diff --git a/models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/TFLite/tflite_fp32/ds_cnn_l.tflite b/models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/TFLite/tflite_fp32/ds_cnn_l.tflite
new file mode 100644
index 0000000..6619422
--- /dev/null
+++ b/models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/TFLite/tflite_fp32/ds_cnn_l.tflite
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:703bedd0f14360a47ac870a51b13dfde965e4be4d901ee8c6b87bd2f3360671b
+size 1652648
diff --git a/models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/TFLite/tflite_fp32/testing_input/input/0.npy b/models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/TFLite/tflite_fp32/testing_input/input/0.npy
new file mode 100644
index 0000000..8886270
--- /dev/null
+++ b/models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/TFLite/tflite_fp32/testing_input/input/0.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:add2d479791b5e4aa5e4bfd8f16cf47f965783aff20845a8283fa7e571cabd50
+size 2088
diff --git a/models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/TFLite/tflite_fp32/testing_output/Identity/0.npy b/models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/TFLite/tflite_fp32/testing_output/Identity/0.npy
new file mode 100644
index 0000000..5b8a6d6
--- /dev/null
+++ b/models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/TFLite/tflite_fp32/testing_output/Identity/0.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7ffd3d5e8b2601d820fd4b4c786d5f475075848f6f9636a5d62a7c38f30d2cc0
+size 176
diff --git a/models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/TFLite/tflite_int8/README.md b/models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/TFLite/tflite_int8/README.md
new file mode 100644
index 0000000..7f813ed
--- /dev/null
+++ b/models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/TFLite/tflite_int8/README.md
@@ -0,0 +1,62 @@
+# keyword_spotting/ds_cnn_large/model_package_tf/model_archive/TFLite/tflite_int8
+
+## Description
+This is a fully quantized int8 version of the DS-CNN Large model developed by Arm, from the Hello Edge paper.
+
+## License
+[Apache-2.0](https://spdx.org/licenses/Apache-2.0.html)
+
+## Network Information
+| Network Information | Value |
+|---------------------|-------|
+| Framework | TensorFlow Lite |
+| Datatype | int8 |
+| SHA-1 Hash | 504f8e7bfa5c0f15c5475e5d08637b3b8aad0972 |
+| Size (Bytes) | 503816 |
+| Provenance | https://arxiv.org/abs/1711.07128 |
+| Training | Trained by Arm |
+| Paper | https://arxiv.org/abs/1711.07128 |
+
+## DataSet
+| Dataset Information | Value |
+|--------|-------|
+| Name | Google Speech Commands test set |
+
+## Accuracy
+
+| Metric | Value |
+|--------|-------|
+| Accuracy | 94.52% |
+
+## HW Support
+| HW Support | Value |
+|--------------|-------|
+| Cortex-A |:heavy_check_mark: |
+| Cortex-M |:heavy_check_mark: HERO |
+| Mali GPU |:heavy_check_mark: |
+| Ethos U |:heavy_check_mark: |
+
+### Key
+* :heavy_check_mark: - Will run on this platform.
+* :heavy_multiplication_x: - Will not run on this platform.
+
+## Network Quality
+| Network Quality | Value |
+|-------------------------|-------|
+| Recreate | :heavy_check_mark: |
+| Quality level | Hero |
+| Vanilla | :heavy_check_mark: |
+| Clustered | :heavy_multiplication_x: |
+| Pruned | :heavy_multiplication_x: |
+| Quantization - default | :heavy_multiplication_x: |
+| Quantization - full | :heavy_check_mark: |
+
+## Network Inputs
+| Input Node Name | Shape | Type | Example Path | Example Type | Example Shape | Example Use Case |
+|-----------------|-------|-------|--------------|-------|-------|-----------------|
+| input | (1, 490) | int8 | models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/TFLite/tflite_int8/testing_input/input | int8 | [1, 490] | The input is a processed MFCCs |
+
+## Network Outputs
+| Output Node Name | Shape | Type | Example Path | Example Type | Example Shape | Example Use Case |
+|-----------------|-------|-------|--------------|-------|-------|-----------------|
+| Identity | (1, 12) | int8 | models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/TFLite/tflite_int8/testing_output/Identity | int8 | [1, 12] | The probability on 12 keywords |
\ No newline at end of file
diff --git a/models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/TFLite/tflite_int8/definition.yaml b/models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/TFLite/tflite_int8/definition.yaml
new file mode 100644
index 0000000..6a2b864
--- /dev/null
+++ b/models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/TFLite/tflite_int8/definition.yaml
@@ -0,0 +1,66 @@
+benchmark:
+ benchmark_metrics:
+ Accuracy: 94.52%
+ benchmark_name: Google Speech Commands test set
+description: This is a fully quantized int8 version of the DS-CNN Large model developed
+ by Arm, from the Hello Edge paper.
+license:
+- Apache-2.0
+network:
+ datatype: int8
+ file_size_bytes: 503816
+ filename: ds_cnn_l_quantized.tflite
+ framework: TensorFlow Lite
+ hash:
+ algorithm: sha1
+ value: 504f8e7bfa5c0f15c5475e5d08637b3b8aad0972
+ provenance: https://arxiv.org/abs/1711.07128
+ training: Trained by Arm
+network_parameters:
+ input_nodes:
+ - description: The input is a processed MFCCs of shape (1, 490)
+ example_input:
+ path: models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/TFLite/tflite_int8/testing_input/input
+ shape:
+ - 1
+ - 490
+ type: int8
+ use_case: Random input for model regression.
+ input_datatype: int8
+ name: input
+ shape:
+ - 1
+ - 490
+ output_nodes:
+ - description: The probability on 12 keywords.
+ example_output:
+ path: models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/TFLite/tflite_int8/testing_output/Identity
+ shape:
+ - 1
+ - 12
+ type: int8
+ use_case: output for model regression.
+ name: Identity
+ output_datatype: int8
+ shape:
+ - 1
+ - 12
+network_quality:
+ clustered: false
+ is_vanilla: true
+ pruned: false
+ quality_level: Deployable
+ quality_level_hero_hw: cortex_m
+ quantization_default: false
+ quantization_full: true
+ recreate: true
+operators:
+ TensorFlow Lite:
+ - AVERAGE_POOL_2D
+ - CONV_2D
+ - DEPTHWISE_CONV_2D
+ - FULLY_CONNECTED
+ - RELU
+ - RESHAPE
+ - SOFTMAX
+paper: https://arxiv.org/abs/1711.07128
\ No newline at end of file
diff --git a/models/keyword_spotting/ds_cnn_large/tflite_int8/ds_cnn_l_quantized.tflite b/models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/TFLite/tflite_int8/ds_cnn_l_quantized.tflite
similarity index 100%
rename from models/keyword_spotting/ds_cnn_large/tflite_int8/ds_cnn_l_quantized.tflite
rename to models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/TFLite/tflite_int8/ds_cnn_l_quantized.tflite
diff --git a/models/keyword_spotting/ds_cnn_large/tflite_int8/testing_input/input/0.npy b/models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/TFLite/tflite_int8/testing_input/input/0.npy
similarity index 100%
rename from models/keyword_spotting/ds_cnn_large/tflite_int8/testing_input/input/0.npy
rename to models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/TFLite/tflite_int8/testing_input/input/0.npy
diff --git a/models/keyword_spotting/ds_cnn_large/tflite_int8/testing_output/Identity/0.npy b/models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/TFLite/tflite_int8/testing_output/Identity/0.npy
similarity index 100%
rename from models/keyword_spotting/ds_cnn_large/tflite_int8/testing_output/Identity/0.npy
rename to models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/TFLite/tflite_int8/testing_output/Identity/0.npy
diff --git a/models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/model_source/saved_model/ds_cnn_large/keras_metadata.pb b/models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/model_source/saved_model/ds_cnn_large/keras_metadata.pb
new file mode 100644
index 0000000..454265f
--- /dev/null
+++ b/models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/model_source/saved_model/ds_cnn_large/keras_metadata.pb
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bb5e42915f74efe437002d09ef323928da8efdc68b403118711d05871534690e
+size 78436
diff --git a/models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/model_source/saved_model/ds_cnn_large/saved_model.pb b/models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/model_source/saved_model/ds_cnn_large/saved_model.pb
new file mode 100644
index 0000000..95b9f8f
--- /dev/null
+++ b/models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/model_source/saved_model/ds_cnn_large/saved_model.pb
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ac426a86f4d862a0055c945b92ecb0e8f3de3ea90542b2731764b67c2e9ae3f3
+size 859950
diff --git a/models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/model_source/saved_model/ds_cnn_large/variables/variables.data-00000-of-00001 b/models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/model_source/saved_model/ds_cnn_large/variables/variables.data-00000-of-00001
new file mode 100644
index 0000000..77a395d
--- /dev/null
+++ b/models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/model_source/saved_model/ds_cnn_large/variables/variables.data-00000-of-00001
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:efd31c705f2114c88f89660862742beb82a0bea80efd245969076e5339bccdf4
+size 1713786
diff --git a/models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/model_source/saved_model/ds_cnn_large/variables/variables.index b/models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/model_source/saved_model/ds_cnn_large/variables/variables.index
new file mode 100644
index 0000000..7493cc8
--- /dev/null
+++ b/models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/model_source/saved_model/ds_cnn_large/variables/variables.index
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f30be633de16e745ef0a11a3842ad8dbc70d8ead948acf049a613aff0c64cd3d
+size 4397
diff --git a/models/keyword_spotting/ds_cnn_large/tflite_int8/ckpt/checkpoint b/models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/model_source/weights/checkpoint
similarity index 100%
rename from models/keyword_spotting/ds_cnn_large/tflite_int8/ckpt/checkpoint
rename to models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/model_source/weights/checkpoint
diff --git a/models/keyword_spotting/ds_cnn_large/tflite_int8/ckpt/ds_cnn_0.95_ckpt.data-00000-of-00001 b/models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/model_source/weights/ds_cnn_0.95_ckpt.data-00000-of-00001
similarity index 100%
rename from models/keyword_spotting/ds_cnn_large/tflite_int8/ckpt/ds_cnn_0.95_ckpt.data-00000-of-00001
rename to models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/model_source/weights/ds_cnn_0.95_ckpt.data-00000-of-00001
diff --git a/models/keyword_spotting/ds_cnn_large/tflite_int8/ckpt/ds_cnn_0.95_ckpt.index b/models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/model_source/weights/ds_cnn_0.95_ckpt.index
similarity index 100%
rename from models/keyword_spotting/ds_cnn_large/tflite_int8/ckpt/ds_cnn_0.95_ckpt.index
rename to models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/model_source/weights/ds_cnn_0.95_ckpt.index
diff --git a/models/keyword_spotting/ds_cnn_large/model_package_tf/model_core_utils/__init__.py b/models/keyword_spotting/ds_cnn_large/model_package_tf/model_core_utils/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/models/keyword_spotting/ds_cnn_large/model_package_tf/model_core_utils/models.py b/models/keyword_spotting/ds_cnn_large/model_package_tf/model_core_utils/models.py
new file mode 100644
index 0000000..1978136
--- /dev/null
+++ b/models/keyword_spotting/ds_cnn_large/model_package_tf/model_core_utils/models.py
@@ -0,0 +1,327 @@
+# Copyright © 2023 Arm Ltd. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Model definitions for simple keyword spotting."""
+
+import math
+
+import tensorflow as tf
+
+
+def prepare_model_settings(label_count, sample_rate, clip_duration_ms,
+ window_size_ms, window_stride_ms,
+ dct_coefficient_count):
+ """Calculates common settings needed for all models.
+
+ Args:
+ label_count: How many classes are to be recognized.
+ sample_rate: Number of audio samples per second.
+ clip_duration_ms: Length of each audio clip to be analyzed.
+ window_size_ms: Duration of frequency analysis window.
+ window_stride_ms: How far to move in time between frequency windows.
+ dct_coefficient_count: Number of frequency bins to use for analysis.
+
+ Returns:
+ Dictionary containing common settings.
+ """
+ desired_samples = int(sample_rate * clip_duration_ms / 1000)
+ window_size_samples = int(sample_rate * window_size_ms / 1000)
+ window_stride_samples = int(sample_rate * window_stride_ms / 1000)
+ length_minus_window = (desired_samples - window_size_samples)
+ if length_minus_window < 0:
+ spectrogram_length = 0
+ else:
+ spectrogram_length = 1 + int(length_minus_window / window_stride_samples)
+ fingerprint_size = dct_coefficient_count * spectrogram_length
+
+ return {
+ 'desired_samples': desired_samples,
+ 'window_size_samples': window_size_samples,
+ 'window_stride_samples': window_stride_samples,
+ 'spectrogram_length': spectrogram_length,
+ 'dct_coefficient_count': dct_coefficient_count,
+ 'fingerprint_size': fingerprint_size,
+ 'label_count': label_count,
+ 'sample_rate': sample_rate,
+ }
+
+
+def create_model(model_settings, model_architecture, model_size_info, is_training):
+ """Builds a tf.keras model of the requested architecture compatible with the settings.
+
+ Args:
+ model_settings: Dictionary of information about the model.
+ model_architecture: String specifying which kind of model to create.
+ model_size_info: Array with specific information for the chosen architecture
+ (e.g convolutional parameters, number of layers).
+
+ Returns:
+ A tf.keras Model with the requested architecture.
+
+ Raises:
+ Exception: If the architecture type isn't recognized.
+ """
+
+ if model_architecture == 'dnn':
+ return create_dnn_model(model_settings, model_size_info)
+
+ elif model_architecture == 'cnn':
+ return create_cnn_model(model_settings, model_size_info)
+
+ elif model_architecture == 'ds_cnn':
+ return create_ds_cnn_model(model_settings, model_size_info)
+ elif model_architecture == 'single_fc':
+ return create_single_fc_model(model_settings)
+ elif model_architecture == 'basic_lstm':
+ return create_basic_lstm_model(model_settings, model_size_info, is_training)
+ else:
+ raise Exception(f'model_architecture argument {model_architecture} not recognized'
+ f', should be one of, "dnn", "cnn", "ds_cnn" ')
+
+
+def create_single_fc_model(model_settings):
+ """Builds a model with a single fully-connected layer.
+
+ For details see https://arxiv.org/abs/1711.07128.
+
+ Args:
+ model_settings: Dict of different settings for model training.
+
+ Returns:
+ tf.keras Model of the 'SINGLE_FC' architecture.
+ """
+ inputs = tf.keras.Input(shape=(model_settings['fingerprint_size'],), name='input')
+ # Fully connected layer
+ output = tf.keras.layers.Dense(units=model_settings['label_count'], activation='softmax')(inputs)
+
+ return tf.keras.Model(inputs, output)
+
+
+def create_basic_lstm_model(model_settings, model_size_info, is_training):
+ """Builds a model with a basic lstm layer.
+
+ For details see https://arxiv.org/abs/1711.07128.
+
+ Args:
+ model_settings: Dict of different settings for model training.
+ model_size_info: Length of the array defines the number of hidden-layers and
+ each element in the array represent the number of neurons in that layer.
+ is_training: Determining whether the use of the model is for training or for something else.
+
+ Returns:
+ tf.keras Model of the 'Basic_LSTM' architecture.
+ """
+ inputs = tf.keras.Input(shape=(model_settings['fingerprint_size'], ), name='input')
+
+ input_frequency_size = model_settings['dct_coefficient_count']
+ input_time_size = model_settings['spectrogram_length']
+ x = tf.reshape(inputs, shape=(-1, input_time_size, input_frequency_size))
+
+ # LSTM layer, and unrolling depending on whether you are training or not
+ if is_training:
+ x = tf.keras.layers.LSTM(units=model_size_info[0], time_major=False, unroll=False)(x)
+ else:
+ x = tf.keras.layers.LSTM(units=model_size_info[0], time_major=False, unroll=True)(x)
+
+ # Outputs a fully connected layer
+ output = tf.keras.layers.Dense(units=model_settings['label_count'], activation='softmax')(x)
+
+ return tf.keras.Model(inputs, output)
+
+
+def create_dnn_model(model_settings, model_size_info):
+ """Builds a model with multiple hidden fully-connected layers.
+
+ For details see https://arxiv.org/abs/1711.07128.
+
+ Args:
+ model_settings: Dict of different settings for model training.
+ model_size_info: Length of the array defines the number of hidden-layers and
+ each element in the array represent the number of neurons in that layer.
+
+ Returns:
+ tf.keras Model of the 'DNN' architecture.
+ """
+
+ inputs = tf.keras.Input(shape=(model_settings['fingerprint_size'], ), name='input')
+
+ # First fully connected layer.
+ x = tf.keras.layers.Dense(units=model_size_info[0], activation='relu')(inputs)
+
+ # Hidden layers with ReLU activations.
+ for i in range(1, len(model_size_info)):
+ x = tf.keras.layers.Dense(units=model_size_info[i], activation='relu')(x)
+
+ # Output fully connected layer.
+ output = tf.keras.layers.Dense(units=model_settings['label_count'], activation='softmax')(x)
+
+ return tf.keras.Model(inputs, output)
+
+
+def create_cnn_model(model_settings, model_size_info):
+ """Builds a model with 2 convolution layers followed by a linear layer and a hidden fully-connected layer.
+
+ For details see https://arxiv.org/abs/1711.07128.
+
+ Args:
+ model_settings: Dict of different settings for model training.
+ model_size_info: Defines the first and second convolution parameters in
+ {number of conv features, conv filter height, width, stride in y,x dir.},
+ followed by linear layer size and fully-connected layer size.
+
+ Returns:
+ tf.keras Model of the 'CNN' architecture.
+ """
+
+ input_frequency_size = model_settings['dct_coefficient_count']
+ input_time_size = model_settings['spectrogram_length']
+
+ first_filter_count = model_size_info[0]
+ first_filter_height = model_size_info[1] # Time axis.
+ first_filter_width = model_size_info[2] # Frequency axis.
+ first_filter_stride_y = model_size_info[3] # Time axis.
+ first_filter_stride_x = model_size_info[4] # Frequency_axis.
+
+ second_filter_count = model_size_info[5]
+ second_filter_height = model_size_info[6] # Time axis.
+ second_filter_width = model_size_info[7] # Frequency axis.
+ second_filter_stride_y = model_size_info[8] # Time axis.
+ second_filter_stride_x = model_size_info[9] # Frequency axis.
+
+ linear_layer_size = model_size_info[10]
+ fc_size = model_size_info[11]
+
+ inputs = tf.keras.Input(shape=(model_settings['fingerprint_size']), name='input')
+
+ # Reshape the flattened input.
+ x = tf.reshape(inputs, shape=(-1, input_time_size, input_frequency_size, 1))
+
+ # First convolution.
+ x = tf.keras.layers.Conv2D(filters=first_filter_count,
+ kernel_size=(first_filter_height, first_filter_width),
+ strides=(first_filter_stride_y, first_filter_stride_x),
+ padding='VALID')(x)
+ x = tf.keras.layers.BatchNormalization()(x)
+ x = tf.keras.layers.ReLU()(x)
+ x = tf.keras.layers.Dropout(rate=0)(x)
+
+ # Second convolution.
+ x = tf.keras.layers.Conv2D(filters=second_filter_count,
+ kernel_size=(second_filter_height, second_filter_width),
+ strides=(second_filter_stride_y, second_filter_stride_x),
+ padding='VALID')(x)
+ x = tf.keras.layers.BatchNormalization()(x)
+ x = tf.keras.layers.ReLU()(x)
+ x = tf.keras.layers.Dropout(rate=0)(x)
+
+ # Flatten for fully connected layers.
+ x = tf.keras.layers.Flatten()(x)
+
+ # Fully connected layer with no activation.
+ x = tf.keras.layers.Dense(units=linear_layer_size)(x)
+
+ # Fully connected layer with ReLU activation.
+ x = tf.keras.layers.Dense(units=fc_size)(x)
+ x = tf.keras.layers.BatchNormalization()(x)
+ x = tf.keras.layers.ReLU()(x)
+ x = tf.keras.layers.Dropout(rate=0)(x)
+
+ # Output fully connected.
+ output = tf.keras.layers.Dense(units=model_settings['label_count'], activation='softmax')(x)
+
+ return tf.keras.Model(inputs, output)
+
+
+def create_ds_cnn_model(model_settings, model_size_info):
+ """Builds a model with convolutional & depthwise separable convolutional layers.
+
+ For more details see https://arxiv.org/abs/1711.07128.
+
+ Args:
+ model_settings: Dict of different settings for model training.
+ model_size_info: Defines number of layers, followed by the DS-Conv layer
+ parameters in the order {number of conv features, conv filter height,
+ width and stride in y,x dir.} for each of the layers.
+
+ Returns:
+ tf.keras Model of the 'DS-CNN' architecture.
+ """
+
+ label_count = model_settings['label_count']
+ input_frequency_size = model_settings['dct_coefficient_count']
+ input_time_size = model_settings['spectrogram_length']
+
+ t_dim = input_time_size
+ f_dim = input_frequency_size
+
+ # Extract model dimensions from model_size_info.
+ num_layers = model_size_info[0]
+ conv_feat = [None]*num_layers
+ conv_kt = [None]*num_layers
+ conv_kf = [None]*num_layers
+ conv_st = [None]*num_layers
+ conv_sf = [None]*num_layers
+
+ i = 1
+ for layer_no in range(0, num_layers):
+ conv_feat[layer_no] = model_size_info[i]
+ i += 1
+ conv_kt[layer_no] = model_size_info[i]
+ i += 1
+ conv_kf[layer_no] = model_size_info[i]
+ i += 1
+ conv_st[layer_no] = model_size_info[i]
+ i += 1
+ conv_sf[layer_no] = model_size_info[i]
+ i += 1
+
+ inputs = tf.keras.Input(shape=(model_settings['fingerprint_size']), name='input')
+
+ # Reshape the flattened input.
+ x = tf.reshape(inputs, shape=(-1, input_time_size, input_frequency_size, 1))
+
+ # Depthwise separable convolutions.
+ for layer_no in range(0, num_layers):
+ if layer_no == 0:
+ # First convolution.
+ x = tf.keras.layers.Conv2D(filters=conv_feat[0],
+ kernel_size=(conv_kt[0], conv_kf[0]),
+ strides=(conv_st[0], conv_sf[0]),
+ padding='SAME')(x)
+ x = tf.keras.layers.BatchNormalization()(x)
+ x = tf.keras.layers.ReLU()(x)
+ else:
+ # Depthwise convolution.
+ x = tf.keras.layers.DepthwiseConv2D(kernel_size=(conv_kt[layer_no], conv_kf[layer_no]),
+ strides=(conv_sf[layer_no], conv_st[layer_no]),
+ padding='SAME')(x)
+ x = tf.keras.layers.BatchNormalization()(x)
+ x = tf.keras.layers.ReLU()(x)
+
+ # Pointwise convolution.
+ x = tf.keras.layers.Conv2D(filters=conv_feat[layer_no], kernel_size=(1, 1))(x)
+ x = tf.keras.layers.BatchNormalization()(x)
+ x = tf.keras.layers.ReLU()(x)
+
+ t_dim = math.ceil(t_dim/float(conv_st[layer_no]))
+ f_dim = math.ceil(f_dim/float(conv_sf[layer_no]))
+
+ # Global average pool.
+ x = tf.keras.layers.AveragePooling2D(pool_size=(t_dim, f_dim), strides=1)(x)
+
+ # Squeeze before passing to output fully connected layer.
+ x = tf.reshape(x, shape=(-1, conv_feat[layer_no]))
+
+ # Output connected layer.
+ output = tf.keras.layers.Dense(units=label_count, activation='softmax')(x)
+
+ return tf.keras.Model(inputs, output)
diff --git a/models/keyword_spotting/ds_cnn_large/model_package_tf/optimisations.py b/models/keyword_spotting/ds_cnn_large/model_package_tf/optimisations.py
new file mode 100644
index 0000000..16b6f4c
--- /dev/null
+++ b/models/keyword_spotting/ds_cnn_large/model_package_tf/optimisations.py
@@ -0,0 +1,259 @@
+# Copyright © 2023 Arm Ltd. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Functions for optimizing simple keyword spotting models using clustering API."""
+
+import argparse
+from pathlib import Path
+
+import tensorflow as tf
+import numpy as np
+import tensorflow_model_optimization as tfmot
+
+from data_processing import data_preprocessing
+from model_core_utils import models
+
+
+def print_model_weight_clusters(model):
+
+ for layer in model.layers:
+ if isinstance(layer, tf.keras.layers.Wrapper):
+ weights = layer.trainable_weights
+ else:
+ weights = layer.weights
+ for weight in weights:
+ if "kernel" in weight.name:
+ unique_count = len(np.unique(weight))
+ print(
+ f"{layer.name}/{weight.name}: {unique_count} clusters "
+ )
+
+
+def optimize():
+ model_settings = models.prepare_model_settings(
+ len(data_preprocessing.prepare_words_list(FLAGS.wanted_words.split(','))),
+ FLAGS.sample_rate, FLAGS.clip_duration_ms, FLAGS.window_size_ms,
+ FLAGS.window_stride_ms, FLAGS.dct_coefficient_count)
+
+ # Create the model to optimize from checkpoint.
+ model = models.create_model(model_settings, FLAGS.model_architecture, FLAGS.model_size_info)
+ model.load_weights(FLAGS.checkpoint).expect_partial()
+
+ audio_processor = data_preprocessing.AudioProcessor(data_url=FLAGS.data_url,
+ data_dir=FLAGS.data_dir,
+ silence_percentage=FLAGS.silence_percentage,
+ unknown_percentage=FLAGS.unknown_percentage,
+ wanted_words=FLAGS.wanted_words.split(','),
+ validation_percentage=FLAGS.validation_percentage,
+ testing_percentage=FLAGS.testing_percentage,
+ model_settings=model_settings)
+
+ # We decay learning rate in a constant piecewise way to help learning.
+ training_steps_list = list(map(int, FLAGS.how_many_training_steps.split(',')))
+ learning_rates_list = list(map(float, FLAGS.learning_rate.split(',')))
+ lr_boundary_list = training_steps_list[:-1] # Only need the values at which to change lr.
+ lr_schedule = tf.keras.optimizers.schedules.PiecewiseConstantDecay(boundaries=lr_boundary_list,
+ values=learning_rates_list)
+
+ cluster_weights = tfmot.clustering.keras.cluster_weights
+ CentroidInitialization = tfmot.clustering.keras.CentroidInitialization
+
+ clustering_params = {
+ 'number_of_clusters': 32,
+ 'cluster_centroids_init': CentroidInitialization.KMEANS_PLUS_PLUS}
+
+ clustered_model = cluster_weights(model, **clustering_params)
+
+ # Specify the optimizer configurations.
+ optimizer = tf.keras.optimizers.Adam(learning_rate=lr_schedule)
+ clustered_model.compile(optimizer=optimizer,
+ loss=tf.keras.losses.SparseCategoricalCrossentropy(),
+ metrics=['accuracy'])
+
+ train_data = audio_processor.get_data(audio_processor.Modes.TRAINING,
+ FLAGS.background_frequency, FLAGS.background_volume,
+ int((FLAGS.time_shift_ms * FLAGS.sample_rate) / 1000))
+ train_data = train_data.repeat().batch(FLAGS.batch_size).prefetch(tf.data.AUTOTUNE)
+ val_data = audio_processor.get_data(audio_processor.Modes.VALIDATION)
+ val_data = val_data.batch(FLAGS.batch_size).prefetch(tf.data.AUTOTUNE)
+
+ # We train for a max number of iterations so need to calculate how many 'epochs' this will be.
+ training_steps_max = np.sum(training_steps_list)
+ training_epoch_max = int(np.ceil(training_steps_max / FLAGS.eval_step_interval))
+
+ # Train the model with clustering applied.
+ clustered_model.fit(x=train_data,
+ steps_per_epoch=FLAGS.eval_step_interval,
+ epochs=training_epoch_max,
+ validation_data=val_data)
+
+ stripped_clustered_model = tfmot.clustering.keras.strip_clustering(clustered_model)
+
+ print_model_weight_clusters(stripped_clustered_model)
+
+ # Save the clustered model weights
+ train_dir = Path(FLAGS.train_dir) / "optimized"
+ train_dir.mkdir(parents=True, exist_ok=True)
+
+ stripped_clustered_model.save_weights((train_dir /
+ (FLAGS.model_architecture +
+ "_clustered_ckpt")))
+
+ # Test the model.
+ test_data = audio_processor.get_data(audio_processor.Modes.TESTING)
+ test_data = test_data.batch(FLAGS.batch_size)
+
+ stripped_clustered_model.compile(optimizer=optimizer,
+ loss=tf.keras.losses.SparseCategoricalCrossentropy(),
+ metrics=['accuracy'])
+
+ test_loss, test_acc = stripped_clustered_model.evaluate(x=test_data)
+ print(f'Final test accuracy: {test_acc*100:.2f}%')
+
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser()
+ parser.add_argument(
+ '--data_url',
+ type=str,
+ default='http://download.tensorflow.org/data/speech_commands_v0.02.tar.gz',
+ help='Location of speech training data archive on the web.')
+ parser.add_argument(
+ '--data_dir',
+ type=str,
+ default='/tmp/speech_dataset/',
+ help="""\
+ Where to download the speech training data to.
+ """)
+ parser.add_argument(
+ '--background_volume',
+ type=float,
+ default=0.1,
+ help="""\
+ How loud the background noise should be, between 0 and 1.
+ """)
+ parser.add_argument(
+ '--background_frequency',
+ type=float,
+ default=0.8,
+ help="""\
+ How many of the training samples have background noise mixed in.
+ """)
+ parser.add_argument(
+ '--silence_percentage',
+ type=float,
+ default=10.0,
+ help="""\
+ How much of the training data should be silence.
+ """)
+ parser.add_argument(
+ '--unknown_percentage',
+ type=float,
+ default=10.0,
+ help="""\
+ How much of the training data should be unknown words.
+ """)
+ parser.add_argument(
+ '--time_shift_ms',
+ type=float,
+ default=100.0,
+ help="""\
+ Range to randomly shift the training audio by in time.
+ """)
+ parser.add_argument(
+ '--testing_percentage',
+ type=int,
+ default=10,
+ help='What percentage of wavs to use as a test set.')
+ parser.add_argument(
+ '--validation_percentage',
+ type=int,
+ default=10,
+ help='What percentage of wavs to use as a validation set.')
+ parser.add_argument(
+ '--sample_rate',
+ type=int,
+ default=16000,
+ help='Expected sample rate of the wavs',)
+ parser.add_argument(
+ '--clip_duration_ms',
+ type=int,
+ default=1000,
+ help='Expected duration in milliseconds of the wavs',)
+ parser.add_argument(
+ '--window_size_ms',
+ type=float,
+ default=30.0,
+ help='How long each spectrogram timeslice is',)
+ parser.add_argument(
+ '--window_stride_ms',
+ type=float,
+ default=10.0,
+ help='How long each spectrogram timeslice is',)
+ parser.add_argument(
+ '--dct_coefficient_count',
+ type=int,
+ default=40,
+ help='How many bins to use for the MFCC fingerprint',)
+ parser.add_argument(
+ '--how_many_training_steps',
+ type=str,
+ default='3750,750',
+ help='How many training loops to run',)
+ parser.add_argument(
+ '--eval_step_interval',
+ type=int,
+ default=400,
+ help='How often to evaluate the training results.')
+ parser.add_argument(
+ '--learning_rate',
+ type=str,
+ default='0.001,0.0001',
+ help='How large a learning rate to use when training.')
+ parser.add_argument(
+ '--batch_size',
+ type=int,
+ default=100,
+ help='How many items to train with at once',)
+ parser.add_argument(
+ '--wanted_words',
+ type=str,
+ default='yes,no,up,down,left,right,on,off,stop,go',
+ help='Words to use (others will be added to an unknown label)',)
+ parser.add_argument(
+ '--train_dir',
+ type=str,
+ default='/tmp/speech_commands_train',
+ help='Directory to write event logs and checkpoint.')
+ parser.add_argument(
+ '--save_step_interval',
+ type=int,
+ default=100,
+ help='Save model checkpoint every save_steps.')
+ parser.add_argument(
+ '--checkpoint',
+ type=str,
+ help='Checkpoint to load the weights from before fine-tuning.')
+ parser.add_argument(
+ '--model_architecture',
+ type=str,
+ default='dnn',
+ help='What model architecture to use')
+ parser.add_argument(
+ '--model_size_info',
+ type=int,
+ nargs="+",
+ default=[128, 128, 128],
+ help='Model dimensions - different for various models')
+
+ FLAGS, _ = parser.parse_known_args()
+ optimize()
diff --git a/models/keyword_spotting/ds_cnn_large/model_package_tf/recreate_model.sh b/models/keyword_spotting/ds_cnn_large/model_package_tf/recreate_model.sh
new file mode 100644
index 0000000..fabe86c
--- /dev/null
+++ b/models/keyword_spotting/ds_cnn_large/model_package_tf/recreate_model.sh
@@ -0,0 +1,52 @@
+#!/usr/bin/env bash
+# Copyright (C) 2023 Arm Limited or its affiliates. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+ckpt_path=model_archive/model_source/weights/ds_cnn_0.95_ckpt
+train=false
+
+# Parse command line args
+while (( $# >= 1 )); do
+ case $1 in
+ --ckpt)
+ if [ "$2" ]; then
+ ckpt_path=$2
+ shift
+ else
+ printf 'ERROR: "--ckpt" requires a path to be supplied.\n'
+ exit 1
+ fi
+ ;;
+ --train)
+ train=true
+ break;;
+ *) shift;
+ esac;
+done
+
+
+# DS-CNN Large training
+if [ "$train" = true ]
+then
+python train.py --model_architecture ds_cnn --model_size_info 6 276 10 4 2 1 276 3 3 2 2 276 3 3 1 1 276 3 3 1 1 276 3 3 1 1 276 3 3 1 1 --dct_coefficient_count 10 --window_size_ms 40 --window_stride_ms 20 --learning_rate 0.0005,0.0001,0.00002 --how_many_training_steps 10000,10000,10000 --summaries_dir work/DS_CNN/DS_CNN_L/retrain_logs --train_dir work/DS_CNN/DS_CNN_L/training
+fi
+
+# Conversion to TFLite fp32
+python convert_to_tflite.py --model_architecture ds_cnn --model_size_info 6 276 10 4 2 1 276 3 3 2 2 276 3 3 1 1 276 3 3 1 1 276 3 3 1 1 276 3 3 1 1 --dct_coefficient_count 10 --window_size_ms 40 --window_stride_ms 20 --checkpoint $ckpt_path --no-quantize
+
+# Conversion to TFLite int8
+python convert_to_tflite.py --model_architecture ds_cnn --model_size_info 6 276 10 4 2 1 276 3 3 2 2 276 3 3 1 1 276 3 3 1 1 276 3 3 1 1 276 3 3 1 1 --dct_coefficient_count 10 --window_size_ms 40 --window_stride_ms 20 --checkpoint $ckpt_path --inference_type int8
+
diff --git a/models/keyword_spotting/ds_cnn_large/model_package_tf/requirements.txt b/models/keyword_spotting/ds_cnn_large/model_package_tf/requirements.txt
new file mode 100644
index 0000000..3448cff
--- /dev/null
+++ b/models/keyword_spotting/ds_cnn_large/model_package_tf/requirements.txt
@@ -0,0 +1,3 @@
+numpy == 1.19.5
+tensorflow == 2.5.0
+tensorflow-model-optimization == 0.6.0
\ No newline at end of file
diff --git a/models/keyword_spotting/ds_cnn_large/model_package_tf/train.py b/models/keyword_spotting/ds_cnn_large/model_package_tf/train.py
new file mode 100644
index 0000000..8c488b3
--- /dev/null
+++ b/models/keyword_spotting/ds_cnn_large/model_package_tf/train.py
@@ -0,0 +1,227 @@
+# Copyright © 2023 Arm Ltd. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Functions for training simple keyword spotting models."""
+
+import argparse
+from pathlib import Path
+
+import tensorflow as tf
+import numpy as np
+
+from data_processing import data_preprocessing
+from model_core_utils import models
+
+
+def train():
+ model_settings = models.prepare_model_settings(
+ len(data_preprocessing.prepare_words_list(FLAGS.wanted_words.split(','))),
+ FLAGS.sample_rate, FLAGS.clip_duration_ms, FLAGS.window_size_ms,
+ FLAGS.window_stride_ms, FLAGS.dct_coefficient_count)
+
+ # Create the model.
+ model = models.create_model(model_settings, FLAGS.model_architecture, FLAGS.model_size_info, True)
+
+ audio_processor = data_preprocessing.AudioProcessor(data_url=FLAGS.data_url,
+ data_dir=FLAGS.data_dir,
+ silence_percentage=FLAGS.silence_percentage,
+ unknown_percentage=FLAGS.unknown_percentage,
+ wanted_words=FLAGS.wanted_words.split(','),
+ validation_percentage=FLAGS.validation_percentage,
+ testing_percentage=FLAGS.testing_percentage,
+ model_settings=model_settings)
+
+ # We decay learning rate in a constant piecewise way to help learning.
+ training_steps_list = list(map(int, FLAGS.how_many_training_steps.split(',')))
+ learning_rates_list = list(map(float, FLAGS.learning_rate.split(',')))
+ lr_boundary_list = training_steps_list[:-1] # Only need the values at which to change lr.
+ lr_schedule = tf.keras.optimizers.schedules.PiecewiseConstantDecay(boundaries=lr_boundary_list,
+ values=learning_rates_list)
+
+ # Specify the optimizer configurations.
+ optimizer = tf.keras.optimizers.Adam(learning_rate=lr_schedule)
+ model.compile(optimizer=optimizer,
+ loss=tf.keras.losses.SparseCategoricalCrossentropy(),
+ metrics=['accuracy'])
+
+ train_data = audio_processor.get_data(audio_processor.Modes.TRAINING,
+ FLAGS.background_frequency, FLAGS.background_volume,
+ int((FLAGS.time_shift_ms * FLAGS.sample_rate) / 1000))
+ train_data = train_data.repeat().batch(FLAGS.batch_size).prefetch(tf.data.AUTOTUNE)
+ val_data = audio_processor.get_data(audio_processor.Modes.VALIDATION)
+ val_data = val_data.batch(FLAGS.batch_size).prefetch(tf.data.AUTOTUNE)
+
+ # We train for a max number of iterations so need to calculate how many 'epochs' this will be.
+ training_steps_max = np.sum(training_steps_list)
+ training_epoch_max = int(np.ceil(training_steps_max / FLAGS.eval_step_interval))
+
+ # Callbacks.
+ train_dir = Path(FLAGS.train_dir) / "best"
+ train_dir.mkdir(parents=True, exist_ok=True)
+ model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
+ filepath=(train_dir / (FLAGS.model_architecture + "_{val_accuracy:.3f}_ckpt")),
+ save_weights_only=True,
+ monitor='val_accuracy',
+ mode='max',
+ save_best_only=True)
+ tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=FLAGS.summaries_dir)
+
+ # Train the model.
+ model.fit(x=train_data,
+ steps_per_epoch=FLAGS.eval_step_interval,
+ epochs=training_epoch_max,
+ validation_data=val_data,
+ callbacks=[model_checkpoint_callback, tensorboard_callback])
+
+ # Test and save the model.
+ test_data = audio_processor.get_data(audio_processor.Modes.TESTING)
+ test_data = test_data.batch(FLAGS.batch_size)
+
+ test_loss, test_acc = model.evaluate(x=test_data)
+ print(f'Final test accuracy: {test_acc*100:.2f}%')
+ model.save(f'saved_model/{FLAGS.model_architecture}')
+ model.save(f'keras/{FLAGS.model_architecture}.h5')
+
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser()
+ parser.add_argument(
+ '--data_url',
+ type=str,
+ default='http://download.tensorflow.org/data/speech_commands_v0.02.tar.gz',
+ help='Location of speech training data archive on the web.')
+ parser.add_argument(
+ '--data_dir',
+ type=str,
+ default='/tmp/speech_dataset/',
+ help="""\
+ Where to download the speech training data to.
+ """)
+ parser.add_argument(
+ '--background_volume',
+ type=float,
+ default=0.1,
+ help="""\
+ How loud the background noise should be, between 0 and 1.
+ """)
+ parser.add_argument(
+ '--background_frequency',
+ type=float,
+ default=0.8,
+ help="""\
+ How many of the training samples have background noise mixed in.
+ """)
+ parser.add_argument(
+ '--silence_percentage',
+ type=float,
+ default=10.0,
+ help="""\
+ How much of the training data should be silence.
+ """)
+ parser.add_argument(
+ '--unknown_percentage',
+ type=float,
+ default=10.0,
+ help="""\
+ How much of the training data should be unknown words.
+ """)
+ parser.add_argument(
+ '--time_shift_ms',
+ type=float,
+ default=100.0,
+ help="""\
+ Range to randomly shift the training audio by in time.
+ """)
+ parser.add_argument(
+ '--testing_percentage',
+ type=int,
+ default=10,
+ help='What percentage of wavs to use as a test set.')
+ parser.add_argument(
+ '--validation_percentage',
+ type=int,
+ default=10,
+ help='What percentage of wavs to use as a validation set.')
+ parser.add_argument(
+ '--sample_rate',
+ type=int,
+ default=16000,
+ help='Expected sample rate of the wavs',)
+ parser.add_argument(
+ '--clip_duration_ms',
+ type=int,
+ default=1000,
+ help='Expected duration in milliseconds of the wavs',)
+ parser.add_argument(
+ '--window_size_ms',
+ type=float,
+ default=30.0,
+ help='How long each spectrogram timeslice is',)
+ parser.add_argument(
+ '--window_stride_ms',
+ type=float,
+ default=10.0,
+ help='How long each spectrogram timeslice is',)
+ parser.add_argument(
+ '--dct_coefficient_count',
+ type=int,
+ default=40,
+ help='How many bins to use for the MFCC fingerprint',)
+ parser.add_argument(
+ '--how_many_training_steps',
+ type=str,
+ default='15000,3000',
+ help='How many training loops to run',)
+ parser.add_argument(
+ '--eval_step_interval',
+ type=int,
+ default=400,
+ help='How often to evaluate the training results.')
+ parser.add_argument(
+ '--learning_rate',
+ type=str,
+ default='0.001,0.0001',
+ help='How large a learning rate to use when training.')
+ parser.add_argument(
+ '--batch_size',
+ type=int,
+ default=100,
+ help='How many items to train with at once',)
+ parser.add_argument(
+ '--summaries_dir',
+ type=str,
+ default='/tmp/retrain_logs',
+ help='Where to save summary logs for TensorBoard.')
+ parser.add_argument(
+ '--wanted_words',
+ type=str,
+ default='yes,no,up,down,left,right,on,off,stop,go',
+ help='Words to use (others will be added to an unknown label)',)
+ parser.add_argument(
+ '--train_dir',
+ type=str,
+ default='/tmp/speech_commands_train',
+ help='Directory to write event logs and checkpoint.')
+ parser.add_argument(
+ '--model_architecture',
+ type=str,
+ default='dnn',
+ help='What model architecture to use')
+ parser.add_argument(
+ '--model_size_info',
+ type=int,
+ nargs="+",
+ default=[128, 128, 128],
+ help='Model dimensions - different for various models')
+
+ FLAGS, _ = parser.parse_known_args()
+ train()
diff --git a/models/keyword_spotting/ds_cnn_large/model_package_tf/validation_utils/labels.txt b/models/keyword_spotting/ds_cnn_large/model_package_tf/validation_utils/labels.txt
new file mode 100644
index 0000000..ba41645
--- /dev/null
+++ b/models/keyword_spotting/ds_cnn_large/model_package_tf/validation_utils/labels.txt
@@ -0,0 +1,12 @@
+_silence_
+_unknown_
+yes
+no
+up
+down
+left
+right
+on
+off
+stop
+go
\ No newline at end of file
diff --git a/models/keyword_spotting/ds_cnn_large/tflite_clustered_fp32/README.md b/models/keyword_spotting/ds_cnn_large/tflite_clustered_fp32/README.md
deleted file mode 100644
index 0643dd8..0000000
--- a/models/keyword_spotting/ds_cnn_large/tflite_clustered_fp32/README.md
+++ /dev/null
@@ -1,76 +0,0 @@
-# DS-CNN Clustered FP32
-
-## Description
-This is a clustered (32 clusters, kmeans++ centroid initialization) and retrained (fine-tuned) FP32 version of the DS-CNN Large model developed by Arm from the Hello Edge paper. Code for the original DS-CNN implementation can be found here: https://github.com/ARM-software/ML-KWS-for-MCU. The original model was converted to Keras and optimized using the Clustering API in TensorFlow Model Optimization Toolkit.
-
-## License
-[Apache-2.0](https://spdx.org/licenses/Apache-2.0.html)
-
-## Related Materials
-### Class Labels
-The class labels associated with this model can be downloaded by running the script `get_class_labels.sh`.
-
-### Model Recreation Code
-Code to recreate this model can be found here: https://github.com/ARM-software/ML-examples/tree/master/tflu-kws-cortex-m.
-
-## Network Information
-| Network Information | Value |
-|---------------------|----------------|
-| Framework | TensorFlow Lite |
-| SHA-1 Hash | d9af9829a2363c21fd6158c7bc425d0b635eb55c |
-| Size (Bytes) | 1652648 |
-| Provenance | The original model (before clustering and quantization) is a pretrained checkpoint based on https://github.com/ARM-software/ML-KWS-for-MCU |
-| Paper | https://arxiv.org/abs/1711.07128 |
-
-## Performance
-| Platform | Optimized |
-|----------|:---------:|
-| Cortex-A |:heavy_check_mark: |
-| Cortex-M |:heavy_multiplication_x: |
-| Mali GPU |:heavy_check_mark: |
-| Ethos U |:heavy_multiplication_x: |
-
-### Key
-* :heavy_check_mark: - Will run on this platform.
-* :heavy_multiplication_x: - Will not run on this platform.
-
-## Accuracy
-Dataset: Google Speech Commands Test Set
-
-| Metric | Value |
-|--------|-------|
-| Top 1 Accuracy | 0.950 |
-
-## Optimizations
-| Optimization | Value |
-|-----------------|---------|
-| Number of Clusters | 32 |
-| Cluster Initialization | K-Means |
-
-## Network Inputs
-
-
- Input Node Name |
- Shape |
- Description |
-
-
- input |
- (1, 490) |
- The input is a processed MFCCs of shape (1,490) |
-
-
-
-## Network Outputs
-
-
- Output Node Name |
- Shape |
- Description |
-
-
- Identity |
- (1, 12) |
- The probability on 12 keywords. |
-
-
diff --git a/models/keyword_spotting/ds_cnn_large/tflite_clustered_fp32/ckpt/checkpoint b/models/keyword_spotting/ds_cnn_large/tflite_clustered_fp32/ckpt/checkpoint
deleted file mode 100644
index be5b265..0000000
--- a/models/keyword_spotting/ds_cnn_large/tflite_clustered_fp32/ckpt/checkpoint
+++ /dev/null
@@ -1,2 +0,0 @@
-model_checkpoint_path: "ds_cnn_clustered_ckpt"
-all_model_checkpoint_paths: "ds_cnn_clustered_ckpt"
diff --git a/models/keyword_spotting/ds_cnn_large/tflite_clustered_fp32/ckpt/ds_cnn_clustered_ckpt.data-00000-of-00001 b/models/keyword_spotting/ds_cnn_large/tflite_clustered_fp32/ckpt/ds_cnn_clustered_ckpt.data-00000-of-00001
deleted file mode 100644
index fbbad53..0000000
--- a/models/keyword_spotting/ds_cnn_large/tflite_clustered_fp32/ckpt/ds_cnn_clustered_ckpt.data-00000-of-00001
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:77f79b7be1dec13fa39088ca249cc6ea1ab2a0e0bab595034a81a7915d0584f1
-size 1699733
diff --git a/models/keyword_spotting/ds_cnn_large/tflite_clustered_fp32/ckpt/ds_cnn_clustered_ckpt.index b/models/keyword_spotting/ds_cnn_large/tflite_clustered_fp32/ckpt/ds_cnn_clustered_ckpt.index
deleted file mode 100644
index f1630cc..0000000
--- a/models/keyword_spotting/ds_cnn_large/tflite_clustered_fp32/ckpt/ds_cnn_clustered_ckpt.index
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:62b15a99efc82778286c3de5248bbf4d246a751a95007d27c5e778527929b015
-size 4396
diff --git a/models/keyword_spotting/ds_cnn_large/tflite_clustered_fp32/definition.yaml b/models/keyword_spotting/ds_cnn_large/tflite_clustered_fp32/definition.yaml
deleted file mode 100644
index f9c2303..0000000
--- a/models/keyword_spotting/ds_cnn_large/tflite_clustered_fp32/definition.yaml
+++ /dev/null
@@ -1,47 +0,0 @@
-benchmark:
- SpeechCommands:
- top_1_accuracy: 0.950
-description: 'This is a clustered (32 clusters, kmeans++ centroid initialization)
- and retrained (fine-tuned) FP32 version of the DS-CNN Large model developed by Arm
- from the Hello Edge paper. Code for the original DS-CNN implementation can be found
- here: https://github.com/ARM-software/ML-KWS-for-MCU. The original model was converted
- to Keras and optimized using the Clustering API in TensorFlow Model Optimization
- Toolkit.'
-license:
-- Apache-2.0
-network:
- file_size_bytes: 1652648
- filename: ds_cnn_clustered_fp32.tflite
- framework: TensorFlow Lite
- hash:
- algorithm: sha1
- value: d9af9829a2363c21fd6158c7bc425d0b635eb55c
- provenance: The original model (before clustering and quantization) is a pretrained
- checkpoint based on https://github.com/ARM-software/ML-KWS-for-MCU
-network_parameters:
- input_nodes:
- - description: The input is a processed MFCCs of shape (1,490)
- example_input:
- path: models/keyword_spotting/ds_cnn_large/tflite_clustered_fp32/testing_input/input
- name: input
- shape:
- - 1
- - 490
- type: float32
- output_nodes:
- - description: The probability on 12 keywords.
- name: Identity
- shape:
- - 1
- - 12
- test_output_path: models/keyword_spotting/ds_cnn_large/tflite_clustered_fp32/testing_output/Identity
-operators:
- TensorFlow Lite:
- - AVERAGE_POOL_2D
- - CONV_2D
- - DEPTHWISE_CONV_2D
- - FULLY_CONNECTED
- - RELU
- - RESHAPE
- - SOFTMAX
-paper: https://arxiv.org/abs/1711.07128
diff --git a/models/keyword_spotting/ds_cnn_large/tflite_clustered_fp32/get_class_labels.sh b/models/keyword_spotting/ds_cnn_large/tflite_clustered_fp32/get_class_labels.sh
deleted file mode 100755
index e59caf5..0000000
--- a/models/keyword_spotting/ds_cnn_large/tflite_clustered_fp32/get_class_labels.sh
+++ /dev/null
@@ -1,20 +0,0 @@
-# Copyright (C) 2021 Arm Limited or its affiliates. All rights reserved.
-#
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-#!/usr/bin/env bash
-
-wget https://raw.githubusercontent.com/ARM-software/ML-KWS-for-MCU/e9cf319e9aa2ff71d433e111477dd95329fb94cb/Pretrained_models/labels.txt
-mv labels.txt labelmappings.txt
\ No newline at end of file
diff --git a/models/keyword_spotting/ds_cnn_large/tflite_clustered_int8/README.md b/models/keyword_spotting/ds_cnn_large/tflite_clustered_int8/README.md
deleted file mode 100644
index 3e859ed..0000000
--- a/models/keyword_spotting/ds_cnn_large/tflite_clustered_int8/README.md
+++ /dev/null
@@ -1,77 +0,0 @@
-# DS-CNN Clustered INT8
-
-## Description
-This is a clustered (32 clusters, kmeans++ centroid initialization), retrained (fine-tuned) and fully quantized version (INT8) of the DS-CNN Large model developed by Arm from the Hello Edge paper. Code for the original DS-CNN implementation can be found here: https://github.com/ARM-software/ML-KWS-for-MCU. The original model was converted to Keras, optimized using the Clustering API in TensorFlow Model Optimization Toolkit, and quantized using post-training quantization in the TF Lite Converter.
-
-## License
-[Apache-2.0](https://spdx.org/licenses/Apache-2.0.html)
-
-## Related Materials
-### Class Labels
-The class labels associated with this model can be downloaded by running the script `get_class_labels.sh`.
-
-### Model Recreation Code
-Code to recreate this model can be found here: https://github.com/ARM-software/ML-examples/tree/master/tflu-kws-cortex-m.
-
-## Network Information
-| Network Information | Value |
-|---------------------|----------------|
-| Framework | TensorFlow Lite |
-| SHA-1 Hash | 2ee38794ed171c75d3313460a1633c5d6a79f530 |
-| Size (Bytes) | 503816 |
-| Provenance | The original model (before clustering) is a pretrained checkpoint based on https://github.com/ARM-software/ML-KWS-for-MCU |
-| Paper | https://arxiv.org/abs/1711.07128 |
-
-## Performance
-| Platform | Optimized |
-|----------|:---------:|
-| Cortex-A |:heavy_multiplication_x: |
-| Cortex-M |:heavy_check_mark: |
-| Mali GPU |:heavy_check_mark: |
-| Ethos U |:heavy_check_mark: |
-
-### Key
-* :heavy_check_mark: - Will run on this platform.
-* :heavy_multiplication_x: - Will not run on this platform.
-
-## Accuracy
-Dataset: Google Speech Commands Test Set
-
-| Metric | Value |
-|--------|-------|
-| Top 1 Accuracy | 0.940 |
-
-## Optimizations
-| Optimization | Value |
-|--------------|---------|
-| Quantization | INT8 |
-| Number of Clusters | 32 |
-| Cluster Initialization | K-Means |
-
-## Network Inputs
-
-
- Input Node Name |
- Shape |
- Description |
-
-
- input |
- (1, 490) |
- The input is a processed MFCCs of shape (1,490) |
-
-
-
-## Network Outputs
-
-
- Output Node Name |
- Shape |
- Description |
-
-
- Identity |
- (1, 12) |
- The probability on 12 keywords. |
-
-
diff --git a/models/keyword_spotting/ds_cnn_large/tflite_clustered_int8/ckpt/checkpoint b/models/keyword_spotting/ds_cnn_large/tflite_clustered_int8/ckpt/checkpoint
deleted file mode 100644
index be5b265..0000000
--- a/models/keyword_spotting/ds_cnn_large/tflite_clustered_int8/ckpt/checkpoint
+++ /dev/null
@@ -1,2 +0,0 @@
-model_checkpoint_path: "ds_cnn_clustered_ckpt"
-all_model_checkpoint_paths: "ds_cnn_clustered_ckpt"
diff --git a/models/keyword_spotting/ds_cnn_large/tflite_clustered_int8/ckpt/ds_cnn_clustered_ckpt.data-00000-of-00001 b/models/keyword_spotting/ds_cnn_large/tflite_clustered_int8/ckpt/ds_cnn_clustered_ckpt.data-00000-of-00001
deleted file mode 100644
index fbbad53..0000000
--- a/models/keyword_spotting/ds_cnn_large/tflite_clustered_int8/ckpt/ds_cnn_clustered_ckpt.data-00000-of-00001
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:77f79b7be1dec13fa39088ca249cc6ea1ab2a0e0bab595034a81a7915d0584f1
-size 1699733
diff --git a/models/keyword_spotting/ds_cnn_large/tflite_clustered_int8/ckpt/ds_cnn_clustered_ckpt.index b/models/keyword_spotting/ds_cnn_large/tflite_clustered_int8/ckpt/ds_cnn_clustered_ckpt.index
deleted file mode 100644
index f1630cc..0000000
--- a/models/keyword_spotting/ds_cnn_large/tflite_clustered_int8/ckpt/ds_cnn_clustered_ckpt.index
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:62b15a99efc82778286c3de5248bbf4d246a751a95007d27c5e778527929b015
-size 4396
diff --git a/models/keyword_spotting/ds_cnn_large/tflite_clustered_int8/definition.yaml b/models/keyword_spotting/ds_cnn_large/tflite_clustered_int8/definition.yaml
deleted file mode 100644
index 3d65144..0000000
--- a/models/keyword_spotting/ds_cnn_large/tflite_clustered_int8/definition.yaml
+++ /dev/null
@@ -1,48 +0,0 @@
-benchmark:
- SpeechCommands:
- top_1_accuracy: 0.940
-description: 'This is a clustered (32 clusters, kmeans++ centroid initialization),
- retrained (fine-tuned) and fully quantized version (INT8) of the DS-CNN Large model
- developed by Arm from the Hello Edge paper. Code for the original DS-CNN implementation
- can be found here: https://github.com/ARM-software/ML-KWS-for-MCU. The original
- model was converted to Keras, optimized using the Clustering API in TensorFlow Model
- Optimization Toolkit, and quantized using post-training quantization in the TF Lite
- Converter.'
-license:
-- Apache-2.0
-network:
- file_size_bytes: 503816
- filename: ds_cnn_clustered_int8.tflite
- framework: TensorFlow Lite
- hash:
- algorithm: sha1
- value: 2ee38794ed171c75d3313460a1633c5d6a79f530
- provenance: The original model (before clustering) is a pretrained checkpoint based
- on https://github.com/ARM-software/ML-KWS-for-MCU
-network_parameters:
- input_nodes:
- - description: The input is a processed MFCCs of shape (1,490)
- example_input:
- path: models/keyword_spotting/ds_cnn_large/tflite_clustered_int8/testing_input/input
- name: input
- shape:
- - 1
- - 490
- type: int8
- output_nodes:
- - description: The probability on 12 keywords.
- name: Identity
- shape:
- - 1
- - 12
- test_output_path: models/keyword_spotting/ds_cnn_large/tflite_clustered_int8/testing_output/Identity
-operators:
- TensorFlow Lite:
- - AVERAGE_POOL_2D
- - CONV_2D
- - DEPTHWISE_CONV_2D
- - FULLY_CONNECTED
- - RELU
- - RESHAPE
- - SOFTMAX
-paper: https://arxiv.org/abs/1711.07128
diff --git a/models/keyword_spotting/ds_cnn_large/tflite_clustered_int8/get_class_labels.sh b/models/keyword_spotting/ds_cnn_large/tflite_clustered_int8/get_class_labels.sh
deleted file mode 100755
index e59caf5..0000000
--- a/models/keyword_spotting/ds_cnn_large/tflite_clustered_int8/get_class_labels.sh
+++ /dev/null
@@ -1,20 +0,0 @@
-# Copyright (C) 2021 Arm Limited or its affiliates. All rights reserved.
-#
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-#!/usr/bin/env bash
-
-wget https://raw.githubusercontent.com/ARM-software/ML-KWS-for-MCU/e9cf319e9aa2ff71d433e111477dd95329fb94cb/Pretrained_models/labels.txt
-mv labels.txt labelmappings.txt
\ No newline at end of file
diff --git a/models/keyword_spotting/ds_cnn_large/tflite_int8/README.md b/models/keyword_spotting/ds_cnn_large/tflite_int8/README.md
deleted file mode 100644
index e132990..0000000
--- a/models/keyword_spotting/ds_cnn_large/tflite_int8/README.md
+++ /dev/null
@@ -1,59 +0,0 @@
-# DS-CNN Large INT8
-
-## Description
-This is a fully quantized version (asymmetrical int8) of the DS-CNN Large model developed by Arm, with training checkpoints, from the Hello Edge paper. Code to recreate this model can be found here: https://github.com/ARM-software/ML-examples/tree/master/tflu-kws-cortex-m
-
-## License
-[Apache-2.0](https://spdx.org/licenses/Apache-2.0.html)
-
-## Related Materials
-### Class Labels
-The class labels associated with this model can be downloaded by running the script `get_class_labels.sh`.
-
-### Model Recreation Code
-Code to recreate this model can be found here: https://github.com/ARM-software/ML-examples/tree/master/tflu-kws-cortex-m.
-
-## Network Information
-| Network Information | Value |
-|---------------------|------------------|
-| Framework | TensorFlow Lite |
-| SHA-1 Hash | 504f8e7bfa5c0f15c5475e5d08637b3b8aad0972 |
-| Size (Bytes) | 503816 |
-| Provenance | https://github.com/ARM-software/ML-examples/tree/master/tflu-kws-cortex-m |
-| Paper | https://arxiv.org/abs/1711.07128 |
-
-## Accuracy
-Dataset: Google Speech Commands Test Set
-
-| Metric | Value |
-|--------|-------|
-| Accuracy | 0.946 |
-
-## Performance
-| Platform | Optimized |
-|----------|:---------:|
-| Cortex-A |:heavy_check_mark: |
-| Cortex-M |:heavy_check_mark: HERO |
-| Mali GPU |:heavy_check_mark: |
-| Ethos U |:heavy_check_mark: |
-
-### Key
-* :heavy_check_mark: - Will run on this platform.
-* :heavy_multiplication_x: - Will not run on this platform.
-
-
-
-## Optimizations
-| Optimization | Value |
-|-----------------|---------|
-| Quantization | INT8 |
-
-## Network Inputs
-| Input Node Name | Shape | Description |
-|-----------------|---------|-------------|
-| input | (1, 490) | The input is a processed MFCCs of shape (1, 490) |
-
-## Network Outputs
-| Output Node Name | Shape | Description |
-|------------------|---------|-------------|
-| Identity | (1, 12) | The probability on 12 keywords. |
diff --git a/models/keyword_spotting/ds_cnn_large/tflite_int8/definition.yaml b/models/keyword_spotting/ds_cnn_large/tflite_int8/definition.yaml
deleted file mode 100644
index 54df622..0000000
--- a/models/keyword_spotting/ds_cnn_large/tflite_int8/definition.yaml
+++ /dev/null
@@ -1,45 +0,0 @@
-benchmark:
- Google Speech Commands test set:
- Accuracy: 94.58%
-description: 'This is a fully quantized version (asymmetrical int8) of the DS-CNN
- Large model developed by Arm, with training checkpoints, from the Hello Edge paper.
- Code to recreate this model can be found here: https://github.com/ARM-software/ML-examples/tree/master/tflu-kws-cortex-m'
-license:
-- Apache-2.0
-network:
- file_size_bytes: 503816
- filename: ds_cnn_l_quantized.tflite
- framework: TensorFlow Lite
- hash:
- algorithm: sha1
- value: 504f8e7bfa5c0f15c5475e5d08637b3b8aad0972
- provenance: https://github.com/ARM-software/ML-examples/tree/master/tflu-kws-cortex-m
- quality_level: hero#CORTEX-M
-network_parameters:
- input_nodes:
- - description: The input is a processed MFCCs of shape (1, 490)
- example_input:
- path: models/keyword_spotting/ds_cnn_large/tflite_int8/testing_input/input
- name: input
- shape:
- - 1
- - 490
- output_nodes:
- - description: The probability on 12 keywords.
- name: Identity
- shape:
- - 1
- - 12
- test_output_path: models/keyword_spotting/ds_cnn_large/tflite_int8/testing_output/Identity
-operators:
- TensorFlow Lite:
- - AVERAGE_POOL_2D
- - CONV_2D
- - DEPTHWISE_CONV_2D
- - DEQUANTIZE
- - FULLY_CONNECTED
- - QUANTIZE
- - RELU
- - RESHAPE
- - SOFTMAX
-paper: https://arxiv.org/abs/1711.07128
diff --git a/models/keyword_spotting/ds_cnn_large/tflite_int8/get_class_labels.sh b/models/keyword_spotting/ds_cnn_large/tflite_int8/get_class_labels.sh
deleted file mode 100755
index e59caf5..0000000
--- a/models/keyword_spotting/ds_cnn_large/tflite_int8/get_class_labels.sh
+++ /dev/null
@@ -1,20 +0,0 @@
-# Copyright (C) 2021 Arm Limited or its affiliates. All rights reserved.
-#
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-#!/usr/bin/env bash
-
-wget https://raw.githubusercontent.com/ARM-software/ML-KWS-for-MCU/e9cf319e9aa2ff71d433e111477dd95329fb94cb/Pretrained_models/labels.txt
-mv labels.txt labelmappings.txt
\ No newline at end of file
diff --git a/models/keyword_spotting/ds_cnn_medium/model_package_tf/README.md b/models/keyword_spotting/ds_cnn_medium/model_package_tf/README.md
new file mode 100644
index 0000000..47e2846
--- /dev/null
+++ b/models/keyword_spotting/ds_cnn_medium/model_package_tf/README.md
@@ -0,0 +1,115 @@
+# DS-CNN Medium model package
+
+This folder contains code that will allow you to recreate the DS-CNN Medium keyword spotting model from
+the [Hello Edge paper](https://arxiv.org/pdf/1711.07128.pdf).
+
+## License
+[Apache-2.0](https://spdx.org/licenses/Apache-2.0.html)
+
+## Model Package Overview
+| Model | DS_CNN_Medium |
+|:---------------: |:------------------------------------------:|
+| **Format**: | Keras, Saved Model, TensorFlow Lite int8, TensorFlow Lite fp32 |
+| **Feature**: | Keyword spotting for Arm Cortex-M CPUs |
+| **Architectural Delta w.r.t. Vanilla**: | None |
+| **Domain**: | Keyword spotting |
+| **Package Quality**: | Hero |
+
+## Model Recreation
+
+In order to recreate the model you will first need to be using ```Python3.7``` and install the requirements in ```requirements.txt```.
+
+Once you have these requirements satisfied you can execute the recreation script contained within this folder, just run:
+
+```bash
+bash ./recreate_model.sh
+```
+
+Running this script will use the pre-trained checkpoint files supplied in the ```./model_archive/model_source/weights``` folder
+to generate the TFLite files and perform evaluation on the test sets. Both an fp32 version and a quantized version will be produced.
+The quantized version will use post-training quantization to fully quantize it.
+
+If you want to run training from scratch you can do this by supplying ```--train``` when running the script. For example:
+
+```bash
+bash ./recreate_model.sh --train
+```
+
+Training is then performed and should produce a model to the stated accuracy in this repository.
+Note that exporting to TFLite will still happen with the pre-trained checkpoint files so you will need to re-run the script
+and this time supply the path to the new checkpoint files you want to use, for example:
+
+```bash
+bash ./recreate_model.sh --ckpt
+```
+
+
+## Training
+
+To train a DNN with 3 fully-connected layers with 128 neurons in each layer, run:
+
+```
+python train.py --model_architecture dnn --model_size_info 128 128 128
+```
+The command line argument *--model_size_info* is used to pass the neural network layer
+dimensions such as number of layers, convolution filter size/stride as a list to models.py,
+which builds the TensorFlow graph based on the provided model architecture
+and layer dimensions. For more info on *model_size_info* for each network architecture see
+[models.py](models.py).
+
+The training commands with all the hyperparameters to reproduce the models shown in the
+[paper](https://arxiv.org/pdf/1711.07128.pdf) are given [here](recreate_model.sh).
+
+## Testing
+To run inference on the trained model from a checkpoint and get accuracy on validation and test sets, run:
+```
+python evaluation.py --model_architecture dnn --model_size_info 128 128 128 --checkpoint
+```
+The parameters used here should match those used in the Training step.
+
+## Optimization
+
+We introduce a new *optional* step to optimize the trained keyword spotting model for deployment.
+
+Here we use TensorFlow's [weight clustering API](https://www.tensorflow.org/model_optimization/guide/clustering) to reduce the compressed model size and optimize inference on supported hardware. 32 weight clusters and kmeans++ cluster intialization method are used as the clustering hyperparameters.
+
+To optimize your trained model (e.g. a DNN), a trained model checkpoint is needed to run clustering and fine-tuning on.
+You can use the pre-trained checkpoints provided, or train your own model and use the resulting checkpoint.
+
+To apply the optimization and fine-tuning, run the following command:
+```
+python optimisations.py --model_architecture dnn --model_size_info 128 128 128 --checkpoint
+```
+The parameters used here should match those used in the Training step, except for the number of training steps.
+The number of training steps is reduced since the optimization step only requires fine-tuning.
+
+This will generate a clustered model checkpoint that can be used in the quantization step to generate a quantized and clustered TFLite model.
+
+## Quantization and TFLite Conversion
+
+As part of the update we now use TensorFlow's
+[post training quantization](https://www.tensorflow.org/lite/performance/post_training_quantization) to
+make quantization of the trained models super simple.
+
+To quantize your trained model (e.g. a DNN) run:
+```
+python convert_to_tflite.py --model_architecture dnn --model_size_info 128 128 128 --checkpoint [--inference_type int8|int16]
+```
+The parameters used here should match those used in the Training step.
+
+The inference_type parameter is *optional* and to be used if a fully quantized model with inputs and outputs of type int8 or int16 is needed. It defaults to fp32.
+
+This step will produce a quantized TFLite file *dnn_quantized.tflite*.
+You can test the accuracy of this quantized model on the test set by running:
+```
+python evaluation.py --tflite_path dnn_quantized.tflite
+```
+The parameters used here should match those used in the Training step.
+
+`convert_to_tflite.py` uses post-training quantization to generate a quantized model by default. If you wish to convert to a floating point TFLite model, use the command below:
+
+```
+python convert_to_tflite.py --model_architecture dnn --model_size_info 128 128 128 --checkpoint --no-quantize
+```
+
+This will produce a floating point TFLite file *dnn.tflite*. You can test the accuracy of this floating point model using `evaluation.py` as above.
diff --git a/models/keyword_spotting/ds_cnn_medium/model_package_tf/convert_to_tflite.py b/models/keyword_spotting/ds_cnn_medium/model_package_tf/convert_to_tflite.py
new file mode 100644
index 0000000..64ab8df
--- /dev/null
+++ b/models/keyword_spotting/ds_cnn_medium/model_package_tf/convert_to_tflite.py
@@ -0,0 +1,234 @@
+# Copyright © 2023 Arm Ltd. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Functions for converting and quantizing a trained keyword spotting
+ model and saving to TFLite."""
+
+import argparse
+
+import tensorflow as tf
+
+from data_processing import data_preprocessing
+from model_core_utils import models
+from evaluation import tflite_test
+
+NUM_REP_DATA_SAMPLES = 100 # How many samples to use for post training quantization.
+
+
+def convert(model_settings, audio_processor, checkpoint, quantize, inference_type, tflite_path):
+ """Load our trained floating point model and convert it.
+
+ TFLite conversion or post training quantization is performed and the
+ resulting model is saved as a TFLite file.
+ We use samples from the validation set to do post training quantization.
+
+ Args:
+ model_settings: Dictionary of common model settings.
+ audio_processor: Audio processor class object.
+ checkpoint: Path to training checkpoint to load.
+ quantize: Whether to quantize the model or convert to fp32 TFLite model.
+ inference_type: Input/output type of the quantized model.
+ tflite_path: Output TFLite file save path.
+ """
+ model = models.create_model(model_settings, FLAGS.model_architecture, FLAGS.model_size_info, False)
+ model.load_weights(checkpoint).expect_partial()
+
+ val_data = audio_processor.get_data(audio_processor.Modes.VALIDATION).batch(1)
+
+ def _rep_dataset():
+ """Generator function to produce representative dataset."""
+ i = 0
+ for mfcc, label in val_data:
+ if i > NUM_REP_DATA_SAMPLES:
+ break
+ i += 1
+ yield [mfcc]
+
+ if quantize:
+ # Quantize model and save to disk.
+ tflite_model = post_training_quantize(model, inference_type, _rep_dataset)
+ with open(tflite_path, 'wb') as f:
+ f.write(tflite_model)
+ print(f'Quantized model saved to {tflite_path}.')
+ else:
+ converter = tf.lite.TFLiteConverter.from_keras_model(model)
+ tflite_model = converter.convert()
+ with open(tflite_path, 'wb') as f:
+ f.write(tflite_model)
+ print(f'Converted model saved to {tflite_path}.')
+
+
+def post_training_quantize(keras_model, inference_type, rep_dataset):
+ """Perform post training quantization and returns the TFLite model ready for saving.
+
+ See https://www.tensorflow.org/lite/performance/post_training_quantization#full_integer_quantization for
+ more details.
+
+ Args:
+ keras_model: The trained tf Keras model used for post training quantization.
+ inference_type: Input/output type of the quantized model.
+ rep_dataset: Function to use as a representative dataset, must be callable.
+
+ Returns:
+ Quantized TFLite model ready for saving to disk.
+ """
+ converter = tf.lite.TFLiteConverter.from_keras_model(keras_model)
+ converter.optimizations = [tf.lite.Optimize.DEFAULT]
+
+ if inference_type == 'int8':
+ converter.inference_input_type = tf.int8
+ converter.inference_output_type = tf.int8
+ supported_ops = tf.lite.OpsSet.TFLITE_BUILTINS_INT8
+ if inference_type == 'int16':
+ converter.inference_input_type = tf.int16
+ converter.inference_output_type = tf.int16
+ supported_ops = tf.lite.OpsSet.EXPERIMENTAL_TFLITE_BUILTINS_ACTIVATIONS_INT16_WEIGHTS_INT8
+
+ # Int8 post training quantization needs representative dataset.
+ converter.representative_dataset = rep_dataset
+ converter.target_spec.supported_ops = [supported_ops]
+
+ tflite_model = converter.convert()
+
+ return tflite_model
+
+
+def main():
+ model_settings = models.prepare_model_settings(
+ len(data_preprocessing.prepare_words_list(FLAGS.wanted_words.split(','))),
+ FLAGS.sample_rate, FLAGS.clip_duration_ms, FLAGS.window_size_ms,
+ FLAGS.window_stride_ms, FLAGS.dct_coefficient_count)
+
+ audio_processor = data_preprocessing.AudioProcessor(data_url=FLAGS.data_url,
+ data_dir=FLAGS.data_dir,
+ silence_percentage=FLAGS.silence_percentage,
+ unknown_percentage=FLAGS.unknown_percentage,
+ wanted_words=FLAGS.wanted_words.split(','),
+ validation_percentage=FLAGS.validation_percentage,
+ testing_percentage=FLAGS.testing_percentage,
+ model_settings=model_settings)
+
+ if FLAGS.quantize:
+ tflite_path = f'{FLAGS.model_architecture}_quantized.tflite'
+ else:
+ tflite_path = f'{FLAGS.model_architecture}.tflite'
+
+ # Load floating point model from checkpoint and convert it.
+ convert(model_settings, audio_processor, FLAGS.checkpoint,
+ FLAGS.quantize, FLAGS.inference_type, tflite_path)
+
+ # Test the newly converted model on the test set.
+ tflite_test(model_settings, audio_processor, tflite_path)
+
+
+if __name__ == '__main__':
+ parser = argparse.ArgumentParser()
+ parser.add_argument(
+ '--data_url',
+ type=str,
+ default='http://download.tensorflow.org/data/speech_commands_v0.02.tar.gz',
+ help='Location of speech training data archive on the web.')
+ parser.add_argument(
+ '--data_dir',
+ type=str,
+ default='/tmp/speech_dataset/',
+ help="""\
+ Where to download the speech training data to.
+ """)
+ parser.add_argument(
+ '--silence_percentage',
+ type=float,
+ default=10.0,
+ help="""\
+ How much of the training data should be silence.
+ """)
+ parser.add_argument(
+ '--unknown_percentage',
+ type=float,
+ default=10.0,
+ help="""\
+ How much of the training data should be unknown words.
+ """)
+ parser.add_argument(
+ '--testing_percentage',
+ type=int,
+ default=10,
+ help='What percentage of wavs to use as a test set.')
+ parser.add_argument(
+ '--validation_percentage',
+ type=int,
+ default=10,
+ help='What percentage of wavs to use as a validation set.')
+ parser.add_argument(
+ '--sample_rate',
+ type=int,
+ default=16000,
+ help='Expected sample rate of the wavs',)
+ parser.add_argument(
+ '--clip_duration_ms',
+ type=int,
+ default=1000,
+ help='Expected duration in milliseconds of the wavs',)
+ parser.add_argument(
+ '--window_size_ms',
+ type=float,
+ default=30.0,
+ help='How long each spectrogram timeslice is',)
+ parser.add_argument(
+ '--window_stride_ms',
+ type=float,
+ default=10.0,
+ help='How long each spectrogram timeslice is',)
+ parser.add_argument(
+ '--dct_coefficient_count',
+ type=int,
+ default=40,
+ help='How many bins to use for the MFCC fingerprint',)
+ parser.add_argument(
+ '--wanted_words',
+ type=str,
+ default='yes,no,up,down,left,right,on,off,stop,go',
+ help='Words to use (others will be added to an unknown label)',)
+ parser.add_argument(
+ '--model_architecture',
+ type=str,
+ default='dnn',
+ help='What model architecture to use')
+ parser.add_argument(
+ '--model_size_info',
+ type=int,
+ nargs="+",
+ default=[128, 128, 128],
+ help='Model dimensions - different for various models')
+ parser.add_argument(
+ '--checkpoint',
+ type=str,
+ help='Checkpoint to load the weights from.')
+ parser.add_argument(
+ '--quantize',
+ dest='quantize',
+ action="store_true",
+ default=True,
+ help='Whether to quantize the model or convert to fp32 TFLite model. Defaults to True.')
+ parser.add_argument(
+ '--no-quantize',
+ dest='quantize',
+ action="store_false",
+ help='Whether to quantize the model or convert to fp32 TFLite model. Defaults to True.')
+ parser.add_argument(
+ '--inference_type',
+ type=str,
+ default='fp32',
+ help='If quantize is true, whether the model input and output is float32, int8 or int16')
+
+ FLAGS, _ = parser.parse_known_args()
+ main()
diff --git a/models/keyword_spotting/ds_cnn_medium/model_package_tf/data_processing/__init__.py b/models/keyword_spotting/ds_cnn_medium/model_package_tf/data_processing/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/models/keyword_spotting/ds_cnn_medium/model_package_tf/data_processing/data_preprocessing.py b/models/keyword_spotting/ds_cnn_medium/model_package_tf/data_processing/data_preprocessing.py
new file mode 100644
index 0000000..05cf5ba
--- /dev/null
+++ b/models/keyword_spotting/ds_cnn_medium/model_package_tf/data_processing/data_preprocessing.py
@@ -0,0 +1,462 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+#
+# Modifications Copyright 2023 Arm Inc. All Rights Reserved.
+# Modified to use TensorFlow 2.0 and data pipelines.
+#
+"""Functions for loading and preparing data for keyword spotting."""
+
+import os
+import re
+import sys
+import urllib
+from pathlib import Path
+import tarfile
+import hashlib
+import random
+import math
+from enum import Enum
+
+import numpy as np
+import tensorflow as tf
+from tensorflow.python.ops import gen_audio_ops as audio_ops
+
+MAX_NUM_WAVS_PER_CLASS = 2**27 - 1 # ~134M
+RANDOM_SEED = 59185
+BACKGROUND_NOISE_DIR_NAME = '_background_noise_'
+SILENCE_LABEL = '_silence_'
+SILENCE_INDEX = 0
+UNKNOWN_WORD_INDEX = 1
+UNKNOWN_WORD_LABEL = '_unknown_'
+
+
+def load_wav_file(wav_filename, desired_samples):
+ """Loads and then decodes a given 16bit PCM wav file.
+
+ Decoded audio is scaled to the range [-1, 1] and padded or cropped to the desired number of samples.
+
+ Args:
+ wav_filename: 16bit PCM wav file to load.
+ desired_samples: Number of samples wanted from the audio file.
+
+ Returns:
+ Tuple consisting of the decoded audio and sample rate.
+ """
+ wav_file = tf.io.read_file(wav_filename)
+ decoded_wav = audio_ops.decode_wav(wav_file, desired_channels=1, desired_samples=desired_samples)
+
+ return decoded_wav.audio, decoded_wav.sample_rate
+
+
+def calculate_mfcc(audio_signal, audio_sample_rate, window_size, window_stride, num_mfcc):
+ """Returns Mel Frequency Cepstral Coefficients (MFCC) for a given audio signal.
+
+ Args:
+ audio_signal: Raw audio signal in range [-1, 1]
+ audio_sample_rate: Audio signal sample rate
+ window_size: Window size in samples for calculating spectrogram
+ window_stride: Window stride in samples for calculating spectrogram
+ num_mfcc: The number of MFCC features wanted.
+
+ Returns:
+ Calculated mffc features.
+ """
+ spectrogram = audio_ops.audio_spectrogram(input=audio_signal, window_size=window_size, stride=window_stride,
+ magnitude_squared=True)
+
+ mfcc_features = audio_ops.mfcc(spectrogram, audio_sample_rate, dct_coefficient_count=num_mfcc)
+
+ return mfcc_features
+
+
+def which_set(filename, validation_percentage, testing_percentage):
+ """Determines which data partition the file should belong to.
+
+ We want to keep files in the same training, validation, or testing sets even
+ if new ones are added over time. This makes it less likely that testing
+ samples will accidentally be reused in training when long runs are restarted
+ for example. To keep this stability, a hash of the filename is taken and used
+ to determine which set it should belong to. This determination only depends on
+ the name and the set proportions, so it won't change as other files are added.
+ It's also useful to associate particular files as related (for example words
+ spoken by the same person), so anything after '_nohash_' in a filename is
+ ignored for set determination. This ensures that 'bobby_nohash_0.wav' and
+ 'bobby_nohash_1.wav' are always in the same set, for example.
+
+ Args:
+ filename: File path of the data sample.
+ validation_percentage: How much of the data set to use for validation.
+ testing_percentage: How much of the data set to use for testing.
+
+ Returns:
+ String, one of 'training', 'validation', or 'testing'.
+ """
+ base_name = os.path.basename(filename)
+ # We want to ignore anything after '_nohash_' in the file name when
+ # deciding which set to put a wav in, so the data set creator has a way of
+ # grouping wavs that are close variations of each other.
+ hash_name = re.sub(r'_nohash_.*$', '', base_name)
+ # This looks a bit magical, but we need to decide whether this file should
+ # go into the training, testing, or validation sets, and we want to keep
+ # existing files in the same set even if more files are subsequently
+ # added.
+ # To do that, we need a stable way of deciding based on just the file name
+ # itself, so we do a hash of that and then use that to generate a
+ # probability value that we use to assign it.
+ hash_name_hashed = hashlib.sha1(tf.compat.as_bytes(hash_name)).hexdigest()
+ percentage_hash = ((int(hash_name_hashed, 16) %
+ (MAX_NUM_WAVS_PER_CLASS + 1)) *
+ (100.0 / MAX_NUM_WAVS_PER_CLASS))
+ if percentage_hash < validation_percentage:
+ result = 'validation'
+ elif percentage_hash < (testing_percentage + validation_percentage):
+ result = 'testing'
+ else:
+ result = 'training'
+ return result
+
+
+def prepare_words_list(wanted_words):
+ """Prepends common tokens to the custom word list.
+
+ Args:
+ wanted_words: List of strings containing custom words to spot.
+
+ Returns:
+ List of words with silence and unknown tokens added.
+ """
+ return [SILENCE_LABEL, UNKNOWN_WORD_LABEL] + wanted_words
+
+
+class AudioProcessor:
+ """Handles loading, partitioning, and preparing audio training data."""
+
+ class Modes(Enum):
+ TRAINING = 1
+ VALIDATION = 2
+ TESTING = 3
+
+ def __init__(self, data_url, data_dir, silence_percentage, unknown_percentage,
+ wanted_words, validation_percentage, testing_percentage, model_settings):
+ self.data_dir = Path(data_dir)
+ self.model_settings = model_settings
+ self.words_list = prepare_words_list(wanted_words)
+
+ self._tf_datasets = {}
+ self.background_data = None
+ self._set_size = {'training': 0, 'validation': 0, 'testing': 0}
+
+ self._download_and_extract_data(data_url, data_dir)
+ self._prepare_datasets(silence_percentage, unknown_percentage, wanted_words,
+ validation_percentage, testing_percentage)
+ self._prepare_background_data()
+
+ def get_data(self, mode, background_frequency=0, background_volume_range=0, time_shift=0):
+ """Returns the train, validation or test set for KWS as a TF Dataset.
+
+ Args:
+ mode: The set to return, see AudioProcessor.Modes enumeration.
+ background_frequency: How many of the samples have background noise mixed in.
+ background_volume_range: How loud the background noise should be, between 0 and 1.
+ time_shift: Range to randomly shift the training audio by in time.
+
+ Returns:
+ TF dataset that will generate tuples containing an mfcc and corresponding label.
+
+ Raises:
+ ValueError: If mode is not recognised.
+ """
+ if mode == AudioProcessor.Modes.TRAINING:
+ dataset = self._tf_datasets['training']
+ elif mode == AudioProcessor.Modes.VALIDATION:
+ dataset = self._tf_datasets['validation']
+ elif mode == AudioProcessor.Modes.TESTING:
+ dataset = self._tf_datasets['testing']
+ else:
+ ValueError("Incorrect dataset type given")
+
+ use_background = (self.background_data is not None) and (mode == AudioProcessor.Modes.TRAINING)
+ dataset = dataset.map(lambda path, label: self._process_path(path, label, self.model_settings,
+ background_frequency, background_volume_range,
+ time_shift, use_background, self.background_data),
+ num_parallel_calls=tf.data.experimental.AUTOTUNE)
+
+ return dataset
+
+ def set_size(self, mode):
+ """Get the number of samples in the requested dataset partition.
+
+ Args:
+ mode: Which partition, see AudioProcessor.Modes enumeration.
+
+ Returns:
+ Number of samples in the partition.
+
+ Raises:
+ ValueError: If mode is not recognised.
+ """
+ if mode == AudioProcessor.Modes.TRAINING:
+ return self._set_size['training']
+ elif mode == AudioProcessor.Modes.VALIDATION:
+ return self._set_size['validation']
+ elif mode == AudioProcessor.Modes.TESTING:
+ return self._set_size['testing']
+ else:
+ ValueError('Incorrect dataset type given')
+
+ @staticmethod
+ def _process_path(path, label, model_settings, background_frequency, background_volume_range, time_shift_samples,
+ use_background, background_data):
+ """Load wav files and calculate mfcc features.
+
+ Random shifting of samples and adding in background noise is done within this function as well.
+ This function is meant to be mapped onto a TF Dataset by using a lambda function.
+
+ Args:
+ path: Path to the wav file to load.
+ label: Integer label for classifying the audio clip.
+ model_settings: Dictionary of settings for model being trained.
+ background_frequency: How many clips will have background noise, 0.0 to 1.0.
+ background_volume_range: How loud the background noise will be.
+ time_shift_samples: How much to randomly shift the clips by.
+ use_background: Add in background noise to audio clips or not.
+ background_data: Ragged tensor of loaded background noise samples.
+
+ Returns:
+ Tuple of calculated flattened mfcc and its class label.
+ """
+
+ desired_samples = model_settings['desired_samples']
+ audio, sample_rate = load_wav_file(path, desired_samples=desired_samples)
+
+ # Make our own silence audio data.
+ if label == SILENCE_INDEX:
+ audio = tf.multiply(audio, 0)
+
+ # Shift samples start position and pad any gaps with zeros.
+ if time_shift_samples > 0:
+ time_shift_amount = tf.random.uniform(shape=(), minval=-time_shift_samples, maxval=time_shift_samples,
+ dtype=tf.int32)
+ else:
+ time_shift_amount = 0
+ if time_shift_amount > 0:
+ time_shift_padding = [[time_shift_amount, 0], [0, 0]]
+ time_shift_offset = [0, 0]
+ else:
+ time_shift_padding = [[0, -time_shift_amount], [0, 0]]
+ time_shift_offset = [-time_shift_amount, 0]
+
+ padded_foreground = tf.pad(audio, time_shift_padding, mode='CONSTANT')
+ sliced_foreground = tf.slice(padded_foreground, time_shift_offset, [desired_samples, -1])
+
+ # Get a random section of background noise.
+ if use_background:
+ background_index = tf.random.uniform(shape=(), maxval=background_data.shape[0], dtype=tf.int32)
+ background_sample = background_data[background_index]
+ background_offset = tf.random.uniform(shape=(), maxval=len(background_sample)-desired_samples,
+ dtype=tf.int32)
+ background_clipped = background_sample[background_offset:(background_offset + desired_samples)]
+ background_reshaped = tf.reshape(background_clipped, [desired_samples, 1])
+ if tf.random.uniform(shape=(), maxval=1) < background_frequency:
+ background_volume = tf.random.uniform(shape=(), maxval=background_volume_range)
+ else:
+ background_volume = tf.constant(0, dtype='float32')
+ else:
+ background_reshaped = np.zeros([desired_samples, 1], dtype=np.float32)
+ background_volume = tf.constant(0, dtype='float32')
+
+ # Mix in background noise.
+ background_mul = tf.multiply(background_reshaped, background_volume)
+ background_add = tf.add(background_mul, sliced_foreground)
+ background_clamp = tf.clip_by_value(background_add, -1.0, 1.0)
+
+ mfcc = calculate_mfcc(background_clamp, sample_rate, model_settings['window_size_samples'],
+ model_settings['window_stride_samples'],
+ model_settings['dct_coefficient_count'])
+ mfcc = tf.reshape(mfcc, [-1])
+
+ return mfcc, label
+
+ def _download_and_extract_data(self, data_url, target_directory):
+ """Downloads and extracts file to target directory.
+
+ If the file does not already exist download it and then untar into the target directory.
+
+ Args:
+ data_url: Web link to the tarred data to download.
+ target_directory: Directory to download and extract to.
+ """
+ target_directory = Path(target_directory)
+ target_directory.mkdir(exist_ok=True)
+
+ filename = data_url.split('/')[-1]
+ filepath = target_directory / filename
+
+ if not filepath.exists():
+ def _report_hook(block_num, block_size, total_size):
+ """Function to track download progress in urllib"""
+ read_so_far = block_num * block_size
+ percent = (read_so_far / total_size) * 100.0
+
+ s = f"\rDownloading {filename} {percent:.1f}%"
+
+ sys.stdout.write(s)
+ sys.stdout.flush()
+
+ filepath, _ = urllib.request.urlretrieve(data_url, filepath, _report_hook)
+ print()
+
+ print(f'Untarring {filename}...')
+ tarfile.open(filepath, 'r:gz').extractall(target_directory)
+
+ def _prepare_datasets(self, silence_percentage, unknown_percentage, wanted_words,
+ validation_percentage, testing_percentage):
+ """Split the data into train, validation and testing sets.
+
+ Silence and unknown data is added, then sets are converted to TF Datasets.
+
+ Args:
+ silence_percentage: Percent of words should be silence.
+ unknown_percentage: Percent of words that should be unknown.
+ wanted_words: List of words wanted to classify.
+ validation_percentage: Percent to split off for validation.
+ testing_percentage: Percent to split off for testing.
+ """
+ # Make sure the shuffling and picking of unknowns is deterministic.
+ random.seed(RANDOM_SEED)
+ wanted_words_index = {}
+
+ for index, wanted_word in enumerate(wanted_words):
+ wanted_words_index[wanted_word] = index + 2
+
+ # Find all wav files in subfolders.
+ search_path = self.data_dir / '*' / '*.wav'
+ data_index, unknown_index, all_words = self._find_and_sort_wavs(search_path, validation_percentage,
+ testing_percentage, wanted_words_index)
+
+ for index, wanted_word in enumerate(wanted_words):
+ if wanted_word not in all_words:
+ raise Exception(f'Tried to find {wanted_word} in labels but only found: {", ".join(all_words.keys())}')
+
+ word_to_index = {}
+ for word in all_words:
+ if word in wanted_words_index:
+ word_to_index[word] = wanted_words_index[word]
+ else:
+ word_to_index[word] = UNKNOWN_WORD_INDEX
+ word_to_index[SILENCE_LABEL] = SILENCE_INDEX
+
+ # We need an arbitrary file to load as the input for the silence samples.
+ # It's multiplied by zero later, so the content doesn't matter.
+ silence_wav_path = data_index['training'][0]['file']
+ for set_index in ['validation', 'testing', 'training']:
+ set_size = len(data_index[set_index]) # Size before adding silence and unknown samples.
+ silence_size = int(math.ceil(set_size * silence_percentage / 100))
+ for _ in range(silence_size):
+ data_index[set_index].append({
+ 'label': SILENCE_LABEL,
+ 'file': silence_wav_path
+ })
+ # Pick some unknowns to add to each partition of the data set.
+ random.shuffle(unknown_index[set_index])
+ unknown_size = int(math.ceil(set_size * unknown_percentage / 100))
+ data_index[set_index].extend(unknown_index[set_index][:unknown_size])
+
+ self._set_size[set_index] = len(data_index[set_index]) # Size after adding silence and unknown samples.
+
+ # Make sure the ordering is random.
+ random.shuffle(data_index[set_index])
+
+ # Transform into TF Datasets ready for easier processing later.
+ labels, paths = list(zip(*[d.values() for d in data_index[set_index]]))
+ labels = [word_to_index[label] for label in labels]
+ self._tf_datasets[set_index] = tf.data.Dataset.from_tensor_slices((list(paths), labels))
+
+ def _find_and_sort_wavs(self, search_pattern, validation_percentage, testing_percentage, wanted_words_index):
+ """Find and sort wav files into known and unknown word sets.
+
+ Known words are files containing words in the list of wanted words.
+ Any other clip goes to the unknown label set. Labels come from the folder names.
+ All clips are also assigned to train, test and validation sets.
+
+ Args:
+ search_pattern: Path pattern used by glob to find wav files.
+ validation_percentage: Percent to split off for validation.
+ testing_percentage: Percent to split off for testing.
+ wanted_words_index: Dict mapping wanted words to their label index.
+
+ Returns:
+ 3-tuple of known words, unknown words and mapping of all word labels.
+ """
+ data_index = {'validation': [], 'testing': [], 'training': []}
+ unknown_index = {'validation': [], 'testing': [], 'training': []}
+ all_words = {}
+
+ for wav_path in sorted(tf.io.gfile.glob(str(search_pattern))):
+ word = Path(wav_path).parent.name.lower()
+
+ # Treat the '_background_noise_' folder as a special case, since we expect
+ # it to contain long audio samples we mix in to improve training.
+ if word == BACKGROUND_NOISE_DIR_NAME:
+ continue
+
+ all_words[word] = True
+ set_index = which_set(wav_path, validation_percentage, testing_percentage)
+ # If it's a known class, store its detail, otherwise add it to the list
+ # we'll use to train the unknown label.
+ if word in wanted_words_index:
+ data_index[set_index].append({'label': word, 'file': wav_path})
+ else:
+ unknown_index[set_index].append({'label': word, 'file': wav_path})
+ if not all_words:
+ raise Exception('No .wavs found at ' + str(search_pattern))
+
+ return data_index, unknown_index, all_words
+
+ def _prepare_background_data(self):
+ """Searches a folder for background noise audio, and loads it into memory.
+
+ It's expected that the background audio samples will be in a subdirectory
+ named '_background_noise_' inside the 'data_dir' folder, as .wavs that match
+ the sample rate of the training data, but can be much longer in duration.
+
+ If the '_background_noise_' folder doesn't exist at all, this isn't an
+ error, it's just taken to mean that no background noise augmentation should
+ be used. If the folder does exist, but it's empty, that's treated as an
+ error.
+
+ Returns:
+ Ragged tensor of raw PCM-encoded audio samples of background noise.
+ None if '_background_noise_' folder doesnt exist.
+
+ Raises:
+ Exception: If files aren't found in the folder.
+ """
+ background_data = []
+ background_dir = Path(self.data_dir / BACKGROUND_NOISE_DIR_NAME)
+ if not background_dir.exists():
+ self.background_data = None
+ return
+
+ search_path = Path(background_dir / '*.wav')
+ for wav_path in tf.io.gfile.glob(str(search_path)):
+ wav_data, _ = load_wav_file(wav_path, desired_samples=-1)
+ background_data.append(tf.reshape(wav_data, [-1]))
+
+ if not background_data:
+ raise Exception('No background wav files were found in ' + str(search_path))
+
+ # Ragged tensor as we cant use lists in tf dataset map functions.
+ self.background_data = tf.ragged.stack(background_data)
diff --git a/models/keyword_spotting/ds_cnn_medium/model_package_tf/ds_cnn_m_inference_keras.py b/models/keyword_spotting/ds_cnn_medium/model_package_tf/ds_cnn_m_inference_keras.py
new file mode 100644
index 0000000..db7694a
--- /dev/null
+++ b/models/keyword_spotting/ds_cnn_medium/model_package_tf/ds_cnn_m_inference_keras.py
@@ -0,0 +1,76 @@
+# Copyright © 2023 Arm Ltd. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from data_processing.data_preprocessing import load_wav_file, calculate_mfcc
+
+import tensorflow as tf
+import argparse
+
+
+def load_labels(filename):
+ """Read in labels, one label per line."""
+ f = open(filename, "r")
+ return f.read().splitlines()
+
+
+def main():
+ window_size_samples = int(FLAGS.sample_rate * FLAGS.window_size_ms / 1000)
+ window_stride_samples = int(FLAGS.sample_rate * FLAGS.window_stride_ms / 1000)
+ decoded, sample = load_wav_file(FLAGS.wav, FLAGS.sample_rate)
+ x = calculate_mfcc(decoded, sample, window_size_samples, window_stride_samples, FLAGS.dct_coefficient_count)
+ x = tf.reshape(x, [1, -1])
+
+ model = tf.keras.models.load_model(FLAGS.keras_file_path)
+ predictions = model.predict(x)
+
+ # Sort to show labels in order of confidence
+ top_k = predictions[0].argsort()[-1:][::-1]
+ for node_id in top_k:
+ human_string = load_labels(FLAGS.labels)[int(node_id)]
+ score = predictions[0,node_id]
+ print(f'model predicted: {human_string} with score {score:.5f}')
+
+
+if __name__ == '__main__':
+ parser = argparse.ArgumentParser()
+ parser.add_argument(
+ '--wav', type=str, default='', help='Audio file to be identified.')
+ parser.add_argument(
+ '--labels', type=str, default='', help='Path to file containing labels.')
+ parser.add_argument(
+ '--sample_rate',
+ type=int,
+ default=16000,
+ help='Expected sample rate of the wavs', )
+ parser.add_argument(
+ '--window_size_ms',
+ type=float,
+ default=30.0,
+ help='How long each spectrogram timeslice is', )
+ parser.add_argument(
+ '--window_stride_ms',
+ type=float,
+ default=10.0,
+ help='How long each spectrogram timeslice is', )
+ parser.add_argument(
+ '--dct_coefficient_count',
+ type=int,
+ default=40,
+ help='How many bins to use for the MFCC fingerprint', )
+ parser.add_argument(
+ '--keras_file_path',
+ type=str,
+ default='',
+ help='Path to the .h5 Keras model file to use for testing.')
+ FLAGS, unparsed = parser.parse_known_args()
+ main()
diff --git a/models/keyword_spotting/ds_cnn_medium/model_package_tf/ds_cnn_m_inference_tflite.py b/models/keyword_spotting/ds_cnn_medium/model_package_tf/ds_cnn_m_inference_tflite.py
new file mode 100644
index 0000000..9f79d99
--- /dev/null
+++ b/models/keyword_spotting/ds_cnn_medium/model_package_tf/ds_cnn_m_inference_tflite.py
@@ -0,0 +1,120 @@
+# Copyright © 2023 Arm Ltd. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from data_processing.data_preprocessing import load_wav_file, calculate_mfcc
+
+import tensorflow as tf
+import numpy as np
+import argparse
+
+
+def tflite_inference(input_data, tflite_path):
+ """Call forwards pass of TFLite file and returns the result.
+
+ Args:
+ input_data: Input data to use on forward pass.
+ tflite_path: Path to TFLite file to run.
+
+ Returns:
+ Output from inference.
+ """
+ supported_quant_dtypes = (np.int8, np.int16)
+ interpreter = tf.lite.Interpreter(model_path=tflite_path)
+ interpreter.allocate_tensors()
+
+ input_details = interpreter.get_input_details()
+ output_details = interpreter.get_output_details()
+
+ input_dtype = input_details[0]["dtype"]
+ output_dtype = output_details[0]["dtype"]
+
+ # Check if the input/output type is quantized,
+ # set scale and zero-point accordingly
+ if input_dtype in supported_quant_dtypes:
+ input_scale, input_zero_point = input_details[0]["quantization"]
+ else:
+ input_scale, input_zero_point = 1, 0
+
+ input_data = input_data / input_scale + input_zero_point
+ input_data = np.round(input_data) if input_dtype in supported_quant_dtypes else input_data
+
+ if output_dtype in supported_quant_dtypes:
+ output_scale, output_zero_point = output_details[0]["quantization"]
+ else:
+ output_scale, output_zero_point = 1, 0
+
+ interpreter.set_tensor(input_details[0]['index'], tf.cast(input_data, input_dtype))
+ interpreter.invoke()
+
+ output_data = interpreter.get_tensor(output_details[0]['index'])
+
+ output_data = output_scale * (output_data.astype(np.float32) - output_zero_point)
+
+ return output_data
+
+
+def load_labels(filename):
+ """Read in labels, one label per line."""
+ f = open(filename, "r")
+ return f.read().splitlines()
+
+
+def main():
+ window_size_samples = int(FLAGS.sample_rate * FLAGS.window_size_ms / 1000)
+ window_stride_samples = int(FLAGS.sample_rate * FLAGS.window_stride_ms / 1000)
+ decoded, sample = load_wav_file(FLAGS.wav, FLAGS.sample_rate)
+ x = calculate_mfcc(decoded, sample, window_size_samples, window_stride_samples, FLAGS.dct_coefficient_count)
+ x = tf.reshape(x, [1, -1])
+ predictions = tflite_inference(x, FLAGS.tflite_path)
+
+ # Sort to show labels in order of confidence
+ top_k = predictions[0].argsort()[-1:][::-1]
+ for node_id in top_k:
+ human_string = load_labels(FLAGS.labels)[int(node_id)]
+ score = predictions[0,node_id]
+ print(f'model predicted: {human_string} with score {score:.5f}')
+
+
+if __name__ == '__main__':
+ parser = argparse.ArgumentParser()
+ parser.add_argument(
+ '--wav', type=str, default='', help='Audio file to be identified.')
+ parser.add_argument(
+ '--labels', type=str, default='', help='Path to file containing labels.')
+ parser.add_argument(
+ '--sample_rate',
+ type=int,
+ default=16000,
+ help='Expected sample rate of the wavs', )
+ parser.add_argument(
+ '--window_size_ms',
+ type=float,
+ default=30.0,
+ help='How long each spectrogram timeslice is', )
+ parser.add_argument(
+ '--window_stride_ms',
+ type=float,
+ default=10.0,
+ help='How long each spectrogram timeslice is', )
+ parser.add_argument(
+ '--dct_coefficient_count',
+ type=int,
+ default=40,
+ help='How many bins to use for the MFCC fingerprint', )
+ parser.add_argument(
+ '--tflite_path',
+ type=str,
+ default='',
+ help='Path to TFLite file to use for testing.')
+ FLAGS, unparsed = parser.parse_known_args()
+ main()
diff --git a/models/keyword_spotting/ds_cnn_medium/model_package_tf/evaluation.py b/models/keyword_spotting/ds_cnn_medium/model_package_tf/evaluation.py
new file mode 100644
index 0000000..f1ea40a
--- /dev/null
+++ b/models/keyword_spotting/ds_cnn_medium/model_package_tf/evaluation.py
@@ -0,0 +1,250 @@
+# Copyright © 2023 Arm Ltd. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Functions for testing trained keyword spotting models from checkpoint files and TFLite files."""
+
+import argparse
+
+import numpy as np
+import tensorflow as tf
+
+from data_processing import data_preprocessing
+from model_core_utils import models
+from ds_cnn_m_inference_tflite import tflite_inference
+
+
+def tflite_test(model_settings, audio_processor, tflite_path):
+ """Calculate accuracy and confusion matrices on the validation and test sets.
+
+ A TFLite model is used for doing testing.
+
+ Args:
+ model_settings: Dictionary of common model settings.
+ audio_processor: Audio processor class object.
+ tflite_path: Path to TFLite file to use for inference.
+ """
+ # Evaluate on validation set.
+ print("Running TFLite evaluation on validation set...")
+ val_data = audio_processor.get_data(audio_processor.Modes.VALIDATION).batch(1)
+ expected_indices = np.concatenate([y for x, y in val_data])
+ predicted_indices = []
+
+ for mfcc, label in val_data:
+ prediction = tflite_inference(mfcc, tflite_path)
+ predicted_indices.append(np.squeeze(tf.argmax(prediction, axis=1)))
+
+ val_accuracy = calculate_accuracy(predicted_indices, expected_indices)
+ confusion_matrix = tf.math.confusion_matrix(expected_indices, predicted_indices,
+ num_classes=model_settings['label_count'])
+
+ print(confusion_matrix.numpy())
+ print(f'Validation accuracy = {val_accuracy * 100:.2f}%'
+ f'(N={audio_processor.set_size(audio_processor.Modes.VALIDATION)})')
+
+ # Evaluate on testing set.
+ print("Running TFLite evaluation on test set...")
+ test_data = audio_processor.get_data(audio_processor.Modes.TESTING).batch(1)
+ expected_indices = np.concatenate([y for x, y in test_data])
+ predicted_indices = []
+
+ for mfcc, label in test_data:
+ prediction = tflite_inference(mfcc, tflite_path)
+ predicted_indices.append(np.squeeze(tf.argmax(prediction, axis=1)))
+
+ test_accuracy = calculate_accuracy(predicted_indices, expected_indices)
+ confusion_matrix = tf.math.confusion_matrix(expected_indices, predicted_indices,
+ num_classes=model_settings['label_count'])
+
+ print(confusion_matrix.numpy())
+ print(f'Test accuracy = {test_accuracy * 100:.2f}%'
+ f'(N={audio_processor.set_size(audio_processor.Modes.TESTING)})')
+
+
+def keras_test(model_settings, audio_processor, model):
+ """Calculate accuracy and confusion matrices on the validation and test sets.
+
+ A loaded keras model is used for doing testing.
+
+ Args:
+ model_settings: Dictionary of common model settings.
+ audio_processor: Audio processor class object.
+ model: Loaded keras model.
+ """
+ # Evaluate on validation set.
+ print("Running TF evaluation on validation set...")
+ val_data = audio_processor.get_data(audio_processor.Modes.VALIDATION).batch(FLAGS.batch_size)
+ expected_indices = np.concatenate([y for x, y in val_data])
+
+ predictions = model.predict(val_data)
+ predicted_indices = tf.argmax(predictions, axis=1)
+
+ val_accuracy = calculate_accuracy(predicted_indices, expected_indices)
+ confusion_matrix = tf.math.confusion_matrix(expected_indices, predicted_indices,
+ num_classes=model_settings['label_count'])
+ print(confusion_matrix.numpy())
+ print(f'Validation accuracy = {val_accuracy * 100:.2f}%'
+ f'(N={audio_processor.set_size(audio_processor.Modes.VALIDATION)})')
+
+ # Evaluate on testing set.
+ print("Running TF evaluation on test set...")
+ test_data = audio_processor.get_data(audio_processor.Modes.TESTING).batch(FLAGS.batch_size)
+ expected_indices = np.concatenate([y for x, y in test_data])
+
+ predictions = model.predict(test_data)
+ predicted_indices = tf.argmax(predictions, axis=1)
+
+ test_accuracy = calculate_accuracy(predicted_indices, expected_indices)
+ confusion_matrix = tf.math.confusion_matrix(expected_indices, predicted_indices,
+ num_classes=model_settings['label_count'])
+ print(confusion_matrix.numpy())
+ print(f'Test accuracy = {test_accuracy * 100:.2f}%'
+ f'(N={audio_processor.set_size(audio_processor.Modes.TESTING)})')
+
+
+def calculate_accuracy(predicted_indices, expected_indices):
+ """Calculates and returns accuracy.
+
+ Args:
+ predicted_indices: List of predicted integer indices.
+ expected_indices: List of expected integer indices.
+
+ Returns:
+ Accuracy value between 0 and 1.
+ """
+ correct_prediction = tf.equal(predicted_indices, expected_indices)
+ accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
+ return accuracy
+
+
+def evaluate():
+ """Calculate accuracy and confusion matrices on validation and test sets.
+
+ Model is created and weights loaded from supplied command line arguments.
+ """
+ model_settings = models.prepare_model_settings(
+ len(data_preprocessing.prepare_words_list(FLAGS.wanted_words.split(','))),
+ FLAGS.sample_rate, FLAGS.clip_duration_ms, FLAGS.window_size_ms,
+ FLAGS.window_stride_ms, FLAGS.dct_coefficient_count)
+
+ audio_processor = data_preprocessing.AudioProcessor(data_url=FLAGS.data_url,
+ data_dir=FLAGS.data_dir,
+ silence_percentage=FLAGS.silence_percentage,
+ unknown_percentage=FLAGS.unknown_percentage,
+ wanted_words=FLAGS.wanted_words.split(','),
+ validation_percentage=FLAGS.validation_percentage,
+ testing_percentage=FLAGS.testing_percentage,
+ model_settings=model_settings)
+
+ if FLAGS.tflite_path:
+ tflite_test(model_settings, audio_processor, FLAGS.tflite_path)
+
+ if FLAGS.checkpoint:
+ model = models.create_model(model_settings, FLAGS.model_architecture, FLAGS.model_size_info, False)
+ model.load_weights(FLAGS.checkpoint).expect_partial()
+ keras_test(model_settings, audio_processor, model)
+
+
+if __name__ == '__main__':
+ parser = argparse.ArgumentParser()
+ parser.add_argument(
+ '--data_url',
+ type=str,
+ default='http://download.tensorflow.org/data/speech_commands_v0.02.tar.gz',
+ help='Location of speech training data archive on the web.')
+ parser.add_argument(
+ '--data_dir',
+ type=str,
+ default='/tmp/speech_dataset/',
+ help="""\
+ Where to download the speech training data to.
+ """)
+ parser.add_argument(
+ '--silence_percentage',
+ type=float,
+ default=10.0,
+ help="""\
+ How much of the training data should be silence.
+ """)
+ parser.add_argument(
+ '--unknown_percentage',
+ type=float,
+ default=10.0,
+ help="""\
+ How much of the training data should be unknown words.
+ """)
+ parser.add_argument(
+ '--testing_percentage',
+ type=int,
+ default=10,
+ help='What percentage of wavs to use as a test set.')
+ parser.add_argument(
+ '--validation_percentage',
+ type=int,
+ default=10,
+ help='What percentage of wavs to use as a validation set.')
+ parser.add_argument(
+ '--sample_rate',
+ type=int,
+ default=16000,
+ help='Expected sample rate of the wavs',)
+ parser.add_argument(
+ '--clip_duration_ms',
+ type=int,
+ default=1000,
+ help='Expected duration in milliseconds of the wavs',)
+ parser.add_argument(
+ '--window_size_ms',
+ type=float,
+ default=30.0,
+ help='How long each spectrogram timeslice is',)
+ parser.add_argument(
+ '--window_stride_ms',
+ type=float,
+ default=10.0,
+ help='How long each spectrogram timeslice is',)
+ parser.add_argument(
+ '--dct_coefficient_count',
+ type=int,
+ default=40,
+ help='How many bins to use for the MFCC fingerprint',)
+ parser.add_argument(
+ '--batch_size',
+ type=int,
+ default=100,
+ help='How many items to train with at once',)
+ parser.add_argument(
+ '--wanted_words',
+ type=str,
+ default='yes,no,up,down,left,right,on,off,stop,go',
+ help='Words to use (others will be added to an unknown label)',)
+ parser.add_argument(
+ '--checkpoint',
+ type=str,
+ help='Checkpoint to load the weights from')
+ parser.add_argument(
+ '--tflite_path',
+ type=str,
+ help='Path to TFLite file to use for evaluation')
+ parser.add_argument(
+ '--model_architecture',
+ type=str,
+ default='dnn',
+ help='What model architecture to use')
+ parser.add_argument(
+ '--model_size_info',
+ type=int,
+ nargs="+",
+ default=[128, 128, 128],
+ help='Model dimensions - different for various models')
+
+ FLAGS, _ = parser.parse_known_args()
+ evaluate()
diff --git a/models/keyword_spotting/ds_cnn_medium/model_package_tf/how_to_guidance.ipynb b/models/keyword_spotting/ds_cnn_medium/model_package_tf/how_to_guidance.ipynb
new file mode 100644
index 0000000..fea007f
--- /dev/null
+++ b/models/keyword_spotting/ds_cnn_medium/model_package_tf/how_to_guidance.ipynb
@@ -0,0 +1,428 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Copyright (C) 2023 Arm Limited or its affiliates. All rights reserved.\n",
+ "#\n",
+ "# SPDX-License-Identifier: Apache-2.0\n",
+ "#\n",
+ "# Licensed under the Apache License, Version 2.0 (the License); you may\n",
+ "# not use this file except in compliance with the License.\n",
+ "# You may obtain a copy of the License at\n",
+ "#\n",
+ "# www.apache.org/licenses/LICENSE-2.0\n",
+ "#\n",
+ "# Unless required by applicable law or agreed to in writing, software\n",
+ "# distributed under the License is distributed on an AS IS BASIS, WITHOUT\n",
+ "# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
+ "# See the License for the specific language governing permissions and\n",
+ "# limitations under the License."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# DS_CNN_Medium - Hero\n",
+ "\n",
+ "Here we reproduce the models with our established codebase and ModelPackage approach for your convenience.\n",
+ "\n",
+ "## Model-Package Overview:\n",
+ "\n",
+ "| Model \t| DS_CNN_Medium \t|\n",
+ "|:---------------:\t|:---------------------------------------------------------------:\t|\n",
+ "| **Format**: \t| Keras, Saved Model, TensorFlow Lite int8, TensorFlow Lite fp32 |\n",
+ "| **Feature**: \t| Keyword spotting for Arm Cortex-M CPUs |\n",
+ "| **Architectural Delta w.r.t. Vanilla**: | None |\n",
+ "| **Domain**: \t| Keyword spotting |\n",
+ "| **Package Quality**: \t| Hero |"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Table of contents \n",
+ "\n",
+ "This how-to guidance presents the key steps to reproduce everything in this package. The contents are organised as below. We provided the internal navigation links for users to easy-jump among different sections. \n",
+ "\n",
+ " \n",
+ "* [1.0 Model recreation](#model_recreation)\n",
+ "\n",
+ "* [2.0 Training](#training)\n",
+ "\n",
+ "* [3.0 Testing](#testing)\n",
+ "\n",
+ "* [4.0 Optimization](#optimization)\n",
+ "\n",
+ "* [5.0 Quantization and TFLite conversion](#tflite_conversion)\n",
+ "\n",
+ "* [6.0 Inference the TFLite model files](#tflite_inference)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 1.0 Model Recreation\n",
+ "\n",
+ "In order to recreate the model you will first need to be using ```Python3.7``` and install the requirements in ```requirements.txt```.\n",
+ "\n",
+ "Once you have these requirements satisfied you can execute the recreation script contained within this folder, just run:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "2023-01-31 11:54:08.485801: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0\n",
+ "Untarring speech_commands_v0.02.tar.gz...\n",
+ "2023-01-31 11:54:58.475678: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcuda.so.1\n",
+ "2023-01-31 11:54:58.516721: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: \n",
+ "pciBusID: 0000:03:00.0 name: NVIDIA TITAN Xp computeCapability: 6.1\n",
+ "coreClock: 1.582GHz coreCount: 30 deviceMemorySize: 11.91GiB deviceMemoryBandwidth: 510.07GiB/s\n",
+ "2023-01-31 11:54:58.516765: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0\n",
+ "2023-01-31 11:54:58.537249: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcublas.so.11\n",
+ "2023-01-31 11:54:58.537321: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcublasLt.so.11\n",
+ "2023-01-31 11:54:58.540057: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcufft.so.10\n",
+ "2023-01-31 11:54:58.540315: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcurand.so.10\n",
+ "2023-01-31 11:54:58.540872: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcusolver.so.11\n",
+ "2023-01-31 11:54:58.541591: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcusparse.so.11\n",
+ "2023-01-31 11:54:58.541745: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudnn.so.8\n",
+ "2023-01-31 11:54:58.542218: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0\n",
+ "2023-01-31 11:54:58.542511: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA\n",
+ "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
+ "2023-01-31 11:54:58.543331: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: \n",
+ "pciBusID: 0000:03:00.0 name: NVIDIA TITAN Xp computeCapability: 6.1\n",
+ "coreClock: 1.582GHz coreCount: 30 deviceMemorySize: 11.91GiB deviceMemoryBandwidth: 510.07GiB/s\n",
+ "2023-01-31 11:54:58.543822: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0\n",
+ "2023-01-31 11:54:58.543872: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0\n",
+ "2023-01-31 11:54:58.966709: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1258] Device interconnect StreamExecutor with strength 1 edge matrix:\n",
+ "2023-01-31 11:54:58.966747: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1264] 0 \n",
+ "2023-01-31 11:54:58.966761: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1277] 0: N \n",
+ "2023-01-31 11:54:58.967266: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1418] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 11002 MB memory) -> physical GPU (device: 0, name: NVIDIA TITAN Xp, pci bus id: 0000:03:00.0, compute capability: 6.1)\n",
+ "WARNING:tensorflow:Compiled the loaded model, but the compiled metrics have yet to be built. `model.compile_metrics` will be empty until you train or evaluate the model.\n",
+ "2023-01-31 11:55:01.322474: W tensorflow/python/util/util.cc:348] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.\n",
+ "2023-01-31 11:55:03.039244: I tensorflow/core/grappler/devices.cc:69] Number of eligible GPUs (core count >= 8, compute capability >= 0.0): 1\n",
+ "2023-01-31 11:55:03.039493: I tensorflow/core/grappler/clusters/single_machine.cc:357] Starting new session\n",
+ "2023-01-31 11:55:03.039987: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: \n",
+ "pciBusID: 0000:03:00.0 name: NVIDIA TITAN Xp computeCapability: 6.1\n",
+ "coreClock: 1.582GHz coreCount: 30 deviceMemorySize: 11.91GiB deviceMemoryBandwidth: 510.07GiB/s\n",
+ "2023-01-31 11:55:03.040276: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0\n",
+ "2023-01-31 11:55:03.040309: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1258] Device interconnect StreamExecutor with strength 1 edge matrix:\n",
+ "2023-01-31 11:55:03.040317: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1264] 0 \n",
+ "2023-01-31 11:55:03.040325: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1277] 0: N \n",
+ "2023-01-31 11:55:03.040640: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1418] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 11002 MB memory) -> physical GPU (device: 0, name: NVIDIA TITAN Xp, pci bus id: 0000:03:00.0, compute capability: 6.1)\n",
+ "2023-01-31 11:55:03.059483: I tensorflow/core/platform/profile_utils/cpu_utils.cc:114] CPU Frequency: 3492140000 Hz\n",
+ "2023-01-31 11:55:03.063108: I tensorflow/core/grappler/optimizers/meta_optimizer.cc:1144] Optimization results for grappler item: graph_to_optimize\n",
+ " function_optimizer: function_optimizer did nothing. time = 0.01ms.\n",
+ " function_optimizer: function_optimizer did nothing. time = 0.001ms.\n",
+ "\n",
+ "2023-01-31 11:55:03.313219: W tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc:345] Ignored output_format.\n",
+ "2023-01-31 11:55:03.313256: W tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc:348] Ignored drop_control_dependency.\n",
+ "2023-01-31 11:55:03.318616: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:210] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.\n",
+ "2023-01-31 11:55:03.321473: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: \n",
+ "pciBusID: 0000:03:00.0 name: NVIDIA TITAN Xp computeCapability: 6.1\n",
+ "coreClock: 1.582GHz coreCount: 30 deviceMemorySize: 11.91GiB deviceMemoryBandwidth: 510.07GiB/s\n",
+ "2023-01-31 11:55:03.321732: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0\n",
+ "2023-01-31 11:55:03.321763: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1258] Device interconnect StreamExecutor with strength 1 edge matrix:\n",
+ "2023-01-31 11:55:03.321773: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1264] 0 \n",
+ "2023-01-31 11:55:03.321780: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1277] 0: N \n",
+ "2023-01-31 11:55:03.322065: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1418] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 11002 MB memory) -> physical GPU (device: 0, name: NVIDIA TITAN Xp, pci bus id: 0000:03:00.0, compute capability: 6.1)\n",
+ "Converted model saved to ds_cnn.tflite.\n",
+ "Running TFLite evaluation on validation set...\n",
+ "2023-01-31 11:55:03.376097: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:176] None of the MLIR Optimization Passes are enabled (registered 2)\n",
+ "[[371 0 0 0 0 0 0 0 0 0 0 0]\n",
+ " [ 0 332 3 5 1 5 6 3 8 3 2 3]\n",
+ " [ 0 4 386 1 0 0 4 0 0 0 0 2]\n",
+ " [ 0 5 2 378 2 3 3 0 0 1 1 11]\n",
+ " [ 0 1 2 0 324 1 0 0 1 16 4 1]\n",
+ " [ 0 3 0 8 1 360 0 0 1 1 1 2]\n",
+ " [ 1 0 8 1 1 0 338 3 0 0 0 0]\n",
+ " [ 0 2 1 1 0 0 1 356 0 1 1 0]\n",
+ " [ 1 5 0 2 4 0 0 0 341 10 0 0]\n",
+ " [ 0 2 0 0 16 0 3 0 4 345 2 1]\n",
+ " [ 1 1 0 0 12 1 0 1 0 1 332 1]\n",
+ " [ 0 4 0 13 2 4 1 0 1 1 1 345]]\n",
+ "Validation accuracy = 94.67%(N=4445)\n",
+ "Running TFLite evaluation on test set...\n",
+ "[[408 0 0 0 0 0 0 0 0 0 0 0]\n",
+ " [ 0 362 2 8 5 2 8 3 5 1 2 10]\n",
+ " [ 0 7 402 2 0 0 7 0 0 1 0 0]\n",
+ " [ 0 4 1 389 0 4 1 0 0 0 0 6]\n",
+ " [ 0 6 0 0 397 1 0 0 4 12 5 0]\n",
+ " [ 0 8 1 14 0 374 3 1 1 0 1 3]\n",
+ " [ 0 8 5 1 0 0 396 2 0 0 0 0]\n",
+ " [ 0 6 0 0 0 1 4 383 0 1 1 0]\n",
+ " [ 0 4 0 0 7 3 1 0 368 13 0 0]\n",
+ " [ 0 5 0 2 11 0 1 0 5 375 0 3]\n",
+ " [ 0 3 0 0 8 2 1 1 0 0 394 2]\n",
+ " [ 0 5 1 27 3 1 1 1 0 1 0 362]]\n",
+ "Test accuracy = 94.27%(N=4890)\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "2023-01-31 11:55:32.290813: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0\n",
+ "Untarring speech_commands_v0.02.tar.gz...\n",
+ "2023-01-31 11:56:25.228757: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcuda.so.1\n",
+ "2023-01-31 11:56:25.264869: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: \n",
+ "pciBusID: 0000:03:00.0 name: NVIDIA TITAN Xp computeCapability: 6.1\n",
+ "coreClock: 1.582GHz coreCount: 30 deviceMemorySize: 11.91GiB deviceMemoryBandwidth: 510.07GiB/s\n",
+ "2023-01-31 11:56:25.264908: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0\n",
+ "2023-01-31 11:56:25.285323: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcublas.so.11\n",
+ "2023-01-31 11:56:25.285388: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcublasLt.so.11\n",
+ "2023-01-31 11:56:25.288128: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcufft.so.10\n",
+ "2023-01-31 11:56:25.288385: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcurand.so.10\n",
+ "2023-01-31 11:56:25.288944: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcusolver.so.11\n",
+ "2023-01-31 11:56:25.289667: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcusparse.so.11\n",
+ "2023-01-31 11:56:25.289820: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudnn.so.8\n",
+ "2023-01-31 11:56:25.292002: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0\n",
+ "2023-01-31 11:56:25.292281: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA\n",
+ "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
+ "2023-01-31 11:56:25.293162: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: \n",
+ "pciBusID: 0000:03:00.0 name: NVIDIA TITAN Xp computeCapability: 6.1\n",
+ "coreClock: 1.582GHz coreCount: 30 deviceMemorySize: 11.91GiB deviceMemoryBandwidth: 510.07GiB/s\n",
+ "2023-01-31 11:56:25.293718: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0\n",
+ "2023-01-31 11:56:25.293799: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0\n",
+ "2023-01-31 11:56:25.736053: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1258] Device interconnect StreamExecutor with strength 1 edge matrix:\n",
+ "2023-01-31 11:56:25.736092: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1264] 0 \n",
+ "2023-01-31 11:56:25.736100: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1277] 0: N \n",
+ "2023-01-31 11:56:25.736608: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1418] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 11002 MB memory) -> physical GPU (device: 0, name: NVIDIA TITAN Xp, pci bus id: 0000:03:00.0, compute capability: 6.1)\n",
+ "WARNING:tensorflow:Compiled the loaded model, but the compiled metrics have yet to be built. `model.compile_metrics` will be empty until you train or evaluate the model.\n",
+ "2023-01-31 11:56:28.038374: W tensorflow/python/util/util.cc:348] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.\n",
+ "2023-01-31 11:56:29.838652: I tensorflow/core/grappler/devices.cc:69] Number of eligible GPUs (core count >= 8, compute capability >= 0.0): 1\n",
+ "2023-01-31 11:56:29.838886: I tensorflow/core/grappler/clusters/single_machine.cc:357] Starting new session\n",
+ "2023-01-31 11:56:29.839342: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: \n",
+ "pciBusID: 0000:03:00.0 name: NVIDIA TITAN Xp computeCapability: 6.1\n",
+ "coreClock: 1.582GHz coreCount: 30 deviceMemorySize: 11.91GiB deviceMemoryBandwidth: 510.07GiB/s\n",
+ "2023-01-31 11:56:29.839606: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0\n",
+ "2023-01-31 11:56:29.839637: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1258] Device interconnect StreamExecutor with strength 1 edge matrix:\n",
+ "2023-01-31 11:56:29.839648: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1264] 0 \n",
+ "2023-01-31 11:56:29.839655: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1277] 0: N \n",
+ "2023-01-31 11:56:29.839941: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1418] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 11002 MB memory) -> physical GPU (device: 0, name: NVIDIA TITAN Xp, pci bus id: 0000:03:00.0, compute capability: 6.1)\n",
+ "2023-01-31 11:56:29.859427: I tensorflow/core/platform/profile_utils/cpu_utils.cc:114] CPU Frequency: 3492140000 Hz\n",
+ "2023-01-31 11:56:29.863763: I tensorflow/core/grappler/optimizers/meta_optimizer.cc:1144] Optimization results for grappler item: graph_to_optimize\n",
+ " function_optimizer: function_optimizer did nothing. time = 0.013ms.\n",
+ " function_optimizer: function_optimizer did nothing. time = 0.002ms.\n",
+ "\n",
+ "2023-01-31 11:56:30.003088: W tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc:345] Ignored output_format.\n",
+ "2023-01-31 11:56:30.003122: W tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc:348] Ignored drop_control_dependency.\n",
+ "2023-01-31 11:56:30.008047: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:210] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.\n",
+ "2023-01-31 11:56:30.010836: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: \n",
+ "pciBusID: 0000:03:00.0 name: NVIDIA TITAN Xp computeCapability: 6.1\n",
+ "coreClock: 1.582GHz coreCount: 30 deviceMemorySize: 11.91GiB deviceMemoryBandwidth: 510.07GiB/s\n",
+ "2023-01-31 11:56:30.011085: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0\n",
+ "2023-01-31 11:56:30.011115: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1258] Device interconnect StreamExecutor with strength 1 edge matrix:\n",
+ "2023-01-31 11:56:30.011125: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1264] 0 \n",
+ "2023-01-31 11:56:30.011131: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1277] 0: N \n",
+ "2023-01-31 11:56:30.011421: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1418] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 11002 MB memory) -> physical GPU (device: 0, name: NVIDIA TITAN Xp, pci bus id: 0000:03:00.0, compute capability: 6.1)\n",
+ "2023-01-31 11:56:30.051239: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:176] None of the MLIR Optimization Passes are enabled (registered 2)\n",
+ "fully_quantize: 0, inference_type: 6, input_inference_type: 9, output_inference_type: 9\n",
+ "Quantized model saved to ds_cnn_quantized.tflite.\n",
+ "Running TFLite evaluation on validation set...\n",
+ "[[371 0 0 0 0 0 0 0 0 0 0 0]\n",
+ " [ 0 328 2 5 1 8 5 3 8 4 2 5]\n",
+ " [ 0 6 375 1 0 1 10 0 0 0 1 3]\n",
+ " [ 0 9 2 368 1 3 6 0 1 0 4 12]\n",
+ " [ 0 3 1 0 319 1 1 0 2 13 9 1]\n",
+ " [ 0 3 2 9 0 350 1 0 3 1 2 6]\n",
+ " [ 1 3 8 1 1 0 334 3 0 0 0 1]\n",
+ " [ 0 4 1 0 1 0 1 351 0 1 2 2]\n",
+ " [ 1 6 0 1 4 0 0 0 343 7 0 1]\n",
+ " [ 0 5 0 0 21 0 3 1 4 333 3 3]\n",
+ " [ 1 2 0 0 11 0 0 1 1 2 331 1]\n",
+ " [ 0 7 0 15 2 4 1 0 0 1 3 339]]\n",
+ "Validation accuracy = 93.18%(N=4445)\n",
+ "Running TFLite evaluation on test set...\n",
+ "[[408 0 0 0 0 0 0 0 0 0 0 0]\n",
+ " [ 0 363 1 7 2 1 11 3 4 1 3 12]\n",
+ " [ 0 7 399 2 0 0 8 0 1 0 0 2]\n",
+ " [ 0 5 1 380 0 4 4 0 0 0 2 9]\n",
+ " [ 0 8 0 0 390 1 1 2 4 10 8 1]\n",
+ " [ 0 8 1 14 2 370 2 1 0 0 4 4]\n",
+ " [ 0 9 4 1 1 0 395 2 0 0 0 0]\n",
+ " [ 0 8 2 0 2 1 8 372 0 1 2 0]\n",
+ " [ 0 9 0 0 9 3 1 0 358 12 1 3]\n",
+ " [ 0 7 0 2 15 0 1 0 4 362 4 7]\n",
+ " [ 0 3 0 0 7 4 1 2 0 1 391 2]\n",
+ " [ 0 9 2 26 3 4 0 0 2 1 4 351]]\n",
+ "Test accuracy = 92.82%(N=4890)\n"
+ ]
+ }
+ ],
+ "source": [
+ "!bash ./recreate_model.sh"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Running this script will use the pre-trained checkpoint files supplied in the ```./model_archive/model_source/weights``` folder to generate the TFLite files and perform evaluation on the test set. Both an fp32 version and a quantized version will be produced. The quantized version will use post-training quantization to fully quantize it.\n",
+ "\n",
+ "If you want to run training from scratch you can do this by supplying ```--train``` when running the script. For example:\n",
+ "\n",
+ "```bash\n",
+ "bash ./recreate_model.sh --train\n",
+ "```\n",
+ "\n",
+ "Training is then performed and should produce a model to the stated accuracy in this repository. Note that exporting to TFLite will still happen with the baseline pre-trained checkpoint files, so you will need to re-run the script and this time supply the path to the new checkpoint files you want to use, for example:\n",
+ "\n",
+ "```bash\n",
+ "bash ./recreate_model.sh --ckpt \n",
+ "```"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 2.0 Training\n",
+ "\n",
+ "The training scripts can be used to recreate any of the models from the [Hello Edge paper](https://arxiv.org/pdf/1711.07128.pdf) provided the right hyperparameters are used. The training commands with all the hyperparameters to reproduce the model in this repository are given [here](recreate_model.sh). The model in this part of the repository represents just one variation of the models from the paper, other varieties are covered in other parts of the repository.\n",
+ "\n",
+ "\n",
+ "As a general example of how to train a DNN with 3 fully-connected layers with 128 neurons in each layer, run:\n",
+ "```\n",
+ "python train.py --model_architecture dnn --model_size_info 128 128 128\n",
+ "```\n",
+ "\n",
+ "The command line argument *--model_size_info* is used to pass the neural network layer\n",
+ "dimensions such as number of layers, convolution filter size/stride as a list to models.py,\n",
+ "which builds the TensorFlow graph based on the provided model architecture\n",
+ "and layer dimensions. For more info on *model_size_info* for each network architecture see\n",
+ "[models.py](model_core_utils/models.py).\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 3.0 Testing\n",
+ "To run inference on the trained model from a checkpoint and get accuracy on validation and test sets, run:\n",
+ "```\n",
+ "python evaluation.py --model_architecture dnn --model_size_info 128 128 128 --checkpoint \n",
+ "```\n",
+ "**The model and feature extraction parameters passed to this script should match those used in the Training step.**"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 4.0 Optimization\n",
+ "\n",
+ "We introduce an *optional* step to optimize the trained keyword spotting model for deployment.\n",
+ "\n",
+ "Here we use TensorFlow's [weight clustering API](https://www.tensorflow.org/model_optimization/guide/clustering) to reduce the compressed model size and optimize inference on supported hardware. 32 weight clusters and kmeans++ cluster intialization method are used as the clustering hyperparameters.\n",
+ "\n",
+ "To optimize your trained model (e.g. a DNN), a trained model checkpoint is needed to run clustering and fine-tuning on.\n",
+ "You can use the pre-trained checkpoints provided, or train your own model and use the resulting checkpoint.\n",
+ "\n",
+ "To apply the optimization and fine-tuning, run the following command:\n",
+ "```\n",
+ "python optimisations.py --model_architecture dnn --model_size_info 128 128 128 --checkpoint \n",
+ "```\n",
+ "**The model and feature extraction parameters used here should match those used in the Training step, except for the number of training steps.\n",
+ "The number of training steps is reduced since the optimization step only requires fine-tuning.**\n",
+ "\n",
+ "This will generate a clustered model checkpoint that can be used in the quantization step to generate a quantized and clustered TFLite model."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 5.0 Quantization and TFLite Conversion\n",
+ "\n",
+ "You can now use TensorFlow's\n",
+ "[post training quantization](https://www.tensorflow.org/lite/performance/post_training_quantization) to\n",
+ "make quantization of the trained models super simple.\n",
+ "\n",
+ "To quantize your trained model (e.g. a DNN) run:\n",
+ "```\n",
+ "python convert_to_tflite.py --model_architecture dnn --model_size_info 128 128 128 --checkpoint [--inference_type int8|int16]\n",
+ "```\n",
+ "**The model and feature extraction parameters used here should match those used in the Training step.**\n",
+ "\n",
+ "The ```inference_type``` parameter is *optional* and to be used if a fully quantized model with inputs and outputs of type int8 or int16 is needed. It defaults to fp32.\n",
+ "\n",
+ "In this example, this step will produce a quantized TFLite file *dnn_quantized.tflite*."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "You can test the accuracy of this quantized model on the test set by running:\n",
+ "```\n",
+ "python evaluation.py --tflite_path dnn_quantized.tflite\n",
+ "```\n",
+ "**The model and feature extraction parameters used here should match those used in the Training step.**\n",
+ "\n",
+ "`convert_to_tflite.py` uses post-training quantization to generate a quantized model by default. If you wish to convert to a floating point TFLite model, use the command below:\n",
+ "\n",
+ "```\n",
+ "python convert_to_tflite.py --model_architecture dnn --model_size_info 128 128 128 --checkpoint --no-quantize\n",
+ "```\n",
+ "\n",
+ "This will produce a floating point TFLite file *dnn.tflite*. You can test the accuracy of this floating point model using `evaluation.py` as above.\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 6.0 Single inference of the TFLite model files \n",
+ "\n",
+ "You can conduct TFLite inference for .fp32 and .int8 model files by using the following command: \n",
+ "\n",
+ "```python ds_cnn_m_inference_tflite.py --labels validation_utils/labels.txt --wav --tflite_path ```\n",
+ "\n",
+ "**The feature extraction parameters used here should match those used in the Training step.**\n",
+ "\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.7.4"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/models/keyword_spotting/ds_cnn_medium/model_package_tf/model_archive/TFLite/tflite_fp32/README.md b/models/keyword_spotting/ds_cnn_medium/model_package_tf/model_archive/TFLite/tflite_fp32/README.md
new file mode 100644
index 0000000..ae2c70e
--- /dev/null
+++ b/models/keyword_spotting/ds_cnn_medium/model_package_tf/model_archive/TFLite/tflite_fp32/README.md
@@ -0,0 +1,62 @@
+# keyword_spotting/ds_cnn_medium/model_package_tf/model_archive/TFLite/tflite_fp32
+
+## Description
+This is a floating point fp32 version of the DS-CNN Medium model developed by Arm, from the Hello Edge paper.
+
+## License
+[Apache-2.0](https://spdx.org/licenses/Apache-2.0.html)
+
+## Network Information
+| Network Information | Value |
+|---------------------|-------|
+| Framework | TensorFlow Lite |
+| Datatype | fp32 |
+| SHA-1 Hash | 620951417ca52a1640bb25490ca7b34507fe8881 |
+| Size (Bytes) | 548468 |
+| Provenance | https://arxiv.org/abs/1711.07128 |
+| Training | Trained by Arm |
+| Paper | https://arxiv.org/abs/1711.07128 |
+
+## DataSet
+| Dataset Information | Value |
+|--------|-------|
+| Name | Google Speech Commands test set |
+
+## Accuracy
+
+| Metric | Value |
+|--------|-------|
+| accuracy | 94.27% |
+
+## HW Support
+| HW Support | Value |
+|--------------|-------|
+| Cortex-A |:heavy_check_mark: |
+| Cortex-M |:heavy_check_mark: HERO |
+| Mali GPU |:heavy_check_mark: |
+| Ethos U |:heavy_multiplication_x: |
+
+### Key
+* :heavy_check_mark: - Will run on this platform.
+* :heavy_multiplication_x: - Will not run on this platform.
+
+## Network Quality
+| Network Quality | Value |
+|-------------------------|-------|
+| Recreate | :heavy_check_mark: |
+| Quality level | Hero |
+| Vanilla | :heavy_check_mark: |
+| Clustered | :heavy_multiplication_x: |
+| Pruned | :heavy_multiplication_x: |
+| Quantization - default | :heavy_multiplication_x: |
+| Quantization - full | :heavy_multiplication_x: |
+
+## Network Inputs
+| Input Node Name | Shape | Type | Example Path | Example Type | Example Shape | Example Use Case |
+|-----------------|-------|-------|--------------|-------|-------|-----------------|
+| input | (1, 490) | fp32 | models/keyword_spotting/ds_cnn_medium/model_package_tf/model_archive/TFLite/tflite_fp32/testing_input/input | fp32 | [1, 490] | The input is a processed MFCCs |
+
+## Network Outputs
+| Output Node Name | Shape | Type | Example Path | Example Type | Example Shape | Example Use Case |
+|-----------------|-------|-------|--------------|-------|-------|-----------------|
+| Identity | (1, 12) | fp32 | models/keyword_spotting/ds_cnn_medium/model_package_tf/model_archive/TFLite/tflite_fp32/testing_output/Identity | fp32 | [1, 12] | The probability on 12 keywords |
\ No newline at end of file
diff --git a/models/keyword_spotting/ds_cnn_medium/model_package_tf/model_archive/TFLite/tflite_fp32/definition.yaml b/models/keyword_spotting/ds_cnn_medium/model_package_tf/model_archive/TFLite/tflite_fp32/definition.yaml
new file mode 100644
index 0000000..2277065
--- /dev/null
+++ b/models/keyword_spotting/ds_cnn_medium/model_package_tf/model_archive/TFLite/tflite_fp32/definition.yaml
@@ -0,0 +1,66 @@
+benchmark:
+ benchmark_metrics:
+ accuracy: 94.27%
+ benchmark_name: Google Speech Commands test set
+description: This is a floating point fp32 version of the DS-CNN Medium model developed
+ by Arm, from the Hello Edge paper.
+license:
+- Apache-2.0
+network:
+ datatype: fp32
+ file_size_bytes: 548468
+ filename: ds_cnn_m.tflite
+ framework: TensorFlow Lite
+ hash:
+ algorithm: sha1
+ value: 620951417ca52a1640bb25490ca7b34507fe8881
+ provenance: https://arxiv.org/abs/1711.07128
+ training: Trained by Arm
+network_parameters:
+ input_nodes:
+ - description: The input is a processed MFCCs of shape (1, 490)
+ example_input:
+ path: models/keyword_spotting/ds_cnn_medium/model_package_tf/model_archive/TFLite/tflite_fp32/testing_input/input
+ shape:
+ - 1
+ - 490
+ type: fp32
+ use_case: Random input for model regression.
+ input_datatype: fp32
+ name: input
+ shape:
+ - 1
+ - 490
+ output_nodes:
+ - description: The probability on 12 keywords.
+ example_output:
+ path: models/keyword_spotting/ds_cnn_medium/model_package_tf/model_archive/TFLite/tflite_fp32/testing_output/Identity
+ shape:
+ - 1
+ - 12
+ type: fp32
+ use_case: output for model regression.
+ name: Identity
+ output_datatype: fp32
+ shape:
+ - 1
+ - 12
+network_quality:
+ clustered: false
+ is_vanilla: true
+ pruned: false
+ quality_level: Deployable
+ quality_level_hero_hw: cortex_m
+ quantization_default: false
+ quantization_full: false
+ recreate: true
+operators:
+ TensorFlow Lite:
+ - AVERAGE_POOL_2D
+ - CONV_2D
+ - DEPTHWISE_CONV_2D
+ - FULLY_CONNECTED
+ - RELU
+ - RESHAPE
+ - SOFTMAX
+paper: https://arxiv.org/abs/1711.07128
\ No newline at end of file
diff --git a/models/keyword_spotting/ds_cnn_medium/model_package_tf/model_archive/TFLite/tflite_fp32/ds_cnn_m.tflite b/models/keyword_spotting/ds_cnn_medium/model_package_tf/model_archive/TFLite/tflite_fp32/ds_cnn_m.tflite
new file mode 100644
index 0000000..b4b2f28
--- /dev/null
+++ b/models/keyword_spotting/ds_cnn_medium/model_package_tf/model_archive/TFLite/tflite_fp32/ds_cnn_m.tflite
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:405ba6ec5977ae6bd42ac153deb02f471bcd76e6c07b127352e4a0f3ca5be054
+size 548468
diff --git a/models/keyword_spotting/ds_cnn_medium/model_package_tf/model_archive/TFLite/tflite_fp32/testing_input/input/0.npy b/models/keyword_spotting/ds_cnn_medium/model_package_tf/model_archive/TFLite/tflite_fp32/testing_input/input/0.npy
new file mode 100644
index 0000000..701fcd4
--- /dev/null
+++ b/models/keyword_spotting/ds_cnn_medium/model_package_tf/model_archive/TFLite/tflite_fp32/testing_input/input/0.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fcdf0702505989d7a0fdffca09308abde32082a1f56bad845c05fbca24e87aa4
+size 2088
diff --git a/models/keyword_spotting/ds_cnn_medium/model_package_tf/model_archive/TFLite/tflite_fp32/testing_output/Identity/0.npy b/models/keyword_spotting/ds_cnn_medium/model_package_tf/model_archive/TFLite/tflite_fp32/testing_output/Identity/0.npy
new file mode 100644
index 0000000..f6082ba
--- /dev/null
+++ b/models/keyword_spotting/ds_cnn_medium/model_package_tf/model_archive/TFLite/tflite_fp32/testing_output/Identity/0.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5a107cccce62cb03a3aadc59387f87ecb46a6e4bf81ed5f67d15750fa8b78fec
+size 176
diff --git a/models/keyword_spotting/ds_cnn_medium/model_package_tf/model_archive/TFLite/tflite_int8/README.md b/models/keyword_spotting/ds_cnn_medium/model_package_tf/model_archive/TFLite/tflite_int8/README.md
new file mode 100644
index 0000000..331b883
--- /dev/null
+++ b/models/keyword_spotting/ds_cnn_medium/model_package_tf/model_archive/TFLite/tflite_int8/README.md
@@ -0,0 +1,63 @@
+# keyword_spotting/ds_cnn_medium/model_package_tf/model_archive/TFLite/tflite_int8
+
+## Description
+This is a fully quantized int8 version of the DS-CNN Medium model developed by Arm, from the Hello Edge paper.
+
+## License
+[Apache-2.0](https://spdx.org/licenses/Apache-2.0.html)
+
+## Network Information
+| Network Information | Value |
+|---------------------|-------|
+| Framework | TensorFlow Lite |
+| Datatype | int8 |
+| SHA-1 Hash | 740d32adde16948b2ab45e1e8c856de2925a05eb |
+| Size (Bytes) | 186288 |
+| Provenance | https://arxiv.org/abs/1711.07128 |
+| Training | Trained by Arm |
+| Paper | https://arxiv.org/abs/1711.07128 |
+
+## DataSet
+| Dataset Information | Value |
+|--------|-------|
+| Name | Google Speech Commands test set |
+
+## Accuracy
+
+| Metric | Value |
+|--------|-------|
+| Accuracy | 93.93% |
+
+## HW Support
+| HW Support | Value |
+|--------------|-------|
+| Cortex-A |:heavy_check_mark: |
+| Cortex-M |:heavy_check_mark: HERO |
+| Mali GPU |:heavy_check_mark: |
+| Ethos U |:heavy_check_mark: |
+
+### Key
+* :heavy_check_mark: - Will run on this platform.
+* :heavy_multiplication_x: - Will not run on this platform.
+
+## Network Quality
+| Network Quality | Value |
+|-------------------------|-------|
+| Recreate | :heavy_check_mark: |
+| Quality level | Hero |
+| Vanilla | :heavy_check_mark: |
+| Clustered | :heavy_multiplication_x: |
+| Pruned | :heavy_multiplication_x: |
+| Quantization - default | :heavy_multiplication_x: |
+| Quantization - full | :heavy_check_mark: |
+
+## Network Inputs
+| Input Node Name | Shape | Type | Example Path | Example Type | Example Shape | Example Use Case |
+|-----------------|-------|-------|--------------|-------|-------|-----------------|
+| input | (1, 490) | int8 | models/keyword_spotting/ds_cnn_medium/model_package_tf/model_archive/TFLite/tflite_int8/testing_input/input | int8 | [1, 490] | The input is a processed MFCCs |
+
+## Network Outputs
+| Output Node Name | Shape | Type | Example Path | Example Type | Example Shape | Example Use Case |
+|-----------------|-------|-------|--------------|-------|-------|-----------------|
+| Identity | (1, 12) | int8 | models/keyword_spotting/ds_cnn_medium/model_package_tf/model_archive/TFLite/tflite_int8/testing_output/Identity | int8 | [1, 12] | The probability on 12 keywords |
+
diff --git a/models/keyword_spotting/ds_cnn_medium/model_package_tf/model_archive/TFLite/tflite_int8/definition.yaml b/models/keyword_spotting/ds_cnn_medium/model_package_tf/model_archive/TFLite/tflite_int8/definition.yaml
new file mode 100644
index 0000000..7cc5a2a
--- /dev/null
+++ b/models/keyword_spotting/ds_cnn_medium/model_package_tf/model_archive/TFLite/tflite_int8/definition.yaml
@@ -0,0 +1,66 @@
+benchmark:
+ benchmark_metrics:
+ Accuracy: 93.93%
+ benchmark_name: Google Speech Commands test set
+description: This is a fully quantized int8 version of the DS-CNN Medium model developed
+ by Arm, from the Hello Edge paper.
+license:
+- Apache-2.0
+network:
+ datatype: int8
+ file_size_bytes: 186288
+ filename: ds_cnn_m_quantized.tflite
+ framework: TensorFlow Lite
+ hash:
+ algorithm: sha1
+ value: 740d32adde16948b2ab45e1e8c856de2925a05eb
+ provenance: https://arxiv.org/abs/1711.07128
+ training: Trained by Arm
+network_parameters:
+ input_nodes:
+ - description: The input is a processed MFCCs of shape (1, 490)
+ example_input:
+ path: models/keyword_spotting/ds_cnn_medium/model_package_tf/model_archive/TFLite/tflite_int8/testing_input/input
+ shape:
+ - 1
+ - 490
+ type: int8
+ use_case: Random input for model regression.
+ input_datatype: int8
+ name: input
+ shape:
+ - 1
+ - 490
+ output_nodes:
+ - description: The probability on 12 keywords.
+ example_output:
+ path: models/keyword_spotting/ds_cnn_medium/model_package_tf/model_archive/TFLite/tflite_int8/testing_output/Identity
+ shape:
+ - 1
+ - 12
+ type: int8
+ use_case: output for model regression.
+ name: Identity
+ output_datatype: int8
+ shape:
+ - 1
+ - 12
+network_quality:
+ clustered: false
+ is_vanilla: true
+ pruned: false
+ quality_level: Deployable
+ quality_level_hero_hw: cortex_m
+ quantization_default: false
+ quantization_full: true
+ recreate: true
+operators:
+ TensorFlow Lite:
+ - AVERAGE_POOL_2D
+ - CONV_2D
+ - DEPTHWISE_CONV_2D
+ - FULLY_CONNECTED
+ - RELU
+ - RESHAPE
+ - SOFTMAX
+paper: https://arxiv.org/abs/1711.07128
\ No newline at end of file
diff --git a/models/keyword_spotting/ds_cnn_medium/tflite_int8/ds_cnn_m_quantized.tflite b/models/keyword_spotting/ds_cnn_medium/model_package_tf/model_archive/TFLite/tflite_int8/ds_cnn_m_quantized.tflite
similarity index 100%
rename from models/keyword_spotting/ds_cnn_medium/tflite_int8/ds_cnn_m_quantized.tflite
rename to models/keyword_spotting/ds_cnn_medium/model_package_tf/model_archive/TFLite/tflite_int8/ds_cnn_m_quantized.tflite
diff --git a/models/keyword_spotting/ds_cnn_medium/tflite_int8/testing_input/input/0.npy b/models/keyword_spotting/ds_cnn_medium/model_package_tf/model_archive/TFLite/tflite_int8/testing_input/input/0.npy
similarity index 100%
rename from models/keyword_spotting/ds_cnn_medium/tflite_int8/testing_input/input/0.npy
rename to models/keyword_spotting/ds_cnn_medium/model_package_tf/model_archive/TFLite/tflite_int8/testing_input/input/0.npy
diff --git a/models/keyword_spotting/ds_cnn_medium/tflite_int8/testing_output/Identity/0.npy b/models/keyword_spotting/ds_cnn_medium/model_package_tf/model_archive/TFLite/tflite_int8/testing_output/Identity/0.npy
similarity index 100%
rename from models/keyword_spotting/ds_cnn_medium/tflite_int8/testing_output/Identity/0.npy
rename to models/keyword_spotting/ds_cnn_medium/model_package_tf/model_archive/TFLite/tflite_int8/testing_output/Identity/0.npy
diff --git a/models/keyword_spotting/ds_cnn_medium/model_package_tf/model_archive/baseline/saved_model/ds_cnn_medium/keras_metadata.pb b/models/keyword_spotting/ds_cnn_medium/model_package_tf/model_archive/baseline/saved_model/ds_cnn_medium/keras_metadata.pb
new file mode 100644
index 0000000..d1cf98b
--- /dev/null
+++ b/models/keyword_spotting/ds_cnn_medium/model_package_tf/model_archive/baseline/saved_model/ds_cnn_medium/keras_metadata.pb
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0e2c679859ef8fe55a5240076d46d21fb6058d6f5eb6789e8f66484c0eb5606c
+size 65455
diff --git a/models/keyword_spotting/ds_cnn_medium/model_package_tf/model_archive/baseline/saved_model/ds_cnn_medium/saved_model.pb b/models/keyword_spotting/ds_cnn_medium/model_package_tf/model_archive/baseline/saved_model/ds_cnn_medium/saved_model.pb
new file mode 100644
index 0000000..edf9f9d
--- /dev/null
+++ b/models/keyword_spotting/ds_cnn_medium/model_package_tf/model_archive/baseline/saved_model/ds_cnn_medium/saved_model.pb
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f3441fda9da39b45faa7e26c777cb8608318cb6140df5aee5470f2a94c04b5a7
+size 711776
diff --git a/models/keyword_spotting/ds_cnn_medium/model_package_tf/model_archive/baseline/saved_model/ds_cnn_medium/variables/variables.data-00000-of-00001 b/models/keyword_spotting/ds_cnn_medium/model_package_tf/model_archive/baseline/saved_model/ds_cnn_medium/variables/variables.data-00000-of-00001
new file mode 100644
index 0000000..fa0e037
--- /dev/null
+++ b/models/keyword_spotting/ds_cnn_medium/model_package_tf/model_archive/baseline/saved_model/ds_cnn_medium/variables/variables.data-00000-of-00001
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:89a822e0c17c8dc7500805a9833fd2558ffe89da671932747c508402e60c7405
+size 583382
diff --git a/models/keyword_spotting/ds_cnn_medium/model_package_tf/model_archive/baseline/saved_model/ds_cnn_medium/variables/variables.index b/models/keyword_spotting/ds_cnn_medium/model_package_tf/model_archive/baseline/saved_model/ds_cnn_medium/variables/variables.index
new file mode 100644
index 0000000..24cf127
--- /dev/null
+++ b/models/keyword_spotting/ds_cnn_medium/model_package_tf/model_archive/baseline/saved_model/ds_cnn_medium/variables/variables.index
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8f94d6215cd19d5651d333504aad08c2d1450afae072b86e9d6c344b8e23fd26
+size 3642
diff --git a/models/keyword_spotting/ds_cnn_medium/tflite_int8/ckpt/checkpoint b/models/keyword_spotting/ds_cnn_medium/model_package_tf/model_archive/baseline/weights/checkpoint
similarity index 100%
rename from models/keyword_spotting/ds_cnn_medium/tflite_int8/ckpt/checkpoint
rename to models/keyword_spotting/ds_cnn_medium/model_package_tf/model_archive/baseline/weights/checkpoint
diff --git a/models/keyword_spotting/ds_cnn_medium/tflite_int8/ckpt/ds_cnn_0.95_ckpt.data-00000-of-00001 b/models/keyword_spotting/ds_cnn_medium/model_package_tf/model_archive/baseline/weights/ds_cnn_0.95_ckpt.data-00000-of-00001
similarity index 100%
rename from models/keyword_spotting/ds_cnn_medium/tflite_int8/ckpt/ds_cnn_0.95_ckpt.data-00000-of-00001
rename to models/keyword_spotting/ds_cnn_medium/model_package_tf/model_archive/baseline/weights/ds_cnn_0.95_ckpt.data-00000-of-00001
diff --git a/models/keyword_spotting/ds_cnn_medium/tflite_int8/ckpt/ds_cnn_0.95_ckpt.index b/models/keyword_spotting/ds_cnn_medium/model_package_tf/model_archive/baseline/weights/ds_cnn_0.95_ckpt.index
similarity index 100%
rename from models/keyword_spotting/ds_cnn_medium/tflite_int8/ckpt/ds_cnn_0.95_ckpt.index
rename to models/keyword_spotting/ds_cnn_medium/model_package_tf/model_archive/baseline/weights/ds_cnn_0.95_ckpt.index
diff --git a/models/keyword_spotting/ds_cnn_medium/model_package_tf/model_core_utils/__init__.py b/models/keyword_spotting/ds_cnn_medium/model_package_tf/model_core_utils/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/models/keyword_spotting/ds_cnn_medium/model_package_tf/model_core_utils/models.py b/models/keyword_spotting/ds_cnn_medium/model_package_tf/model_core_utils/models.py
new file mode 100644
index 0000000..1978136
--- /dev/null
+++ b/models/keyword_spotting/ds_cnn_medium/model_package_tf/model_core_utils/models.py
@@ -0,0 +1,327 @@
+# Copyright © 2023 Arm Ltd. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Model definitions for simple keyword spotting."""
+
+import math
+
+import tensorflow as tf
+
+
+def prepare_model_settings(label_count, sample_rate, clip_duration_ms,
+ window_size_ms, window_stride_ms,
+ dct_coefficient_count):
+ """Calculates common settings needed for all models.
+
+ Args:
+ label_count: How many classes are to be recognized.
+ sample_rate: Number of audio samples per second.
+ clip_duration_ms: Length of each audio clip to be analyzed.
+ window_size_ms: Duration of frequency analysis window.
+ window_stride_ms: How far to move in time between frequency windows.
+ dct_coefficient_count: Number of frequency bins to use for analysis.
+
+ Returns:
+ Dictionary containing common settings.
+ """
+ desired_samples = int(sample_rate * clip_duration_ms / 1000)
+ window_size_samples = int(sample_rate * window_size_ms / 1000)
+ window_stride_samples = int(sample_rate * window_stride_ms / 1000)
+ length_minus_window = (desired_samples - window_size_samples)
+ if length_minus_window < 0:
+ spectrogram_length = 0
+ else:
+ spectrogram_length = 1 + int(length_minus_window / window_stride_samples)
+ fingerprint_size = dct_coefficient_count * spectrogram_length
+
+ return {
+ 'desired_samples': desired_samples,
+ 'window_size_samples': window_size_samples,
+ 'window_stride_samples': window_stride_samples,
+ 'spectrogram_length': spectrogram_length,
+ 'dct_coefficient_count': dct_coefficient_count,
+ 'fingerprint_size': fingerprint_size,
+ 'label_count': label_count,
+ 'sample_rate': sample_rate,
+ }
+
+
+def create_model(model_settings, model_architecture, model_size_info, is_training):
+ """Builds a tf.keras model of the requested architecture compatible with the settings.
+
+ Args:
+ model_settings: Dictionary of information about the model.
+ model_architecture: String specifying which kind of model to create.
+ model_size_info: Array with specific information for the chosen architecture
+ (e.g convolutional parameters, number of layers).
+
+ Returns:
+ A tf.keras Model with the requested architecture.
+
+ Raises:
+ Exception: If the architecture type isn't recognized.
+ """
+
+ if model_architecture == 'dnn':
+ return create_dnn_model(model_settings, model_size_info)
+
+ elif model_architecture == 'cnn':
+ return create_cnn_model(model_settings, model_size_info)
+
+ elif model_architecture == 'ds_cnn':
+ return create_ds_cnn_model(model_settings, model_size_info)
+ elif model_architecture == 'single_fc':
+ return create_single_fc_model(model_settings)
+ elif model_architecture == 'basic_lstm':
+ return create_basic_lstm_model(model_settings, model_size_info, is_training)
+ else:
+ raise Exception(f'model_architecture argument {model_architecture} not recognized'
+ f', should be one of, "dnn", "cnn", "ds_cnn" ')
+
+
+def create_single_fc_model(model_settings):
+ """Builds a model with a single fully-connected layer.
+
+ For details see https://arxiv.org/abs/1711.07128.
+
+ Args:
+ model_settings: Dict of different settings for model training.
+
+ Returns:
+ tf.keras Model of the 'SINGLE_FC' architecture.
+ """
+ inputs = tf.keras.Input(shape=(model_settings['fingerprint_size'],), name='input')
+ # Fully connected layer
+ output = tf.keras.layers.Dense(units=model_settings['label_count'], activation='softmax')(inputs)
+
+ return tf.keras.Model(inputs, output)
+
+
+def create_basic_lstm_model(model_settings, model_size_info, is_training):
+ """Builds a model with a basic lstm layer.
+
+ For details see https://arxiv.org/abs/1711.07128.
+
+ Args:
+ model_settings: Dict of different settings for model training.
+ model_size_info: Length of the array defines the number of hidden-layers and
+ each element in the array represent the number of neurons in that layer.
+ is_training: Determining whether the use of the model is for training or for something else.
+
+ Returns:
+ tf.keras Model of the 'Basic_LSTM' architecture.
+ """
+ inputs = tf.keras.Input(shape=(model_settings['fingerprint_size'], ), name='input')
+
+ input_frequency_size = model_settings['dct_coefficient_count']
+ input_time_size = model_settings['spectrogram_length']
+ x = tf.reshape(inputs, shape=(-1, input_time_size, input_frequency_size))
+
+ # LSTM layer, and unrolling depending on whether you are training or not
+ if is_training:
+ x = tf.keras.layers.LSTM(units=model_size_info[0], time_major=False, unroll=False)(x)
+ else:
+ x = tf.keras.layers.LSTM(units=model_size_info[0], time_major=False, unroll=True)(x)
+
+ # Outputs a fully connected layer
+ output = tf.keras.layers.Dense(units=model_settings['label_count'], activation='softmax')(x)
+
+ return tf.keras.Model(inputs, output)
+
+
+def create_dnn_model(model_settings, model_size_info):
+ """Builds a model with multiple hidden fully-connected layers.
+
+ For details see https://arxiv.org/abs/1711.07128.
+
+ Args:
+ model_settings: Dict of different settings for model training.
+ model_size_info: Length of the array defines the number of hidden-layers and
+ each element in the array represent the number of neurons in that layer.
+
+ Returns:
+ tf.keras Model of the 'DNN' architecture.
+ """
+
+ inputs = tf.keras.Input(shape=(model_settings['fingerprint_size'], ), name='input')
+
+ # First fully connected layer.
+ x = tf.keras.layers.Dense(units=model_size_info[0], activation='relu')(inputs)
+
+ # Hidden layers with ReLU activations.
+ for i in range(1, len(model_size_info)):
+ x = tf.keras.layers.Dense(units=model_size_info[i], activation='relu')(x)
+
+ # Output fully connected layer.
+ output = tf.keras.layers.Dense(units=model_settings['label_count'], activation='softmax')(x)
+
+ return tf.keras.Model(inputs, output)
+
+
+def create_cnn_model(model_settings, model_size_info):
+ """Builds a model with 2 convolution layers followed by a linear layer and a hidden fully-connected layer.
+
+ For details see https://arxiv.org/abs/1711.07128.
+
+ Args:
+ model_settings: Dict of different settings for model training.
+ model_size_info: Defines the first and second convolution parameters in
+ {number of conv features, conv filter height, width, stride in y,x dir.},
+ followed by linear layer size and fully-connected layer size.
+
+ Returns:
+ tf.keras Model of the 'CNN' architecture.
+ """
+
+ input_frequency_size = model_settings['dct_coefficient_count']
+ input_time_size = model_settings['spectrogram_length']
+
+ first_filter_count = model_size_info[0]
+ first_filter_height = model_size_info[1] # Time axis.
+ first_filter_width = model_size_info[2] # Frequency axis.
+ first_filter_stride_y = model_size_info[3] # Time axis.
+ first_filter_stride_x = model_size_info[4] # Frequency_axis.
+
+ second_filter_count = model_size_info[5]
+ second_filter_height = model_size_info[6] # Time axis.
+ second_filter_width = model_size_info[7] # Frequency axis.
+ second_filter_stride_y = model_size_info[8] # Time axis.
+ second_filter_stride_x = model_size_info[9] # Frequency axis.
+
+ linear_layer_size = model_size_info[10]
+ fc_size = model_size_info[11]
+
+ inputs = tf.keras.Input(shape=(model_settings['fingerprint_size']), name='input')
+
+ # Reshape the flattened input.
+ x = tf.reshape(inputs, shape=(-1, input_time_size, input_frequency_size, 1))
+
+ # First convolution.
+ x = tf.keras.layers.Conv2D(filters=first_filter_count,
+ kernel_size=(first_filter_height, first_filter_width),
+ strides=(first_filter_stride_y, first_filter_stride_x),
+ padding='VALID')(x)
+ x = tf.keras.layers.BatchNormalization()(x)
+ x = tf.keras.layers.ReLU()(x)
+ x = tf.keras.layers.Dropout(rate=0)(x)
+
+ # Second convolution.
+ x = tf.keras.layers.Conv2D(filters=second_filter_count,
+ kernel_size=(second_filter_height, second_filter_width),
+ strides=(second_filter_stride_y, second_filter_stride_x),
+ padding='VALID')(x)
+ x = tf.keras.layers.BatchNormalization()(x)
+ x = tf.keras.layers.ReLU()(x)
+ x = tf.keras.layers.Dropout(rate=0)(x)
+
+ # Flatten for fully connected layers.
+ x = tf.keras.layers.Flatten()(x)
+
+ # Fully connected layer with no activation.
+ x = tf.keras.layers.Dense(units=linear_layer_size)(x)
+
+ # Fully connected layer with ReLU activation.
+ x = tf.keras.layers.Dense(units=fc_size)(x)
+ x = tf.keras.layers.BatchNormalization()(x)
+ x = tf.keras.layers.ReLU()(x)
+ x = tf.keras.layers.Dropout(rate=0)(x)
+
+ # Output fully connected.
+ output = tf.keras.layers.Dense(units=model_settings['label_count'], activation='softmax')(x)
+
+ return tf.keras.Model(inputs, output)
+
+
+def create_ds_cnn_model(model_settings, model_size_info):
+ """Builds a model with convolutional & depthwise separable convolutional layers.
+
+ For more details see https://arxiv.org/abs/1711.07128.
+
+ Args:
+ model_settings: Dict of different settings for model training.
+ model_size_info: Defines number of layers, followed by the DS-Conv layer
+ parameters in the order {number of conv features, conv filter height,
+ width and stride in y,x dir.} for each of the layers.
+
+ Returns:
+ tf.keras Model of the 'DS-CNN' architecture.
+ """
+
+ label_count = model_settings['label_count']
+ input_frequency_size = model_settings['dct_coefficient_count']
+ input_time_size = model_settings['spectrogram_length']
+
+ t_dim = input_time_size
+ f_dim = input_frequency_size
+
+ # Extract model dimensions from model_size_info.
+ num_layers = model_size_info[0]
+ conv_feat = [None]*num_layers
+ conv_kt = [None]*num_layers
+ conv_kf = [None]*num_layers
+ conv_st = [None]*num_layers
+ conv_sf = [None]*num_layers
+
+ i = 1
+ for layer_no in range(0, num_layers):
+ conv_feat[layer_no] = model_size_info[i]
+ i += 1
+ conv_kt[layer_no] = model_size_info[i]
+ i += 1
+ conv_kf[layer_no] = model_size_info[i]
+ i += 1
+ conv_st[layer_no] = model_size_info[i]
+ i += 1
+ conv_sf[layer_no] = model_size_info[i]
+ i += 1
+
+ inputs = tf.keras.Input(shape=(model_settings['fingerprint_size']), name='input')
+
+ # Reshape the flattened input.
+ x = tf.reshape(inputs, shape=(-1, input_time_size, input_frequency_size, 1))
+
+ # Depthwise separable convolutions.
+ for layer_no in range(0, num_layers):
+ if layer_no == 0:
+ # First convolution.
+ x = tf.keras.layers.Conv2D(filters=conv_feat[0],
+ kernel_size=(conv_kt[0], conv_kf[0]),
+ strides=(conv_st[0], conv_sf[0]),
+ padding='SAME')(x)
+ x = tf.keras.layers.BatchNormalization()(x)
+ x = tf.keras.layers.ReLU()(x)
+ else:
+ # Depthwise convolution.
+ x = tf.keras.layers.DepthwiseConv2D(kernel_size=(conv_kt[layer_no], conv_kf[layer_no]),
+ strides=(conv_sf[layer_no], conv_st[layer_no]),
+ padding='SAME')(x)
+ x = tf.keras.layers.BatchNormalization()(x)
+ x = tf.keras.layers.ReLU()(x)
+
+ # Pointwise convolution.
+ x = tf.keras.layers.Conv2D(filters=conv_feat[layer_no], kernel_size=(1, 1))(x)
+ x = tf.keras.layers.BatchNormalization()(x)
+ x = tf.keras.layers.ReLU()(x)
+
+ t_dim = math.ceil(t_dim/float(conv_st[layer_no]))
+ f_dim = math.ceil(f_dim/float(conv_sf[layer_no]))
+
+ # Global average pool.
+ x = tf.keras.layers.AveragePooling2D(pool_size=(t_dim, f_dim), strides=1)(x)
+
+ # Squeeze before passing to output fully connected layer.
+ x = tf.reshape(x, shape=(-1, conv_feat[layer_no]))
+
+ # Output connected layer.
+ output = tf.keras.layers.Dense(units=label_count, activation='softmax')(x)
+
+ return tf.keras.Model(inputs, output)
diff --git a/models/keyword_spotting/ds_cnn_medium/model_package_tf/optimisations.py b/models/keyword_spotting/ds_cnn_medium/model_package_tf/optimisations.py
new file mode 100644
index 0000000..16b6f4c
--- /dev/null
+++ b/models/keyword_spotting/ds_cnn_medium/model_package_tf/optimisations.py
@@ -0,0 +1,259 @@
+# Copyright © 2023 Arm Ltd. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Functions for optimizing simple keyword spotting models using clustering API."""
+
+import argparse
+from pathlib import Path
+
+import tensorflow as tf
+import numpy as np
+import tensorflow_model_optimization as tfmot
+
+from data_processing import data_preprocessing
+from model_core_utils import models
+
+
+def print_model_weight_clusters(model):
+
+ for layer in model.layers:
+ if isinstance(layer, tf.keras.layers.Wrapper):
+ weights = layer.trainable_weights
+ else:
+ weights = layer.weights
+ for weight in weights:
+ if "kernel" in weight.name:
+ unique_count = len(np.unique(weight))
+ print(
+ f"{layer.name}/{weight.name}: {unique_count} clusters "
+ )
+
+
+def optimize():
+ model_settings = models.prepare_model_settings(
+ len(data_preprocessing.prepare_words_list(FLAGS.wanted_words.split(','))),
+ FLAGS.sample_rate, FLAGS.clip_duration_ms, FLAGS.window_size_ms,
+ FLAGS.window_stride_ms, FLAGS.dct_coefficient_count)
+
+ # Create the model to optimize from checkpoint.
+ model = models.create_model(model_settings, FLAGS.model_architecture, FLAGS.model_size_info)
+ model.load_weights(FLAGS.checkpoint).expect_partial()
+
+ audio_processor = data_preprocessing.AudioProcessor(data_url=FLAGS.data_url,
+ data_dir=FLAGS.data_dir,
+ silence_percentage=FLAGS.silence_percentage,
+ unknown_percentage=FLAGS.unknown_percentage,
+ wanted_words=FLAGS.wanted_words.split(','),
+ validation_percentage=FLAGS.validation_percentage,
+ testing_percentage=FLAGS.testing_percentage,
+ model_settings=model_settings)
+
+ # We decay learning rate in a constant piecewise way to help learning.
+ training_steps_list = list(map(int, FLAGS.how_many_training_steps.split(',')))
+ learning_rates_list = list(map(float, FLAGS.learning_rate.split(',')))
+ lr_boundary_list = training_steps_list[:-1] # Only need the values at which to change lr.
+ lr_schedule = tf.keras.optimizers.schedules.PiecewiseConstantDecay(boundaries=lr_boundary_list,
+ values=learning_rates_list)
+
+ cluster_weights = tfmot.clustering.keras.cluster_weights
+ CentroidInitialization = tfmot.clustering.keras.CentroidInitialization
+
+ clustering_params = {
+ 'number_of_clusters': 32,
+ 'cluster_centroids_init': CentroidInitialization.KMEANS_PLUS_PLUS}
+
+ clustered_model = cluster_weights(model, **clustering_params)
+
+ # Specify the optimizer configurations.
+ optimizer = tf.keras.optimizers.Adam(learning_rate=lr_schedule)
+ clustered_model.compile(optimizer=optimizer,
+ loss=tf.keras.losses.SparseCategoricalCrossentropy(),
+ metrics=['accuracy'])
+
+ train_data = audio_processor.get_data(audio_processor.Modes.TRAINING,
+ FLAGS.background_frequency, FLAGS.background_volume,
+ int((FLAGS.time_shift_ms * FLAGS.sample_rate) / 1000))
+ train_data = train_data.repeat().batch(FLAGS.batch_size).prefetch(tf.data.AUTOTUNE)
+ val_data = audio_processor.get_data(audio_processor.Modes.VALIDATION)
+ val_data = val_data.batch(FLAGS.batch_size).prefetch(tf.data.AUTOTUNE)
+
+ # We train for a max number of iterations so need to calculate how many 'epochs' this will be.
+ training_steps_max = np.sum(training_steps_list)
+ training_epoch_max = int(np.ceil(training_steps_max / FLAGS.eval_step_interval))
+
+ # Train the model with clustering applied.
+ clustered_model.fit(x=train_data,
+ steps_per_epoch=FLAGS.eval_step_interval,
+ epochs=training_epoch_max,
+ validation_data=val_data)
+
+ stripped_clustered_model = tfmot.clustering.keras.strip_clustering(clustered_model)
+
+ print_model_weight_clusters(stripped_clustered_model)
+
+ # Save the clustered model weights
+ train_dir = Path(FLAGS.train_dir) / "optimized"
+ train_dir.mkdir(parents=True, exist_ok=True)
+
+ stripped_clustered_model.save_weights((train_dir /
+ (FLAGS.model_architecture +
+ "_clustered_ckpt")))
+
+ # Test the model.
+ test_data = audio_processor.get_data(audio_processor.Modes.TESTING)
+ test_data = test_data.batch(FLAGS.batch_size)
+
+ stripped_clustered_model.compile(optimizer=optimizer,
+ loss=tf.keras.losses.SparseCategoricalCrossentropy(),
+ metrics=['accuracy'])
+
+ test_loss, test_acc = stripped_clustered_model.evaluate(x=test_data)
+ print(f'Final test accuracy: {test_acc*100:.2f}%')
+
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser()
+ parser.add_argument(
+ '--data_url',
+ type=str,
+ default='http://download.tensorflow.org/data/speech_commands_v0.02.tar.gz',
+ help='Location of speech training data archive on the web.')
+ parser.add_argument(
+ '--data_dir',
+ type=str,
+ default='/tmp/speech_dataset/',
+ help="""\
+ Where to download the speech training data to.
+ """)
+ parser.add_argument(
+ '--background_volume',
+ type=float,
+ default=0.1,
+ help="""\
+ How loud the background noise should be, between 0 and 1.
+ """)
+ parser.add_argument(
+ '--background_frequency',
+ type=float,
+ default=0.8,
+ help="""\
+ How many of the training samples have background noise mixed in.
+ """)
+ parser.add_argument(
+ '--silence_percentage',
+ type=float,
+ default=10.0,
+ help="""\
+ How much of the training data should be silence.
+ """)
+ parser.add_argument(
+ '--unknown_percentage',
+ type=float,
+ default=10.0,
+ help="""\
+ How much of the training data should be unknown words.
+ """)
+ parser.add_argument(
+ '--time_shift_ms',
+ type=float,
+ default=100.0,
+ help="""\
+ Range to randomly shift the training audio by in time.
+ """)
+ parser.add_argument(
+ '--testing_percentage',
+ type=int,
+ default=10,
+ help='What percentage of wavs to use as a test set.')
+ parser.add_argument(
+ '--validation_percentage',
+ type=int,
+ default=10,
+ help='What percentage of wavs to use as a validation set.')
+ parser.add_argument(
+ '--sample_rate',
+ type=int,
+ default=16000,
+ help='Expected sample rate of the wavs',)
+ parser.add_argument(
+ '--clip_duration_ms',
+ type=int,
+ default=1000,
+ help='Expected duration in milliseconds of the wavs',)
+ parser.add_argument(
+ '--window_size_ms',
+ type=float,
+ default=30.0,
+ help='How long each spectrogram timeslice is',)
+ parser.add_argument(
+ '--window_stride_ms',
+ type=float,
+ default=10.0,
+ help='How long each spectrogram timeslice is',)
+ parser.add_argument(
+ '--dct_coefficient_count',
+ type=int,
+ default=40,
+ help='How many bins to use for the MFCC fingerprint',)
+ parser.add_argument(
+ '--how_many_training_steps',
+ type=str,
+ default='3750,750',
+ help='How many training loops to run',)
+ parser.add_argument(
+ '--eval_step_interval',
+ type=int,
+ default=400,
+ help='How often to evaluate the training results.')
+ parser.add_argument(
+ '--learning_rate',
+ type=str,
+ default='0.001,0.0001',
+ help='How large a learning rate to use when training.')
+ parser.add_argument(
+ '--batch_size',
+ type=int,
+ default=100,
+ help='How many items to train with at once',)
+ parser.add_argument(
+ '--wanted_words',
+ type=str,
+ default='yes,no,up,down,left,right,on,off,stop,go',
+ help='Words to use (others will be added to an unknown label)',)
+ parser.add_argument(
+ '--train_dir',
+ type=str,
+ default='/tmp/speech_commands_train',
+ help='Directory to write event logs and checkpoint.')
+ parser.add_argument(
+ '--save_step_interval',
+ type=int,
+ default=100,
+ help='Save model checkpoint every save_steps.')
+ parser.add_argument(
+ '--checkpoint',
+ type=str,
+ help='Checkpoint to load the weights from before fine-tuning.')
+ parser.add_argument(
+ '--model_architecture',
+ type=str,
+ default='dnn',
+ help='What model architecture to use')
+ parser.add_argument(
+ '--model_size_info',
+ type=int,
+ nargs="+",
+ default=[128, 128, 128],
+ help='Model dimensions - different for various models')
+
+ FLAGS, _ = parser.parse_known_args()
+ optimize()
diff --git a/models/keyword_spotting/ds_cnn_medium/model_package_tf/recreate_model.sh b/models/keyword_spotting/ds_cnn_medium/model_package_tf/recreate_model.sh
new file mode 100644
index 0000000..278bddd
--- /dev/null
+++ b/models/keyword_spotting/ds_cnn_medium/model_package_tf/recreate_model.sh
@@ -0,0 +1,52 @@
+#!/usr/bin/env bash
+# Copyright (C) 2023 Arm Limited or its affiliates. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+ckpt_path=model_archive/model_source/weights/ds_cnn_0.95_ckpt
+train=false
+
+# Parse command line args
+while (( $# >= 1 )); do
+ case $1 in
+ --ckpt)
+ if [ "$2" ]; then
+ ckpt_path=$2
+ shift
+ else
+ printf 'ERROR: "--ckpt" requires a path to be supplied.\n'
+ exit 1
+ fi
+ ;;
+ --train)
+ train=true
+ break;;
+ *) shift;
+ esac;
+done
+
+
+# DS-CNN Medium training
+if [ "$train" = true ]
+then
+python train.py --model_architecture ds_cnn --model_size_info 5 172 10 4 2 1 172 3 3 2 2 172 3 3 1 1 172 3 3 1 1 172 3 3 1 1 --dct_coefficient_count 10 --window_size_ms 40 --window_stride_ms 20 --learning_rate 0.0005,0.0001,0.00002 --how_many_training_steps 10000,10000,10000 --summaries_dir work/DS_CNN/DS_CNN_M/retrain_logs --train_dir work/DS_CNN/DS_CNN_M/training
+fi
+
+# Conversion to TFLite fp32
+python convert_to_tflite.py --model_architecture ds_cnn --model_size_info 5 172 10 4 2 1 172 3 3 2 2 172 3 3 1 1 172 3 3 1 1 172 3 3 1 1 --dct_coefficient_count 10 --window_size_ms 40 --window_stride_ms 20 --checkpoint $ckpt_path --no-quantize
+
+# Conversion to TFLite int8
+python convert_to_tflite.py --model_architecture ds_cnn --model_size_info 5 172 10 4 2 1 172 3 3 2 2 172 3 3 1 1 172 3 3 1 1 172 3 3 1 1 --dct_coefficient_count 10 --window_size_ms 40 --window_stride_ms 20 --checkpoint $ckpt_path --inference_type int8
+
diff --git a/models/keyword_spotting/ds_cnn_medium/model_package_tf/requirements.txt b/models/keyword_spotting/ds_cnn_medium/model_package_tf/requirements.txt
new file mode 100644
index 0000000..3448cff
--- /dev/null
+++ b/models/keyword_spotting/ds_cnn_medium/model_package_tf/requirements.txt
@@ -0,0 +1,3 @@
+numpy == 1.19.5
+tensorflow == 2.5.0
+tensorflow-model-optimization == 0.6.0
\ No newline at end of file
diff --git a/models/keyword_spotting/ds_cnn_medium/model_package_tf/train.py b/models/keyword_spotting/ds_cnn_medium/model_package_tf/train.py
new file mode 100644
index 0000000..8c488b3
--- /dev/null
+++ b/models/keyword_spotting/ds_cnn_medium/model_package_tf/train.py
@@ -0,0 +1,227 @@
+# Copyright © 2023 Arm Ltd. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Functions for training simple keyword spotting models."""
+
+import argparse
+from pathlib import Path
+
+import tensorflow as tf
+import numpy as np
+
+from data_processing import data_preprocessing
+from model_core_utils import models
+
+
+def train():
+ model_settings = models.prepare_model_settings(
+ len(data_preprocessing.prepare_words_list(FLAGS.wanted_words.split(','))),
+ FLAGS.sample_rate, FLAGS.clip_duration_ms, FLAGS.window_size_ms,
+ FLAGS.window_stride_ms, FLAGS.dct_coefficient_count)
+
+ # Create the model.
+ model = models.create_model(model_settings, FLAGS.model_architecture, FLAGS.model_size_info, True)
+
+ audio_processor = data_preprocessing.AudioProcessor(data_url=FLAGS.data_url,
+ data_dir=FLAGS.data_dir,
+ silence_percentage=FLAGS.silence_percentage,
+ unknown_percentage=FLAGS.unknown_percentage,
+ wanted_words=FLAGS.wanted_words.split(','),
+ validation_percentage=FLAGS.validation_percentage,
+ testing_percentage=FLAGS.testing_percentage,
+ model_settings=model_settings)
+
+ # We decay learning rate in a constant piecewise way to help learning.
+ training_steps_list = list(map(int, FLAGS.how_many_training_steps.split(',')))
+ learning_rates_list = list(map(float, FLAGS.learning_rate.split(',')))
+ lr_boundary_list = training_steps_list[:-1] # Only need the values at which to change lr.
+ lr_schedule = tf.keras.optimizers.schedules.PiecewiseConstantDecay(boundaries=lr_boundary_list,
+ values=learning_rates_list)
+
+ # Specify the optimizer configurations.
+ optimizer = tf.keras.optimizers.Adam(learning_rate=lr_schedule)
+ model.compile(optimizer=optimizer,
+ loss=tf.keras.losses.SparseCategoricalCrossentropy(),
+ metrics=['accuracy'])
+
+ train_data = audio_processor.get_data(audio_processor.Modes.TRAINING,
+ FLAGS.background_frequency, FLAGS.background_volume,
+ int((FLAGS.time_shift_ms * FLAGS.sample_rate) / 1000))
+ train_data = train_data.repeat().batch(FLAGS.batch_size).prefetch(tf.data.AUTOTUNE)
+ val_data = audio_processor.get_data(audio_processor.Modes.VALIDATION)
+ val_data = val_data.batch(FLAGS.batch_size).prefetch(tf.data.AUTOTUNE)
+
+ # We train for a max number of iterations so need to calculate how many 'epochs' this will be.
+ training_steps_max = np.sum(training_steps_list)
+ training_epoch_max = int(np.ceil(training_steps_max / FLAGS.eval_step_interval))
+
+ # Callbacks.
+ train_dir = Path(FLAGS.train_dir) / "best"
+ train_dir.mkdir(parents=True, exist_ok=True)
+ model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
+ filepath=(train_dir / (FLAGS.model_architecture + "_{val_accuracy:.3f}_ckpt")),
+ save_weights_only=True,
+ monitor='val_accuracy',
+ mode='max',
+ save_best_only=True)
+ tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=FLAGS.summaries_dir)
+
+ # Train the model.
+ model.fit(x=train_data,
+ steps_per_epoch=FLAGS.eval_step_interval,
+ epochs=training_epoch_max,
+ validation_data=val_data,
+ callbacks=[model_checkpoint_callback, tensorboard_callback])
+
+ # Test and save the model.
+ test_data = audio_processor.get_data(audio_processor.Modes.TESTING)
+ test_data = test_data.batch(FLAGS.batch_size)
+
+ test_loss, test_acc = model.evaluate(x=test_data)
+ print(f'Final test accuracy: {test_acc*100:.2f}%')
+ model.save(f'saved_model/{FLAGS.model_architecture}')
+ model.save(f'keras/{FLAGS.model_architecture}.h5')
+
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser()
+ parser.add_argument(
+ '--data_url',
+ type=str,
+ default='http://download.tensorflow.org/data/speech_commands_v0.02.tar.gz',
+ help='Location of speech training data archive on the web.')
+ parser.add_argument(
+ '--data_dir',
+ type=str,
+ default='/tmp/speech_dataset/',
+ help="""\
+ Where to download the speech training data to.
+ """)
+ parser.add_argument(
+ '--background_volume',
+ type=float,
+ default=0.1,
+ help="""\
+ How loud the background noise should be, between 0 and 1.
+ """)
+ parser.add_argument(
+ '--background_frequency',
+ type=float,
+ default=0.8,
+ help="""\
+ How many of the training samples have background noise mixed in.
+ """)
+ parser.add_argument(
+ '--silence_percentage',
+ type=float,
+ default=10.0,
+ help="""\
+ How much of the training data should be silence.
+ """)
+ parser.add_argument(
+ '--unknown_percentage',
+ type=float,
+ default=10.0,
+ help="""\
+ How much of the training data should be unknown words.
+ """)
+ parser.add_argument(
+ '--time_shift_ms',
+ type=float,
+ default=100.0,
+ help="""\
+ Range to randomly shift the training audio by in time.
+ """)
+ parser.add_argument(
+ '--testing_percentage',
+ type=int,
+ default=10,
+ help='What percentage of wavs to use as a test set.')
+ parser.add_argument(
+ '--validation_percentage',
+ type=int,
+ default=10,
+ help='What percentage of wavs to use as a validation set.')
+ parser.add_argument(
+ '--sample_rate',
+ type=int,
+ default=16000,
+ help='Expected sample rate of the wavs',)
+ parser.add_argument(
+ '--clip_duration_ms',
+ type=int,
+ default=1000,
+ help='Expected duration in milliseconds of the wavs',)
+ parser.add_argument(
+ '--window_size_ms',
+ type=float,
+ default=30.0,
+ help='How long each spectrogram timeslice is',)
+ parser.add_argument(
+ '--window_stride_ms',
+ type=float,
+ default=10.0,
+ help='How long each spectrogram timeslice is',)
+ parser.add_argument(
+ '--dct_coefficient_count',
+ type=int,
+ default=40,
+ help='How many bins to use for the MFCC fingerprint',)
+ parser.add_argument(
+ '--how_many_training_steps',
+ type=str,
+ default='15000,3000',
+ help='How many training loops to run',)
+ parser.add_argument(
+ '--eval_step_interval',
+ type=int,
+ default=400,
+ help='How often to evaluate the training results.')
+ parser.add_argument(
+ '--learning_rate',
+ type=str,
+ default='0.001,0.0001',
+ help='How large a learning rate to use when training.')
+ parser.add_argument(
+ '--batch_size',
+ type=int,
+ default=100,
+ help='How many items to train with at once',)
+ parser.add_argument(
+ '--summaries_dir',
+ type=str,
+ default='/tmp/retrain_logs',
+ help='Where to save summary logs for TensorBoard.')
+ parser.add_argument(
+ '--wanted_words',
+ type=str,
+ default='yes,no,up,down,left,right,on,off,stop,go',
+ help='Words to use (others will be added to an unknown label)',)
+ parser.add_argument(
+ '--train_dir',
+ type=str,
+ default='/tmp/speech_commands_train',
+ help='Directory to write event logs and checkpoint.')
+ parser.add_argument(
+ '--model_architecture',
+ type=str,
+ default='dnn',
+ help='What model architecture to use')
+ parser.add_argument(
+ '--model_size_info',
+ type=int,
+ nargs="+",
+ default=[128, 128, 128],
+ help='Model dimensions - different for various models')
+
+ FLAGS, _ = parser.parse_known_args()
+ train()
diff --git a/models/keyword_spotting/ds_cnn_medium/model_package_tf/validation_utils/labels.txt b/models/keyword_spotting/ds_cnn_medium/model_package_tf/validation_utils/labels.txt
new file mode 100644
index 0000000..ba41645
--- /dev/null
+++ b/models/keyword_spotting/ds_cnn_medium/model_package_tf/validation_utils/labels.txt
@@ -0,0 +1,12 @@
+_silence_
+_unknown_
+yes
+no
+up
+down
+left
+right
+on
+off
+stop
+go
\ No newline at end of file
diff --git a/models/keyword_spotting/ds_cnn_medium/tflite_int8/README.md b/models/keyword_spotting/ds_cnn_medium/tflite_int8/README.md
deleted file mode 100644
index c675a6f..0000000
--- a/models/keyword_spotting/ds_cnn_medium/tflite_int8/README.md
+++ /dev/null
@@ -1,59 +0,0 @@
-# DS-CNN Medium INT8
-
-## Description
-This is a fully quantized version (asymmetrical int8) of the DS-CNN Medium model developed by Arm, with training checkpoints, from the Hello Edge paper. Code to recreate this model can be found here: https://github.com/ARM-software/ML-examples/tree/master/tflu-kws-cortex-m
-
-## License
-[Apache-2.0](https://spdx.org/licenses/Apache-2.0.html)
-
-## Related Materials
-### Class Labels
-The class labels associated with this model can be downloaded by running the script `get_class_labels.sh`.
-
-### Model Recreation Code
-Code to recreate this model can be found here: https://github.com/ARM-software/ML-examples/tree/master/tflu-kws-cortex-m.
-
-## Network Information
-| Network Information | Value |
-|---------------------|------------------|
-| Framework | TensorFlow Lite |
-| SHA-1 Hash | 740d32adde16948b2ab45e1e8c856de2925a05eb |
-| Size (Bytes) | 186288 |
-| Provenance | https://github.com/ARM-software/ML-examples/tree/master/tflu-kws-cortex-m |
-| Paper | https://arxiv.org/abs/1711.07128 |
-
-## Accuracy
-Dataset: Google Speech Commands Test Set
-
-| Metric | Value |
-|--------|-------|
-| Accuracy | 0.941 |
-
-## Performance
-| Platform | Optimized |
-|----------|:---------:|
-| Cortex-A |:heavy_check_mark: |
-| Cortex-M |:heavy_check_mark: HERO |
-| Mali GPU |:heavy_check_mark: |
-| Ethos U |:heavy_check_mark: |
-
-### Key
-* :heavy_check_mark: - Will run on this platform.
-* :heavy_multiplication_x: - Will not run on this platform.
-
-
-
-## Optimizations
-| Optimization | Value |
-|-----------------|---------|
-| Quantization | INT8 |
-
-## Network Inputs
-| Input Node Name | Shape | Description |
-|-----------------|---------|-------------|
-| input | (1, 490) | The input is a processed MFCCs of shape (1, 490) |
-
-## Network Outputs
-| Output Node Name | Shape | Description |
-|------------------|---------|-------------|
-| Identity | (1, 12) | The probability on 12 keywords. |
diff --git a/models/keyword_spotting/ds_cnn_medium/tflite_int8/definition.yaml b/models/keyword_spotting/ds_cnn_medium/tflite_int8/definition.yaml
deleted file mode 100644
index c77867c..0000000
--- a/models/keyword_spotting/ds_cnn_medium/tflite_int8/definition.yaml
+++ /dev/null
@@ -1,45 +0,0 @@
-benchmark:
- Google Speech Commands test set:
- Accuracy: 94.13%
-description: 'This is a fully quantized version (asymmetrical int8) of the DS-CNN
- Medium model developed by Arm, with training checkpoints, from the Hello Edge paper.
- Code to recreate this model can be found here: https://github.com/ARM-software/ML-examples/tree/master/tflu-kws-cortex-m'
-license:
-- Apache-2.0
-network:
- file_size_bytes: 186288
- filename: ds_cnn_m_quantized.tflite
- framework: TensorFlow Lite
- hash:
- algorithm: sha1
- value: 740d32adde16948b2ab45e1e8c856de2925a05eb
- provenance: https://github.com/ARM-software/ML-examples/tree/master/tflu-kws-cortex-m
- quality_level: hero#CORTEX-M
-network_parameters:
- input_nodes:
- - description: The input is a processed MFCCs of shape (1, 490)
- example_input:
- path: models/keyword_spotting/ds_cnn_medium/tflite_int8/testing_input/input
- name: input
- shape:
- - 1
- - 490
- output_nodes:
- - description: The probability on 12 keywords.
- name: Identity
- shape:
- - 1
- - 12
- test_output_path: models/keyword_spotting/ds_cnn_medium/tflite_int8/testing_output/Identity
-operators:
- TensorFlow Lite:
- - AVERAGE_POOL_2D
- - CONV_2D
- - DEPTHWISE_CONV_2D
- - DEQUANTIZE
- - FULLY_CONNECTED
- - QUANTIZE
- - RELU
- - RESHAPE
- - SOFTMAX
-paper: https://arxiv.org/abs/1711.07128
diff --git a/models/keyword_spotting/ds_cnn_medium/tflite_int8/get_class_labels.sh b/models/keyword_spotting/ds_cnn_medium/tflite_int8/get_class_labels.sh
deleted file mode 100755
index e59caf5..0000000
--- a/models/keyword_spotting/ds_cnn_medium/tflite_int8/get_class_labels.sh
+++ /dev/null
@@ -1,20 +0,0 @@
-# Copyright (C) 2021 Arm Limited or its affiliates. All rights reserved.
-#
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-#!/usr/bin/env bash
-
-wget https://raw.githubusercontent.com/ARM-software/ML-KWS-for-MCU/e9cf319e9aa2ff71d433e111477dd95329fb94cb/Pretrained_models/labels.txt
-mv labels.txt labelmappings.txt
\ No newline at end of file
diff --git a/models/keyword_spotting/ds_cnn_small/model_package_tf/README.md b/models/keyword_spotting/ds_cnn_small/model_package_tf/README.md
new file mode 100644
index 0000000..077f31c
--- /dev/null
+++ b/models/keyword_spotting/ds_cnn_small/model_package_tf/README.md
@@ -0,0 +1,115 @@
+# DS-CNN Small model package
+
+This folder contains code that will allow you to recreate the DS-CNN Small keyword spotting model from
+the [Hello Edge paper](https://arxiv.org/pdf/1711.07128.pdf).
+
+## License
+[Apache-2.0](https://spdx.org/licenses/Apache-2.0.html)
+
+## Model Package Overview
+| Model | DS_CNN_Small |
+|:---------------: |:------------------------------------------:|
+| **Format**: | Keras, Saved Model, TensorFlow Lite int8, TensorFlow Lite fp32 |
+| **Feature**: | Keyword spotting for Arm Cortex-M CPUs |
+| **Architectural Delta w.r.t. Vanilla**: | None |
+| **Domain**: | Keyword spotting |
+| **Package Quality**: | Hero |
+
+## Model Recreation
+
+In order to recreate the model you will first need to be using ```Python3.7``` and install the requirements in ```requirements.txt```.
+
+Once you have these requirements satisfied you can execute the recreation script contained within this folder, just run:
+
+```bash
+bash ./recreate_model.sh
+```
+
+Running this script will use the pre-trained checkpoint files supplied in the ```./model_archive/model_source/weights``` folder
+to generate the TFLite files and perform evaluation on the test sets. Both an fp32 version and a quantized version will be produced.
+The quantized version will use post-training quantization to fully quantize it.
+
+If you want to run training from scratch you can do this by supplying ```--train``` when running the script. For example:
+
+```bash
+bash ./recreate_model.sh --train
+```
+
+Training is then performed and should produce a model to the stated accuracy in this repository.
+Note that exporting to TFLite will still happen with the pre-trained checkpoint files so you will need to re-run the script
+and this time supply the path to the new checkpoint files you want to use, for example:
+
+```bash
+bash ./recreate_model.sh --ckpt
+```
+
+
+## Training
+
+To train a DNN with 3 fully-connected layers with 128 neurons in each layer, run:
+
+```
+python train.py --model_architecture dnn --model_size_info 128 128 128
+```
+The command line argument *--model_size_info* is used to pass the neural network layer
+dimensions such as number of layers, convolution filter size/stride as a list to models.py,
+which builds the TensorFlow graph based on the provided model architecture
+and layer dimensions. For more info on *model_size_info* for each network architecture see
+[models.py](models.py).
+
+The training commands with all the hyperparameters to reproduce the models shown in the
+[paper](https://arxiv.org/pdf/1711.07128.pdf) are given [here](recreate_model.sh).
+
+## Testing
+To run inference on the trained model from a checkpoint and get accuracy on validation and test sets, run:
+```
+python evaluation.py --model_architecture dnn --model_size_info 128 128 128 --checkpoint
+```
+The parameters used here should match those used in the Training step.
+
+## Optimization
+
+We introduce a new *optional* step to optimize the trained keyword spotting model for deployment.
+
+Here we use TensorFlow's [weight clustering API](https://www.tensorflow.org/model_optimization/guide/clustering) to reduce the compressed model size and optimize inference on supported hardware. 32 weight clusters and kmeans++ cluster intialization method are used as the clustering hyperparameters.
+
+To optimize your trained model (e.g. a DNN), a trained model checkpoint is needed to run clustering and fine-tuning on.
+You can use the pre-trained checkpoints provided, or train your own model and use the resulting checkpoint.
+
+To apply the optimization and fine-tuning, run the following command:
+```
+python optimisations.py --model_architecture dnn --model_size_info 128 128 128 --checkpoint
+```
+The parameters used here should match those used in the Training step, except for the number of training steps.
+The number of training steps is reduced since the optimization step only requires fine-tuning.
+
+This will generate a clustered model checkpoint that can be used in the quantization step to generate a quantized and clustered TFLite model.
+
+## Quantization and TFLite Conversion
+
+As part of the update we now use TensorFlow's
+[post training quantization](https://www.tensorflow.org/lite/performance/post_training_quantization) to
+make quantization of the trained models super simple.
+
+To quantize your trained model (e.g. a DNN) run:
+```
+python convert_to_tflite.py --model_architecture dnn --model_size_info 128 128 128 --checkpoint [--inference_type int8|int16]
+```
+The parameters used here should match those used in the Training step.
+
+The inference_type parameter is *optional* and to be used if a fully quantized model with inputs and outputs of type int8 or int16 is needed. It defaults to fp32.
+
+This step will produce a quantized TFLite file *dnn_quantized.tflite*.
+You can test the accuracy of this quantized model on the test set by running:
+```
+python evaluation.py --tflite_path dnn_quantized.tflite
+```
+The parameters used here should match those used in the Training step.
+
+`convert_to_tflite.py` uses post-training quantization to generate a quantized model by default. If you wish to convert to a floating point TFLite model, use the command below:
+
+```
+python convert_to_tflite.py --model_architecture dnn --model_size_info 128 128 128 --checkpoint --no-quantize
+```
+
+This will produce a floating point TFLite file *dnn.tflite*. You can test the accuracy of this floating point model using `evaluation.py` as above.
diff --git a/models/keyword_spotting/ds_cnn_small/model_package_tf/convert_to_tflite.py b/models/keyword_spotting/ds_cnn_small/model_package_tf/convert_to_tflite.py
new file mode 100644
index 0000000..64ab8df
--- /dev/null
+++ b/models/keyword_spotting/ds_cnn_small/model_package_tf/convert_to_tflite.py
@@ -0,0 +1,234 @@
+# Copyright © 2023 Arm Ltd. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Functions for converting and quantizing a trained keyword spotting
+ model and saving to TFLite."""
+
+import argparse
+
+import tensorflow as tf
+
+from data_processing import data_preprocessing
+from model_core_utils import models
+from evaluation import tflite_test
+
+NUM_REP_DATA_SAMPLES = 100 # How many samples to use for post training quantization.
+
+
+def convert(model_settings, audio_processor, checkpoint, quantize, inference_type, tflite_path):
+ """Load our trained floating point model and convert it.
+
+ TFLite conversion or post training quantization is performed and the
+ resulting model is saved as a TFLite file.
+ We use samples from the validation set to do post training quantization.
+
+ Args:
+ model_settings: Dictionary of common model settings.
+ audio_processor: Audio processor class object.
+ checkpoint: Path to training checkpoint to load.
+ quantize: Whether to quantize the model or convert to fp32 TFLite model.
+ inference_type: Input/output type of the quantized model.
+ tflite_path: Output TFLite file save path.
+ """
+ model = models.create_model(model_settings, FLAGS.model_architecture, FLAGS.model_size_info, False)
+ model.load_weights(checkpoint).expect_partial()
+
+ val_data = audio_processor.get_data(audio_processor.Modes.VALIDATION).batch(1)
+
+ def _rep_dataset():
+ """Generator function to produce representative dataset."""
+ i = 0
+ for mfcc, label in val_data:
+ if i > NUM_REP_DATA_SAMPLES:
+ break
+ i += 1
+ yield [mfcc]
+
+ if quantize:
+ # Quantize model and save to disk.
+ tflite_model = post_training_quantize(model, inference_type, _rep_dataset)
+ with open(tflite_path, 'wb') as f:
+ f.write(tflite_model)
+ print(f'Quantized model saved to {tflite_path}.')
+ else:
+ converter = tf.lite.TFLiteConverter.from_keras_model(model)
+ tflite_model = converter.convert()
+ with open(tflite_path, 'wb') as f:
+ f.write(tflite_model)
+ print(f'Converted model saved to {tflite_path}.')
+
+
+def post_training_quantize(keras_model, inference_type, rep_dataset):
+ """Perform post training quantization and returns the TFLite model ready for saving.
+
+ See https://www.tensorflow.org/lite/performance/post_training_quantization#full_integer_quantization for
+ more details.
+
+ Args:
+ keras_model: The trained tf Keras model used for post training quantization.
+ inference_type: Input/output type of the quantized model.
+ rep_dataset: Function to use as a representative dataset, must be callable.
+
+ Returns:
+ Quantized TFLite model ready for saving to disk.
+ """
+ converter = tf.lite.TFLiteConverter.from_keras_model(keras_model)
+ converter.optimizations = [tf.lite.Optimize.DEFAULT]
+
+ if inference_type == 'int8':
+ converter.inference_input_type = tf.int8
+ converter.inference_output_type = tf.int8
+ supported_ops = tf.lite.OpsSet.TFLITE_BUILTINS_INT8
+ if inference_type == 'int16':
+ converter.inference_input_type = tf.int16
+ converter.inference_output_type = tf.int16
+ supported_ops = tf.lite.OpsSet.EXPERIMENTAL_TFLITE_BUILTINS_ACTIVATIONS_INT16_WEIGHTS_INT8
+
+ # Int8 post training quantization needs representative dataset.
+ converter.representative_dataset = rep_dataset
+ converter.target_spec.supported_ops = [supported_ops]
+
+ tflite_model = converter.convert()
+
+ return tflite_model
+
+
+def main():
+ model_settings = models.prepare_model_settings(
+ len(data_preprocessing.prepare_words_list(FLAGS.wanted_words.split(','))),
+ FLAGS.sample_rate, FLAGS.clip_duration_ms, FLAGS.window_size_ms,
+ FLAGS.window_stride_ms, FLAGS.dct_coefficient_count)
+
+ audio_processor = data_preprocessing.AudioProcessor(data_url=FLAGS.data_url,
+ data_dir=FLAGS.data_dir,
+ silence_percentage=FLAGS.silence_percentage,
+ unknown_percentage=FLAGS.unknown_percentage,
+ wanted_words=FLAGS.wanted_words.split(','),
+ validation_percentage=FLAGS.validation_percentage,
+ testing_percentage=FLAGS.testing_percentage,
+ model_settings=model_settings)
+
+ if FLAGS.quantize:
+ tflite_path = f'{FLAGS.model_architecture}_quantized.tflite'
+ else:
+ tflite_path = f'{FLAGS.model_architecture}.tflite'
+
+ # Load floating point model from checkpoint and convert it.
+ convert(model_settings, audio_processor, FLAGS.checkpoint,
+ FLAGS.quantize, FLAGS.inference_type, tflite_path)
+
+ # Test the newly converted model on the test set.
+ tflite_test(model_settings, audio_processor, tflite_path)
+
+
+if __name__ == '__main__':
+ parser = argparse.ArgumentParser()
+ parser.add_argument(
+ '--data_url',
+ type=str,
+ default='http://download.tensorflow.org/data/speech_commands_v0.02.tar.gz',
+ help='Location of speech training data archive on the web.')
+ parser.add_argument(
+ '--data_dir',
+ type=str,
+ default='/tmp/speech_dataset/',
+ help="""\
+ Where to download the speech training data to.
+ """)
+ parser.add_argument(
+ '--silence_percentage',
+ type=float,
+ default=10.0,
+ help="""\
+ How much of the training data should be silence.
+ """)
+ parser.add_argument(
+ '--unknown_percentage',
+ type=float,
+ default=10.0,
+ help="""\
+ How much of the training data should be unknown words.
+ """)
+ parser.add_argument(
+ '--testing_percentage',
+ type=int,
+ default=10,
+ help='What percentage of wavs to use as a test set.')
+ parser.add_argument(
+ '--validation_percentage',
+ type=int,
+ default=10,
+ help='What percentage of wavs to use as a validation set.')
+ parser.add_argument(
+ '--sample_rate',
+ type=int,
+ default=16000,
+ help='Expected sample rate of the wavs',)
+ parser.add_argument(
+ '--clip_duration_ms',
+ type=int,
+ default=1000,
+ help='Expected duration in milliseconds of the wavs',)
+ parser.add_argument(
+ '--window_size_ms',
+ type=float,
+ default=30.0,
+ help='How long each spectrogram timeslice is',)
+ parser.add_argument(
+ '--window_stride_ms',
+ type=float,
+ default=10.0,
+ help='How long each spectrogram timeslice is',)
+ parser.add_argument(
+ '--dct_coefficient_count',
+ type=int,
+ default=40,
+ help='How many bins to use for the MFCC fingerprint',)
+ parser.add_argument(
+ '--wanted_words',
+ type=str,
+ default='yes,no,up,down,left,right,on,off,stop,go',
+ help='Words to use (others will be added to an unknown label)',)
+ parser.add_argument(
+ '--model_architecture',
+ type=str,
+ default='dnn',
+ help='What model architecture to use')
+ parser.add_argument(
+ '--model_size_info',
+ type=int,
+ nargs="+",
+ default=[128, 128, 128],
+ help='Model dimensions - different for various models')
+ parser.add_argument(
+ '--checkpoint',
+ type=str,
+ help='Checkpoint to load the weights from.')
+ parser.add_argument(
+ '--quantize',
+ dest='quantize',
+ action="store_true",
+ default=True,
+ help='Whether to quantize the model or convert to fp32 TFLite model. Defaults to True.')
+ parser.add_argument(
+ '--no-quantize',
+ dest='quantize',
+ action="store_false",
+ help='Whether to quantize the model or convert to fp32 TFLite model. Defaults to True.')
+ parser.add_argument(
+ '--inference_type',
+ type=str,
+ default='fp32',
+ help='If quantize is true, whether the model input and output is float32, int8 or int16')
+
+ FLAGS, _ = parser.parse_known_args()
+ main()
diff --git a/models/keyword_spotting/ds_cnn_small/model_package_tf/data_processing/__init__.py b/models/keyword_spotting/ds_cnn_small/model_package_tf/data_processing/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/models/keyword_spotting/ds_cnn_small/model_package_tf/data_processing/data_preprocessing.py b/models/keyword_spotting/ds_cnn_small/model_package_tf/data_processing/data_preprocessing.py
new file mode 100644
index 0000000..05cf5ba
--- /dev/null
+++ b/models/keyword_spotting/ds_cnn_small/model_package_tf/data_processing/data_preprocessing.py
@@ -0,0 +1,462 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+#
+# Modifications Copyright 2023 Arm Inc. All Rights Reserved.
+# Modified to use TensorFlow 2.0 and data pipelines.
+#
+"""Functions for loading and preparing data for keyword spotting."""
+
+import os
+import re
+import sys
+import urllib
+from pathlib import Path
+import tarfile
+import hashlib
+import random
+import math
+from enum import Enum
+
+import numpy as np
+import tensorflow as tf
+from tensorflow.python.ops import gen_audio_ops as audio_ops
+
+MAX_NUM_WAVS_PER_CLASS = 2**27 - 1 # ~134M
+RANDOM_SEED = 59185
+BACKGROUND_NOISE_DIR_NAME = '_background_noise_'
+SILENCE_LABEL = '_silence_'
+SILENCE_INDEX = 0
+UNKNOWN_WORD_INDEX = 1
+UNKNOWN_WORD_LABEL = '_unknown_'
+
+
+def load_wav_file(wav_filename, desired_samples):
+ """Loads and then decodes a given 16bit PCM wav file.
+
+ Decoded audio is scaled to the range [-1, 1] and padded or cropped to the desired number of samples.
+
+ Args:
+ wav_filename: 16bit PCM wav file to load.
+ desired_samples: Number of samples wanted from the audio file.
+
+ Returns:
+ Tuple consisting of the decoded audio and sample rate.
+ """
+ wav_file = tf.io.read_file(wav_filename)
+ decoded_wav = audio_ops.decode_wav(wav_file, desired_channels=1, desired_samples=desired_samples)
+
+ return decoded_wav.audio, decoded_wav.sample_rate
+
+
+def calculate_mfcc(audio_signal, audio_sample_rate, window_size, window_stride, num_mfcc):
+ """Returns Mel Frequency Cepstral Coefficients (MFCC) for a given audio signal.
+
+ Args:
+ audio_signal: Raw audio signal in range [-1, 1]
+ audio_sample_rate: Audio signal sample rate
+ window_size: Window size in samples for calculating spectrogram
+ window_stride: Window stride in samples for calculating spectrogram
+ num_mfcc: The number of MFCC features wanted.
+
+ Returns:
+ Calculated mffc features.
+ """
+ spectrogram = audio_ops.audio_spectrogram(input=audio_signal, window_size=window_size, stride=window_stride,
+ magnitude_squared=True)
+
+ mfcc_features = audio_ops.mfcc(spectrogram, audio_sample_rate, dct_coefficient_count=num_mfcc)
+
+ return mfcc_features
+
+
+def which_set(filename, validation_percentage, testing_percentage):
+ """Determines which data partition the file should belong to.
+
+ We want to keep files in the same training, validation, or testing sets even
+ if new ones are added over time. This makes it less likely that testing
+ samples will accidentally be reused in training when long runs are restarted
+ for example. To keep this stability, a hash of the filename is taken and used
+ to determine which set it should belong to. This determination only depends on
+ the name and the set proportions, so it won't change as other files are added.
+ It's also useful to associate particular files as related (for example words
+ spoken by the same person), so anything after '_nohash_' in a filename is
+ ignored for set determination. This ensures that 'bobby_nohash_0.wav' and
+ 'bobby_nohash_1.wav' are always in the same set, for example.
+
+ Args:
+ filename: File path of the data sample.
+ validation_percentage: How much of the data set to use for validation.
+ testing_percentage: How much of the data set to use for testing.
+
+ Returns:
+ String, one of 'training', 'validation', or 'testing'.
+ """
+ base_name = os.path.basename(filename)
+ # We want to ignore anything after '_nohash_' in the file name when
+ # deciding which set to put a wav in, so the data set creator has a way of
+ # grouping wavs that are close variations of each other.
+ hash_name = re.sub(r'_nohash_.*$', '', base_name)
+ # This looks a bit magical, but we need to decide whether this file should
+ # go into the training, testing, or validation sets, and we want to keep
+ # existing files in the same set even if more files are subsequently
+ # added.
+ # To do that, we need a stable way of deciding based on just the file name
+ # itself, so we do a hash of that and then use that to generate a
+ # probability value that we use to assign it.
+ hash_name_hashed = hashlib.sha1(tf.compat.as_bytes(hash_name)).hexdigest()
+ percentage_hash = ((int(hash_name_hashed, 16) %
+ (MAX_NUM_WAVS_PER_CLASS + 1)) *
+ (100.0 / MAX_NUM_WAVS_PER_CLASS))
+ if percentage_hash < validation_percentage:
+ result = 'validation'
+ elif percentage_hash < (testing_percentage + validation_percentage):
+ result = 'testing'
+ else:
+ result = 'training'
+ return result
+
+
+def prepare_words_list(wanted_words):
+ """Prepends common tokens to the custom word list.
+
+ Args:
+ wanted_words: List of strings containing custom words to spot.
+
+ Returns:
+ List of words with silence and unknown tokens added.
+ """
+ return [SILENCE_LABEL, UNKNOWN_WORD_LABEL] + wanted_words
+
+
+class AudioProcessor:
+ """Handles loading, partitioning, and preparing audio training data."""
+
+ class Modes(Enum):
+ TRAINING = 1
+ VALIDATION = 2
+ TESTING = 3
+
+ def __init__(self, data_url, data_dir, silence_percentage, unknown_percentage,
+ wanted_words, validation_percentage, testing_percentage, model_settings):
+ self.data_dir = Path(data_dir)
+ self.model_settings = model_settings
+ self.words_list = prepare_words_list(wanted_words)
+
+ self._tf_datasets = {}
+ self.background_data = None
+ self._set_size = {'training': 0, 'validation': 0, 'testing': 0}
+
+ self._download_and_extract_data(data_url, data_dir)
+ self._prepare_datasets(silence_percentage, unknown_percentage, wanted_words,
+ validation_percentage, testing_percentage)
+ self._prepare_background_data()
+
+ def get_data(self, mode, background_frequency=0, background_volume_range=0, time_shift=0):
+ """Returns the train, validation or test set for KWS as a TF Dataset.
+
+ Args:
+ mode: The set to return, see AudioProcessor.Modes enumeration.
+ background_frequency: How many of the samples have background noise mixed in.
+ background_volume_range: How loud the background noise should be, between 0 and 1.
+ time_shift: Range to randomly shift the training audio by in time.
+
+ Returns:
+ TF dataset that will generate tuples containing an mfcc and corresponding label.
+
+ Raises:
+ ValueError: If mode is not recognised.
+ """
+ if mode == AudioProcessor.Modes.TRAINING:
+ dataset = self._tf_datasets['training']
+ elif mode == AudioProcessor.Modes.VALIDATION:
+ dataset = self._tf_datasets['validation']
+ elif mode == AudioProcessor.Modes.TESTING:
+ dataset = self._tf_datasets['testing']
+ else:
+ ValueError("Incorrect dataset type given")
+
+ use_background = (self.background_data is not None) and (mode == AudioProcessor.Modes.TRAINING)
+ dataset = dataset.map(lambda path, label: self._process_path(path, label, self.model_settings,
+ background_frequency, background_volume_range,
+ time_shift, use_background, self.background_data),
+ num_parallel_calls=tf.data.experimental.AUTOTUNE)
+
+ return dataset
+
+ def set_size(self, mode):
+ """Get the number of samples in the requested dataset partition.
+
+ Args:
+ mode: Which partition, see AudioProcessor.Modes enumeration.
+
+ Returns:
+ Number of samples in the partition.
+
+ Raises:
+ ValueError: If mode is not recognised.
+ """
+ if mode == AudioProcessor.Modes.TRAINING:
+ return self._set_size['training']
+ elif mode == AudioProcessor.Modes.VALIDATION:
+ return self._set_size['validation']
+ elif mode == AudioProcessor.Modes.TESTING:
+ return self._set_size['testing']
+ else:
+ ValueError('Incorrect dataset type given')
+
+ @staticmethod
+ def _process_path(path, label, model_settings, background_frequency, background_volume_range, time_shift_samples,
+ use_background, background_data):
+ """Load wav files and calculate mfcc features.
+
+ Random shifting of samples and adding in background noise is done within this function as well.
+ This function is meant to be mapped onto a TF Dataset by using a lambda function.
+
+ Args:
+ path: Path to the wav file to load.
+ label: Integer label for classifying the audio clip.
+ model_settings: Dictionary of settings for model being trained.
+ background_frequency: How many clips will have background noise, 0.0 to 1.0.
+ background_volume_range: How loud the background noise will be.
+ time_shift_samples: How much to randomly shift the clips by.
+ use_background: Add in background noise to audio clips or not.
+ background_data: Ragged tensor of loaded background noise samples.
+
+ Returns:
+ Tuple of calculated flattened mfcc and its class label.
+ """
+
+ desired_samples = model_settings['desired_samples']
+ audio, sample_rate = load_wav_file(path, desired_samples=desired_samples)
+
+ # Make our own silence audio data.
+ if label == SILENCE_INDEX:
+ audio = tf.multiply(audio, 0)
+
+ # Shift samples start position and pad any gaps with zeros.
+ if time_shift_samples > 0:
+ time_shift_amount = tf.random.uniform(shape=(), minval=-time_shift_samples, maxval=time_shift_samples,
+ dtype=tf.int32)
+ else:
+ time_shift_amount = 0
+ if time_shift_amount > 0:
+ time_shift_padding = [[time_shift_amount, 0], [0, 0]]
+ time_shift_offset = [0, 0]
+ else:
+ time_shift_padding = [[0, -time_shift_amount], [0, 0]]
+ time_shift_offset = [-time_shift_amount, 0]
+
+ padded_foreground = tf.pad(audio, time_shift_padding, mode='CONSTANT')
+ sliced_foreground = tf.slice(padded_foreground, time_shift_offset, [desired_samples, -1])
+
+ # Get a random section of background noise.
+ if use_background:
+ background_index = tf.random.uniform(shape=(), maxval=background_data.shape[0], dtype=tf.int32)
+ background_sample = background_data[background_index]
+ background_offset = tf.random.uniform(shape=(), maxval=len(background_sample)-desired_samples,
+ dtype=tf.int32)
+ background_clipped = background_sample[background_offset:(background_offset + desired_samples)]
+ background_reshaped = tf.reshape(background_clipped, [desired_samples, 1])
+ if tf.random.uniform(shape=(), maxval=1) < background_frequency:
+ background_volume = tf.random.uniform(shape=(), maxval=background_volume_range)
+ else:
+ background_volume = tf.constant(0, dtype='float32')
+ else:
+ background_reshaped = np.zeros([desired_samples, 1], dtype=np.float32)
+ background_volume = tf.constant(0, dtype='float32')
+
+ # Mix in background noise.
+ background_mul = tf.multiply(background_reshaped, background_volume)
+ background_add = tf.add(background_mul, sliced_foreground)
+ background_clamp = tf.clip_by_value(background_add, -1.0, 1.0)
+
+ mfcc = calculate_mfcc(background_clamp, sample_rate, model_settings['window_size_samples'],
+ model_settings['window_stride_samples'],
+ model_settings['dct_coefficient_count'])
+ mfcc = tf.reshape(mfcc, [-1])
+
+ return mfcc, label
+
+ def _download_and_extract_data(self, data_url, target_directory):
+ """Downloads and extracts file to target directory.
+
+ If the file does not already exist download it and then untar into the target directory.
+
+ Args:
+ data_url: Web link to the tarred data to download.
+ target_directory: Directory to download and extract to.
+ """
+ target_directory = Path(target_directory)
+ target_directory.mkdir(exist_ok=True)
+
+ filename = data_url.split('/')[-1]
+ filepath = target_directory / filename
+
+ if not filepath.exists():
+ def _report_hook(block_num, block_size, total_size):
+ """Function to track download progress in urllib"""
+ read_so_far = block_num * block_size
+ percent = (read_so_far / total_size) * 100.0
+
+ s = f"\rDownloading {filename} {percent:.1f}%"
+
+ sys.stdout.write(s)
+ sys.stdout.flush()
+
+ filepath, _ = urllib.request.urlretrieve(data_url, filepath, _report_hook)
+ print()
+
+ print(f'Untarring {filename}...')
+ tarfile.open(filepath, 'r:gz').extractall(target_directory)
+
+ def _prepare_datasets(self, silence_percentage, unknown_percentage, wanted_words,
+ validation_percentage, testing_percentage):
+ """Split the data into train, validation and testing sets.
+
+ Silence and unknown data is added, then sets are converted to TF Datasets.
+
+ Args:
+ silence_percentage: Percent of words should be silence.
+ unknown_percentage: Percent of words that should be unknown.
+ wanted_words: List of words wanted to classify.
+ validation_percentage: Percent to split off for validation.
+ testing_percentage: Percent to split off for testing.
+ """
+ # Make sure the shuffling and picking of unknowns is deterministic.
+ random.seed(RANDOM_SEED)
+ wanted_words_index = {}
+
+ for index, wanted_word in enumerate(wanted_words):
+ wanted_words_index[wanted_word] = index + 2
+
+ # Find all wav files in subfolders.
+ search_path = self.data_dir / '*' / '*.wav'
+ data_index, unknown_index, all_words = self._find_and_sort_wavs(search_path, validation_percentage,
+ testing_percentage, wanted_words_index)
+
+ for index, wanted_word in enumerate(wanted_words):
+ if wanted_word not in all_words:
+ raise Exception(f'Tried to find {wanted_word} in labels but only found: {", ".join(all_words.keys())}')
+
+ word_to_index = {}
+ for word in all_words:
+ if word in wanted_words_index:
+ word_to_index[word] = wanted_words_index[word]
+ else:
+ word_to_index[word] = UNKNOWN_WORD_INDEX
+ word_to_index[SILENCE_LABEL] = SILENCE_INDEX
+
+ # We need an arbitrary file to load as the input for the silence samples.
+ # It's multiplied by zero later, so the content doesn't matter.
+ silence_wav_path = data_index['training'][0]['file']
+ for set_index in ['validation', 'testing', 'training']:
+ set_size = len(data_index[set_index]) # Size before adding silence and unknown samples.
+ silence_size = int(math.ceil(set_size * silence_percentage / 100))
+ for _ in range(silence_size):
+ data_index[set_index].append({
+ 'label': SILENCE_LABEL,
+ 'file': silence_wav_path
+ })
+ # Pick some unknowns to add to each partition of the data set.
+ random.shuffle(unknown_index[set_index])
+ unknown_size = int(math.ceil(set_size * unknown_percentage / 100))
+ data_index[set_index].extend(unknown_index[set_index][:unknown_size])
+
+ self._set_size[set_index] = len(data_index[set_index]) # Size after adding silence and unknown samples.
+
+ # Make sure the ordering is random.
+ random.shuffle(data_index[set_index])
+
+ # Transform into TF Datasets ready for easier processing later.
+ labels, paths = list(zip(*[d.values() for d in data_index[set_index]]))
+ labels = [word_to_index[label] for label in labels]
+ self._tf_datasets[set_index] = tf.data.Dataset.from_tensor_slices((list(paths), labels))
+
+ def _find_and_sort_wavs(self, search_pattern, validation_percentage, testing_percentage, wanted_words_index):
+ """Find and sort wav files into known and unknown word sets.
+
+ Known words are files containing words in the list of wanted words.
+ Any other clip goes to the unknown label set. Labels come from the folder names.
+ All clips are also assigned to train, test and validation sets.
+
+ Args:
+ search_pattern: Path pattern used by glob to find wav files.
+ validation_percentage: Percent to split off for validation.
+ testing_percentage: Percent to split off for testing.
+ wanted_words_index: Dict mapping wanted words to their label index.
+
+ Returns:
+ 3-tuple of known words, unknown words and mapping of all word labels.
+ """
+ data_index = {'validation': [], 'testing': [], 'training': []}
+ unknown_index = {'validation': [], 'testing': [], 'training': []}
+ all_words = {}
+
+ for wav_path in sorted(tf.io.gfile.glob(str(search_pattern))):
+ word = Path(wav_path).parent.name.lower()
+
+ # Treat the '_background_noise_' folder as a special case, since we expect
+ # it to contain long audio samples we mix in to improve training.
+ if word == BACKGROUND_NOISE_DIR_NAME:
+ continue
+
+ all_words[word] = True
+ set_index = which_set(wav_path, validation_percentage, testing_percentage)
+ # If it's a known class, store its detail, otherwise add it to the list
+ # we'll use to train the unknown label.
+ if word in wanted_words_index:
+ data_index[set_index].append({'label': word, 'file': wav_path})
+ else:
+ unknown_index[set_index].append({'label': word, 'file': wav_path})
+ if not all_words:
+ raise Exception('No .wavs found at ' + str(search_pattern))
+
+ return data_index, unknown_index, all_words
+
+ def _prepare_background_data(self):
+ """Searches a folder for background noise audio, and loads it into memory.
+
+ It's expected that the background audio samples will be in a subdirectory
+ named '_background_noise_' inside the 'data_dir' folder, as .wavs that match
+ the sample rate of the training data, but can be much longer in duration.
+
+ If the '_background_noise_' folder doesn't exist at all, this isn't an
+ error, it's just taken to mean that no background noise augmentation should
+ be used. If the folder does exist, but it's empty, that's treated as an
+ error.
+
+ Returns:
+ Ragged tensor of raw PCM-encoded audio samples of background noise.
+ None if '_background_noise_' folder doesnt exist.
+
+ Raises:
+ Exception: If files aren't found in the folder.
+ """
+ background_data = []
+ background_dir = Path(self.data_dir / BACKGROUND_NOISE_DIR_NAME)
+ if not background_dir.exists():
+ self.background_data = None
+ return
+
+ search_path = Path(background_dir / '*.wav')
+ for wav_path in tf.io.gfile.glob(str(search_path)):
+ wav_data, _ = load_wav_file(wav_path, desired_samples=-1)
+ background_data.append(tf.reshape(wav_data, [-1]))
+
+ if not background_data:
+ raise Exception('No background wav files were found in ' + str(search_path))
+
+ # Ragged tensor as we cant use lists in tf dataset map functions.
+ self.background_data = tf.ragged.stack(background_data)
diff --git a/models/keyword_spotting/ds_cnn_small/model_package_tf/ds_cnn_s_inference_keras.py b/models/keyword_spotting/ds_cnn_small/model_package_tf/ds_cnn_s_inference_keras.py
new file mode 100644
index 0000000..db7694a
--- /dev/null
+++ b/models/keyword_spotting/ds_cnn_small/model_package_tf/ds_cnn_s_inference_keras.py
@@ -0,0 +1,76 @@
+# Copyright © 2023 Arm Ltd. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from data_processing.data_preprocessing import load_wav_file, calculate_mfcc
+
+import tensorflow as tf
+import argparse
+
+
+def load_labels(filename):
+ """Read in labels, one label per line."""
+ f = open(filename, "r")
+ return f.read().splitlines()
+
+
+def main():
+ window_size_samples = int(FLAGS.sample_rate * FLAGS.window_size_ms / 1000)
+ window_stride_samples = int(FLAGS.sample_rate * FLAGS.window_stride_ms / 1000)
+ decoded, sample = load_wav_file(FLAGS.wav, FLAGS.sample_rate)
+ x = calculate_mfcc(decoded, sample, window_size_samples, window_stride_samples, FLAGS.dct_coefficient_count)
+ x = tf.reshape(x, [1, -1])
+
+ model = tf.keras.models.load_model(FLAGS.keras_file_path)
+ predictions = model.predict(x)
+
+ # Sort to show labels in order of confidence
+ top_k = predictions[0].argsort()[-1:][::-1]
+ for node_id in top_k:
+ human_string = load_labels(FLAGS.labels)[int(node_id)]
+ score = predictions[0,node_id]
+ print(f'model predicted: {human_string} with score {score:.5f}')
+
+
+if __name__ == '__main__':
+ parser = argparse.ArgumentParser()
+ parser.add_argument(
+ '--wav', type=str, default='', help='Audio file to be identified.')
+ parser.add_argument(
+ '--labels', type=str, default='', help='Path to file containing labels.')
+ parser.add_argument(
+ '--sample_rate',
+ type=int,
+ default=16000,
+ help='Expected sample rate of the wavs', )
+ parser.add_argument(
+ '--window_size_ms',
+ type=float,
+ default=30.0,
+ help='How long each spectrogram timeslice is', )
+ parser.add_argument(
+ '--window_stride_ms',
+ type=float,
+ default=10.0,
+ help='How long each spectrogram timeslice is', )
+ parser.add_argument(
+ '--dct_coefficient_count',
+ type=int,
+ default=40,
+ help='How many bins to use for the MFCC fingerprint', )
+ parser.add_argument(
+ '--keras_file_path',
+ type=str,
+ default='',
+ help='Path to the .h5 Keras model file to use for testing.')
+ FLAGS, unparsed = parser.parse_known_args()
+ main()
diff --git a/models/keyword_spotting/ds_cnn_small/model_package_tf/ds_cnn_s_inference_tflite.py b/models/keyword_spotting/ds_cnn_small/model_package_tf/ds_cnn_s_inference_tflite.py
new file mode 100644
index 0000000..9f79d99
--- /dev/null
+++ b/models/keyword_spotting/ds_cnn_small/model_package_tf/ds_cnn_s_inference_tflite.py
@@ -0,0 +1,120 @@
+# Copyright © 2023 Arm Ltd. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from data_processing.data_preprocessing import load_wav_file, calculate_mfcc
+
+import tensorflow as tf
+import numpy as np
+import argparse
+
+
+def tflite_inference(input_data, tflite_path):
+ """Call forwards pass of TFLite file and returns the result.
+
+ Args:
+ input_data: Input data to use on forward pass.
+ tflite_path: Path to TFLite file to run.
+
+ Returns:
+ Output from inference.
+ """
+ supported_quant_dtypes = (np.int8, np.int16)
+ interpreter = tf.lite.Interpreter(model_path=tflite_path)
+ interpreter.allocate_tensors()
+
+ input_details = interpreter.get_input_details()
+ output_details = interpreter.get_output_details()
+
+ input_dtype = input_details[0]["dtype"]
+ output_dtype = output_details[0]["dtype"]
+
+ # Check if the input/output type is quantized,
+ # set scale and zero-point accordingly
+ if input_dtype in supported_quant_dtypes:
+ input_scale, input_zero_point = input_details[0]["quantization"]
+ else:
+ input_scale, input_zero_point = 1, 0
+
+ input_data = input_data / input_scale + input_zero_point
+ input_data = np.round(input_data) if input_dtype in supported_quant_dtypes else input_data
+
+ if output_dtype in supported_quant_dtypes:
+ output_scale, output_zero_point = output_details[0]["quantization"]
+ else:
+ output_scale, output_zero_point = 1, 0
+
+ interpreter.set_tensor(input_details[0]['index'], tf.cast(input_data, input_dtype))
+ interpreter.invoke()
+
+ output_data = interpreter.get_tensor(output_details[0]['index'])
+
+ output_data = output_scale * (output_data.astype(np.float32) - output_zero_point)
+
+ return output_data
+
+
+def load_labels(filename):
+ """Read in labels, one label per line."""
+ f = open(filename, "r")
+ return f.read().splitlines()
+
+
+def main():
+ window_size_samples = int(FLAGS.sample_rate * FLAGS.window_size_ms / 1000)
+ window_stride_samples = int(FLAGS.sample_rate * FLAGS.window_stride_ms / 1000)
+ decoded, sample = load_wav_file(FLAGS.wav, FLAGS.sample_rate)
+ x = calculate_mfcc(decoded, sample, window_size_samples, window_stride_samples, FLAGS.dct_coefficient_count)
+ x = tf.reshape(x, [1, -1])
+ predictions = tflite_inference(x, FLAGS.tflite_path)
+
+ # Sort to show labels in order of confidence
+ top_k = predictions[0].argsort()[-1:][::-1]
+ for node_id in top_k:
+ human_string = load_labels(FLAGS.labels)[int(node_id)]
+ score = predictions[0,node_id]
+ print(f'model predicted: {human_string} with score {score:.5f}')
+
+
+if __name__ == '__main__':
+ parser = argparse.ArgumentParser()
+ parser.add_argument(
+ '--wav', type=str, default='', help='Audio file to be identified.')
+ parser.add_argument(
+ '--labels', type=str, default='', help='Path to file containing labels.')
+ parser.add_argument(
+ '--sample_rate',
+ type=int,
+ default=16000,
+ help='Expected sample rate of the wavs', )
+ parser.add_argument(
+ '--window_size_ms',
+ type=float,
+ default=30.0,
+ help='How long each spectrogram timeslice is', )
+ parser.add_argument(
+ '--window_stride_ms',
+ type=float,
+ default=10.0,
+ help='How long each spectrogram timeslice is', )
+ parser.add_argument(
+ '--dct_coefficient_count',
+ type=int,
+ default=40,
+ help='How many bins to use for the MFCC fingerprint', )
+ parser.add_argument(
+ '--tflite_path',
+ type=str,
+ default='',
+ help='Path to TFLite file to use for testing.')
+ FLAGS, unparsed = parser.parse_known_args()
+ main()
diff --git a/models/keyword_spotting/ds_cnn_small/model_package_tf/evaluation.py b/models/keyword_spotting/ds_cnn_small/model_package_tf/evaluation.py
new file mode 100644
index 0000000..9488d35
--- /dev/null
+++ b/models/keyword_spotting/ds_cnn_small/model_package_tf/evaluation.py
@@ -0,0 +1,250 @@
+# Copyright © 2023 Arm Ltd. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Functions for testing trained keyword spotting models from checkpoint files and TFLite files."""
+
+import argparse
+
+import numpy as np
+import tensorflow as tf
+
+from data_processing import data_preprocessing
+from model_core_utils import models
+from ds_cnn_s_inference_tflite import tflite_inference
+
+
+def tflite_test(model_settings, audio_processor, tflite_path):
+ """Calculate accuracy and confusion matrices on the validation and test sets.
+
+ A TFLite model is used for doing testing.
+
+ Args:
+ model_settings: Dictionary of common model settings.
+ audio_processor: Audio processor class object.
+ tflite_path: Path to TFLite file to use for inference.
+ """
+ # Evaluate on validation set.
+ print("Running TFLite evaluation on validation set...")
+ val_data = audio_processor.get_data(audio_processor.Modes.VALIDATION).batch(1)
+ expected_indices = np.concatenate([y for x, y in val_data])
+ predicted_indices = []
+
+ for mfcc, label in val_data:
+ prediction = tflite_inference(mfcc, tflite_path)
+ predicted_indices.append(np.squeeze(tf.argmax(prediction, axis=1)))
+
+ val_accuracy = calculate_accuracy(predicted_indices, expected_indices)
+ confusion_matrix = tf.math.confusion_matrix(expected_indices, predicted_indices,
+ num_classes=model_settings['label_count'])
+
+ print(confusion_matrix.numpy())
+ print(f'Validation accuracy = {val_accuracy * 100:.2f}%'
+ f'(N={audio_processor.set_size(audio_processor.Modes.VALIDATION)})')
+
+ # Evaluate on testing set.
+ print("Running TFLite evaluation on test set...")
+ test_data = audio_processor.get_data(audio_processor.Modes.TESTING).batch(1)
+ expected_indices = np.concatenate([y for x, y in test_data])
+ predicted_indices = []
+
+ for mfcc, label in test_data:
+ prediction = tflite_inference(mfcc, tflite_path)
+ predicted_indices.append(np.squeeze(tf.argmax(prediction, axis=1)))
+
+ test_accuracy = calculate_accuracy(predicted_indices, expected_indices)
+ confusion_matrix = tf.math.confusion_matrix(expected_indices, predicted_indices,
+ num_classes=model_settings['label_count'])
+
+ print(confusion_matrix.numpy())
+ print(f'Test accuracy = {test_accuracy * 100:.2f}%'
+ f'(N={audio_processor.set_size(audio_processor.Modes.TESTING)})')
+
+
+def keras_test(model_settings, audio_processor, model):
+ """Calculate accuracy and confusion matrices on the validation and test sets.
+
+ A loaded keras model is used for doing testing.
+
+ Args:
+ model_settings: Dictionary of common model settings.
+ audio_processor: Audio processor class object.
+ model: Loaded keras model.
+ """
+ # Evaluate on validation set.
+ print("Running TF evaluation on validation set...")
+ val_data = audio_processor.get_data(audio_processor.Modes.VALIDATION).batch(FLAGS.batch_size)
+ expected_indices = np.concatenate([y for x, y in val_data])
+
+ predictions = model.predict(val_data)
+ predicted_indices = tf.argmax(predictions, axis=1)
+
+ val_accuracy = calculate_accuracy(predicted_indices, expected_indices)
+ confusion_matrix = tf.math.confusion_matrix(expected_indices, predicted_indices,
+ num_classes=model_settings['label_count'])
+ print(confusion_matrix.numpy())
+ print(f'Validation accuracy = {val_accuracy * 100:.2f}%'
+ f'(N={audio_processor.set_size(audio_processor.Modes.VALIDATION)})')
+
+ # Evaluate on testing set.
+ print("Running TF evaluation on test set...")
+ test_data = audio_processor.get_data(audio_processor.Modes.TESTING).batch(FLAGS.batch_size)
+ expected_indices = np.concatenate([y for x, y in test_data])
+
+ predictions = model.predict(test_data)
+ predicted_indices = tf.argmax(predictions, axis=1)
+
+ test_accuracy = calculate_accuracy(predicted_indices, expected_indices)
+ confusion_matrix = tf.math.confusion_matrix(expected_indices, predicted_indices,
+ num_classes=model_settings['label_count'])
+ print(confusion_matrix.numpy())
+ print(f'Test accuracy = {test_accuracy * 100:.2f}%'
+ f'(N={audio_processor.set_size(audio_processor.Modes.TESTING)})')
+
+
+def calculate_accuracy(predicted_indices, expected_indices):
+ """Calculates and returns accuracy.
+
+ Args:
+ predicted_indices: List of predicted integer indices.
+ expected_indices: List of expected integer indices.
+
+ Returns:
+ Accuracy value between 0 and 1.
+ """
+ correct_prediction = tf.equal(predicted_indices, expected_indices)
+ accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
+ return accuracy
+
+
+def evaluate():
+ """Calculate accuracy and confusion matrices on validation and test sets.
+
+ Model is created and weights loaded from supplied command line arguments.
+ """
+ model_settings = models.prepare_model_settings(
+ len(data_preprocessing.prepare_words_list(FLAGS.wanted_words.split(','))),
+ FLAGS.sample_rate, FLAGS.clip_duration_ms, FLAGS.window_size_ms,
+ FLAGS.window_stride_ms, FLAGS.dct_coefficient_count)
+
+ audio_processor = data_preprocessing.AudioProcessor(data_url=FLAGS.data_url,
+ data_dir=FLAGS.data_dir,
+ silence_percentage=FLAGS.silence_percentage,
+ unknown_percentage=FLAGS.unknown_percentage,
+ wanted_words=FLAGS.wanted_words.split(','),
+ validation_percentage=FLAGS.validation_percentage,
+ testing_percentage=FLAGS.testing_percentage,
+ model_settings=model_settings)
+
+ if FLAGS.tflite_path:
+ tflite_test(model_settings, audio_processor, FLAGS.tflite_path)
+
+ if FLAGS.checkpoint:
+ model = models.create_model(model_settings, FLAGS.model_architecture, FLAGS.model_size_info, False)
+ model.load_weights(FLAGS.checkpoint).expect_partial()
+ keras_test(model_settings, audio_processor, model)
+
+
+if __name__ == '__main__':
+ parser = argparse.ArgumentParser()
+ parser.add_argument(
+ '--data_url',
+ type=str,
+ default='http://download.tensorflow.org/data/speech_commands_v0.02.tar.gz',
+ help='Location of speech training data archive on the web.')
+ parser.add_argument(
+ '--data_dir',
+ type=str,
+ default='/tmp/speech_dataset/',
+ help="""\
+ Where to download the speech training data to.
+ """)
+ parser.add_argument(
+ '--silence_percentage',
+ type=float,
+ default=10.0,
+ help="""\
+ How much of the training data should be silence.
+ """)
+ parser.add_argument(
+ '--unknown_percentage',
+ type=float,
+ default=10.0,
+ help="""\
+ How much of the training data should be unknown words.
+ """)
+ parser.add_argument(
+ '--testing_percentage',
+ type=int,
+ default=10,
+ help='What percentage of wavs to use as a test set.')
+ parser.add_argument(
+ '--validation_percentage',
+ type=int,
+ default=10,
+ help='What percentage of wavs to use as a validation set.')
+ parser.add_argument(
+ '--sample_rate',
+ type=int,
+ default=16000,
+ help='Expected sample rate of the wavs',)
+ parser.add_argument(
+ '--clip_duration_ms',
+ type=int,
+ default=1000,
+ help='Expected duration in milliseconds of the wavs',)
+ parser.add_argument(
+ '--window_size_ms',
+ type=float,
+ default=30.0,
+ help='How long each spectrogram timeslice is',)
+ parser.add_argument(
+ '--window_stride_ms',
+ type=float,
+ default=10.0,
+ help='How long each spectrogram timeslice is',)
+ parser.add_argument(
+ '--dct_coefficient_count',
+ type=int,
+ default=40,
+ help='How many bins to use for the MFCC fingerprint',)
+ parser.add_argument(
+ '--batch_size',
+ type=int,
+ default=100,
+ help='How many items to train with at once',)
+ parser.add_argument(
+ '--wanted_words',
+ type=str,
+ default='yes,no,up,down,left,right,on,off,stop,go',
+ help='Words to use (others will be added to an unknown label)',)
+ parser.add_argument(
+ '--checkpoint',
+ type=str,
+ help='Checkpoint to load the weights from')
+ parser.add_argument(
+ '--tflite_path',
+ type=str,
+ help='Path to TFLite file to use for evaluation')
+ parser.add_argument(
+ '--model_architecture',
+ type=str,
+ default='dnn',
+ help='What model architecture to use')
+ parser.add_argument(
+ '--model_size_info',
+ type=int,
+ nargs="+",
+ default=[128, 128, 128],
+ help='Model dimensions - different for various models')
+
+ FLAGS, _ = parser.parse_known_args()
+ evaluate()
diff --git a/models/keyword_spotting/ds_cnn_small/model_package_tf/how_to_guidance.ipynb b/models/keyword_spotting/ds_cnn_small/model_package_tf/how_to_guidance.ipynb
new file mode 100644
index 0000000..1391914
--- /dev/null
+++ b/models/keyword_spotting/ds_cnn_small/model_package_tf/how_to_guidance.ipynb
@@ -0,0 +1,428 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Copyright (C) 2023 Arm Limited or its affiliates. All rights reserved.\n",
+ "#\n",
+ "# SPDX-License-Identifier: Apache-2.0\n",
+ "#\n",
+ "# Licensed under the Apache License, Version 2.0 (the License); you may\n",
+ "# not use this file except in compliance with the License.\n",
+ "# You may obtain a copy of the License at\n",
+ "#\n",
+ "# www.apache.org/licenses/LICENSE-2.0\n",
+ "#\n",
+ "# Unless required by applicable law or agreed to in writing, software\n",
+ "# distributed under the License is distributed on an AS IS BASIS, WITHOUT\n",
+ "# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
+ "# See the License for the specific language governing permissions and\n",
+ "# limitations under the License."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# DS_CNN_Small - Hero\n",
+ "\n",
+ "Here we reproduce the models with our established codebase and ModelPackage approach for your convenience.\n",
+ "\n",
+ "## Model-Package Overview:\n",
+ "\n",
+ "| Model \t| DS_CNN_Small \t|\n",
+ "|:---------------:\t|:---------------------------------------------------------------:\t|\n",
+ "| **Format**: \t| Keras, Saved Model, TensorFlow Lite int8, TensorFlow Lite fp32 |\n",
+ "| **Feature**: \t| Keyword spotting for Arm Cortex-M CPUs |\n",
+ "| **Architectural Delta w.r.t. Vanilla**: | None |\n",
+ "| **Domain**: \t| Keyword spotting |\n",
+ "| **Package Quality**: \t| Hero |"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Table of contents \n",
+ "\n",
+ "This how-to guidance presents the key steps to reproduce everything in this package. The contents are organised as below. We provided the internal navigation links for users to easy-jump among different sections. \n",
+ "\n",
+ " \n",
+ "* [1.0 Model recreation](#model_recreation)\n",
+ "\n",
+ "* [2.0 Training](#training)\n",
+ "\n",
+ "* [3.0 Testing](#testing)\n",
+ "\n",
+ "* [4.0 Optimization](#optimization)\n",
+ "\n",
+ "* [5.0 Quantization and TFLite conversion](#tflite_conversion)\n",
+ "\n",
+ "* [6.0 Inference the TFLite model files](#tflite_inference)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 1.0 Model Recreation\n",
+ "\n",
+ "In order to recreate the model you will first need to be using ```Python3.7``` and install the requirements in ```requirements.txt```.\n",
+ "\n",
+ "Once you have these requirements satisfied you can execute the recreation script contained within this folder, just run:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "2023-01-31 12:04:29.102214: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0\n",
+ "Untarring speech_commands_v0.02.tar.gz...\n",
+ "2023-01-31 12:05:19.918303: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcuda.so.1\n",
+ "2023-01-31 12:05:19.952173: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: \n",
+ "pciBusID: 0000:03:00.0 name: NVIDIA TITAN Xp computeCapability: 6.1\n",
+ "coreClock: 1.582GHz coreCount: 30 deviceMemorySize: 11.91GiB deviceMemoryBandwidth: 510.07GiB/s\n",
+ "2023-01-31 12:05:19.952211: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0\n",
+ "2023-01-31 12:05:19.971851: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcublas.so.11\n",
+ "2023-01-31 12:05:19.971921: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcublasLt.so.11\n",
+ "2023-01-31 12:05:19.974596: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcufft.so.10\n",
+ "2023-01-31 12:05:19.974884: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcurand.so.10\n",
+ "2023-01-31 12:05:19.975441: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcusolver.so.11\n",
+ "2023-01-31 12:05:19.976147: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcusparse.so.11\n",
+ "2023-01-31 12:05:19.976295: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudnn.so.8\n",
+ "2023-01-31 12:05:19.976755: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0\n",
+ "2023-01-31 12:05:19.977035: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA\n",
+ "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
+ "2023-01-31 12:05:19.977720: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: \n",
+ "pciBusID: 0000:03:00.0 name: NVIDIA TITAN Xp computeCapability: 6.1\n",
+ "coreClock: 1.582GHz coreCount: 30 deviceMemorySize: 11.91GiB deviceMemoryBandwidth: 510.07GiB/s\n",
+ "2023-01-31 12:05:19.978052: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0\n",
+ "2023-01-31 12:05:19.978106: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0\n",
+ "2023-01-31 12:05:20.390120: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1258] Device interconnect StreamExecutor with strength 1 edge matrix:\n",
+ "2023-01-31 12:05:20.390158: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1264] 0 \n",
+ "2023-01-31 12:05:20.390167: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1277] 0: N \n",
+ "2023-01-31 12:05:20.390683: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1418] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 11007 MB memory) -> physical GPU (device: 0, name: NVIDIA TITAN Xp, pci bus id: 0000:03:00.0, compute capability: 6.1)\n",
+ "WARNING:tensorflow:Compiled the loaded model, but the compiled metrics have yet to be built. `model.compile_metrics` will be empty until you train or evaluate the model.\n",
+ "2023-01-31 12:05:22.730373: W tensorflow/python/util/util.cc:348] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.\n",
+ "2023-01-31 12:05:24.433377: I tensorflow/core/grappler/devices.cc:69] Number of eligible GPUs (core count >= 8, compute capability >= 0.0): 1\n",
+ "2023-01-31 12:05:24.433576: I tensorflow/core/grappler/clusters/single_machine.cc:357] Starting new session\n",
+ "2023-01-31 12:05:24.434021: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: \n",
+ "pciBusID: 0000:03:00.0 name: NVIDIA TITAN Xp computeCapability: 6.1\n",
+ "coreClock: 1.582GHz coreCount: 30 deviceMemorySize: 11.91GiB deviceMemoryBandwidth: 510.07GiB/s\n",
+ "2023-01-31 12:05:24.434280: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0\n",
+ "2023-01-31 12:05:24.434312: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1258] Device interconnect StreamExecutor with strength 1 edge matrix:\n",
+ "2023-01-31 12:05:24.434324: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1264] 0 \n",
+ "2023-01-31 12:05:24.434333: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1277] 0: N \n",
+ "2023-01-31 12:05:24.434616: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1418] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 11007 MB memory) -> physical GPU (device: 0, name: NVIDIA TITAN Xp, pci bus id: 0000:03:00.0, compute capability: 6.1)\n",
+ "2023-01-31 12:05:24.451559: I tensorflow/core/platform/profile_utils/cpu_utils.cc:114] CPU Frequency: 3492140000 Hz\n",
+ "2023-01-31 12:05:24.458087: I tensorflow/core/grappler/optimizers/meta_optimizer.cc:1144] Optimization results for grappler item: graph_to_optimize\n",
+ " function_optimizer: function_optimizer did nothing. time = 0.014ms.\n",
+ " function_optimizer: function_optimizer did nothing. time = 0.001ms.\n",
+ "\n",
+ "2023-01-31 12:05:24.730913: W tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc:345] Ignored output_format.\n",
+ "2023-01-31 12:05:24.730951: W tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc:348] Ignored drop_control_dependency.\n",
+ "2023-01-31 12:05:24.736446: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:210] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.\n",
+ "2023-01-31 12:05:24.739564: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: \n",
+ "pciBusID: 0000:03:00.0 name: NVIDIA TITAN Xp computeCapability: 6.1\n",
+ "coreClock: 1.582GHz coreCount: 30 deviceMemorySize: 11.91GiB deviceMemoryBandwidth: 510.07GiB/s\n",
+ "2023-01-31 12:05:24.739849: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0\n",
+ "2023-01-31 12:05:24.739885: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1258] Device interconnect StreamExecutor with strength 1 edge matrix:\n",
+ "2023-01-31 12:05:24.739895: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1264] 0 \n",
+ "2023-01-31 12:05:24.739902: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1277] 0: N \n",
+ "2023-01-31 12:05:24.740218: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1418] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 11007 MB memory) -> physical GPU (device: 0, name: NVIDIA TITAN Xp, pci bus id: 0000:03:00.0, compute capability: 6.1)\n",
+ "Converted model saved to ds_cnn.tflite.\n",
+ "Running TFLite evaluation on validation set...\n",
+ "2023-01-31 12:05:24.804992: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:176] None of the MLIR Optimization Passes are enabled (registered 2)\n",
+ "[[371 0 0 0 0 0 0 0 0 0 0 0]\n",
+ " [ 0 311 1 9 2 9 8 4 11 3 4 9]\n",
+ " [ 0 5 387 1 0 0 3 0 0 0 0 1]\n",
+ " [ 0 11 5 372 1 6 5 0 0 0 0 6]\n",
+ " [ 0 4 0 0 327 0 2 0 1 10 6 0]\n",
+ " [ 0 2 2 6 0 360 0 1 1 0 1 4]\n",
+ " [ 0 1 7 0 3 1 333 5 0 0 0 2]\n",
+ " [ 0 5 0 1 0 0 5 350 1 0 0 1]\n",
+ " [ 1 5 0 1 4 1 0 1 343 7 0 0]\n",
+ " [ 0 1 1 1 16 0 2 1 5 343 1 2]\n",
+ " [ 1 2 0 0 9 1 0 0 0 3 334 0]\n",
+ " [ 0 15 0 14 1 6 0 0 0 2 3 331]]\n",
+ "Validation accuracy = 93.63%(N=4445)\n",
+ "Running TFLite evaluation on test set...\n",
+ "[[408 0 0 0 0 0 0 0 0 0 0 0]\n",
+ " [ 0 354 5 8 4 2 7 8 6 0 5 9]\n",
+ " [ 0 5 404 1 0 0 9 0 0 0 0 0]\n",
+ " [ 0 4 1 380 0 7 3 0 0 0 0 10]\n",
+ " [ 0 4 0 0 396 1 1 0 2 14 4 3]\n",
+ " [ 0 12 1 9 0 376 2 0 1 0 1 4]\n",
+ " [ 0 2 7 1 1 0 399 1 0 0 1 0]\n",
+ " [ 0 10 0 0 1 1 6 376 0 0 2 0]\n",
+ " [ 0 7 1 0 4 0 0 0 364 16 1 3]\n",
+ " [ 1 5 1 3 12 0 1 0 2 369 1 7]\n",
+ " [ 0 1 0 1 4 2 1 0 1 1 397 3]\n",
+ " [ 0 3 2 18 1 5 1 0 0 2 2 368]]\n",
+ "Test accuracy = 93.89%(N=4890)\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "2023-01-31 12:05:46.655980: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0\n",
+ "Untarring speech_commands_v0.02.tar.gz...\n",
+ "2023-01-31 12:06:37.310206: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcuda.so.1\n",
+ "2023-01-31 12:06:37.346033: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: \n",
+ "pciBusID: 0000:03:00.0 name: NVIDIA TITAN Xp computeCapability: 6.1\n",
+ "coreClock: 1.582GHz coreCount: 30 deviceMemorySize: 11.91GiB deviceMemoryBandwidth: 510.07GiB/s\n",
+ "2023-01-31 12:06:37.346068: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0\n",
+ "2023-01-31 12:06:37.365782: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcublas.so.11\n",
+ "2023-01-31 12:06:37.365855: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcublasLt.so.11\n",
+ "2023-01-31 12:06:37.368622: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcufft.so.10\n",
+ "2023-01-31 12:06:37.368939: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcurand.so.10\n",
+ "2023-01-31 12:06:37.369500: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcusolver.so.11\n",
+ "2023-01-31 12:06:37.370276: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcusparse.so.11\n",
+ "2023-01-31 12:06:37.370427: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudnn.so.8\n",
+ "2023-01-31 12:06:37.370808: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0\n",
+ "2023-01-31 12:06:37.371101: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA\n",
+ "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
+ "2023-01-31 12:06:37.371913: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: \n",
+ "pciBusID: 0000:03:00.0 name: NVIDIA TITAN Xp computeCapability: 6.1\n",
+ "coreClock: 1.582GHz coreCount: 30 deviceMemorySize: 11.91GiB deviceMemoryBandwidth: 510.07GiB/s\n",
+ "2023-01-31 12:06:37.372648: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0\n",
+ "2023-01-31 12:06:37.372708: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0\n",
+ "2023-01-31 12:06:37.810221: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1258] Device interconnect StreamExecutor with strength 1 edge matrix:\n",
+ "2023-01-31 12:06:37.810261: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1264] 0 \n",
+ "2023-01-31 12:06:37.810269: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1277] 0: N \n",
+ "2023-01-31 12:06:37.810782: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1418] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 11007 MB memory) -> physical GPU (device: 0, name: NVIDIA TITAN Xp, pci bus id: 0000:03:00.0, compute capability: 6.1)\n",
+ "WARNING:tensorflow:Compiled the loaded model, but the compiled metrics have yet to be built. `model.compile_metrics` will be empty until you train or evaluate the model.\n",
+ "2023-01-31 12:06:40.113450: W tensorflow/python/util/util.cc:348] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.\n",
+ "2023-01-31 12:06:41.895930: I tensorflow/core/grappler/devices.cc:69] Number of eligible GPUs (core count >= 8, compute capability >= 0.0): 1\n",
+ "2023-01-31 12:06:41.896029: I tensorflow/core/grappler/clusters/single_machine.cc:357] Starting new session\n",
+ "2023-01-31 12:06:41.896600: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: \n",
+ "pciBusID: 0000:03:00.0 name: NVIDIA TITAN Xp computeCapability: 6.1\n",
+ "coreClock: 1.582GHz coreCount: 30 deviceMemorySize: 11.91GiB deviceMemoryBandwidth: 510.07GiB/s\n",
+ "2023-01-31 12:06:41.896861: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0\n",
+ "2023-01-31 12:06:41.896892: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1258] Device interconnect StreamExecutor with strength 1 edge matrix:\n",
+ "2023-01-31 12:06:41.896901: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1264] 0 \n",
+ "2023-01-31 12:06:41.896909: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1277] 0: N \n",
+ "2023-01-31 12:06:41.897198: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1418] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 11007 MB memory) -> physical GPU (device: 0, name: NVIDIA TITAN Xp, pci bus id: 0000:03:00.0, compute capability: 6.1)\n",
+ "2023-01-31 12:06:41.915523: I tensorflow/core/platform/profile_utils/cpu_utils.cc:114] CPU Frequency: 3492140000 Hz\n",
+ "2023-01-31 12:06:41.922229: I tensorflow/core/grappler/optimizers/meta_optimizer.cc:1144] Optimization results for grappler item: graph_to_optimize\n",
+ " function_optimizer: function_optimizer did nothing. time = 0.019ms.\n",
+ " function_optimizer: function_optimizer did nothing. time = 0.003ms.\n",
+ "\n",
+ "2023-01-31 12:06:42.074632: W tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc:345] Ignored output_format.\n",
+ "2023-01-31 12:06:42.074672: W tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc:348] Ignored drop_control_dependency.\n",
+ "2023-01-31 12:06:42.079631: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:210] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.\n",
+ "2023-01-31 12:06:42.082664: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: \n",
+ "pciBusID: 0000:03:00.0 name: NVIDIA TITAN Xp computeCapability: 6.1\n",
+ "coreClock: 1.582GHz coreCount: 30 deviceMemorySize: 11.91GiB deviceMemoryBandwidth: 510.07GiB/s\n",
+ "2023-01-31 12:06:42.082962: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0\n",
+ "2023-01-31 12:06:42.083001: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1258] Device interconnect StreamExecutor with strength 1 edge matrix:\n",
+ "2023-01-31 12:06:42.083013: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1264] 0 \n",
+ "2023-01-31 12:06:42.083021: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1277] 0: N \n",
+ "2023-01-31 12:06:42.083360: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1418] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 11007 MB memory) -> physical GPU (device: 0, name: NVIDIA TITAN Xp, pci bus id: 0000:03:00.0, compute capability: 6.1)\n",
+ "2023-01-31 12:06:42.114217: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:176] None of the MLIR Optimization Passes are enabled (registered 2)\n",
+ "fully_quantize: 0, inference_type: 6, input_inference_type: 9, output_inference_type: 9\n",
+ "Quantized model saved to ds_cnn_quantized.tflite.\n",
+ "Running TFLite evaluation on validation set...\n",
+ "[[371 0 0 0 0 0 0 0 0 0 0 0]\n",
+ " [ 0 308 4 6 1 11 5 6 11 4 5 10]\n",
+ " [ 0 5 379 0 0 1 9 0 0 1 0 2]\n",
+ " [ 0 10 3 369 3 6 6 0 1 2 0 6]\n",
+ " [ 0 5 0 1 315 0 3 0 2 13 10 1]\n",
+ " [ 0 5 2 9 0 347 0 1 3 1 5 4]\n",
+ " [ 0 2 5 1 2 1 335 4 0 1 1 0]\n",
+ " [ 0 7 0 1 2 0 7 342 1 1 1 1]\n",
+ " [ 1 4 0 1 6 1 0 0 343 6 1 0]\n",
+ " [ 0 2 0 1 22 0 1 0 6 336 2 3]\n",
+ " [ 1 4 0 0 14 0 0 0 0 1 328 2]\n",
+ " [ 0 12 0 16 2 9 0 0 1 2 4 326]]\n",
+ "Validation accuracy = 92.22%(N=4445)\n",
+ "Running TFLite evaluation on test set...\n",
+ "[[408 0 0 0 0 0 0 0 0 0 0 0]\n",
+ " [ 0 347 3 8 5 5 8 8 9 1 6 8]\n",
+ " [ 0 7 399 2 0 1 8 0 0 1 1 0]\n",
+ " [ 0 4 1 377 4 7 2 0 0 0 1 9]\n",
+ " [ 0 5 1 0 390 1 1 1 2 14 6 4]\n",
+ " [ 0 15 0 12 2 361 4 0 2 1 1 8]\n",
+ " [ 0 6 5 2 4 0 393 2 0 0 0 0]\n",
+ " [ 0 9 0 0 5 0 10 365 1 1 2 3]\n",
+ " [ 0 9 0 1 6 1 3 1 357 15 1 2]\n",
+ " [ 0 4 1 2 15 0 1 0 2 369 1 7]\n",
+ " [ 0 1 0 2 4 3 2 0 1 3 393 2]\n",
+ " [ 0 5 2 21 3 7 2 0 0 3 1 358]]\n",
+ "Test accuracy = 92.37%(N=4890)\n"
+ ]
+ }
+ ],
+ "source": [
+ "!bash ./recreate_model.sh"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Running this script will use the pre-trained checkpoint files supplied in the ```./model_archive/model_source/weights``` folder to generate the TFLite files and perform evaluation on the test set. Both an fp32 version and a quantized version will be produced. The quantized version will use post-training quantization to fully quantize it.\n",
+ "\n",
+ "If you want to run training from scratch you can do this by supplying ```--train``` when running the script. For example:\n",
+ "\n",
+ "```bash\n",
+ "bash ./recreate_model.sh --train\n",
+ "```\n",
+ "\n",
+ "Training is then performed and should produce a model to the stated accuracy in this repository. Note that exporting to TFLite will still happen with the baseline pre-trained checkpoint files, so you will need to re-run the script and this time supply the path to the new checkpoint files you want to use, for example:\n",
+ "\n",
+ "```bash\n",
+ "bash ./recreate_model.sh --ckpt \n",
+ "```"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 2.0 Training\n",
+ "\n",
+ "The training scripts can be used to recreate any of the models from the [Hello Edge paper](https://arxiv.org/pdf/1711.07128.pdf) provided the right hyperparameters are used. The training commands with all the hyperparameters to reproduce the model in this repository are given [here](recreate_model.sh). The model in this part of the repository represents just one variation of the models from the paper, other varieties are covered in other parts of the repository.\n",
+ "\n",
+ "\n",
+ "As a general example of how to train a DNN with 3 fully-connected layers with 128 neurons in each layer, run:\n",
+ "```\n",
+ "python train.py --model_architecture dnn --model_size_info 128 128 128\n",
+ "```\n",
+ "\n",
+ "The command line argument *--model_size_info* is used to pass the neural network layer\n",
+ "dimensions such as number of layers, convolution filter size/stride as a list to models.py,\n",
+ "which builds the TensorFlow graph based on the provided model architecture\n",
+ "and layer dimensions. For more info on *model_size_info* for each network architecture see\n",
+ "[models.py](model_core_utils/models.py).\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 3.0 Testing\n",
+ "To run inference on the trained model from a checkpoint and get accuracy on validation and test sets, run:\n",
+ "```\n",
+ "python evaluation.py --model_architecture dnn --model_size_info 128 128 128 --checkpoint \n",
+ "```\n",
+ "**The model and feature extraction parameters passed to this script should match those used in the Training step.**"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 4.0 Optimization\n",
+ "\n",
+ "We introduce an *optional* step to optimize the trained keyword spotting model for deployment.\n",
+ "\n",
+ "Here we use TensorFlow's [weight clustering API](https://www.tensorflow.org/model_optimization/guide/clustering) to reduce the compressed model size and optimize inference on supported hardware. 32 weight clusters and kmeans++ cluster intialization method are used as the clustering hyperparameters.\n",
+ "\n",
+ "To optimize your trained model (e.g. a DNN), a trained model checkpoint is needed to run clustering and fine-tuning on.\n",
+ "You can use the pre-trained checkpoints provided, or train your own model and use the resulting checkpoint.\n",
+ "\n",
+ "To apply the optimization and fine-tuning, run the following command:\n",
+ "```\n",
+ "python optimisations.py --model_architecture dnn --model_size_info 128 128 128 --checkpoint \n",
+ "```\n",
+ "**The model and feature extraction parameters used here should match those used in the Training step, except for the number of training steps.\n",
+ "The number of training steps is reduced since the optimization step only requires fine-tuning.**\n",
+ "\n",
+ "This will generate a clustered model checkpoint that can be used in the quantization step to generate a quantized and clustered TFLite model."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 5.0 Quantization and TFLite Conversion\n",
+ "\n",
+ "You can now use TensorFlow's\n",
+ "[post training quantization](https://www.tensorflow.org/lite/performance/post_training_quantization) to\n",
+ "make quantization of the trained models super simple.\n",
+ "\n",
+ "To quantize your trained model (e.g. a DNN) run:\n",
+ "```\n",
+ "python convert_to_tflite.py --model_architecture dnn --model_size_info 128 128 128 --checkpoint [--inference_type int8|int16]\n",
+ "```\n",
+ "**The model and feature extraction parameters used here should match those used in the Training step.**\n",
+ "\n",
+ "The ```inference_type``` parameter is *optional* and to be used if a fully quantized model with inputs and outputs of type int8 or int16 is needed. It defaults to fp32.\n",
+ "\n",
+ "In this example, this step will produce a quantized TFLite file *dnn_quantized.tflite*."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "You can test the accuracy of this quantized model on the test set by running:\n",
+ "```\n",
+ "python evaluation.py --tflite_path dnn_quantized.tflite\n",
+ "```\n",
+ "**The model and feature extraction parameters used here should match those used in the Training step.**\n",
+ "\n",
+ "`convert_to_tflite.py` uses post-training quantization to generate a quantized model by default. If you wish to convert to a floating point TFLite model, use the command below:\n",
+ "\n",
+ "```\n",
+ "python convert_to_tflite.py --model_architecture dnn --model_size_info 128 128 128 --checkpoint --no-quantize\n",
+ "```\n",
+ "\n",
+ "This will produce a floating point TFLite file *dnn.tflite*. You can test the accuracy of this floating point model using `evaluation.py` as above.\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 6.0 Single inference of the TFLite model files \n",
+ "\n",
+ "You can conduct TFLite inference for .fp32 and .int8 model files by using the following command: \n",
+ "\n",
+ "```python ds_cnn_s_inference_tflite.py --labels validation_utils/labels.txt --wav --tflite_path ```\n",
+ "\n",
+ "**The feature extraction parameters used here should match those used in the Training step.**\n",
+ "\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.7.4"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/models/keyword_spotting/ds_cnn_small/model_package_tf/model_archive/TFLite/tflite_fp32/README.md b/models/keyword_spotting/ds_cnn_small/model_package_tf/model_archive/TFLite/tflite_fp32/README.md
new file mode 100644
index 0000000..b8fbdcb
--- /dev/null
+++ b/models/keyword_spotting/ds_cnn_small/model_package_tf/model_archive/TFLite/tflite_fp32/README.md
@@ -0,0 +1,62 @@
+# keyword_spotting/ds_cnn_small/model_package_tf/model_archive/TFLite/tflite_fp32
+
+## Description
+This is a floating point fp32 version of the DS-CNN Small model developed by Arm, from the Hello Edge paper.
+
+## License
+[Apache-2.0](https://spdx.org/licenses/Apache-2.0.html)
+
+## Network Information
+| Network Information | Value |
+|---------------------|-------|
+| Framework | TensorFlow Lite |
+| Datatype | fp32 |
+| SHA-1 Hash | 8aadd5126bc0d3371c1b834d027c853e794423c1 |
+| Size (Bytes) | 98756 |
+| Provenance | https://arxiv.org/abs/1711.07128 |
+| Training | Trained by Arm |
+| Paper | https://arxiv.org/abs/1711.07128 |
+
+## DataSet
+| Dataset Information | Value |
+|--------|-------|
+| Name | Google Speech Commands test set |
+
+## Accuracy
+
+| Metric | Value |
+|--------|-------|
+| accuracy | 93.89% |
+
+## HW Support
+| HW Support | Value |
+|--------------|-------|
+| Cortex-A |:heavy_check_mark: |
+| Cortex-M |:heavy_check_mark: HERO |
+| Mali GPU |:heavy_check_mark: |
+| Ethos U |:heavy_multiplication_x: |
+
+### Key
+* :heavy_check_mark: - Will run on this platform.
+* :heavy_multiplication_x: - Will not run on this platform.
+
+## Network Quality
+| Network Quality | Value |
+|-------------------------|-------|
+| Recreate | :heavy_check_mark: |
+| Quality level | Hero |
+| Vanilla | :heavy_check_mark: |
+| Clustered | :heavy_multiplication_x: |
+| Pruned | :heavy_multiplication_x: |
+| Quantization - default | :heavy_multiplication_x: |
+| Quantization - full | :heavy_multiplication_x: |
+
+## Network Inputs
+| Input Node Name | Shape | Type | Example Path | Example Type | Example Shape | Example Use Case |
+|-----------------|-------|-------|--------------|-------|-------|-----------------|
+| input | (1, 490) | fp32 | models/keyword_spotting/ds_cnn_small/model_package_tf/model_archive/TFLite/tflite_fp32/testing_input/input | fp32 | [1, 490] | The input is a processed MFCCs |
+
+## Network Outputs
+| Output Node Name | Shape | Type | Example Path | Example Type | Example Shape | Example Use Case |
+|-----------------|-------|-------|--------------|-------|-------|-----------------|
+| Identity | (1, 12) | fp32 | models/keyword_spotting/ds_cnn_small/model_package_tf/model_archive/TFLite/tflite_fp32/testing_output/Identity | fp32 | [1, 12] | The probability on 12 keywords |
\ No newline at end of file
diff --git a/models/keyword_spotting/ds_cnn_small/model_package_tf/model_archive/TFLite/tflite_fp32/definition.yaml b/models/keyword_spotting/ds_cnn_small/model_package_tf/model_archive/TFLite/tflite_fp32/definition.yaml
new file mode 100644
index 0000000..71aa3f6
--- /dev/null
+++ b/models/keyword_spotting/ds_cnn_small/model_package_tf/model_archive/TFLite/tflite_fp32/definition.yaml
@@ -0,0 +1,66 @@
+benchmark:
+ benchmark_metrics:
+ accuracy: 93.89%
+ benchmark_name: Google Speech Commands test set
+description: This is a floating point fp32 version of the DS-CNN Small model developed
+ by Arm, from the Hello Edge paper.
+license:
+- Apache-2.0
+network:
+ datatype: fp32
+ file_size_bytes: 98756
+ filename: ds_cnn_s.tflite
+ framework: TensorFlow Lite
+ hash:
+ algorithm: sha1
+ value: 8aadd5126bc0d3371c1b834d027c853e794423c1
+ provenance: https://arxiv.org/abs/1711.07128
+ training: Trained by Arm
+network_parameters:
+ input_nodes:
+ - description: The input is a processed MFCCs of shape (1, 490)
+ example_input:
+ path: models/keyword_spotting/ds_cnn_small/model_package_tf/model_archive/TFLite/tflite_fp32/testing_input/input
+ shape:
+ - 1
+ - 490
+ type: fp32
+ use_case: Random input for model regression.
+ input_datatype: fp32
+ name: input
+ shape:
+ - 1
+ - 490
+ output_nodes:
+ - description: The probability on 12 keywords.
+ example_output:
+ path: models/keyword_spotting/ds_cnn_small/model_package_tf/model_archive/TFLite/tflite_fp32/testing_output/Identity
+ shape:
+ - 1
+ - 12
+ type: fp32
+ use_case: output for model regression.
+ name: Identity
+ output_datatype: fp32
+ shape:
+ - 1
+ - 12
+network_quality:
+ clustered: false
+ is_vanilla: true
+ pruned: false
+ quality_level: Hero
+ quality_level_hero_hw: cortex_m
+ quantization_default: false
+ quantization_full: false
+ recreate: true
+operators:
+ TensorFlow Lite:
+ - AVERAGE_POOL_2D
+ - CONV_2D
+ - DEPTHWISE_CONV_2D
+ - FULLY_CONNECTED
+ - RELU
+ - RESHAPE
+ - SOFTMAX
+paper: https://arxiv.org/abs/1711.07128
\ No newline at end of file
diff --git a/models/keyword_spotting/ds_cnn_small/model_package_tf/model_archive/TFLite/tflite_fp32/ds_cnn_s.tflite b/models/keyword_spotting/ds_cnn_small/model_package_tf/model_archive/TFLite/tflite_fp32/ds_cnn_s.tflite
new file mode 100644
index 0000000..3fb7602
--- /dev/null
+++ b/models/keyword_spotting/ds_cnn_small/model_package_tf/model_archive/TFLite/tflite_fp32/ds_cnn_s.tflite
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0d302f1f2c53c1344edcde850e28130c0877b60e1567db977292239a9391f59b
+size 98756
diff --git a/models/keyword_spotting/ds_cnn_small/model_package_tf/model_archive/TFLite/tflite_fp32/testing_input/input/0.npy b/models/keyword_spotting/ds_cnn_small/model_package_tf/model_archive/TFLite/tflite_fp32/testing_input/input/0.npy
new file mode 100644
index 0000000..27d44a7
--- /dev/null
+++ b/models/keyword_spotting/ds_cnn_small/model_package_tf/model_archive/TFLite/tflite_fp32/testing_input/input/0.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1ee7676110faaf59275371c1d6b27097d657f049967840cbd214d62a272fa543
+size 2088
diff --git a/models/keyword_spotting/ds_cnn_small/model_package_tf/model_archive/TFLite/tflite_fp32/testing_output/Identity/0.npy b/models/keyword_spotting/ds_cnn_small/model_package_tf/model_archive/TFLite/tflite_fp32/testing_output/Identity/0.npy
new file mode 100644
index 0000000..38660ee
--- /dev/null
+++ b/models/keyword_spotting/ds_cnn_small/model_package_tf/model_archive/TFLite/tflite_fp32/testing_output/Identity/0.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fad0cf24907c9eeb36f99fb498f09667e129f1cdbcca9b50cd826e9322b145d1
+size 176
diff --git a/models/keyword_spotting/ds_cnn_small/model_package_tf/model_archive/TFLite/tflite_int16/README.md b/models/keyword_spotting/ds_cnn_small/model_package_tf/model_archive/TFLite/tflite_int16/README.md
new file mode 100644
index 0000000..b025116
--- /dev/null
+++ b/models/keyword_spotting/ds_cnn_small/model_package_tf/model_archive/TFLite/tflite_int16/README.md
@@ -0,0 +1,62 @@
+# keyword_spotting/ds_cnn_small/model_package_tf/model_archive/TFLite/tflite_int16
+
+## Description
+This is a fully quantized int16 version of the DS-CNN Small model developed by Arm, from the Hello Edge paper.
+
+## License
+[Apache-2.0](https://spdx.org/licenses/Apache-2.0.html)
+
+## Network Information
+| Network Information | Value |
+|---------------------|-------|
+| Framework | TensorFlow Lite |
+| Datatype | int16 |
+| SHA-1 Hash | e82c7d645bec3dec580a096de0a297c6dd9a6463 |
+| Size (Bytes) | 55392 |
+| Provenance | https://github.com/ARM-software/ML-examples/tree/main/tflu-kws-cortex-m |
+| Training | Trained by Arm |
+| Paper | https://arxiv.org/abs/1711.07128 |
+
+## DataSet
+| Dataset Information | Value |
+|--------|-------|
+| Name | Google Speech Commands test set |
+
+## Accuracy
+
+| Metric | Value |
+|--------|-------|
+| Accuracy | 93.39% |
+
+## HW Support
+| HW Support | Value |
+|--------------|-------|
+| Cortex-A |:heavy_check_mark: |
+| Cortex-M |:heavy_check_mark: HERO |
+| Mali GPU |:heavy_check_mark: |
+| Ethos U |:heavy_check_mark: |
+
+### Key
+* :heavy_check_mark: - Will run on this platform.
+* :heavy_multiplication_x: - Will not run on this platform.
+
+## Network Quality
+| Network Quality | Value |
+|-------------------------|-------|
+| Recreate | :heavy_check_mark: |
+| Quality level | Hero |
+| Vanilla | :heavy_check_mark: |
+| Clustered | :heavy_multiplication_x: |
+| Pruned | :heavy_multiplication_x: |
+| Quantization - default | :heavy_multiplication_x: |
+| Quantization - full | :heavy_check_mark: |
+
+## Network Inputs
+| Input Node Name | Shape | Type | Example Path | Example Type | Example Shape | Example Use Case |
+|-----------------|-------|-------|--------------|-------|-------|-----------------|
+| serving_default_input:0 | (1, 490) | int16 | models/keyword_spotting/ds_cnn_small/model_package_tf/model_archive/TFLite/tflite_int16/testing_input | int16 | [1, 490] | The input is a processed MFCCs |
+
+## Network Outputs
+| Output Node Name | Shape | Type | Example Path | Example Type | Example Shape | Example Use Case |
+|-----------------|-------|-------|--------------|-------|-------|-----------------|
+| StatefulPartitionedCall:0 | (1, 12) | int16 | models/keyword_spotting/ds_cnn_small/model_package_tf/model_archive/TFLite/tflite_int16/testing_output | int16 | [1, 12] | The probability on 12 keywords |
\ No newline at end of file
diff --git a/models/keyword_spotting/ds_cnn_small/model_package_tf/model_archive/TFLite/tflite_int16/definition.yaml b/models/keyword_spotting/ds_cnn_small/model_package_tf/model_archive/TFLite/tflite_int16/definition.yaml
new file mode 100644
index 0000000..730a6cc
--- /dev/null
+++ b/models/keyword_spotting/ds_cnn_small/model_package_tf/model_archive/TFLite/tflite_int16/definition.yaml
@@ -0,0 +1,66 @@
+benchmark:
+ benchmark_metrics:
+ Accuracy: 93.39%
+ benchmark_name: Google Speech Commands test set
+description: This is a fully quantized int16 version of the DS-CNN Small model developed
+ by Arm, from the Hello Edge paper.
+license:
+- Apache-2.0
+network:
+ datatype: int16
+ file_size_bytes: 55392
+ filename: ds_cnn_s_quantized_int16.tflite
+ framework: TensorFlow Lite
+ hash:
+ algorithm: sha1
+ value: e82c7d645bec3dec580a096de0a297c6dd9a6463
+ provenance: https://arxiv.org/abs/1711.07128
+ training: Trained by Arm
+network_parameters:
+ input_nodes:
+ - description: The input is a processed MFCCs of shape (1, 490)
+ example_input:
+ path: models/keyword_spotting/ds_cnn_small/model_package_tf/model_archive/TFLite/tflite_int16/testing_input
+ shape:
+ - 1
+ - 490
+ type: int16
+ use_case: Random input for model regression.
+ input_datatype: int16
+ name: serving_default_input:0
+ shape:
+ - 1
+ - 490
+ output_nodes:
+ - description: The probability on 12 keywords.
+ example_output:
+ path: models/keyword_spotting/ds_cnn_small/model_package_tf/model_archive/TFLite/tflite_int16/testing_output
+ shape:
+ - 1
+ - 12
+ type: int16
+ use_case: output for model regression.
+ name: StatefulPartitionedCall:0
+ output_datatype: int16
+ shape:
+ - 1
+ - 12
+network_quality:
+ clustered: false
+ is_vanilla: true
+ pruned: false
+ quality_level: Hero
+ quality_level_hero_hw: cortex_m
+ quantization_default: false
+ quantization_full: true
+ recreate: true
+operators:
+ TensorFlow Lite:
+ - AVERAGE_POOL_2D
+ - CONV_2D
+ - DEPTHWISE_CONV_2D
+ - FULLY_CONNECTED
+ - RELU
+ - RESHAPE
+ - SOFTMAX
+paper: https://arxiv.org/abs/1711.07128
\ No newline at end of file
diff --git a/models/keyword_spotting/ds_cnn_small/model_package_tf/model_archive/TFLite/tflite_int16/ds_cnn_s_quantized_int16.tflite b/models/keyword_spotting/ds_cnn_small/model_package_tf/model_archive/TFLite/tflite_int16/ds_cnn_s_quantized_int16.tflite
new file mode 100644
index 0000000..d3d56fe
--- /dev/null
+++ b/models/keyword_spotting/ds_cnn_small/model_package_tf/model_archive/TFLite/tflite_int16/ds_cnn_s_quantized_int16.tflite
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e80b231d6848e6de69d70d36a17f9bb64022ae46d9957b1f6972b6527f943186
+size 55392
diff --git a/models/keyword_spotting/ds_cnn_small/model_package_tf/model_archive/TFLite/tflite_int16/testing_input/0.npy b/models/keyword_spotting/ds_cnn_small/model_package_tf/model_archive/TFLite/tflite_int16/testing_input/0.npy
new file mode 100644
index 0000000..797c2b0
--- /dev/null
+++ b/models/keyword_spotting/ds_cnn_small/model_package_tf/model_archive/TFLite/tflite_int16/testing_input/0.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3e24c5c602a9c74776927198465769dc6e80645663bf7604ae45aed0586a066a
+size 1108
diff --git a/models/keyword_spotting/ds_cnn_small/model_package_tf/model_archive/TFLite/tflite_int16/testing_output/0.npy b/models/keyword_spotting/ds_cnn_small/model_package_tf/model_archive/TFLite/tflite_int16/testing_output/0.npy
new file mode 100644
index 0000000..4e37127
--- /dev/null
+++ b/models/keyword_spotting/ds_cnn_small/model_package_tf/model_archive/TFLite/tflite_int16/testing_output/0.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:397aff56a28c4e81818c117ae49b216ad8ae501c3612b7abac2cdf9f45ccbf44
+size 152
diff --git a/models/keyword_spotting/ds_cnn_small/model_package_tf/model_archive/TFLite/tflite_int8/README.md b/models/keyword_spotting/ds_cnn_small/model_package_tf/model_archive/TFLite/tflite_int8/README.md
new file mode 100644
index 0000000..3e9a6cc
--- /dev/null
+++ b/models/keyword_spotting/ds_cnn_small/model_package_tf/model_archive/TFLite/tflite_int8/README.md
@@ -0,0 +1,62 @@
+# keyword_spotting/ds_cnn_small/model_package_tf/model_archive/TFLite/tflite_int8
+
+## Description
+This is a fully quantized int8 version of the DS-CNN Small model developed by Arm, from the Hello Edge paper.
+
+## License
+[Apache-2.0](https://spdx.org/licenses/Apache-2.0.html)
+
+## Network Information
+| Network Information | Value |
+|---------------------|-------|
+| Framework | TensorFlow Lite |
+| Datatype | int8 |
+| SHA-1 Hash | cf24429e86a9647b1632c382894bc68d26d34039 |
+| Size (Bytes) | 47616 |
+| Provenance | https://arxiv.org/abs/1711.07128 |
+| Training | Trained by Arm |
+| Paper | https://arxiv.org/abs/1711.07128 |
+
+## DataSet
+| Dataset Information | Value |
+|--------|-------|
+| Name | Google Speech Commands test set |
+
+## Accuracy
+
+| Metric | Value |
+|--------|-------|
+| Accuracy | 93.11% |
+
+## HW Support
+| HW Support | Value |
+|--------------|-------|
+| Cortex-A |:heavy_check_mark: |
+| Cortex-M |:heavy_check_mark: HERO |
+| Mali GPU |:heavy_check_mark: |
+| Ethos U |:heavy_check_mark: |
+
+### Key
+* :heavy_check_mark: - Will run on this platform.
+* :heavy_multiplication_x: - Will not run on this platform.
+
+## Network Quality
+| Network Quality | Value |
+|-------------------------|-------|
+| Recreate | :heavy_check_mark: |
+| Quality level | Hero |
+| Vanilla | :heavy_check_mark: |
+| Clustered | :heavy_multiplication_x: |
+| Pruned | :heavy_multiplication_x: |
+| Quantization - default | :heavy_multiplication_x: |
+| Quantization - full | :heavy_check_mark: |
+
+## Network Inputs
+| Input Node Name | Shape | Type | Example Path | Example Type | Example Shape | Example Use Case |
+|-----------------|-------|-------|--------------|-------|-------|-----------------|
+| input | (1, 490) | int8 | models/keyword_spotting/ds_cnn_small/model_package_tf/model_archive/TFLite/tflite_int8/testing_input/input | int8 | [1, 490] | The input is a processed MFCCs |
+
+## Network Outputs
+| Output Node Name | Shape | Type | Example Path | Example Type | Example Shape | Example Use Case |
+|-----------------|-------|-------|--------------|-------|-------|-----------------|
+| Identity | (1, 12) | int8 | models/keyword_spotting/ds_cnn_small/model_package_tf/model_archive/TFLite/tflite_int8/testing_output/Identity | int8 | [1, 12] | The probability on 12 keywords |
\ No newline at end of file
diff --git a/models/keyword_spotting/ds_cnn_small/model_package_tf/model_archive/TFLite/tflite_int8/definition.yaml b/models/keyword_spotting/ds_cnn_small/model_package_tf/model_archive/TFLite/tflite_int8/definition.yaml
new file mode 100644
index 0000000..6d2f978
--- /dev/null
+++ b/models/keyword_spotting/ds_cnn_small/model_package_tf/model_archive/TFLite/tflite_int8/definition.yaml
@@ -0,0 +1,66 @@
+benchmark:
+ benchmark_metrics:
+ Accuracy: 93.11%
+ benchmark_name: Google Speech Commands test set
+description: This is a fully quantized int8 version of the DS-CNN Small model developed
+ by Arm, from the Hello Edge paper.
+license:
+- Apache-2.0
+network:
+ datatype: int8
+ file_size_bytes: 47616
+ filename: ds_cnn_s_quantized.tflite
+ framework: TensorFlow Lite
+ hash:
+ algorithm: sha1
+ value: cf24429e86a9647b1632c382894bc68d26d34039
+ provenance: https://arxiv.org/abs/1711.07128
+ training: Trained by Arm
+network_parameters:
+ input_nodes:
+ - description: The input is a processed MFCCs of shape (1, 490)
+ example_input:
+ path: models/keyword_spotting/ds_cnn_small/model_package_tf/model_archive/TFLite/tflite_int8/testing_input/input
+ shape:
+ - 1
+ - 490
+ type: int8
+ use_case: Random input for model regression.
+ input_datatype: int8
+ name: input
+ shape:
+ - 1
+ - 490
+ output_nodes:
+ - description: The probability on 12 keywords.
+ example_output:
+ path: models/keyword_spotting/ds_cnn_small/model_package_tf/model_archive/TFLite/tflite_int8/testing_output/Identity
+ shape:
+ - 1
+ - 12
+ type: int8
+ use_case: output for model regression.
+ name: Identity
+ output_datatype: int8
+ shape:
+ - 1
+ - 12
+network_quality:
+ clustered: false
+ is_vanilla: true
+ pruned: false
+ quality_level: Hero
+ quality_level_hero_hw: cortex_m
+ quantization_default: false
+ quantization_full: true
+ recreate: true
+operators:
+ TensorFlow Lite:
+ - AVERAGE_POOL_2D
+ - CONV_2D
+ - DEPTHWISE_CONV_2D
+ - FULLY_CONNECTED
+ - RELU
+ - RESHAPE
+ - SOFTMAX
+paper: https://arxiv.org/abs/1711.07128
\ No newline at end of file
diff --git a/models/keyword_spotting/ds_cnn_small/tflite_int8/ds_cnn_s_quantized.tflite b/models/keyword_spotting/ds_cnn_small/model_package_tf/model_archive/TFLite/tflite_int8/ds_cnn_s_quantized.tflite
similarity index 100%
rename from models/keyword_spotting/ds_cnn_small/tflite_int8/ds_cnn_s_quantized.tflite
rename to models/keyword_spotting/ds_cnn_small/model_package_tf/model_archive/TFLite/tflite_int8/ds_cnn_s_quantized.tflite
diff --git a/models/keyword_spotting/ds_cnn_small/tflite_int8/testing_input/input/0.npy b/models/keyword_spotting/ds_cnn_small/model_package_tf/model_archive/TFLite/tflite_int8/testing_input/input/0.npy
similarity index 100%
rename from models/keyword_spotting/ds_cnn_small/tflite_int8/testing_input/input/0.npy
rename to models/keyword_spotting/ds_cnn_small/model_package_tf/model_archive/TFLite/tflite_int8/testing_input/input/0.npy
diff --git a/models/keyword_spotting/ds_cnn_small/tflite_int8/testing_output/Identity/0.npy b/models/keyword_spotting/ds_cnn_small/model_package_tf/model_archive/TFLite/tflite_int8/testing_output/Identity/0.npy
similarity index 100%
rename from models/keyword_spotting/ds_cnn_small/tflite_int8/testing_output/Identity/0.npy
rename to models/keyword_spotting/ds_cnn_small/model_package_tf/model_archive/TFLite/tflite_int8/testing_output/Identity/0.npy
diff --git a/models/keyword_spotting/ds_cnn_small/model_package_tf/model_archive/model_source/saved_model/ds_cnn_small/keras_metadata.pb b/models/keyword_spotting/ds_cnn_small/model_package_tf/model_archive/model_source/saved_model/ds_cnn_small/keras_metadata.pb
new file mode 100644
index 0000000..a265c82
--- /dev/null
+++ b/models/keyword_spotting/ds_cnn_small/model_package_tf/model_archive/model_source/saved_model/ds_cnn_small/keras_metadata.pb
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:edda8ec1a48de025c96dfcef1163b343f69616f516a6fec12279e71c5a58b4d2
+size 65399
diff --git a/models/keyword_spotting/ds_cnn_small/model_package_tf/model_archive/model_source/saved_model/ds_cnn_small/saved_model.pb b/models/keyword_spotting/ds_cnn_small/model_package_tf/model_archive/model_source/saved_model/ds_cnn_small/saved_model.pb
new file mode 100644
index 0000000..3fd736c
--- /dev/null
+++ b/models/keyword_spotting/ds_cnn_small/model_package_tf/model_archive/model_source/saved_model/ds_cnn_small/saved_model.pb
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3ef43701d6901c7fa2452cf5390d2198b7ba14a3e5f41d10385ec152f0631349
+size 708163
diff --git a/models/keyword_spotting/ds_cnn_small/model_package_tf/model_archive/model_source/saved_model/ds_cnn_small/variables/variables.data-00000-of-00001 b/models/keyword_spotting/ds_cnn_small/model_package_tf/model_archive/model_source/saved_model/ds_cnn_small/variables/variables.data-00000-of-00001
new file mode 100644
index 0000000..4217bf8
--- /dev/null
+++ b/models/keyword_spotting/ds_cnn_small/model_package_tf/model_archive/model_source/saved_model/ds_cnn_small/variables/variables.data-00000-of-00001
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5cee02f3a1e371e6de9e2192600842bd92be832739233b8bdeaf6f3f3b9f1e73
+size 118118
diff --git a/models/keyword_spotting/ds_cnn_small/model_package_tf/model_archive/model_source/saved_model/ds_cnn_small/variables/variables.index b/models/keyword_spotting/ds_cnn_small/model_package_tf/model_archive/model_source/saved_model/ds_cnn_small/variables/variables.index
new file mode 100644
index 0000000..364f025
--- /dev/null
+++ b/models/keyword_spotting/ds_cnn_small/model_package_tf/model_archive/model_source/saved_model/ds_cnn_small/variables/variables.index
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e77718d9698810a79c1ce8989db07245c69ae8d0277c5337703e3f32c6a863f5
+size 3570
diff --git a/models/keyword_spotting/ds_cnn_small/tflite_int8/ckpt/checkpoint b/models/keyword_spotting/ds_cnn_small/model_package_tf/model_archive/model_source/weights/checkpoint
similarity index 100%
rename from models/keyword_spotting/ds_cnn_small/tflite_int8/ckpt/checkpoint
rename to models/keyword_spotting/ds_cnn_small/model_package_tf/model_archive/model_source/weights/checkpoint
diff --git a/models/keyword_spotting/ds_cnn_small/tflite_int8/ckpt/ds_cnn_0.94_ckpt.data-00000-of-00001 b/models/keyword_spotting/ds_cnn_small/model_package_tf/model_archive/model_source/weights/ds_cnn_0.94_ckpt.data-00000-of-00001
similarity index 100%
rename from models/keyword_spotting/ds_cnn_small/tflite_int8/ckpt/ds_cnn_0.94_ckpt.data-00000-of-00001
rename to models/keyword_spotting/ds_cnn_small/model_package_tf/model_archive/model_source/weights/ds_cnn_0.94_ckpt.data-00000-of-00001
diff --git a/models/keyword_spotting/ds_cnn_small/tflite_int8/ckpt/ds_cnn_0.94_ckpt.index b/models/keyword_spotting/ds_cnn_small/model_package_tf/model_archive/model_source/weights/ds_cnn_0.94_ckpt.index
similarity index 100%
rename from models/keyword_spotting/ds_cnn_small/tflite_int8/ckpt/ds_cnn_0.94_ckpt.index
rename to models/keyword_spotting/ds_cnn_small/model_package_tf/model_archive/model_source/weights/ds_cnn_0.94_ckpt.index
diff --git a/models/keyword_spotting/ds_cnn_small/model_package_tf/model_core_utils/__init__.py b/models/keyword_spotting/ds_cnn_small/model_package_tf/model_core_utils/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/models/keyword_spotting/ds_cnn_small/model_package_tf/model_core_utils/models.py b/models/keyword_spotting/ds_cnn_small/model_package_tf/model_core_utils/models.py
new file mode 100644
index 0000000..1978136
--- /dev/null
+++ b/models/keyword_spotting/ds_cnn_small/model_package_tf/model_core_utils/models.py
@@ -0,0 +1,327 @@
+# Copyright © 2023 Arm Ltd. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Model definitions for simple keyword spotting."""
+
+import math
+
+import tensorflow as tf
+
+
+def prepare_model_settings(label_count, sample_rate, clip_duration_ms,
+ window_size_ms, window_stride_ms,
+ dct_coefficient_count):
+ """Calculates common settings needed for all models.
+
+ Args:
+ label_count: How many classes are to be recognized.
+ sample_rate: Number of audio samples per second.
+ clip_duration_ms: Length of each audio clip to be analyzed.
+ window_size_ms: Duration of frequency analysis window.
+ window_stride_ms: How far to move in time between frequency windows.
+ dct_coefficient_count: Number of frequency bins to use for analysis.
+
+ Returns:
+ Dictionary containing common settings.
+ """
+ desired_samples = int(sample_rate * clip_duration_ms / 1000)
+ window_size_samples = int(sample_rate * window_size_ms / 1000)
+ window_stride_samples = int(sample_rate * window_stride_ms / 1000)
+ length_minus_window = (desired_samples - window_size_samples)
+ if length_minus_window < 0:
+ spectrogram_length = 0
+ else:
+ spectrogram_length = 1 + int(length_minus_window / window_stride_samples)
+ fingerprint_size = dct_coefficient_count * spectrogram_length
+
+ return {
+ 'desired_samples': desired_samples,
+ 'window_size_samples': window_size_samples,
+ 'window_stride_samples': window_stride_samples,
+ 'spectrogram_length': spectrogram_length,
+ 'dct_coefficient_count': dct_coefficient_count,
+ 'fingerprint_size': fingerprint_size,
+ 'label_count': label_count,
+ 'sample_rate': sample_rate,
+ }
+
+
+def create_model(model_settings, model_architecture, model_size_info, is_training):
+ """Builds a tf.keras model of the requested architecture compatible with the settings.
+
+ Args:
+ model_settings: Dictionary of information about the model.
+ model_architecture: String specifying which kind of model to create.
+ model_size_info: Array with specific information for the chosen architecture
+ (e.g convolutional parameters, number of layers).
+
+ Returns:
+ A tf.keras Model with the requested architecture.
+
+ Raises:
+ Exception: If the architecture type isn't recognized.
+ """
+
+ if model_architecture == 'dnn':
+ return create_dnn_model(model_settings, model_size_info)
+
+ elif model_architecture == 'cnn':
+ return create_cnn_model(model_settings, model_size_info)
+
+ elif model_architecture == 'ds_cnn':
+ return create_ds_cnn_model(model_settings, model_size_info)
+ elif model_architecture == 'single_fc':
+ return create_single_fc_model(model_settings)
+ elif model_architecture == 'basic_lstm':
+ return create_basic_lstm_model(model_settings, model_size_info, is_training)
+ else:
+ raise Exception(f'model_architecture argument {model_architecture} not recognized'
+ f', should be one of, "dnn", "cnn", "ds_cnn" ')
+
+
+def create_single_fc_model(model_settings):
+ """Builds a model with a single fully-connected layer.
+
+ For details see https://arxiv.org/abs/1711.07128.
+
+ Args:
+ model_settings: Dict of different settings for model training.
+
+ Returns:
+ tf.keras Model of the 'SINGLE_FC' architecture.
+ """
+ inputs = tf.keras.Input(shape=(model_settings['fingerprint_size'],), name='input')
+ # Fully connected layer
+ output = tf.keras.layers.Dense(units=model_settings['label_count'], activation='softmax')(inputs)
+
+ return tf.keras.Model(inputs, output)
+
+
+def create_basic_lstm_model(model_settings, model_size_info, is_training):
+ """Builds a model with a basic lstm layer.
+
+ For details see https://arxiv.org/abs/1711.07128.
+
+ Args:
+ model_settings: Dict of different settings for model training.
+ model_size_info: Length of the array defines the number of hidden-layers and
+ each element in the array represent the number of neurons in that layer.
+ is_training: Determining whether the use of the model is for training or for something else.
+
+ Returns:
+ tf.keras Model of the 'Basic_LSTM' architecture.
+ """
+ inputs = tf.keras.Input(shape=(model_settings['fingerprint_size'], ), name='input')
+
+ input_frequency_size = model_settings['dct_coefficient_count']
+ input_time_size = model_settings['spectrogram_length']
+ x = tf.reshape(inputs, shape=(-1, input_time_size, input_frequency_size))
+
+ # LSTM layer, and unrolling depending on whether you are training or not
+ if is_training:
+ x = tf.keras.layers.LSTM(units=model_size_info[0], time_major=False, unroll=False)(x)
+ else:
+ x = tf.keras.layers.LSTM(units=model_size_info[0], time_major=False, unroll=True)(x)
+
+ # Outputs a fully connected layer
+ output = tf.keras.layers.Dense(units=model_settings['label_count'], activation='softmax')(x)
+
+ return tf.keras.Model(inputs, output)
+
+
+def create_dnn_model(model_settings, model_size_info):
+ """Builds a model with multiple hidden fully-connected layers.
+
+ For details see https://arxiv.org/abs/1711.07128.
+
+ Args:
+ model_settings: Dict of different settings for model training.
+ model_size_info: Length of the array defines the number of hidden-layers and
+ each element in the array represent the number of neurons in that layer.
+
+ Returns:
+ tf.keras Model of the 'DNN' architecture.
+ """
+
+ inputs = tf.keras.Input(shape=(model_settings['fingerprint_size'], ), name='input')
+
+ # First fully connected layer.
+ x = tf.keras.layers.Dense(units=model_size_info[0], activation='relu')(inputs)
+
+ # Hidden layers with ReLU activations.
+ for i in range(1, len(model_size_info)):
+ x = tf.keras.layers.Dense(units=model_size_info[i], activation='relu')(x)
+
+ # Output fully connected layer.
+ output = tf.keras.layers.Dense(units=model_settings['label_count'], activation='softmax')(x)
+
+ return tf.keras.Model(inputs, output)
+
+
+def create_cnn_model(model_settings, model_size_info):
+ """Builds a model with 2 convolution layers followed by a linear layer and a hidden fully-connected layer.
+
+ For details see https://arxiv.org/abs/1711.07128.
+
+ Args:
+ model_settings: Dict of different settings for model training.
+ model_size_info: Defines the first and second convolution parameters in
+ {number of conv features, conv filter height, width, stride in y,x dir.},
+ followed by linear layer size and fully-connected layer size.
+
+ Returns:
+ tf.keras Model of the 'CNN' architecture.
+ """
+
+ input_frequency_size = model_settings['dct_coefficient_count']
+ input_time_size = model_settings['spectrogram_length']
+
+ first_filter_count = model_size_info[0]
+ first_filter_height = model_size_info[1] # Time axis.
+ first_filter_width = model_size_info[2] # Frequency axis.
+ first_filter_stride_y = model_size_info[3] # Time axis.
+ first_filter_stride_x = model_size_info[4] # Frequency_axis.
+
+ second_filter_count = model_size_info[5]
+ second_filter_height = model_size_info[6] # Time axis.
+ second_filter_width = model_size_info[7] # Frequency axis.
+ second_filter_stride_y = model_size_info[8] # Time axis.
+ second_filter_stride_x = model_size_info[9] # Frequency axis.
+
+ linear_layer_size = model_size_info[10]
+ fc_size = model_size_info[11]
+
+ inputs = tf.keras.Input(shape=(model_settings['fingerprint_size']), name='input')
+
+ # Reshape the flattened input.
+ x = tf.reshape(inputs, shape=(-1, input_time_size, input_frequency_size, 1))
+
+ # First convolution.
+ x = tf.keras.layers.Conv2D(filters=first_filter_count,
+ kernel_size=(first_filter_height, first_filter_width),
+ strides=(first_filter_stride_y, first_filter_stride_x),
+ padding='VALID')(x)
+ x = tf.keras.layers.BatchNormalization()(x)
+ x = tf.keras.layers.ReLU()(x)
+ x = tf.keras.layers.Dropout(rate=0)(x)
+
+ # Second convolution.
+ x = tf.keras.layers.Conv2D(filters=second_filter_count,
+ kernel_size=(second_filter_height, second_filter_width),
+ strides=(second_filter_stride_y, second_filter_stride_x),
+ padding='VALID')(x)
+ x = tf.keras.layers.BatchNormalization()(x)
+ x = tf.keras.layers.ReLU()(x)
+ x = tf.keras.layers.Dropout(rate=0)(x)
+
+ # Flatten for fully connected layers.
+ x = tf.keras.layers.Flatten()(x)
+
+ # Fully connected layer with no activation.
+ x = tf.keras.layers.Dense(units=linear_layer_size)(x)
+
+ # Fully connected layer with ReLU activation.
+ x = tf.keras.layers.Dense(units=fc_size)(x)
+ x = tf.keras.layers.BatchNormalization()(x)
+ x = tf.keras.layers.ReLU()(x)
+ x = tf.keras.layers.Dropout(rate=0)(x)
+
+ # Output fully connected.
+ output = tf.keras.layers.Dense(units=model_settings['label_count'], activation='softmax')(x)
+
+ return tf.keras.Model(inputs, output)
+
+
+def create_ds_cnn_model(model_settings, model_size_info):
+ """Builds a model with convolutional & depthwise separable convolutional layers.
+
+ For more details see https://arxiv.org/abs/1711.07128.
+
+ Args:
+ model_settings: Dict of different settings for model training.
+ model_size_info: Defines number of layers, followed by the DS-Conv layer
+ parameters in the order {number of conv features, conv filter height,
+ width and stride in y,x dir.} for each of the layers.
+
+ Returns:
+ tf.keras Model of the 'DS-CNN' architecture.
+ """
+
+ label_count = model_settings['label_count']
+ input_frequency_size = model_settings['dct_coefficient_count']
+ input_time_size = model_settings['spectrogram_length']
+
+ t_dim = input_time_size
+ f_dim = input_frequency_size
+
+ # Extract model dimensions from model_size_info.
+ num_layers = model_size_info[0]
+ conv_feat = [None]*num_layers
+ conv_kt = [None]*num_layers
+ conv_kf = [None]*num_layers
+ conv_st = [None]*num_layers
+ conv_sf = [None]*num_layers
+
+ i = 1
+ for layer_no in range(0, num_layers):
+ conv_feat[layer_no] = model_size_info[i]
+ i += 1
+ conv_kt[layer_no] = model_size_info[i]
+ i += 1
+ conv_kf[layer_no] = model_size_info[i]
+ i += 1
+ conv_st[layer_no] = model_size_info[i]
+ i += 1
+ conv_sf[layer_no] = model_size_info[i]
+ i += 1
+
+ inputs = tf.keras.Input(shape=(model_settings['fingerprint_size']), name='input')
+
+ # Reshape the flattened input.
+ x = tf.reshape(inputs, shape=(-1, input_time_size, input_frequency_size, 1))
+
+ # Depthwise separable convolutions.
+ for layer_no in range(0, num_layers):
+ if layer_no == 0:
+ # First convolution.
+ x = tf.keras.layers.Conv2D(filters=conv_feat[0],
+ kernel_size=(conv_kt[0], conv_kf[0]),
+ strides=(conv_st[0], conv_sf[0]),
+ padding='SAME')(x)
+ x = tf.keras.layers.BatchNormalization()(x)
+ x = tf.keras.layers.ReLU()(x)
+ else:
+ # Depthwise convolution.
+ x = tf.keras.layers.DepthwiseConv2D(kernel_size=(conv_kt[layer_no], conv_kf[layer_no]),
+ strides=(conv_sf[layer_no], conv_st[layer_no]),
+ padding='SAME')(x)
+ x = tf.keras.layers.BatchNormalization()(x)
+ x = tf.keras.layers.ReLU()(x)
+
+ # Pointwise convolution.
+ x = tf.keras.layers.Conv2D(filters=conv_feat[layer_no], kernel_size=(1, 1))(x)
+ x = tf.keras.layers.BatchNormalization()(x)
+ x = tf.keras.layers.ReLU()(x)
+
+ t_dim = math.ceil(t_dim/float(conv_st[layer_no]))
+ f_dim = math.ceil(f_dim/float(conv_sf[layer_no]))
+
+ # Global average pool.
+ x = tf.keras.layers.AveragePooling2D(pool_size=(t_dim, f_dim), strides=1)(x)
+
+ # Squeeze before passing to output fully connected layer.
+ x = tf.reshape(x, shape=(-1, conv_feat[layer_no]))
+
+ # Output connected layer.
+ output = tf.keras.layers.Dense(units=label_count, activation='softmax')(x)
+
+ return tf.keras.Model(inputs, output)
diff --git a/models/keyword_spotting/ds_cnn_small/model_package_tf/optimisations.py b/models/keyword_spotting/ds_cnn_small/model_package_tf/optimisations.py
new file mode 100644
index 0000000..16b6f4c
--- /dev/null
+++ b/models/keyword_spotting/ds_cnn_small/model_package_tf/optimisations.py
@@ -0,0 +1,259 @@
+# Copyright © 2023 Arm Ltd. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Functions for optimizing simple keyword spotting models using clustering API."""
+
+import argparse
+from pathlib import Path
+
+import tensorflow as tf
+import numpy as np
+import tensorflow_model_optimization as tfmot
+
+from data_processing import data_preprocessing
+from model_core_utils import models
+
+
+def print_model_weight_clusters(model):
+
+ for layer in model.layers:
+ if isinstance(layer, tf.keras.layers.Wrapper):
+ weights = layer.trainable_weights
+ else:
+ weights = layer.weights
+ for weight in weights:
+ if "kernel" in weight.name:
+ unique_count = len(np.unique(weight))
+ print(
+ f"{layer.name}/{weight.name}: {unique_count} clusters "
+ )
+
+
+def optimize():
+ model_settings = models.prepare_model_settings(
+ len(data_preprocessing.prepare_words_list(FLAGS.wanted_words.split(','))),
+ FLAGS.sample_rate, FLAGS.clip_duration_ms, FLAGS.window_size_ms,
+ FLAGS.window_stride_ms, FLAGS.dct_coefficient_count)
+
+ # Create the model to optimize from checkpoint.
+ model = models.create_model(model_settings, FLAGS.model_architecture, FLAGS.model_size_info)
+ model.load_weights(FLAGS.checkpoint).expect_partial()
+
+ audio_processor = data_preprocessing.AudioProcessor(data_url=FLAGS.data_url,
+ data_dir=FLAGS.data_dir,
+ silence_percentage=FLAGS.silence_percentage,
+ unknown_percentage=FLAGS.unknown_percentage,
+ wanted_words=FLAGS.wanted_words.split(','),
+ validation_percentage=FLAGS.validation_percentage,
+ testing_percentage=FLAGS.testing_percentage,
+ model_settings=model_settings)
+
+ # We decay learning rate in a constant piecewise way to help learning.
+ training_steps_list = list(map(int, FLAGS.how_many_training_steps.split(',')))
+ learning_rates_list = list(map(float, FLAGS.learning_rate.split(',')))
+ lr_boundary_list = training_steps_list[:-1] # Only need the values at which to change lr.
+ lr_schedule = tf.keras.optimizers.schedules.PiecewiseConstantDecay(boundaries=lr_boundary_list,
+ values=learning_rates_list)
+
+ cluster_weights = tfmot.clustering.keras.cluster_weights
+ CentroidInitialization = tfmot.clustering.keras.CentroidInitialization
+
+ clustering_params = {
+ 'number_of_clusters': 32,
+ 'cluster_centroids_init': CentroidInitialization.KMEANS_PLUS_PLUS}
+
+ clustered_model = cluster_weights(model, **clustering_params)
+
+ # Specify the optimizer configurations.
+ optimizer = tf.keras.optimizers.Adam(learning_rate=lr_schedule)
+ clustered_model.compile(optimizer=optimizer,
+ loss=tf.keras.losses.SparseCategoricalCrossentropy(),
+ metrics=['accuracy'])
+
+ train_data = audio_processor.get_data(audio_processor.Modes.TRAINING,
+ FLAGS.background_frequency, FLAGS.background_volume,
+ int((FLAGS.time_shift_ms * FLAGS.sample_rate) / 1000))
+ train_data = train_data.repeat().batch(FLAGS.batch_size).prefetch(tf.data.AUTOTUNE)
+ val_data = audio_processor.get_data(audio_processor.Modes.VALIDATION)
+ val_data = val_data.batch(FLAGS.batch_size).prefetch(tf.data.AUTOTUNE)
+
+ # We train for a max number of iterations so need to calculate how many 'epochs' this will be.
+ training_steps_max = np.sum(training_steps_list)
+ training_epoch_max = int(np.ceil(training_steps_max / FLAGS.eval_step_interval))
+
+ # Train the model with clustering applied.
+ clustered_model.fit(x=train_data,
+ steps_per_epoch=FLAGS.eval_step_interval,
+ epochs=training_epoch_max,
+ validation_data=val_data)
+
+ stripped_clustered_model = tfmot.clustering.keras.strip_clustering(clustered_model)
+
+ print_model_weight_clusters(stripped_clustered_model)
+
+ # Save the clustered model weights
+ train_dir = Path(FLAGS.train_dir) / "optimized"
+ train_dir.mkdir(parents=True, exist_ok=True)
+
+ stripped_clustered_model.save_weights((train_dir /
+ (FLAGS.model_architecture +
+ "_clustered_ckpt")))
+
+ # Test the model.
+ test_data = audio_processor.get_data(audio_processor.Modes.TESTING)
+ test_data = test_data.batch(FLAGS.batch_size)
+
+ stripped_clustered_model.compile(optimizer=optimizer,
+ loss=tf.keras.losses.SparseCategoricalCrossentropy(),
+ metrics=['accuracy'])
+
+ test_loss, test_acc = stripped_clustered_model.evaluate(x=test_data)
+ print(f'Final test accuracy: {test_acc*100:.2f}%')
+
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser()
+ parser.add_argument(
+ '--data_url',
+ type=str,
+ default='http://download.tensorflow.org/data/speech_commands_v0.02.tar.gz',
+ help='Location of speech training data archive on the web.')
+ parser.add_argument(
+ '--data_dir',
+ type=str,
+ default='/tmp/speech_dataset/',
+ help="""\
+ Where to download the speech training data to.
+ """)
+ parser.add_argument(
+ '--background_volume',
+ type=float,
+ default=0.1,
+ help="""\
+ How loud the background noise should be, between 0 and 1.
+ """)
+ parser.add_argument(
+ '--background_frequency',
+ type=float,
+ default=0.8,
+ help="""\
+ How many of the training samples have background noise mixed in.
+ """)
+ parser.add_argument(
+ '--silence_percentage',
+ type=float,
+ default=10.0,
+ help="""\
+ How much of the training data should be silence.
+ """)
+ parser.add_argument(
+ '--unknown_percentage',
+ type=float,
+ default=10.0,
+ help="""\
+ How much of the training data should be unknown words.
+ """)
+ parser.add_argument(
+ '--time_shift_ms',
+ type=float,
+ default=100.0,
+ help="""\
+ Range to randomly shift the training audio by in time.
+ """)
+ parser.add_argument(
+ '--testing_percentage',
+ type=int,
+ default=10,
+ help='What percentage of wavs to use as a test set.')
+ parser.add_argument(
+ '--validation_percentage',
+ type=int,
+ default=10,
+ help='What percentage of wavs to use as a validation set.')
+ parser.add_argument(
+ '--sample_rate',
+ type=int,
+ default=16000,
+ help='Expected sample rate of the wavs',)
+ parser.add_argument(
+ '--clip_duration_ms',
+ type=int,
+ default=1000,
+ help='Expected duration in milliseconds of the wavs',)
+ parser.add_argument(
+ '--window_size_ms',
+ type=float,
+ default=30.0,
+ help='How long each spectrogram timeslice is',)
+ parser.add_argument(
+ '--window_stride_ms',
+ type=float,
+ default=10.0,
+ help='How long each spectrogram timeslice is',)
+ parser.add_argument(
+ '--dct_coefficient_count',
+ type=int,
+ default=40,
+ help='How many bins to use for the MFCC fingerprint',)
+ parser.add_argument(
+ '--how_many_training_steps',
+ type=str,
+ default='3750,750',
+ help='How many training loops to run',)
+ parser.add_argument(
+ '--eval_step_interval',
+ type=int,
+ default=400,
+ help='How often to evaluate the training results.')
+ parser.add_argument(
+ '--learning_rate',
+ type=str,
+ default='0.001,0.0001',
+ help='How large a learning rate to use when training.')
+ parser.add_argument(
+ '--batch_size',
+ type=int,
+ default=100,
+ help='How many items to train with at once',)
+ parser.add_argument(
+ '--wanted_words',
+ type=str,
+ default='yes,no,up,down,left,right,on,off,stop,go',
+ help='Words to use (others will be added to an unknown label)',)
+ parser.add_argument(
+ '--train_dir',
+ type=str,
+ default='/tmp/speech_commands_train',
+ help='Directory to write event logs and checkpoint.')
+ parser.add_argument(
+ '--save_step_interval',
+ type=int,
+ default=100,
+ help='Save model checkpoint every save_steps.')
+ parser.add_argument(
+ '--checkpoint',
+ type=str,
+ help='Checkpoint to load the weights from before fine-tuning.')
+ parser.add_argument(
+ '--model_architecture',
+ type=str,
+ default='dnn',
+ help='What model architecture to use')
+ parser.add_argument(
+ '--model_size_info',
+ type=int,
+ nargs="+",
+ default=[128, 128, 128],
+ help='Model dimensions - different for various models')
+
+ FLAGS, _ = parser.parse_known_args()
+ optimize()
diff --git a/models/keyword_spotting/ds_cnn_small/model_package_tf/recreate_model.sh b/models/keyword_spotting/ds_cnn_small/model_package_tf/recreate_model.sh
new file mode 100644
index 0000000..a081905
--- /dev/null
+++ b/models/keyword_spotting/ds_cnn_small/model_package_tf/recreate_model.sh
@@ -0,0 +1,52 @@
+#!/usr/bin/env bash
+# Copyright (C) 2023 Arm Limited or its affiliates. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+ckpt_path=model_archive/model_source/weights/ds_cnn_0.94_ckpt
+train=false
+
+# Parse command line args
+while (( $# >= 1 )); do
+ case $1 in
+ --ckpt)
+ if [ "$2" ]; then
+ ckpt_path=$2
+ shift
+ else
+ printf 'ERROR: "--ckpt" requires a path to be supplied.\n'
+ exit 1
+ fi
+ ;;
+ --train)
+ train=true
+ break;;
+ *) shift;
+ esac;
+done
+
+
+# DS-CNN Small training
+if [ "$train" = true ]
+then
+python train.py --model_architecture ds_cnn --model_size_info 5 64 10 4 2 2 64 3 3 1 1 64 3 3 1 1 64 3 3 1 1 64 3 3 1 1 --dct_coefficient_count 10 --window_size_ms 40 --window_stride_ms 20 --learning_rate 0.0005,0.0001,0.00002 --how_many_training_steps 10000,10000,10000 --summaries_dir work/DS_CNN/DS_CNN_S/retrain_logs --train_dir work/DS_CNN/DS_CNN_S/training
+fi
+
+# Conversion to TFLite fp32
+python convert_to_tflite.py --model_architecture ds_cnn --model_size_info 5 64 10 4 2 2 64 3 3 1 1 64 3 3 1 1 64 3 3 1 1 64 3 3 1 1 --dct_coefficient_count 10 --window_size_ms 40 --window_stride_ms 20 --checkpoint $ckpt_path --no-quantize
+
+# Conversion to TFLite int8
+python convert_to_tflite.py --model_architecture ds_cnn --model_size_info 5 64 10 4 2 2 64 3 3 1 1 64 3 3 1 1 64 3 3 1 1 64 3 3 1 1 --dct_coefficient_count 10 --window_size_ms 40 --window_stride_ms 20 --checkpoint $ckpt_path --inference_type int8
+
diff --git a/models/keyword_spotting/ds_cnn_small/model_package_tf/requirements.txt b/models/keyword_spotting/ds_cnn_small/model_package_tf/requirements.txt
new file mode 100644
index 0000000..3448cff
--- /dev/null
+++ b/models/keyword_spotting/ds_cnn_small/model_package_tf/requirements.txt
@@ -0,0 +1,3 @@
+numpy == 1.19.5
+tensorflow == 2.5.0
+tensorflow-model-optimization == 0.6.0
\ No newline at end of file
diff --git a/models/keyword_spotting/ds_cnn_small/model_package_tf/train.py b/models/keyword_spotting/ds_cnn_small/model_package_tf/train.py
new file mode 100644
index 0000000..8c488b3
--- /dev/null
+++ b/models/keyword_spotting/ds_cnn_small/model_package_tf/train.py
@@ -0,0 +1,227 @@
+# Copyright © 2023 Arm Ltd. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Functions for training simple keyword spotting models."""
+
+import argparse
+from pathlib import Path
+
+import tensorflow as tf
+import numpy as np
+
+from data_processing import data_preprocessing
+from model_core_utils import models
+
+
+def train():
+ model_settings = models.prepare_model_settings(
+ len(data_preprocessing.prepare_words_list(FLAGS.wanted_words.split(','))),
+ FLAGS.sample_rate, FLAGS.clip_duration_ms, FLAGS.window_size_ms,
+ FLAGS.window_stride_ms, FLAGS.dct_coefficient_count)
+
+ # Create the model.
+ model = models.create_model(model_settings, FLAGS.model_architecture, FLAGS.model_size_info, True)
+
+ audio_processor = data_preprocessing.AudioProcessor(data_url=FLAGS.data_url,
+ data_dir=FLAGS.data_dir,
+ silence_percentage=FLAGS.silence_percentage,
+ unknown_percentage=FLAGS.unknown_percentage,
+ wanted_words=FLAGS.wanted_words.split(','),
+ validation_percentage=FLAGS.validation_percentage,
+ testing_percentage=FLAGS.testing_percentage,
+ model_settings=model_settings)
+
+ # We decay learning rate in a constant piecewise way to help learning.
+ training_steps_list = list(map(int, FLAGS.how_many_training_steps.split(',')))
+ learning_rates_list = list(map(float, FLAGS.learning_rate.split(',')))
+ lr_boundary_list = training_steps_list[:-1] # Only need the values at which to change lr.
+ lr_schedule = tf.keras.optimizers.schedules.PiecewiseConstantDecay(boundaries=lr_boundary_list,
+ values=learning_rates_list)
+
+ # Specify the optimizer configurations.
+ optimizer = tf.keras.optimizers.Adam(learning_rate=lr_schedule)
+ model.compile(optimizer=optimizer,
+ loss=tf.keras.losses.SparseCategoricalCrossentropy(),
+ metrics=['accuracy'])
+
+ train_data = audio_processor.get_data(audio_processor.Modes.TRAINING,
+ FLAGS.background_frequency, FLAGS.background_volume,
+ int((FLAGS.time_shift_ms * FLAGS.sample_rate) / 1000))
+ train_data = train_data.repeat().batch(FLAGS.batch_size).prefetch(tf.data.AUTOTUNE)
+ val_data = audio_processor.get_data(audio_processor.Modes.VALIDATION)
+ val_data = val_data.batch(FLAGS.batch_size).prefetch(tf.data.AUTOTUNE)
+
+ # We train for a max number of iterations so need to calculate how many 'epochs' this will be.
+ training_steps_max = np.sum(training_steps_list)
+ training_epoch_max = int(np.ceil(training_steps_max / FLAGS.eval_step_interval))
+
+ # Callbacks.
+ train_dir = Path(FLAGS.train_dir) / "best"
+ train_dir.mkdir(parents=True, exist_ok=True)
+ model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
+ filepath=(train_dir / (FLAGS.model_architecture + "_{val_accuracy:.3f}_ckpt")),
+ save_weights_only=True,
+ monitor='val_accuracy',
+ mode='max',
+ save_best_only=True)
+ tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=FLAGS.summaries_dir)
+
+ # Train the model.
+ model.fit(x=train_data,
+ steps_per_epoch=FLAGS.eval_step_interval,
+ epochs=training_epoch_max,
+ validation_data=val_data,
+ callbacks=[model_checkpoint_callback, tensorboard_callback])
+
+ # Test and save the model.
+ test_data = audio_processor.get_data(audio_processor.Modes.TESTING)
+ test_data = test_data.batch(FLAGS.batch_size)
+
+ test_loss, test_acc = model.evaluate(x=test_data)
+ print(f'Final test accuracy: {test_acc*100:.2f}%')
+ model.save(f'saved_model/{FLAGS.model_architecture}')
+ model.save(f'keras/{FLAGS.model_architecture}.h5')
+
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser()
+ parser.add_argument(
+ '--data_url',
+ type=str,
+ default='http://download.tensorflow.org/data/speech_commands_v0.02.tar.gz',
+ help='Location of speech training data archive on the web.')
+ parser.add_argument(
+ '--data_dir',
+ type=str,
+ default='/tmp/speech_dataset/',
+ help="""\
+ Where to download the speech training data to.
+ """)
+ parser.add_argument(
+ '--background_volume',
+ type=float,
+ default=0.1,
+ help="""\
+ How loud the background noise should be, between 0 and 1.
+ """)
+ parser.add_argument(
+ '--background_frequency',
+ type=float,
+ default=0.8,
+ help="""\
+ How many of the training samples have background noise mixed in.
+ """)
+ parser.add_argument(
+ '--silence_percentage',
+ type=float,
+ default=10.0,
+ help="""\
+ How much of the training data should be silence.
+ """)
+ parser.add_argument(
+ '--unknown_percentage',
+ type=float,
+ default=10.0,
+ help="""\
+ How much of the training data should be unknown words.
+ """)
+ parser.add_argument(
+ '--time_shift_ms',
+ type=float,
+ default=100.0,
+ help="""\
+ Range to randomly shift the training audio by in time.
+ """)
+ parser.add_argument(
+ '--testing_percentage',
+ type=int,
+ default=10,
+ help='What percentage of wavs to use as a test set.')
+ parser.add_argument(
+ '--validation_percentage',
+ type=int,
+ default=10,
+ help='What percentage of wavs to use as a validation set.')
+ parser.add_argument(
+ '--sample_rate',
+ type=int,
+ default=16000,
+ help='Expected sample rate of the wavs',)
+ parser.add_argument(
+ '--clip_duration_ms',
+ type=int,
+ default=1000,
+ help='Expected duration in milliseconds of the wavs',)
+ parser.add_argument(
+ '--window_size_ms',
+ type=float,
+ default=30.0,
+ help='How long each spectrogram timeslice is',)
+ parser.add_argument(
+ '--window_stride_ms',
+ type=float,
+ default=10.0,
+ help='How long each spectrogram timeslice is',)
+ parser.add_argument(
+ '--dct_coefficient_count',
+ type=int,
+ default=40,
+ help='How many bins to use for the MFCC fingerprint',)
+ parser.add_argument(
+ '--how_many_training_steps',
+ type=str,
+ default='15000,3000',
+ help='How many training loops to run',)
+ parser.add_argument(
+ '--eval_step_interval',
+ type=int,
+ default=400,
+ help='How often to evaluate the training results.')
+ parser.add_argument(
+ '--learning_rate',
+ type=str,
+ default='0.001,0.0001',
+ help='How large a learning rate to use when training.')
+ parser.add_argument(
+ '--batch_size',
+ type=int,
+ default=100,
+ help='How many items to train with at once',)
+ parser.add_argument(
+ '--summaries_dir',
+ type=str,
+ default='/tmp/retrain_logs',
+ help='Where to save summary logs for TensorBoard.')
+ parser.add_argument(
+ '--wanted_words',
+ type=str,
+ default='yes,no,up,down,left,right,on,off,stop,go',
+ help='Words to use (others will be added to an unknown label)',)
+ parser.add_argument(
+ '--train_dir',
+ type=str,
+ default='/tmp/speech_commands_train',
+ help='Directory to write event logs and checkpoint.')
+ parser.add_argument(
+ '--model_architecture',
+ type=str,
+ default='dnn',
+ help='What model architecture to use')
+ parser.add_argument(
+ '--model_size_info',
+ type=int,
+ nargs="+",
+ default=[128, 128, 128],
+ help='Model dimensions - different for various models')
+
+ FLAGS, _ = parser.parse_known_args()
+ train()
diff --git a/models/keyword_spotting/ds_cnn_small/model_package_tf/validation_utils/labels.txt b/models/keyword_spotting/ds_cnn_small/model_package_tf/validation_utils/labels.txt
new file mode 100644
index 0000000..ba41645
--- /dev/null
+++ b/models/keyword_spotting/ds_cnn_small/model_package_tf/validation_utils/labels.txt
@@ -0,0 +1,12 @@
+_silence_
+_unknown_
+yes
+no
+up
+down
+left
+right
+on
+off
+stop
+go
\ No newline at end of file
diff --git a/models/keyword_spotting/ds_cnn_small/tflite_int16/README.md b/models/keyword_spotting/ds_cnn_small/tflite_int16/README.md
deleted file mode 100644
index 26be0bf..0000000
--- a/models/keyword_spotting/ds_cnn_small/tflite_int16/README.md
+++ /dev/null
@@ -1,59 +0,0 @@
-# DS-CNN Small INT16
-
-## Description
-This is a fully quantized version (asymmetrical int16) of the DS-CNN Small model developed by Arm, with training checkpoints, from the Hello Edge paper. Code to recreate this model can be found here: https://github.com/ARM-software/ML-examples/tree/master/tflu-kws-cortex-m
-
-## License
-[Apache-2.0](https://spdx.org/licenses/Apache-2.0.html)
-
-## Related Materials
-### Class Labels
-The class labels associated with this model can be downloaded by running the script `get_class_labels.sh`.
-
-### Model Recreation Code
-Code to recreate this model can be found here: https://github.com/ARM-software/ML-examples/tree/master/tflu-kws-cortex-m.
-
-## Network Information
-| Network Information | Value |
-|---------------------|------------------|
-| Framework | TensorFlow Lite |
-| SHA-1 Hash | e82c7d645bec3dec580a096de0a297c6dd9a6463 |
-| Size (Bytes) | 55392 |
-| Provenance | https://github.com/ARM-software/ML-examples/tree/master/tflu-kws-cortex-m |
-| Paper | https://arxiv.org/abs/1711.07128 |
-
-## Accuracy
-Dataset: Google Speech Commands Test Set
-
-| Metric | Value |
-|--------|-------|
-| Accuracy | 0.933 |
-
-## Performance
-| Platform | Optimized |
-|----------|:---------:|
-| Cortex-A |:heavy_check_mark: |
-| Cortex-M |:heavy_check_mark: HERO |
-| Mali GPU |:heavy_check_mark: |
-| Ethos U |:heavy_check_mark: |
-
-### Key
-* :heavy_check_mark: - Will run on this platform.
-* :heavy_multiplication_x: - Will not run on this platform.
-
-
-
-## Optimizations
-| Optimization | Value |
-|-----------------|---------|
-| Quantization | INT |
-
-## Network Inputs
-| Input Node Name | Shape | Description |
-|-----------------|---------|-------------|
-| input | (1, 490) | The input is a processed MFCCs of shape (1, 490) |
-
-## Network Outputs
-| Output Node Name | Shape | Description |
-|------------------|---------|-------------|
-| Identity | (1, 12) | The probability on 12 keywords. |
diff --git a/models/keyword_spotting/ds_cnn_small/tflite_int16/ckpt/checkpoint b/models/keyword_spotting/ds_cnn_small/tflite_int16/ckpt/checkpoint
deleted file mode 100644
index 7415b78..0000000
--- a/models/keyword_spotting/ds_cnn_small/tflite_int16/ckpt/checkpoint
+++ /dev/null
@@ -1,2 +0,0 @@
-model_checkpoint_path: "ds_cnn_0.939_ckpt"
-all_model_checkpoint_paths: "ds_cnn_0.939_ckpt"
diff --git a/models/keyword_spotting/ds_cnn_small/tflite_int16/ckpt/ds_cnn_0.939_ckpt.data-00000-of-00001 b/models/keyword_spotting/ds_cnn_small/tflite_int16/ckpt/ds_cnn_0.939_ckpt.data-00000-of-00001
deleted file mode 100644
index d850952..0000000
Binary files a/models/keyword_spotting/ds_cnn_small/tflite_int16/ckpt/ds_cnn_0.939_ckpt.data-00000-of-00001 and /dev/null differ
diff --git a/models/keyword_spotting/ds_cnn_small/tflite_int16/ckpt/ds_cnn_0.939_ckpt.index b/models/keyword_spotting/ds_cnn_small/tflite_int16/ckpt/ds_cnn_0.939_ckpt.index
deleted file mode 100644
index 75f70e3..0000000
Binary files a/models/keyword_spotting/ds_cnn_small/tflite_int16/ckpt/ds_cnn_0.939_ckpt.index and /dev/null differ
diff --git a/models/keyword_spotting/ds_cnn_small/tflite_int16/definition.yaml b/models/keyword_spotting/ds_cnn_small/tflite_int16/definition.yaml
deleted file mode 100644
index 59c1dc7..0000000
--- a/models/keyword_spotting/ds_cnn_small/tflite_int16/definition.yaml
+++ /dev/null
@@ -1,48 +0,0 @@
-benchmark:
- Google Speech Commands test set:
- Accuracy: 93.39%
-description: 'This is a fully quantized version (asymmetrical int16) of the DS-CNN
- Small model developed by Arm, with training checkpoints, from the Hello Edge paper.
- Code to recreate this model can be found here: https://github.com/ARM-software/ML-examples/tree/master/tflu-kws-cortex-m'
-license:
-- Apache-2.0
-network:
- file_size_bytes: 55392
- filename: ds_cnn_quantized.tflite
- framework: TensorFlow Lite
- hash:
- algorithm: sha1
- value: e82c7d645bec3dec580a096de0a297c6dd9a6463
- provenance: https://github.com/ARM-software/ML-examples/tree/main/tflu-kws-cortex-m
-network_parameters:
- input_nodes:
- - description: The input is a processed MFCCs of shape (1, 490)
- example_input:
- path: models/keyword_spotting/ds_cnn/tflite_int16/testing_input/serving_default_input:0
- shape:
- - 1
- - 490
- type: int16
- use_case: Random input for model regression.
- input_datatype: int16
- name: serving_default_input:0
- shape:
- - 1
- - 490
- output_nodes:
- - description: The probability on 12 keywords.
- name: StatefulPartitionedCall:0
- output_datatype: int16
- shape:
- - 1
- - 12
-operators:
- TensorFlow Lite:
- - AVERAGE_POOL_2D
- - CONV_2D
- - DEPTHWISE_CONV_2D
- - FULLY_CONNECTED
- - RELU
- - RESHAPE
- - SOFTMAX
-paper: https://arxiv.org/abs/1711.07128
diff --git a/models/keyword_spotting/ds_cnn_small/tflite_int16/ds_cnn_quantized.tflite b/models/keyword_spotting/ds_cnn_small/tflite_int16/ds_cnn_quantized.tflite
deleted file mode 100644
index b19b478..0000000
Binary files a/models/keyword_spotting/ds_cnn_small/tflite_int16/ds_cnn_quantized.tflite and /dev/null differ
diff --git a/models/keyword_spotting/ds_cnn_small/tflite_int16/get_class_labels.sh b/models/keyword_spotting/ds_cnn_small/tflite_int16/get_class_labels.sh
deleted file mode 100755
index e59caf5..0000000
--- a/models/keyword_spotting/ds_cnn_small/tflite_int16/get_class_labels.sh
+++ /dev/null
@@ -1,20 +0,0 @@
-# Copyright (C) 2021 Arm Limited or its affiliates. All rights reserved.
-#
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-#!/usr/bin/env bash
-
-wget https://raw.githubusercontent.com/ARM-software/ML-KWS-for-MCU/e9cf319e9aa2ff71d433e111477dd95329fb94cb/Pretrained_models/labels.txt
-mv labels.txt labelmappings.txt
\ No newline at end of file
diff --git a/models/keyword_spotting/ds_cnn_small/tflite_int16/testing_input/serving_default_input:0/0.npy b/models/keyword_spotting/ds_cnn_small/tflite_int16/testing_input/serving_default_input:0/0.npy
deleted file mode 100644
index 75a2851..0000000
Binary files a/models/keyword_spotting/ds_cnn_small/tflite_int16/testing_input/serving_default_input:0/0.npy and /dev/null differ
diff --git a/models/keyword_spotting/ds_cnn_small/tflite_int16/testing_output/StatefulPartitionedCall:0/0.npy b/models/keyword_spotting/ds_cnn_small/tflite_int16/testing_output/StatefulPartitionedCall:0/0.npy
deleted file mode 100644
index b4c71a3..0000000
Binary files a/models/keyword_spotting/ds_cnn_small/tflite_int16/testing_output/StatefulPartitionedCall:0/0.npy and /dev/null differ
diff --git a/models/keyword_spotting/ds_cnn_small/tflite_int8/README.md b/models/keyword_spotting/ds_cnn_small/tflite_int8/README.md
deleted file mode 100644
index 230a02f..0000000
--- a/models/keyword_spotting/ds_cnn_small/tflite_int8/README.md
+++ /dev/null
@@ -1,59 +0,0 @@
-# DS-CNN Small INT8
-
-## Description
-This is a fully quantized version (asymmetrical int8) of the DS-CNN Small model developed by Arm, with training checkpoints, from the Hello Edge paper. Code to recreate this model can be found here: https://github.com/ARM-software/ML-examples/tree/master/tflu-kws-cortex-m
-
-## License
-[Apache-2.0](https://spdx.org/licenses/Apache-2.0.html)
-
-## Related Materials
-### Class Labels
-The class labels associated with this model can be downloaded by running the script `get_class_labels.sh`.
-
-### Model Recreation Code
-Code to recreate this model can be found here: https://github.com/ARM-software/ML-examples/tree/master/tflu-kws-cortex-m.
-
-## Network Information
-| Network Information | Value |
-|---------------------|------------------|
-| Framework | TensorFlow Lite |
-| SHA-1 Hash | cf24429e86a9647b1632c382894bc68d26d34039 |
-| Size (Bytes) | 47616 |
-| Provenance | https://github.com/ARM-software/ML-examples/tree/master/tflu-kws-cortex-m |
-| Paper | https://arxiv.org/abs/1711.07128 |
-
-## Accuracy
-Dataset: Google Speech Commands Test Set
-
-| Metric | Value |
-|--------|-------|
-| Accuracy | 0.935 |
-
-## Performance
-| Platform | Optimized |
-|----------|:---------:|
-| Cortex-A |:heavy_check_mark: |
-| Cortex-M |:heavy_check_mark: HERO |
-| Mali GPU |:heavy_check_mark: |
-| Ethos U |:heavy_check_mark: |
-
-### Key
-* :heavy_check_mark: - Will run on this platform.
-* :heavy_multiplication_x: - Will not run on this platform.
-
-
-
-## Optimizations
-| Optimization | Value |
-|-----------------|---------|
-| Quantization | INT8 |
-
-## Network Inputs
-| Input Node Name | Shape | Description |
-|-----------------|---------|-------------|
-| input | (1, 490) | The input is a processed MFCCs of shape (1, 490) |
-
-## Network Outputs
-| Output Node Name | Shape | Description |
-|------------------|---------|-------------|
-| Identity | (1, 12) | The probability on 12 keywords. |
diff --git a/models/keyword_spotting/ds_cnn_small/tflite_int8/definition.yaml b/models/keyword_spotting/ds_cnn_small/tflite_int8/definition.yaml
deleted file mode 100644
index 5e507b4..0000000
--- a/models/keyword_spotting/ds_cnn_small/tflite_int8/definition.yaml
+++ /dev/null
@@ -1,45 +0,0 @@
-benchmark:
- Google Speech Commands test set:
- Accuracy: 93.56%
-description: 'This is a fully quantized version (asymmetrical int8) of the DS-CNN
- Small model developed by Arm, with training checkpoints, from the Hello Edge paper.
- Code to recreate this model can be found here: https://github.com/ARM-software/ML-examples/tree/master/tflu-kws-cortex-m'
-license:
-- Apache-2.0
-network:
- file_size_bytes: 47616
- filename: ds_cnn_s_quantized.tflite
- framework: TensorFlow Lite
- hash:
- algorithm: sha1
- value: cf24429e86a9647b1632c382894bc68d26d34039
- provenance: https://github.com/ARM-software/ML-examples/tree/master/tflu-kws-cortex-m
- quality_level: hero#CORTEX-M
-network_parameters:
- input_nodes:
- - description: The input is a processed MFCCs of shape (1, 490)
- example_input:
- path: models/keyword_spotting/ds_cnn_small/tflite_int8/testing_input/input
- name: input
- shape:
- - 1
- - 490
- output_nodes:
- - description: The probability on 12 keywords.
- name: Identity
- shape:
- - 1
- - 12
- test_output_path: models/keyword_spotting/ds_cnn_small/tflite_int8/testing_output/Identity
-operators:
- TensorFlow Lite:
- - AVERAGE_POOL_2D
- - CONV_2D
- - DEPTHWISE_CONV_2D
- - DEQUANTIZE
- - FULLY_CONNECTED
- - QUANTIZE
- - RELU
- - RESHAPE
- - SOFTMAX
-paper: https://arxiv.org/abs/1711.07128
diff --git a/models/keyword_spotting/ds_cnn_small/tflite_int8/get_class_labels.sh b/models/keyword_spotting/ds_cnn_small/tflite_int8/get_class_labels.sh
deleted file mode 100755
index e59caf5..0000000
--- a/models/keyword_spotting/ds_cnn_small/tflite_int8/get_class_labels.sh
+++ /dev/null
@@ -1,20 +0,0 @@
-# Copyright (C) 2021 Arm Limited or its affiliates. All rights reserved.
-#
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-#!/usr/bin/env bash
-
-wget https://raw.githubusercontent.com/ARM-software/ML-KWS-for-MCU/e9cf319e9aa2ff71d433e111477dd95329fb94cb/Pretrained_models/labels.txt
-mv labels.txt labelmappings.txt
\ No newline at end of file