diff --git a/README.md b/README.md index 1c3ba33..71f654e 100644 --- a/README.md +++ b/README.md @@ -101,114 +101,214 @@ Score (Accuracy) - CNN Large INT8 * + CNN Large INT8 * INT8 TensorFlow Lite :heavy_check_mark: :heavy_check_mark: :heavy_check_mark: :heavy_check_mark: - 0.931 + 0.923 - CNN Medium INT8 * + CNN Medium INT8 * INT8 TensorFlow Lite :heavy_check_mark: :heavy_check_mark: :heavy_check_mark: :heavy_check_mark: - 0.911 + 0.905 - CNN Small INT8 * + CNN Small INT8 * INT8 TensorFlow Lite :heavy_check_mark: :heavy_check_mark: :heavy_check_mark: :heavy_check_mark: - 0.912 + 0.902 - DNN Large INT8 * + DNN Large INT8 * INT8 TensorFlow Lite :heavy_check_mark: :heavy_check_mark: :heavy_check_mark: :heavy_check_mark: - 0.863 + 0.860 - DNN Medium INT8 * + DNN Medium INT8 * INT8 TensorFlow Lite :heavy_check_mark: :heavy_check_mark: :heavy_check_mark: :heavy_check_mark: - 0.844 + 0.839 - DNN Small INT8 * + DNN Small INT8 * INT8 TensorFlow Lite :heavy_check_mark: :heavy_check_mark: :heavy_check_mark: :heavy_check_mark: - 0.825 + 0.821 - DS-CNN Clustered FP32 * + DS-CNN Large Clustered FP32 * FP32 TensorFlow Lite :heavy_check_mark: - :heavy_multiplication_x: + :heavy_check_mark: :heavy_check_mark: :heavy_multiplication_x: - 0.950 + 0.948 - DS-CNN Clustered INT8 * + DS-CNN Large Clustered INT8 * INT8 TensorFlow Lite - :heavy_multiplication_x: :heavy_check_mark: :heavy_check_mark: :heavy_check_mark: - 0.940 + :heavy_check_mark: + 0.939 - DS-CNN Large INT8 * + DS-CNN Large INT8 * INT8 TensorFlow Lite :heavy_check_mark: :heavy_check_mark: HERO :heavy_check_mark: :heavy_check_mark: - 0.946 + 0.945 - DS-CNN Medium INT8 * + DS-CNN Medium INT8 * INT8 TensorFlow Lite :heavy_check_mark: :heavy_check_mark: HERO :heavy_check_mark: :heavy_check_mark: - 0.941 + 0.939 - DS-CNN Small INT8 * + DS-CNN Small INT8 * INT8 TensorFlow Lite :heavy_check_mark: :heavy_check_mark: HERO :heavy_check_mark: :heavy_check_mark: - 0.935 + 0.931 + + + DS-CNN Small INT16 * + INT16 + TensorFlow Lite + :heavy_check_mark: + :heavy_check_mark: HERO + :heavy_check_mark: + :heavy_check_mark: + 0.934 + + + CNN Large FP32 * + FP32 + TensorFlow Lite + :heavy_check_mark: + :heavy_check_mark: + :heavy_check_mark: + :heavy_multiplication_x: + 0.934 + + + CNN Medium FP32 * + FP32 + TensorFlow Lite + :heavy_check_mark: + :heavy_check_mark: + :heavy_check_mark: + :heavy_multiplication_x: + 0.918 + + + CNN Small FP32 * + FP32 + TensorFlow Lite + :heavy_check_mark: + :heavy_check_mark: + :heavy_check_mark: + :heavy_multiplication_x: + 0.922 + + + DNN Large FP32 * + FP32 + TensorFlow Lite + :heavy_check_mark: + :heavy_check_mark: + :heavy_check_mark: + :heavy_multiplication_x: + 0.867 + + + DNN Medium FP32 * + FP32 + TensorFlow Lite + :heavy_check_mark: + :heavy_check_mark: + :heavy_check_mark: + :heavy_multiplication_x: + 0.850 + + + DNN Small FP32 * + FP32 + TensorFlow Lite + :heavy_check_mark: + :heavy_check_mark: + :heavy_check_mark: + :heavy_multiplication_x: + 0.836 + + + DS-CNN Large FP32 * + FP32 + TensorFlow Lite + :heavy_check_mark: + :heavy_check_mark: HERO + :heavy_check_mark: + :heavy_multiplication_x: + 0.950 + + + DS-CNN Medium FP32 * + FP32 + TensorFlow Lite + :heavy_check_mark: + :heavy_check_mark: HERO + :heavy_check_mark: + :heavy_multiplication_x: + 0.943 + + + DS-CNN Small FP32 * + FP32 + TensorFlow Lite + :heavy_check_mark: + :heavy_check_mark: HERO + :heavy_check_mark: + :heavy_multiplication_x: + 0.939 MicroNet Large INT8 diff --git a/models/experimental/efficientnet_lite0_224/efficientnet_lite0_224.tflite b/models/experimental/efficientnet_lite0_224/efficientnet_lite0_224.tflite index 9c9da85..a85250a 100644 Binary files a/models/experimental/efficientnet_lite0_224/efficientnet_lite0_224.tflite and b/models/experimental/efficientnet_lite0_224/efficientnet_lite0_224.tflite differ diff --git a/models/experimental/har_cnn/har_int8.tflite b/models/experimental/har_cnn/har_int8.tflite index 9d65d7e..a85b125 100644 Binary files a/models/experimental/har_cnn/har_int8.tflite and b/models/experimental/har_cnn/har_int8.tflite differ diff --git a/models/experimental/ssd_mobilenet_v3_int8/ssd_mobilenet_v3_int8.tflite b/models/experimental/ssd_mobilenet_v3_int8/ssd_mobilenet_v3_int8.tflite index f188cd2..65e2043 100644 Binary files a/models/experimental/ssd_mobilenet_v3_int8/ssd_mobilenet_v3_int8.tflite and b/models/experimental/ssd_mobilenet_v3_int8/ssd_mobilenet_v3_int8.tflite differ diff --git a/models/experimental/yolov3_416_416_backbone_mltools_int8/yolov3_416_416_backbone_mltools_int8.tflite b/models/experimental/yolov3_416_416_backbone_mltools_int8/yolov3_416_416_backbone_mltools_int8.tflite index 3270fe7..5a77ec3 100644 Binary files a/models/experimental/yolov3_416_416_backbone_mltools_int8/yolov3_416_416_backbone_mltools_int8.tflite and b/models/experimental/yolov3_416_416_backbone_mltools_int8/yolov3_416_416_backbone_mltools_int8.tflite differ diff --git a/models/experimental/yolov3_tiny_int8_pruned_backbone_only/yolov3_tiny_int8_pruned_backbone_only.tflite b/models/experimental/yolov3_tiny_int8_pruned_backbone_only/yolov3_tiny_int8_pruned_backbone_only.tflite index 5a45bf0..b879213 100644 Binary files a/models/experimental/yolov3_tiny_int8_pruned_backbone_only/yolov3_tiny_int8_pruned_backbone_only.tflite and b/models/experimental/yolov3_tiny_int8_pruned_backbone_only/yolov3_tiny_int8_pruned_backbone_only.tflite differ diff --git a/models/keyword_spotting/cnn_large/model_package_tf/README.md b/models/keyword_spotting/cnn_large/model_package_tf/README.md new file mode 100644 index 0000000..b0cbfe4 --- /dev/null +++ b/models/keyword_spotting/cnn_large/model_package_tf/README.md @@ -0,0 +1,115 @@ +# CNN Large model package + +This folder contains code that will allow you to recreate the CNN Large keyword spotting model from +the [Hello Edge paper](https://arxiv.org/pdf/1711.07128.pdf). + +## License +[Apache-2.0](https://spdx.org/licenses/Apache-2.0.html) + +## Model Package Overview +| Model | CNN_Large | +|:---------------: |:------------------------------------------:| +| **Format**: | Keras, Saved Model, TensorFlow Lite int8, TensorFlow Lite fp32 | +| **Feature**: | Keyword spotting for Arm Cortex-M CPUs | +| **Architectural Delta w.r.t. Vanilla**: | None | +| **Domain**: | Keyword spotting | +| **Package Quality**: | Optimised | + +## Model Recreation + +In order to recreate the model you will first need to be using ```Python3.7``` and install the requirements in ```requirements.txt```. + +Once you have these requirements satisfied you can execute the recreation script contained within this folder, just run: + +```bash +bash ./recreate_model.sh +``` + +Running this script will use the pre-trained checkpoint files supplied in the ```./model_archive/model_source/weights``` folder +to generate the TFLite files and perform evaluation on the test sets. Both an fp32 version and a quantized version will be produced. +The quantized version will use post-training quantization to fully quantize it. + +If you want to run training from scratch you can do this by supplying ```--train``` when running the script. For example: + +```bash +bash ./recreate_model.sh --train +``` + +Training is then performed and should produce a model to the stated accuracy in this repository. +Note that exporting to TFLite will still happen with the pre-trained checkpoint files so you will need to re-run the script +and this time supply the path to the new checkpoint files you want to use, for example: + +```bash +bash ./recreate_model.sh --ckpt +``` + + +## Training + +To train a DNN with 3 fully-connected layers with 128 neurons in each layer, run: + +``` +python train.py --model_architecture dnn --model_size_info 128 128 128 +``` +The command line argument *--model_size_info* is used to pass the neural network layer +dimensions such as number of layers, convolution filter size/stride as a list to models.py, +which builds the TensorFlow graph based on the provided model architecture +and layer dimensions. For more info on *model_size_info* for each network architecture see +[models.py](models.py). + +The training commands with all the hyperparameters to reproduce the models shown in the +[paper](https://arxiv.org/pdf/1711.07128.pdf) are given [here](recreate_model.sh). + +## Testing +To run inference on the trained model from a checkpoint and get accuracy on validation and test sets, run: +``` +python evaluation.py --model_architecture dnn --model_size_info 128 128 128 --checkpoint +``` +The parameters used here should match those used in the Training step. + +## Optimization + +We introduce a new *optional* step to optimize the trained keyword spotting model for deployment. + +Here we use TensorFlow's [weight clustering API](https://www.tensorflow.org/model_optimization/guide/clustering) to reduce the compressed model size and optimize inference on supported hardware. 32 weight clusters and kmeans++ cluster intialization method are used as the clustering hyperparameters. + +To optimize your trained model (e.g. a DNN), a trained model checkpoint is needed to run clustering and fine-tuning on. +You can use the pre-trained checkpoints provided, or train your own model and use the resulting checkpoint. + +To apply the optimization and fine-tuning, run the following command: +``` +python optimisations.py --model_architecture dnn --model_size_info 128 128 128 --checkpoint +``` +The parameters used here should match those used in the Training step, except for the number of training steps. +The number of training steps is reduced since the optimization step only requires fine-tuning. + +This will generate a clustered model checkpoint that can be used in the quantization step to generate a quantized and clustered TFLite model. + +## Quantization and TFLite Conversion + +As part of the update we now use TensorFlow's +[post training quantization](https://www.tensorflow.org/lite/performance/post_training_quantization) to +make quantization of the trained models super simple. + +To quantize your trained model (e.g. a DNN) run: +``` +python convert_to_tflite.py --model_architecture dnn --model_size_info 128 128 128 --checkpoint [--inference_type int8|int16] +``` +The parameters used here should match those used in the Training step. + +The inference_type parameter is *optional* and to be used if a fully quantized model with inputs and outputs of type int8 or int16 is needed. It defaults to fp32. + +This step will produce a quantized TFLite file *dnn_quantized.tflite*. +You can test the accuracy of this quantized model on the test set by running: +``` +python evaluation.py --tflite_path dnn_quantized.tflite +``` +The parameters used here should match those used in the Training step. + +`convert_to_tflite.py` uses post-training quantization to generate a quantized model by default. If you wish to convert to a floating point TFLite model, use the command below: + +``` +python convert_to_tflite.py --model_architecture dnn --model_size_info 128 128 128 --checkpoint --no-quantize +``` + +This will produce a floating point TFLite file *dnn.tflite*. You can test the accuracy of this floating point model using `evaluation.py` as above. diff --git a/models/keyword_spotting/cnn_large/model_package_tf/cnn_l_inference_keras.py b/models/keyword_spotting/cnn_large/model_package_tf/cnn_l_inference_keras.py new file mode 100644 index 0000000..db7694a --- /dev/null +++ b/models/keyword_spotting/cnn_large/model_package_tf/cnn_l_inference_keras.py @@ -0,0 +1,76 @@ +# Copyright © 2023 Arm Ltd. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from data_processing.data_preprocessing import load_wav_file, calculate_mfcc + +import tensorflow as tf +import argparse + + +def load_labels(filename): + """Read in labels, one label per line.""" + f = open(filename, "r") + return f.read().splitlines() + + +def main(): + window_size_samples = int(FLAGS.sample_rate * FLAGS.window_size_ms / 1000) + window_stride_samples = int(FLAGS.sample_rate * FLAGS.window_stride_ms / 1000) + decoded, sample = load_wav_file(FLAGS.wav, FLAGS.sample_rate) + x = calculate_mfcc(decoded, sample, window_size_samples, window_stride_samples, FLAGS.dct_coefficient_count) + x = tf.reshape(x, [1, -1]) + + model = tf.keras.models.load_model(FLAGS.keras_file_path) + predictions = model.predict(x) + + # Sort to show labels in order of confidence + top_k = predictions[0].argsort()[-1:][::-1] + for node_id in top_k: + human_string = load_labels(FLAGS.labels)[int(node_id)] + score = predictions[0,node_id] + print(f'model predicted: {human_string} with score {score:.5f}') + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument( + '--wav', type=str, default='', help='Audio file to be identified.') + parser.add_argument( + '--labels', type=str, default='', help='Path to file containing labels.') + parser.add_argument( + '--sample_rate', + type=int, + default=16000, + help='Expected sample rate of the wavs', ) + parser.add_argument( + '--window_size_ms', + type=float, + default=30.0, + help='How long each spectrogram timeslice is', ) + parser.add_argument( + '--window_stride_ms', + type=float, + default=10.0, + help='How long each spectrogram timeslice is', ) + parser.add_argument( + '--dct_coefficient_count', + type=int, + default=40, + help='How many bins to use for the MFCC fingerprint', ) + parser.add_argument( + '--keras_file_path', + type=str, + default='', + help='Path to the .h5 Keras model file to use for testing.') + FLAGS, unparsed = parser.parse_known_args() + main() diff --git a/models/keyword_spotting/cnn_large/model_package_tf/cnn_l_inference_tflite.py b/models/keyword_spotting/cnn_large/model_package_tf/cnn_l_inference_tflite.py new file mode 100644 index 0000000..9f79d99 --- /dev/null +++ b/models/keyword_spotting/cnn_large/model_package_tf/cnn_l_inference_tflite.py @@ -0,0 +1,120 @@ +# Copyright © 2023 Arm Ltd. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from data_processing.data_preprocessing import load_wav_file, calculate_mfcc + +import tensorflow as tf +import numpy as np +import argparse + + +def tflite_inference(input_data, tflite_path): + """Call forwards pass of TFLite file and returns the result. + + Args: + input_data: Input data to use on forward pass. + tflite_path: Path to TFLite file to run. + + Returns: + Output from inference. + """ + supported_quant_dtypes = (np.int8, np.int16) + interpreter = tf.lite.Interpreter(model_path=tflite_path) + interpreter.allocate_tensors() + + input_details = interpreter.get_input_details() + output_details = interpreter.get_output_details() + + input_dtype = input_details[0]["dtype"] + output_dtype = output_details[0]["dtype"] + + # Check if the input/output type is quantized, + # set scale and zero-point accordingly + if input_dtype in supported_quant_dtypes: + input_scale, input_zero_point = input_details[0]["quantization"] + else: + input_scale, input_zero_point = 1, 0 + + input_data = input_data / input_scale + input_zero_point + input_data = np.round(input_data) if input_dtype in supported_quant_dtypes else input_data + + if output_dtype in supported_quant_dtypes: + output_scale, output_zero_point = output_details[0]["quantization"] + else: + output_scale, output_zero_point = 1, 0 + + interpreter.set_tensor(input_details[0]['index'], tf.cast(input_data, input_dtype)) + interpreter.invoke() + + output_data = interpreter.get_tensor(output_details[0]['index']) + + output_data = output_scale * (output_data.astype(np.float32) - output_zero_point) + + return output_data + + +def load_labels(filename): + """Read in labels, one label per line.""" + f = open(filename, "r") + return f.read().splitlines() + + +def main(): + window_size_samples = int(FLAGS.sample_rate * FLAGS.window_size_ms / 1000) + window_stride_samples = int(FLAGS.sample_rate * FLAGS.window_stride_ms / 1000) + decoded, sample = load_wav_file(FLAGS.wav, FLAGS.sample_rate) + x = calculate_mfcc(decoded, sample, window_size_samples, window_stride_samples, FLAGS.dct_coefficient_count) + x = tf.reshape(x, [1, -1]) + predictions = tflite_inference(x, FLAGS.tflite_path) + + # Sort to show labels in order of confidence + top_k = predictions[0].argsort()[-1:][::-1] + for node_id in top_k: + human_string = load_labels(FLAGS.labels)[int(node_id)] + score = predictions[0,node_id] + print(f'model predicted: {human_string} with score {score:.5f}') + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument( + '--wav', type=str, default='', help='Audio file to be identified.') + parser.add_argument( + '--labels', type=str, default='', help='Path to file containing labels.') + parser.add_argument( + '--sample_rate', + type=int, + default=16000, + help='Expected sample rate of the wavs', ) + parser.add_argument( + '--window_size_ms', + type=float, + default=30.0, + help='How long each spectrogram timeslice is', ) + parser.add_argument( + '--window_stride_ms', + type=float, + default=10.0, + help='How long each spectrogram timeslice is', ) + parser.add_argument( + '--dct_coefficient_count', + type=int, + default=40, + help='How many bins to use for the MFCC fingerprint', ) + parser.add_argument( + '--tflite_path', + type=str, + default='', + help='Path to TFLite file to use for testing.') + FLAGS, unparsed = parser.parse_known_args() + main() diff --git a/models/keyword_spotting/cnn_large/model_package_tf/convert_to_tflite.py b/models/keyword_spotting/cnn_large/model_package_tf/convert_to_tflite.py new file mode 100644 index 0000000..64ab8df --- /dev/null +++ b/models/keyword_spotting/cnn_large/model_package_tf/convert_to_tflite.py @@ -0,0 +1,234 @@ +# Copyright © 2023 Arm Ltd. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Functions for converting and quantizing a trained keyword spotting + model and saving to TFLite.""" + +import argparse + +import tensorflow as tf + +from data_processing import data_preprocessing +from model_core_utils import models +from evaluation import tflite_test + +NUM_REP_DATA_SAMPLES = 100 # How many samples to use for post training quantization. + + +def convert(model_settings, audio_processor, checkpoint, quantize, inference_type, tflite_path): + """Load our trained floating point model and convert it. + + TFLite conversion or post training quantization is performed and the + resulting model is saved as a TFLite file. + We use samples from the validation set to do post training quantization. + + Args: + model_settings: Dictionary of common model settings. + audio_processor: Audio processor class object. + checkpoint: Path to training checkpoint to load. + quantize: Whether to quantize the model or convert to fp32 TFLite model. + inference_type: Input/output type of the quantized model. + tflite_path: Output TFLite file save path. + """ + model = models.create_model(model_settings, FLAGS.model_architecture, FLAGS.model_size_info, False) + model.load_weights(checkpoint).expect_partial() + + val_data = audio_processor.get_data(audio_processor.Modes.VALIDATION).batch(1) + + def _rep_dataset(): + """Generator function to produce representative dataset.""" + i = 0 + for mfcc, label in val_data: + if i > NUM_REP_DATA_SAMPLES: + break + i += 1 + yield [mfcc] + + if quantize: + # Quantize model and save to disk. + tflite_model = post_training_quantize(model, inference_type, _rep_dataset) + with open(tflite_path, 'wb') as f: + f.write(tflite_model) + print(f'Quantized model saved to {tflite_path}.') + else: + converter = tf.lite.TFLiteConverter.from_keras_model(model) + tflite_model = converter.convert() + with open(tflite_path, 'wb') as f: + f.write(tflite_model) + print(f'Converted model saved to {tflite_path}.') + + +def post_training_quantize(keras_model, inference_type, rep_dataset): + """Perform post training quantization and returns the TFLite model ready for saving. + + See https://www.tensorflow.org/lite/performance/post_training_quantization#full_integer_quantization for + more details. + + Args: + keras_model: The trained tf Keras model used for post training quantization. + inference_type: Input/output type of the quantized model. + rep_dataset: Function to use as a representative dataset, must be callable. + + Returns: + Quantized TFLite model ready for saving to disk. + """ + converter = tf.lite.TFLiteConverter.from_keras_model(keras_model) + converter.optimizations = [tf.lite.Optimize.DEFAULT] + + if inference_type == 'int8': + converter.inference_input_type = tf.int8 + converter.inference_output_type = tf.int8 + supported_ops = tf.lite.OpsSet.TFLITE_BUILTINS_INT8 + if inference_type == 'int16': + converter.inference_input_type = tf.int16 + converter.inference_output_type = tf.int16 + supported_ops = tf.lite.OpsSet.EXPERIMENTAL_TFLITE_BUILTINS_ACTIVATIONS_INT16_WEIGHTS_INT8 + + # Int8 post training quantization needs representative dataset. + converter.representative_dataset = rep_dataset + converter.target_spec.supported_ops = [supported_ops] + + tflite_model = converter.convert() + + return tflite_model + + +def main(): + model_settings = models.prepare_model_settings( + len(data_preprocessing.prepare_words_list(FLAGS.wanted_words.split(','))), + FLAGS.sample_rate, FLAGS.clip_duration_ms, FLAGS.window_size_ms, + FLAGS.window_stride_ms, FLAGS.dct_coefficient_count) + + audio_processor = data_preprocessing.AudioProcessor(data_url=FLAGS.data_url, + data_dir=FLAGS.data_dir, + silence_percentage=FLAGS.silence_percentage, + unknown_percentage=FLAGS.unknown_percentage, + wanted_words=FLAGS.wanted_words.split(','), + validation_percentage=FLAGS.validation_percentage, + testing_percentage=FLAGS.testing_percentage, + model_settings=model_settings) + + if FLAGS.quantize: + tflite_path = f'{FLAGS.model_architecture}_quantized.tflite' + else: + tflite_path = f'{FLAGS.model_architecture}.tflite' + + # Load floating point model from checkpoint and convert it. + convert(model_settings, audio_processor, FLAGS.checkpoint, + FLAGS.quantize, FLAGS.inference_type, tflite_path) + + # Test the newly converted model on the test set. + tflite_test(model_settings, audio_processor, tflite_path) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument( + '--data_url', + type=str, + default='http://download.tensorflow.org/data/speech_commands_v0.02.tar.gz', + help='Location of speech training data archive on the web.') + parser.add_argument( + '--data_dir', + type=str, + default='/tmp/speech_dataset/', + help="""\ + Where to download the speech training data to. + """) + parser.add_argument( + '--silence_percentage', + type=float, + default=10.0, + help="""\ + How much of the training data should be silence. + """) + parser.add_argument( + '--unknown_percentage', + type=float, + default=10.0, + help="""\ + How much of the training data should be unknown words. + """) + parser.add_argument( + '--testing_percentage', + type=int, + default=10, + help='What percentage of wavs to use as a test set.') + parser.add_argument( + '--validation_percentage', + type=int, + default=10, + help='What percentage of wavs to use as a validation set.') + parser.add_argument( + '--sample_rate', + type=int, + default=16000, + help='Expected sample rate of the wavs',) + parser.add_argument( + '--clip_duration_ms', + type=int, + default=1000, + help='Expected duration in milliseconds of the wavs',) + parser.add_argument( + '--window_size_ms', + type=float, + default=30.0, + help='How long each spectrogram timeslice is',) + parser.add_argument( + '--window_stride_ms', + type=float, + default=10.0, + help='How long each spectrogram timeslice is',) + parser.add_argument( + '--dct_coefficient_count', + type=int, + default=40, + help='How many bins to use for the MFCC fingerprint',) + parser.add_argument( + '--wanted_words', + type=str, + default='yes,no,up,down,left,right,on,off,stop,go', + help='Words to use (others will be added to an unknown label)',) + parser.add_argument( + '--model_architecture', + type=str, + default='dnn', + help='What model architecture to use') + parser.add_argument( + '--model_size_info', + type=int, + nargs="+", + default=[128, 128, 128], + help='Model dimensions - different for various models') + parser.add_argument( + '--checkpoint', + type=str, + help='Checkpoint to load the weights from.') + parser.add_argument( + '--quantize', + dest='quantize', + action="store_true", + default=True, + help='Whether to quantize the model or convert to fp32 TFLite model. Defaults to True.') + parser.add_argument( + '--no-quantize', + dest='quantize', + action="store_false", + help='Whether to quantize the model or convert to fp32 TFLite model. Defaults to True.') + parser.add_argument( + '--inference_type', + type=str, + default='fp32', + help='If quantize is true, whether the model input and output is float32, int8 or int16') + + FLAGS, _ = parser.parse_known_args() + main() diff --git a/models/keyword_spotting/cnn_large/model_package_tf/data_processing/__init__.py b/models/keyword_spotting/cnn_large/model_package_tf/data_processing/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/models/keyword_spotting/cnn_large/model_package_tf/data_processing/data_preprocessing.py b/models/keyword_spotting/cnn_large/model_package_tf/data_processing/data_preprocessing.py new file mode 100644 index 0000000..05cf5ba --- /dev/null +++ b/models/keyword_spotting/cnn_large/model_package_tf/data_processing/data_preprocessing.py @@ -0,0 +1,462 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +# +# Modifications Copyright 2023 Arm Inc. All Rights Reserved. +# Modified to use TensorFlow 2.0 and data pipelines. +# +"""Functions for loading and preparing data for keyword spotting.""" + +import os +import re +import sys +import urllib +from pathlib import Path +import tarfile +import hashlib +import random +import math +from enum import Enum + +import numpy as np +import tensorflow as tf +from tensorflow.python.ops import gen_audio_ops as audio_ops + +MAX_NUM_WAVS_PER_CLASS = 2**27 - 1 # ~134M +RANDOM_SEED = 59185 +BACKGROUND_NOISE_DIR_NAME = '_background_noise_' +SILENCE_LABEL = '_silence_' +SILENCE_INDEX = 0 +UNKNOWN_WORD_INDEX = 1 +UNKNOWN_WORD_LABEL = '_unknown_' + + +def load_wav_file(wav_filename, desired_samples): + """Loads and then decodes a given 16bit PCM wav file. + + Decoded audio is scaled to the range [-1, 1] and padded or cropped to the desired number of samples. + + Args: + wav_filename: 16bit PCM wav file to load. + desired_samples: Number of samples wanted from the audio file. + + Returns: + Tuple consisting of the decoded audio and sample rate. + """ + wav_file = tf.io.read_file(wav_filename) + decoded_wav = audio_ops.decode_wav(wav_file, desired_channels=1, desired_samples=desired_samples) + + return decoded_wav.audio, decoded_wav.sample_rate + + +def calculate_mfcc(audio_signal, audio_sample_rate, window_size, window_stride, num_mfcc): + """Returns Mel Frequency Cepstral Coefficients (MFCC) for a given audio signal. + + Args: + audio_signal: Raw audio signal in range [-1, 1] + audio_sample_rate: Audio signal sample rate + window_size: Window size in samples for calculating spectrogram + window_stride: Window stride in samples for calculating spectrogram + num_mfcc: The number of MFCC features wanted. + + Returns: + Calculated mffc features. + """ + spectrogram = audio_ops.audio_spectrogram(input=audio_signal, window_size=window_size, stride=window_stride, + magnitude_squared=True) + + mfcc_features = audio_ops.mfcc(spectrogram, audio_sample_rate, dct_coefficient_count=num_mfcc) + + return mfcc_features + + +def which_set(filename, validation_percentage, testing_percentage): + """Determines which data partition the file should belong to. + + We want to keep files in the same training, validation, or testing sets even + if new ones are added over time. This makes it less likely that testing + samples will accidentally be reused in training when long runs are restarted + for example. To keep this stability, a hash of the filename is taken and used + to determine which set it should belong to. This determination only depends on + the name and the set proportions, so it won't change as other files are added. + It's also useful to associate particular files as related (for example words + spoken by the same person), so anything after '_nohash_' in a filename is + ignored for set determination. This ensures that 'bobby_nohash_0.wav' and + 'bobby_nohash_1.wav' are always in the same set, for example. + + Args: + filename: File path of the data sample. + validation_percentage: How much of the data set to use for validation. + testing_percentage: How much of the data set to use for testing. + + Returns: + String, one of 'training', 'validation', or 'testing'. + """ + base_name = os.path.basename(filename) + # We want to ignore anything after '_nohash_' in the file name when + # deciding which set to put a wav in, so the data set creator has a way of + # grouping wavs that are close variations of each other. + hash_name = re.sub(r'_nohash_.*$', '', base_name) + # This looks a bit magical, but we need to decide whether this file should + # go into the training, testing, or validation sets, and we want to keep + # existing files in the same set even if more files are subsequently + # added. + # To do that, we need a stable way of deciding based on just the file name + # itself, so we do a hash of that and then use that to generate a + # probability value that we use to assign it. + hash_name_hashed = hashlib.sha1(tf.compat.as_bytes(hash_name)).hexdigest() + percentage_hash = ((int(hash_name_hashed, 16) % + (MAX_NUM_WAVS_PER_CLASS + 1)) * + (100.0 / MAX_NUM_WAVS_PER_CLASS)) + if percentage_hash < validation_percentage: + result = 'validation' + elif percentage_hash < (testing_percentage + validation_percentage): + result = 'testing' + else: + result = 'training' + return result + + +def prepare_words_list(wanted_words): + """Prepends common tokens to the custom word list. + + Args: + wanted_words: List of strings containing custom words to spot. + + Returns: + List of words with silence and unknown tokens added. + """ + return [SILENCE_LABEL, UNKNOWN_WORD_LABEL] + wanted_words + + +class AudioProcessor: + """Handles loading, partitioning, and preparing audio training data.""" + + class Modes(Enum): + TRAINING = 1 + VALIDATION = 2 + TESTING = 3 + + def __init__(self, data_url, data_dir, silence_percentage, unknown_percentage, + wanted_words, validation_percentage, testing_percentage, model_settings): + self.data_dir = Path(data_dir) + self.model_settings = model_settings + self.words_list = prepare_words_list(wanted_words) + + self._tf_datasets = {} + self.background_data = None + self._set_size = {'training': 0, 'validation': 0, 'testing': 0} + + self._download_and_extract_data(data_url, data_dir) + self._prepare_datasets(silence_percentage, unknown_percentage, wanted_words, + validation_percentage, testing_percentage) + self._prepare_background_data() + + def get_data(self, mode, background_frequency=0, background_volume_range=0, time_shift=0): + """Returns the train, validation or test set for KWS as a TF Dataset. + + Args: + mode: The set to return, see AudioProcessor.Modes enumeration. + background_frequency: How many of the samples have background noise mixed in. + background_volume_range: How loud the background noise should be, between 0 and 1. + time_shift: Range to randomly shift the training audio by in time. + + Returns: + TF dataset that will generate tuples containing an mfcc and corresponding label. + + Raises: + ValueError: If mode is not recognised. + """ + if mode == AudioProcessor.Modes.TRAINING: + dataset = self._tf_datasets['training'] + elif mode == AudioProcessor.Modes.VALIDATION: + dataset = self._tf_datasets['validation'] + elif mode == AudioProcessor.Modes.TESTING: + dataset = self._tf_datasets['testing'] + else: + ValueError("Incorrect dataset type given") + + use_background = (self.background_data is not None) and (mode == AudioProcessor.Modes.TRAINING) + dataset = dataset.map(lambda path, label: self._process_path(path, label, self.model_settings, + background_frequency, background_volume_range, + time_shift, use_background, self.background_data), + num_parallel_calls=tf.data.experimental.AUTOTUNE) + + return dataset + + def set_size(self, mode): + """Get the number of samples in the requested dataset partition. + + Args: + mode: Which partition, see AudioProcessor.Modes enumeration. + + Returns: + Number of samples in the partition. + + Raises: + ValueError: If mode is not recognised. + """ + if mode == AudioProcessor.Modes.TRAINING: + return self._set_size['training'] + elif mode == AudioProcessor.Modes.VALIDATION: + return self._set_size['validation'] + elif mode == AudioProcessor.Modes.TESTING: + return self._set_size['testing'] + else: + ValueError('Incorrect dataset type given') + + @staticmethod + def _process_path(path, label, model_settings, background_frequency, background_volume_range, time_shift_samples, + use_background, background_data): + """Load wav files and calculate mfcc features. + + Random shifting of samples and adding in background noise is done within this function as well. + This function is meant to be mapped onto a TF Dataset by using a lambda function. + + Args: + path: Path to the wav file to load. + label: Integer label for classifying the audio clip. + model_settings: Dictionary of settings for model being trained. + background_frequency: How many clips will have background noise, 0.0 to 1.0. + background_volume_range: How loud the background noise will be. + time_shift_samples: How much to randomly shift the clips by. + use_background: Add in background noise to audio clips or not. + background_data: Ragged tensor of loaded background noise samples. + + Returns: + Tuple of calculated flattened mfcc and its class label. + """ + + desired_samples = model_settings['desired_samples'] + audio, sample_rate = load_wav_file(path, desired_samples=desired_samples) + + # Make our own silence audio data. + if label == SILENCE_INDEX: + audio = tf.multiply(audio, 0) + + # Shift samples start position and pad any gaps with zeros. + if time_shift_samples > 0: + time_shift_amount = tf.random.uniform(shape=(), minval=-time_shift_samples, maxval=time_shift_samples, + dtype=tf.int32) + else: + time_shift_amount = 0 + if time_shift_amount > 0: + time_shift_padding = [[time_shift_amount, 0], [0, 0]] + time_shift_offset = [0, 0] + else: + time_shift_padding = [[0, -time_shift_amount], [0, 0]] + time_shift_offset = [-time_shift_amount, 0] + + padded_foreground = tf.pad(audio, time_shift_padding, mode='CONSTANT') + sliced_foreground = tf.slice(padded_foreground, time_shift_offset, [desired_samples, -1]) + + # Get a random section of background noise. + if use_background: + background_index = tf.random.uniform(shape=(), maxval=background_data.shape[0], dtype=tf.int32) + background_sample = background_data[background_index] + background_offset = tf.random.uniform(shape=(), maxval=len(background_sample)-desired_samples, + dtype=tf.int32) + background_clipped = background_sample[background_offset:(background_offset + desired_samples)] + background_reshaped = tf.reshape(background_clipped, [desired_samples, 1]) + if tf.random.uniform(shape=(), maxval=1) < background_frequency: + background_volume = tf.random.uniform(shape=(), maxval=background_volume_range) + else: + background_volume = tf.constant(0, dtype='float32') + else: + background_reshaped = np.zeros([desired_samples, 1], dtype=np.float32) + background_volume = tf.constant(0, dtype='float32') + + # Mix in background noise. + background_mul = tf.multiply(background_reshaped, background_volume) + background_add = tf.add(background_mul, sliced_foreground) + background_clamp = tf.clip_by_value(background_add, -1.0, 1.0) + + mfcc = calculate_mfcc(background_clamp, sample_rate, model_settings['window_size_samples'], + model_settings['window_stride_samples'], + model_settings['dct_coefficient_count']) + mfcc = tf.reshape(mfcc, [-1]) + + return mfcc, label + + def _download_and_extract_data(self, data_url, target_directory): + """Downloads and extracts file to target directory. + + If the file does not already exist download it and then untar into the target directory. + + Args: + data_url: Web link to the tarred data to download. + target_directory: Directory to download and extract to. + """ + target_directory = Path(target_directory) + target_directory.mkdir(exist_ok=True) + + filename = data_url.split('/')[-1] + filepath = target_directory / filename + + if not filepath.exists(): + def _report_hook(block_num, block_size, total_size): + """Function to track download progress in urllib""" + read_so_far = block_num * block_size + percent = (read_so_far / total_size) * 100.0 + + s = f"\rDownloading {filename} {percent:.1f}%" + + sys.stdout.write(s) + sys.stdout.flush() + + filepath, _ = urllib.request.urlretrieve(data_url, filepath, _report_hook) + print() + + print(f'Untarring {filename}...') + tarfile.open(filepath, 'r:gz').extractall(target_directory) + + def _prepare_datasets(self, silence_percentage, unknown_percentage, wanted_words, + validation_percentage, testing_percentage): + """Split the data into train, validation and testing sets. + + Silence and unknown data is added, then sets are converted to TF Datasets. + + Args: + silence_percentage: Percent of words should be silence. + unknown_percentage: Percent of words that should be unknown. + wanted_words: List of words wanted to classify. + validation_percentage: Percent to split off for validation. + testing_percentage: Percent to split off for testing. + """ + # Make sure the shuffling and picking of unknowns is deterministic. + random.seed(RANDOM_SEED) + wanted_words_index = {} + + for index, wanted_word in enumerate(wanted_words): + wanted_words_index[wanted_word] = index + 2 + + # Find all wav files in subfolders. + search_path = self.data_dir / '*' / '*.wav' + data_index, unknown_index, all_words = self._find_and_sort_wavs(search_path, validation_percentage, + testing_percentage, wanted_words_index) + + for index, wanted_word in enumerate(wanted_words): + if wanted_word not in all_words: + raise Exception(f'Tried to find {wanted_word} in labels but only found: {", ".join(all_words.keys())}') + + word_to_index = {} + for word in all_words: + if word in wanted_words_index: + word_to_index[word] = wanted_words_index[word] + else: + word_to_index[word] = UNKNOWN_WORD_INDEX + word_to_index[SILENCE_LABEL] = SILENCE_INDEX + + # We need an arbitrary file to load as the input for the silence samples. + # It's multiplied by zero later, so the content doesn't matter. + silence_wav_path = data_index['training'][0]['file'] + for set_index in ['validation', 'testing', 'training']: + set_size = len(data_index[set_index]) # Size before adding silence and unknown samples. + silence_size = int(math.ceil(set_size * silence_percentage / 100)) + for _ in range(silence_size): + data_index[set_index].append({ + 'label': SILENCE_LABEL, + 'file': silence_wav_path + }) + # Pick some unknowns to add to each partition of the data set. + random.shuffle(unknown_index[set_index]) + unknown_size = int(math.ceil(set_size * unknown_percentage / 100)) + data_index[set_index].extend(unknown_index[set_index][:unknown_size]) + + self._set_size[set_index] = len(data_index[set_index]) # Size after adding silence and unknown samples. + + # Make sure the ordering is random. + random.shuffle(data_index[set_index]) + + # Transform into TF Datasets ready for easier processing later. + labels, paths = list(zip(*[d.values() for d in data_index[set_index]])) + labels = [word_to_index[label] for label in labels] + self._tf_datasets[set_index] = tf.data.Dataset.from_tensor_slices((list(paths), labels)) + + def _find_and_sort_wavs(self, search_pattern, validation_percentage, testing_percentage, wanted_words_index): + """Find and sort wav files into known and unknown word sets. + + Known words are files containing words in the list of wanted words. + Any other clip goes to the unknown label set. Labels come from the folder names. + All clips are also assigned to train, test and validation sets. + + Args: + search_pattern: Path pattern used by glob to find wav files. + validation_percentage: Percent to split off for validation. + testing_percentage: Percent to split off for testing. + wanted_words_index: Dict mapping wanted words to their label index. + + Returns: + 3-tuple of known words, unknown words and mapping of all word labels. + """ + data_index = {'validation': [], 'testing': [], 'training': []} + unknown_index = {'validation': [], 'testing': [], 'training': []} + all_words = {} + + for wav_path in sorted(tf.io.gfile.glob(str(search_pattern))): + word = Path(wav_path).parent.name.lower() + + # Treat the '_background_noise_' folder as a special case, since we expect + # it to contain long audio samples we mix in to improve training. + if word == BACKGROUND_NOISE_DIR_NAME: + continue + + all_words[word] = True + set_index = which_set(wav_path, validation_percentage, testing_percentage) + # If it's a known class, store its detail, otherwise add it to the list + # we'll use to train the unknown label. + if word in wanted_words_index: + data_index[set_index].append({'label': word, 'file': wav_path}) + else: + unknown_index[set_index].append({'label': word, 'file': wav_path}) + if not all_words: + raise Exception('No .wavs found at ' + str(search_pattern)) + + return data_index, unknown_index, all_words + + def _prepare_background_data(self): + """Searches a folder for background noise audio, and loads it into memory. + + It's expected that the background audio samples will be in a subdirectory + named '_background_noise_' inside the 'data_dir' folder, as .wavs that match + the sample rate of the training data, but can be much longer in duration. + + If the '_background_noise_' folder doesn't exist at all, this isn't an + error, it's just taken to mean that no background noise augmentation should + be used. If the folder does exist, but it's empty, that's treated as an + error. + + Returns: + Ragged tensor of raw PCM-encoded audio samples of background noise. + None if '_background_noise_' folder doesnt exist. + + Raises: + Exception: If files aren't found in the folder. + """ + background_data = [] + background_dir = Path(self.data_dir / BACKGROUND_NOISE_DIR_NAME) + if not background_dir.exists(): + self.background_data = None + return + + search_path = Path(background_dir / '*.wav') + for wav_path in tf.io.gfile.glob(str(search_path)): + wav_data, _ = load_wav_file(wav_path, desired_samples=-1) + background_data.append(tf.reshape(wav_data, [-1])) + + if not background_data: + raise Exception('No background wav files were found in ' + str(search_path)) + + # Ragged tensor as we cant use lists in tf dataset map functions. + self.background_data = tf.ragged.stack(background_data) diff --git a/models/keyword_spotting/cnn_large/model_package_tf/evaluation.py b/models/keyword_spotting/cnn_large/model_package_tf/evaluation.py new file mode 100644 index 0000000..1bec940 --- /dev/null +++ b/models/keyword_spotting/cnn_large/model_package_tf/evaluation.py @@ -0,0 +1,250 @@ +# Copyright © 2023 Arm Ltd. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Functions for testing trained keyword spotting models from checkpoint files and TFLite files.""" + +import argparse + +import numpy as np +import tensorflow as tf + +from data_processing import data_preprocessing +from model_core_utils import models +from cnn_l_inference_tflite import tflite_inference + + +def tflite_test(model_settings, audio_processor, tflite_path): + """Calculate accuracy and confusion matrices on the validation and test sets. + + A TFLite model is used for doing testing. + + Args: + model_settings: Dictionary of common model settings. + audio_processor: Audio processor class object. + tflite_path: Path to TFLite file to use for inference. + """ + # Evaluate on validation set. + print("Running TFLite evaluation on validation set...") + val_data = audio_processor.get_data(audio_processor.Modes.VALIDATION).batch(1) + expected_indices = np.concatenate([y for x, y in val_data]) + predicted_indices = [] + + for mfcc, label in val_data: + prediction = tflite_inference(mfcc, tflite_path) + predicted_indices.append(np.squeeze(tf.argmax(prediction, axis=1))) + + val_accuracy = calculate_accuracy(predicted_indices, expected_indices) + confusion_matrix = tf.math.confusion_matrix(expected_indices, predicted_indices, + num_classes=model_settings['label_count']) + + print(confusion_matrix.numpy()) + print(f'Validation accuracy = {val_accuracy * 100:.2f}%' + f'(N={audio_processor.set_size(audio_processor.Modes.VALIDATION)})') + + # Evaluate on testing set. + print("Running TFLite evaluation on test set...") + test_data = audio_processor.get_data(audio_processor.Modes.TESTING).batch(1) + expected_indices = np.concatenate([y for x, y in test_data]) + predicted_indices = [] + + for mfcc, label in test_data: + prediction = tflite_inference(mfcc, tflite_path) + predicted_indices.append(np.squeeze(tf.argmax(prediction, axis=1))) + + test_accuracy = calculate_accuracy(predicted_indices, expected_indices) + confusion_matrix = tf.math.confusion_matrix(expected_indices, predicted_indices, + num_classes=model_settings['label_count']) + + print(confusion_matrix.numpy()) + print(f'Test accuracy = {test_accuracy * 100:.2f}%' + f'(N={audio_processor.set_size(audio_processor.Modes.TESTING)})') + + +def keras_test(model_settings, audio_processor, model): + """Calculate accuracy and confusion matrices on the validation and test sets. + + A loaded keras model is used for doing testing. + + Args: + model_settings: Dictionary of common model settings. + audio_processor: Audio processor class object. + model: Loaded keras model. + """ + # Evaluate on validation set. + print("Running TF evaluation on validation set...") + val_data = audio_processor.get_data(audio_processor.Modes.VALIDATION).batch(FLAGS.batch_size) + expected_indices = np.concatenate([y for x, y in val_data]) + + predictions = model.predict(val_data) + predicted_indices = tf.argmax(predictions, axis=1) + + val_accuracy = calculate_accuracy(predicted_indices, expected_indices) + confusion_matrix = tf.math.confusion_matrix(expected_indices, predicted_indices, + num_classes=model_settings['label_count']) + print(confusion_matrix.numpy()) + print(f'Validation accuracy = {val_accuracy * 100:.2f}%' + f'(N={audio_processor.set_size(audio_processor.Modes.VALIDATION)})') + + # Evaluate on testing set. + print("Running TF evaluation on test set...") + test_data = audio_processor.get_data(audio_processor.Modes.TESTING).batch(FLAGS.batch_size) + expected_indices = np.concatenate([y for x, y in test_data]) + + predictions = model.predict(test_data) + predicted_indices = tf.argmax(predictions, axis=1) + + test_accuracy = calculate_accuracy(predicted_indices, expected_indices) + confusion_matrix = tf.math.confusion_matrix(expected_indices, predicted_indices, + num_classes=model_settings['label_count']) + print(confusion_matrix.numpy()) + print(f'Test accuracy = {test_accuracy * 100:.2f}%' + f'(N={audio_processor.set_size(audio_processor.Modes.TESTING)})') + + +def calculate_accuracy(predicted_indices, expected_indices): + """Calculates and returns accuracy. + + Args: + predicted_indices: List of predicted integer indices. + expected_indices: List of expected integer indices. + + Returns: + Accuracy value between 0 and 1. + """ + correct_prediction = tf.equal(predicted_indices, expected_indices) + accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) + return accuracy + + +def evaluate(): + """Calculate accuracy and confusion matrices on validation and test sets. + + Model is created and weights loaded from supplied command line arguments. + """ + model_settings = models.prepare_model_settings( + len(data_preprocessing.prepare_words_list(FLAGS.wanted_words.split(','))), + FLAGS.sample_rate, FLAGS.clip_duration_ms, FLAGS.window_size_ms, + FLAGS.window_stride_ms, FLAGS.dct_coefficient_count) + + audio_processor = data_preprocessing.AudioProcessor(data_url=FLAGS.data_url, + data_dir=FLAGS.data_dir, + silence_percentage=FLAGS.silence_percentage, + unknown_percentage=FLAGS.unknown_percentage, + wanted_words=FLAGS.wanted_words.split(','), + validation_percentage=FLAGS.validation_percentage, + testing_percentage=FLAGS.testing_percentage, + model_settings=model_settings) + + if FLAGS.tflite_path: + tflite_test(model_settings, audio_processor, FLAGS.tflite_path) + + if FLAGS.checkpoint: + model = models.create_model(model_settings, FLAGS.model_architecture, FLAGS.model_size_info, False) + model.load_weights(FLAGS.checkpoint).expect_partial() + keras_test(model_settings, audio_processor, model) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument( + '--data_url', + type=str, + default='http://download.tensorflow.org/data/speech_commands_v0.02.tar.gz', + help='Location of speech training data archive on the web.') + parser.add_argument( + '--data_dir', + type=str, + default='/tmp/speech_dataset/', + help="""\ + Where to download the speech training data to. + """) + parser.add_argument( + '--silence_percentage', + type=float, + default=10.0, + help="""\ + How much of the training data should be silence. + """) + parser.add_argument( + '--unknown_percentage', + type=float, + default=10.0, + help="""\ + How much of the training data should be unknown words. + """) + parser.add_argument( + '--testing_percentage', + type=int, + default=10, + help='What percentage of wavs to use as a test set.') + parser.add_argument( + '--validation_percentage', + type=int, + default=10, + help='What percentage of wavs to use as a validation set.') + parser.add_argument( + '--sample_rate', + type=int, + default=16000, + help='Expected sample rate of the wavs',) + parser.add_argument( + '--clip_duration_ms', + type=int, + default=1000, + help='Expected duration in milliseconds of the wavs',) + parser.add_argument( + '--window_size_ms', + type=float, + default=30.0, + help='How long each spectrogram timeslice is',) + parser.add_argument( + '--window_stride_ms', + type=float, + default=10.0, + help='How long each spectrogram timeslice is',) + parser.add_argument( + '--dct_coefficient_count', + type=int, + default=40, + help='How many bins to use for the MFCC fingerprint',) + parser.add_argument( + '--batch_size', + type=int, + default=100, + help='How many items to train with at once',) + parser.add_argument( + '--wanted_words', + type=str, + default='yes,no,up,down,left,right,on,off,stop,go', + help='Words to use (others will be added to an unknown label)',) + parser.add_argument( + '--checkpoint', + type=str, + help='Checkpoint to load the weights from') + parser.add_argument( + '--tflite_path', + type=str, + help='Path to TFLite file to use for evaluation') + parser.add_argument( + '--model_architecture', + type=str, + default='dnn', + help='What model architecture to use') + parser.add_argument( + '--model_size_info', + type=int, + nargs="+", + default=[128, 128, 128], + help='Model dimensions - different for various models') + + FLAGS, _ = parser.parse_known_args() + evaluate() diff --git a/models/keyword_spotting/cnn_large/model_package_tf/how_to_guidance.ipynb b/models/keyword_spotting/cnn_large/model_package_tf/how_to_guidance.ipynb new file mode 100644 index 0000000..d818b93 --- /dev/null +++ b/models/keyword_spotting/cnn_large/model_package_tf/how_to_guidance.ipynb @@ -0,0 +1,428 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Copyright (C) 2023 Arm Limited or its affiliates. All rights reserved.\n", + "#\n", + "# SPDX-License-Identifier: Apache-2.0\n", + "#\n", + "# Licensed under the Apache License, Version 2.0 (the License); you may\n", + "# not use this file except in compliance with the License.\n", + "# You may obtain a copy of the License at\n", + "#\n", + "# www.apache.org/licenses/LICENSE-2.0\n", + "#\n", + "# Unless required by applicable law or agreed to in writing, software\n", + "# distributed under the License is distributed on an AS IS BASIS, WITHOUT\n", + "# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", + "# See the License for the specific language governing permissions and\n", + "# limitations under the License." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# CNN_Large - Optimised\n", + "\n", + "Here we reproduce the models with our established codebase and ModelPackage approach for your convenience.\n", + "\n", + "## Model-Package Overview:\n", + "\n", + "| Model \t| CNN_Large \t|\n", + "|:---------------:\t|:---------------------------------------------------------------:\t|\n", + "| **Format**: \t| Keras, Saved Model, TensorFlow Lite int8, TensorFlow Lite fp32 |\n", + "| **Feature**: \t| Keyword spotting for Arm Cortex-M CPUs |\n", + "| **Architectural Delta w.r.t. Vanilla**: | None |\n", + "| **Domain**: \t| Keyword spotting |\n", + "| **Package Quality**: \t| Optimised |" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Table of contents \n", + "\n", + "This how-to guidance presents the key steps to reproduce everything in this package. The contents are organised as below. We provided the internal navigation links for users to easy-jump among different sections. \n", + "\n", + " \n", + "* [1.0 Model recreation](#model_recreation)\n", + "\n", + "* [2.0 Training](#training)\n", + "\n", + "* [3.0 Testing](#testing)\n", + "\n", + "* [4.0 Optimization](#optimization)\n", + "\n", + "* [5.0 Quantization and TFLite conversion](#tflite_conversion)\n", + "\n", + "* [6.0 Inference the TFLite model files](#tflite_inference)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1.0 Model Recreation\n", + "\n", + "In order to recreate the model you will first need to be using ```Python3.7``` and install the requirements in ```requirements.txt```.\n", + "\n", + "Once you have these requirements satisfied you can execute the recreation script contained within this folder, just run:" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2023-01-31 12:11:37.988637: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0\n", + "Untarring speech_commands_v0.02.tar.gz...\n", + "2023-01-31 12:12:28.656297: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcuda.so.1\n", + "2023-01-31 12:12:28.695168: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: \n", + "pciBusID: 0000:03:00.0 name: NVIDIA TITAN Xp computeCapability: 6.1\n", + "coreClock: 1.582GHz coreCount: 30 deviceMemorySize: 11.91GiB deviceMemoryBandwidth: 510.07GiB/s\n", + "2023-01-31 12:12:28.695203: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0\n", + "2023-01-31 12:12:28.715771: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcublas.so.11\n", + "2023-01-31 12:12:28.715835: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcublasLt.so.11\n", + "2023-01-31 12:12:28.718556: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcufft.so.10\n", + "2023-01-31 12:12:28.718828: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcurand.so.10\n", + "2023-01-31 12:12:28.719402: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcusolver.so.11\n", + "2023-01-31 12:12:28.720115: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcusparse.so.11\n", + "2023-01-31 12:12:28.720266: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudnn.so.8\n", + "2023-01-31 12:12:28.720628: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0\n", + "2023-01-31 12:12:28.720911: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA\n", + "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n", + "2023-01-31 12:12:28.721608: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: \n", + "pciBusID: 0000:03:00.0 name: NVIDIA TITAN Xp computeCapability: 6.1\n", + "coreClock: 1.582GHz coreCount: 30 deviceMemorySize: 11.91GiB deviceMemoryBandwidth: 510.07GiB/s\n", + "2023-01-31 12:12:28.721996: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0\n", + "2023-01-31 12:12:28.722060: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0\n", + "2023-01-31 12:12:29.189512: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1258] Device interconnect StreamExecutor with strength 1 edge matrix:\n", + "2023-01-31 12:12:29.189552: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1264] 0 \n", + "2023-01-31 12:12:29.189560: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1277] 0: N \n", + "2023-01-31 12:12:29.190094: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1418] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 11007 MB memory) -> physical GPU (device: 0, name: NVIDIA TITAN Xp, pci bus id: 0000:03:00.0, compute capability: 6.1)\n", + "WARNING:tensorflow:Compiled the loaded model, but the compiled metrics have yet to be built. `model.compile_metrics` will be empty until you train or evaluate the model.\n", + "2023-01-31 12:12:30.746072: W tensorflow/python/util/util.cc:348] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.\n", + "2023-01-31 12:12:31.596489: I tensorflow/core/grappler/devices.cc:69] Number of eligible GPUs (core count >= 8, compute capability >= 0.0): 1\n", + "2023-01-31 12:12:31.596713: I tensorflow/core/grappler/clusters/single_machine.cc:357] Starting new session\n", + "2023-01-31 12:12:31.597272: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: \n", + "pciBusID: 0000:03:00.0 name: NVIDIA TITAN Xp computeCapability: 6.1\n", + "coreClock: 1.582GHz coreCount: 30 deviceMemorySize: 11.91GiB deviceMemoryBandwidth: 510.07GiB/s\n", + "2023-01-31 12:12:31.597524: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0\n", + "2023-01-31 12:12:31.597556: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1258] Device interconnect StreamExecutor with strength 1 edge matrix:\n", + "2023-01-31 12:12:31.597566: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1264] 0 \n", + "2023-01-31 12:12:31.597575: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1277] 0: N \n", + "2023-01-31 12:12:31.597851: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1418] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 11007 MB memory) -> physical GPU (device: 0, name: NVIDIA TITAN Xp, pci bus id: 0000:03:00.0, compute capability: 6.1)\n", + "2023-01-31 12:12:31.615526: I tensorflow/core/platform/profile_utils/cpu_utils.cc:114] CPU Frequency: 3492140000 Hz\n", + "2023-01-31 12:12:31.619233: I tensorflow/core/grappler/optimizers/meta_optimizer.cc:1144] Optimization results for grappler item: graph_to_optimize\n", + " function_optimizer: function_optimizer did nothing. time = 0.019ms.\n", + " function_optimizer: function_optimizer did nothing. time = 0.003ms.\n", + "\n", + "2023-01-31 12:12:31.702242: W tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc:345] Ignored output_format.\n", + "2023-01-31 12:12:31.702286: W tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc:348] Ignored drop_control_dependency.\n", + "2023-01-31 12:12:31.707954: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:210] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.\n", + "2023-01-31 12:12:31.710595: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: \n", + "pciBusID: 0000:03:00.0 name: NVIDIA TITAN Xp computeCapability: 6.1\n", + "coreClock: 1.582GHz coreCount: 30 deviceMemorySize: 11.91GiB deviceMemoryBandwidth: 510.07GiB/s\n", + "2023-01-31 12:12:31.710946: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0\n", + "2023-01-31 12:12:31.710984: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1258] Device interconnect StreamExecutor with strength 1 edge matrix:\n", + "2023-01-31 12:12:31.710993: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1264] 0 \n", + "2023-01-31 12:12:31.711005: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1277] 0: N \n", + "2023-01-31 12:12:31.711361: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1418] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 11007 MB memory) -> physical GPU (device: 0, name: NVIDIA TITAN Xp, pci bus id: 0000:03:00.0, compute capability: 6.1)\n", + "Converted model saved to cnn.tflite.\n", + "Running TFLite evaluation on validation set...\n", + "2023-01-31 12:12:31.770147: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:176] None of the MLIR Optimization Passes are enabled (registered 2)\n", + "[[371 0 0 0 0 0 0 0 0 0 0 0]\n", + " [ 0 301 3 6 3 16 7 5 10 3 4 13]\n", + " [ 0 1 383 1 1 1 5 2 0 0 0 3]\n", + " [ 0 8 3 362 1 13 3 0 1 1 2 12]\n", + " [ 0 2 1 0 322 0 2 0 5 10 5 3]\n", + " [ 0 2 0 8 0 360 0 0 0 1 1 5]\n", + " [ 0 1 8 4 0 1 336 1 1 0 0 0]\n", + " [ 0 6 0 0 1 0 1 353 0 1 1 0]\n", + " [ 1 3 0 1 4 1 0 0 342 7 1 3]\n", + " [ 0 3 0 1 19 1 2 0 4 338 4 1]\n", + " [ 1 1 2 0 7 1 1 0 2 1 334 0]\n", + " [ 0 5 0 9 1 7 0 1 1 3 1 344]]\n", + "Validation accuracy = 93.27%(N=4445)\n", + "Running TFLite evaluation on test set...\n", + "[[408 0 0 0 0 0 0 0 0 0 0 0]\n", + " [ 0 350 1 5 5 8 9 5 8 0 6 11]\n", + " [ 0 9 401 0 0 1 3 0 0 1 0 4]\n", + " [ 0 2 1 375 0 8 5 0 0 0 0 14]\n", + " [ 0 8 0 2 388 2 0 0 5 13 4 3]\n", + " [ 0 4 1 8 1 378 1 0 2 0 1 10]\n", + " [ 0 5 7 1 2 0 396 0 0 0 1 0]\n", + " [ 0 11 0 0 0 1 5 377 0 0 1 1]\n", + " [ 0 5 0 0 4 4 0 0 363 14 2 4]\n", + " [ 0 4 0 2 12 0 1 0 6 374 1 2]\n", + " [ 0 0 0 0 5 5 0 0 0 1 400 0]\n", + " [ 0 4 2 13 3 13 3 1 0 3 1 359]]\n", + "Test accuracy = 93.44%(N=4890)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2023-01-31 12:13:11.688023: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0\n", + "Untarring speech_commands_v0.02.tar.gz...\n", + "2023-01-31 12:14:02.193138: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcuda.so.1\n", + "2023-01-31 12:14:02.228847: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: \n", + "pciBusID: 0000:03:00.0 name: NVIDIA TITAN Xp computeCapability: 6.1\n", + "coreClock: 1.582GHz coreCount: 30 deviceMemorySize: 11.91GiB deviceMemoryBandwidth: 510.07GiB/s\n", + "2023-01-31 12:14:02.228887: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0\n", + "2023-01-31 12:14:02.249127: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcublas.so.11\n", + "2023-01-31 12:14:02.249193: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcublasLt.so.11\n", + "2023-01-31 12:14:02.251962: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcufft.so.10\n", + "2023-01-31 12:14:02.252223: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcurand.so.10\n", + "2023-01-31 12:14:02.252782: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcusolver.so.11\n", + "2023-01-31 12:14:02.253506: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcusparse.so.11\n", + "2023-01-31 12:14:02.253657: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudnn.so.8\n", + "2023-01-31 12:14:02.254137: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0\n", + "2023-01-31 12:14:02.254437: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA\n", + "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n", + "2023-01-31 12:14:02.255267: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: \n", + "pciBusID: 0000:03:00.0 name: NVIDIA TITAN Xp computeCapability: 6.1\n", + "coreClock: 1.582GHz coreCount: 30 deviceMemorySize: 11.91GiB deviceMemoryBandwidth: 510.07GiB/s\n", + "2023-01-31 12:14:02.255838: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0\n", + "2023-01-31 12:14:02.255907: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0\n", + "2023-01-31 12:14:02.712898: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1258] Device interconnect StreamExecutor with strength 1 edge matrix:\n", + "2023-01-31 12:14:02.712937: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1264] 0 \n", + "2023-01-31 12:14:02.712946: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1277] 0: N \n", + "2023-01-31 12:14:02.713547: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1418] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 11007 MB memory) -> physical GPU (device: 0, name: NVIDIA TITAN Xp, pci bus id: 0000:03:00.0, compute capability: 6.1)\n", + "WARNING:tensorflow:Compiled the loaded model, but the compiled metrics have yet to be built. `model.compile_metrics` will be empty until you train or evaluate the model.\n", + "2023-01-31 12:14:04.312064: W tensorflow/python/util/util.cc:348] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.\n", + "2023-01-31 12:14:05.110529: I tensorflow/core/grappler/devices.cc:69] Number of eligible GPUs (core count >= 8, compute capability >= 0.0): 1\n", + "2023-01-31 12:14:05.110622: I tensorflow/core/grappler/clusters/single_machine.cc:357] Starting new session\n", + "2023-01-31 12:14:05.111243: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: \n", + "pciBusID: 0000:03:00.0 name: NVIDIA TITAN Xp computeCapability: 6.1\n", + "coreClock: 1.582GHz coreCount: 30 deviceMemorySize: 11.91GiB deviceMemoryBandwidth: 510.07GiB/s\n", + "2023-01-31 12:14:05.111519: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0\n", + "2023-01-31 12:14:05.111551: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1258] Device interconnect StreamExecutor with strength 1 edge matrix:\n", + "2023-01-31 12:14:05.111562: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1264] 0 \n", + "2023-01-31 12:14:05.111570: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1277] 0: N \n", + "2023-01-31 12:14:05.111865: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1418] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 11007 MB memory) -> physical GPU (device: 0, name: NVIDIA TITAN Xp, pci bus id: 0000:03:00.0, compute capability: 6.1)\n", + "2023-01-31 12:14:05.131485: I tensorflow/core/platform/profile_utils/cpu_utils.cc:114] CPU Frequency: 3492140000 Hz\n", + "2023-01-31 12:14:05.133498: I tensorflow/core/grappler/optimizers/meta_optimizer.cc:1144] Optimization results for grappler item: graph_to_optimize\n", + " function_optimizer: function_optimizer did nothing. time = 0.009ms.\n", + " function_optimizer: function_optimizer did nothing. time = 0.002ms.\n", + "\n", + "2023-01-31 12:14:05.210179: W tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc:345] Ignored output_format.\n", + "2023-01-31 12:14:05.210218: W tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc:348] Ignored drop_control_dependency.\n", + "2023-01-31 12:14:05.215177: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:210] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.\n", + "2023-01-31 12:14:05.217453: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: \n", + "pciBusID: 0000:03:00.0 name: NVIDIA TITAN Xp computeCapability: 6.1\n", + "coreClock: 1.582GHz coreCount: 30 deviceMemorySize: 11.91GiB deviceMemoryBandwidth: 510.07GiB/s\n", + "2023-01-31 12:14:05.217717: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0\n", + "2023-01-31 12:14:05.217748: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1258] Device interconnect StreamExecutor with strength 1 edge matrix:\n", + "2023-01-31 12:14:05.217758: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1264] 0 \n", + "2023-01-31 12:14:05.217766: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1277] 0: N \n", + "2023-01-31 12:14:05.218054: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1418] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 11007 MB memory) -> physical GPU (device: 0, name: NVIDIA TITAN Xp, pci bus id: 0000:03:00.0, compute capability: 6.1)\n", + "2023-01-31 12:14:05.257830: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:176] None of the MLIR Optimization Passes are enabled (registered 2)\n", + "fully_quantize: 0, inference_type: 6, input_inference_type: 9, output_inference_type: 9\n", + "Quantized model saved to cnn_quantized.tflite.\n", + "Running TFLite evaluation on validation set...\n", + "[[371 0 0 0 0 0 0 0 0 0 0 0]\n", + " [ 0 304 3 4 3 14 7 6 9 4 4 13]\n", + " [ 0 2 382 2 1 0 4 2 0 1 0 3]\n", + " [ 0 7 3 356 5 11 3 0 1 1 3 16]\n", + " [ 0 2 1 0 318 1 2 0 5 10 8 3]\n", + " [ 0 2 0 8 1 354 1 0 0 0 4 7]\n", + " [ 0 2 6 3 3 1 333 2 1 0 0 1]\n", + " [ 0 7 0 0 1 0 3 349 0 2 1 0]\n", + " [ 1 4 0 2 4 1 0 0 341 6 1 3]\n", + " [ 0 3 1 1 24 1 4 0 6 328 3 2]\n", + " [ 1 3 2 0 10 3 0 0 0 1 330 0]\n", + " [ 0 5 0 8 2 8 0 1 1 3 2 342]]\n", + "Validation accuracy = 92.42%(N=4445)\n", + "Running TFLite evaluation on test set...\n", + "[[408 0 0 0 0 0 0 0 0 0 0 0]\n", + " [ 0 351 2 4 4 7 9 6 9 0 6 10]\n", + " [ 0 12 392 0 1 1 9 0 0 0 2 2]\n", + " [ 0 5 1 366 2 8 6 1 0 0 1 15]\n", + " [ 0 8 1 2 379 3 2 2 7 10 9 2]\n", + " [ 0 7 1 10 1 370 1 1 1 0 4 10]\n", + " [ 0 8 7 2 4 0 387 2 0 0 2 0]\n", + " [ 0 10 0 0 1 0 8 372 0 1 2 2]\n", + " [ 1 12 0 0 6 4 0 1 356 11 0 5]\n", + " [ 0 5 0 2 15 0 0 1 6 368 2 3]\n", + " [ 0 0 0 2 4 4 0 0 0 0 399 2]\n", + " [ 0 5 0 12 4 15 4 1 1 1 4 355]]\n", + "Test accuracy = 92.09%(N=4890)\n" + ] + } + ], + "source": [ + "!bash ./recreate_model.sh" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Running this script will use the pre-trained checkpoint files supplied in the ```./model_archive/model_source/weights``` folder to generate the TFLite files and perform evaluation on the test set. Both an fp32 version and a quantized version will be produced. The quantized version will use post-training quantization to fully quantize it.\n", + "\n", + "If you want to run training from scratch you can do this by supplying ```--train``` when running the script. For example:\n", + "\n", + "```bash\n", + "bash ./recreate_model.sh --train\n", + "```\n", + "\n", + "Training is then performed and should produce a model to the stated accuracy in this repository. Note that exporting to TFLite will still happen with the baseline pre-trained checkpoint files, so you will need to re-run the script and this time supply the path to the new checkpoint files you want to use, for example:\n", + "\n", + "```bash\n", + "bash ./recreate_model.sh --ckpt \n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2.0 Training\n", + "\n", + "The training scripts can be used to recreate any of the models from the [Hello Edge paper](https://arxiv.org/pdf/1711.07128.pdf) provided the right hyperparameters are used. The training commands with all the hyperparameters to reproduce the model in this repository are given [here](recreate_model.sh). The model in this part of the repository represents just one variation of the models from the paper, other varieties are covered in other parts of the repository.\n", + "\n", + "\n", + "As a general example of how to train a DNN with 3 fully-connected layers with 128 neurons in each layer, run:\n", + "```\n", + "python train.py --model_architecture dnn --model_size_info 128 128 128\n", + "```\n", + "\n", + "The command line argument *--model_size_info* is used to pass the neural network layer\n", + "dimensions such as number of layers, convolution filter size/stride as a list to models.py,\n", + "which builds the TensorFlow graph based on the provided model architecture\n", + "and layer dimensions. For more info on *model_size_info* for each network architecture see\n", + "[models.py](model_core_utils/models.py).\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 3.0 Testing\n", + "To run inference on the trained model from a checkpoint and get accuracy on validation and test sets, run:\n", + "```\n", + "python evaluation.py --model_architecture dnn --model_size_info 128 128 128 --checkpoint \n", + "```\n", + "**The model and feature extraction parameters passed to this script should match those used in the Training step.**" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 4.0 Optimization\n", + "\n", + "We introduce an *optional* step to optimize the trained keyword spotting model for deployment.\n", + "\n", + "Here we use TensorFlow's [weight clustering API](https://www.tensorflow.org/model_optimization/guide/clustering) to reduce the compressed model size and optimize inference on supported hardware. 32 weight clusters and kmeans++ cluster intialization method are used as the clustering hyperparameters.\n", + "\n", + "To optimize your trained model (e.g. a DNN), a trained model checkpoint is needed to run clustering and fine-tuning on.\n", + "You can use the pre-trained checkpoints provided, or train your own model and use the resulting checkpoint.\n", + "\n", + "To apply the optimization and fine-tuning, run the following command:\n", + "```\n", + "python optimisations.py --model_architecture dnn --model_size_info 128 128 128 --checkpoint \n", + "```\n", + "**The model and feature extraction parameters used here should match those used in the Training step, except for the number of training steps.\n", + "The number of training steps is reduced since the optimization step only requires fine-tuning.**\n", + "\n", + "This will generate a clustered model checkpoint that can be used in the quantization step to generate a quantized and clustered TFLite model." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 5.0 Quantization and TFLite Conversion\n", + "\n", + "You can now use TensorFlow's\n", + "[post training quantization](https://www.tensorflow.org/lite/performance/post_training_quantization) to\n", + "make quantization of the trained models super simple.\n", + "\n", + "To quantize your trained model (e.g. a DNN) run:\n", + "```\n", + "python convert_to_tflite.py --model_architecture dnn --model_size_info 128 128 128 --checkpoint [--inference_type int8|int16]\n", + "```\n", + "**The model and feature extraction parameters used here should match those used in the Training step.**\n", + "\n", + "The ```inference_type``` parameter is *optional* and to be used if a fully quantized model with inputs and outputs of type int8 or int16 is needed. It defaults to fp32.\n", + "\n", + "In this example, this step will produce a quantized TFLite file *dnn_quantized.tflite*." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can test the accuracy of this quantized model on the test set by running:\n", + "```\n", + "python evaluation.py --tflite_path dnn_quantized.tflite\n", + "```\n", + "**The model and feature extraction parameters used here should match those used in the Training step.**\n", + "\n", + "`convert_to_tflite.py` uses post-training quantization to generate a quantized model by default. If you wish to convert to a floating point TFLite model, use the command below:\n", + "\n", + "```\n", + "python convert_to_tflite.py --model_architecture dnn --model_size_info 128 128 128 --checkpoint --no-quantize\n", + "```\n", + "\n", + "This will produce a floating point TFLite file *dnn.tflite*. You can test the accuracy of this floating point model using `evaluation.py` as above.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 6.0 Single inference of the TFLite model files \n", + "\n", + "You can conduct TFLite inference for .fp32 and .int8 model files by using the following command: \n", + "\n", + "```python cnn_l_inference_tflite.py --labels validation_utils/labels.txt --wav --tflite_path ```\n", + "\n", + "**The feature extraction parameters used here should match those used in the Training step.**\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.4" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/models/keyword_spotting/cnn_large/model_package_tf/model_archive/TFLite/tflite_fp32/README.md b/models/keyword_spotting/cnn_large/model_package_tf/model_archive/TFLite/tflite_fp32/README.md new file mode 100644 index 0000000..fdb2fcc --- /dev/null +++ b/models/keyword_spotting/cnn_large/model_package_tf/model_archive/TFLite/tflite_fp32/README.md @@ -0,0 +1,62 @@ +# keyword_spotting/cnn_large/model_package_tf/model_archive/TFLite/tflite_fp32 + +## Description +This is a floating point fp32 version of the CNN Large model developed by Arm, from the Hello Edge paper. + +## License +[Apache-2.0](https://spdx.org/licenses/Apache-2.0.html) + +## Network Information +| Network Information | Value | +|---------------------|-------| +| Framework | TensorFlow Lite | +| Datatype | fp32 | +| SHA-1 Hash | e77e0f185dd6b7b9adcb9d867279a6c0a0ecbf02 | +| Size (Bytes) | 1908316 | +| Provenance | https://arxiv.org/abs/1711.07128 | +| Training | Trained by Arm | +| Paper | https://arxiv.org/abs/1711.07128 | + +## DataSet +| Dataset Information | Value | +|--------|-------| +| Name | Google Speech Commands test set | + +## Accuracy + +| Metric | Value | +|--------|-------| +| accuracy | 93.44% | + +## HW Support +| HW Support | Value | +|--------------|-------| +| Cortex-A |:heavy_check_mark: | +| Cortex-M |:heavy_check_mark: | +| Mali GPU |:heavy_check_mark: | +| Ethos U |:heavy_multiplication_x: | + +### Key +* :heavy_check_mark: - Will run on this platform. +* :heavy_multiplication_x: - Will not run on this platform. + +## Network Quality +| Network Quality | Value | +|-------------------------|-------| +| Recreate | :heavy_check_mark: | +| Quality level | Deployable | +| Vanilla | :heavy_check_mark: | +| Clustered | :heavy_multiplication_x: | +| Pruned | :heavy_multiplication_x: | +| Quantization - default | :heavy_multiplication_x: | +| Quantization - full | :heavy_multiplication_x: | + +## Network Inputs +| Input Node Name | Shape | Type | Example Path | Example Type | Example Shape | Example Use Case | +|-----------------|-------|-------|--------------|-------|-------|-----------------| +| input | (1, 490) | fp32 | models/keyword_spotting/cnn_large/model_package_tf/model_archive/TFLite/tflite_fp32/testing_input/input | fp32 | [1, 490] | The input is a processed MFCCs | + +## Network Outputs +| Output Node Name | Shape | Type | Example Path | Example Type | Example Shape | Example Use Case | +|-----------------|-------|-------|--------------|-------|-------|-----------------| +| Identity | (1, 12) | fp32 | models/keyword_spotting/cnn_large/model_package_tf/model_archive/TFLite/tflite_fp32/testing_output/Identity | fp32 | [1, 12] | The probability on 12 keywords | \ No newline at end of file diff --git a/models/keyword_spotting/cnn_large/model_package_tf/model_archive/TFLite/tflite_fp32/cnn_l.tflite b/models/keyword_spotting/cnn_large/model_package_tf/model_archive/TFLite/tflite_fp32/cnn_l.tflite new file mode 100644 index 0000000..cab79f2 --- /dev/null +++ b/models/keyword_spotting/cnn_large/model_package_tf/model_archive/TFLite/tflite_fp32/cnn_l.tflite @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f1a82f9c75ab57bafccbe9a154454d228c9610bd66cb186a69bab4fcc9958558 +size 1908316 diff --git a/models/keyword_spotting/cnn_large/model_package_tf/model_archive/TFLite/tflite_fp32/definition.yaml b/models/keyword_spotting/cnn_large/model_package_tf/model_archive/TFLite/tflite_fp32/definition.yaml new file mode 100644 index 0000000..9404113 --- /dev/null +++ b/models/keyword_spotting/cnn_large/model_package_tf/model_archive/TFLite/tflite_fp32/definition.yaml @@ -0,0 +1,64 @@ +benchmark: + benchmark_metrics: + accuracy: 93.44% + benchmark_name: Google Speech Commands test set +description: This is a floating point fp32 version of the CNN Large model developed + by Arm, from the Hello Edge paper. +license: +- Apache-2.0 +network: + datatype: fp32 + file_size_bytes: 1908316 + filename: cnn_l.tflite + framework: TensorFlow Lite + hash: + algorithm: sha1 + value: e77e0f185dd6b7b9adcb9d867279a6c0a0ecbf02 + provenance: https://arxiv.org/abs/1711.07128 + training: Trained by Arm +network_parameters: + input_nodes: + - description: The input is a processed MFCCs of shape (1, 490) + example_input: + path: models/keyword_spotting/cnn_large/model_package_tf/model_archive/TFLite/tflite_fp32/testing_input/input + shape: + - 1 + - 490 + type: fp32 + use_case: Random input for model regression. + input_datatype: fp32 + name: input + shape: + - 1 + - 490 + output_nodes: + - description: The probability on 12 keywords. + example_output: + path: models/keyword_spotting/cnn_large/model_package_tf/model_archive/TFLite/tflite_fp32/testing_output/Identity + shape: + - 1 + - 12 + type: fp32 + use_case: output for model regression. + name: Identity + output_datatype: fp32 + shape: + - 1 + - 12 +network_quality: + clustered: false + is_vanilla: true + pruned: false + quality_level: Deployable + quality_level_hero_hw: cortex_m + quantization_default: false + quantization_full: false + recreate: true +operators: + TensorFlow Lite: + - CONV_2D + - FULLY_CONNECTED + - RELU + - RESHAPE + - SOFTMAX +paper: https://arxiv.org/abs/1711.07128 \ No newline at end of file diff --git a/models/keyword_spotting/cnn_large/model_package_tf/model_archive/TFLite/tflite_fp32/testing_input/input/0.npy b/models/keyword_spotting/cnn_large/model_package_tf/model_archive/TFLite/tflite_fp32/testing_input/input/0.npy new file mode 100644 index 0000000..4b93b40 --- /dev/null +++ b/models/keyword_spotting/cnn_large/model_package_tf/model_archive/TFLite/tflite_fp32/testing_input/input/0.npy @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c0d3177ad9e25a08e300d6dab37303348cc99cda9137a0ed98bfe4ecabb4cbe2 +size 2088 diff --git a/models/keyword_spotting/cnn_large/model_package_tf/model_archive/TFLite/tflite_fp32/testing_output/Identity/0.npy b/models/keyword_spotting/cnn_large/model_package_tf/model_archive/TFLite/tflite_fp32/testing_output/Identity/0.npy new file mode 100644 index 0000000..cca051a --- /dev/null +++ b/models/keyword_spotting/cnn_large/model_package_tf/model_archive/TFLite/tflite_fp32/testing_output/Identity/0.npy @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:84727ee69c9018fcd7295ca5646c29a982b948ce3abd7c4a9c44c7203c699b24 +size 176 diff --git a/models/keyword_spotting/cnn_large/model_package_tf/model_archive/TFLite/tflite_int8/README.md b/models/keyword_spotting/cnn_large/model_package_tf/model_archive/TFLite/tflite_int8/README.md new file mode 100644 index 0000000..8befb51 --- /dev/null +++ b/models/keyword_spotting/cnn_large/model_package_tf/model_archive/TFLite/tflite_int8/README.md @@ -0,0 +1,62 @@ +# keyword_spotting/cnn_large/model_package_tf/model_archive/TFLite/tflite_int8 + +## Description +This is a fully quantized int8 version of the CNN Large model developed by Arm, from the Hello Edge paper. + +## License +[Apache-2.0](https://spdx.org/licenses/Apache-2.0.html) + +## Network Information +| Network Information | Value | +|---------------------|-------| +| Framework | TensorFlow Lite | +| Datatype | int8 | +| SHA-1 Hash | a61ab748ae8f52f78ab568342db67a792c6ecf34 | +| Size (Bytes) | 484600 | +| Provenance | https://arxiv.org/abs/1711.07128 | +| Training | Trained by Arm | +| Paper | https://arxiv.org/abs/1711.07128 | + +## DataSet +| Dataset Information | Value | +|--------|-------| +| Name | Google Speech Commands test set | + +## Accuracy + +| Metric | Value | +|--------|-------| +| Accuracy | 92.27% | + +## HW Support +| HW Support | Value | +|--------------|-------| +| Cortex-A |:heavy_check_mark: | +| Cortex-M |:heavy_check_mark: | +| Mali GPU |:heavy_check_mark: | +| Ethos U |:heavy_check_mark: | + +### Key +* :heavy_check_mark: - Will run on this platform. +* :heavy_multiplication_x: - Will not run on this platform. + +## Network Quality +| Network Quality | Value | +|-------------------------|-------| +| Recreate | :heavy_check_mark: | +| Quality level | Deployable | +| Vanilla | :heavy_check_mark: | +| Clustered | :heavy_multiplication_x: | +| Pruned | :heavy_multiplication_x: | +| Quantization - default | :heavy_multiplication_x: | +| Quantization - full | :heavy_check_mark: | + +## Network Inputs +| Input Node Name | Shape | Type | Example Path | Example Type | Example Shape | Example Use Case | +|-----------------|-------|-------|--------------|-------|-------|-----------------| +| input | (1, 490) | int8 | models/keyword_spotting/cnn_large/model_package_tf/model_archive/TFLite/tflite_int8/testing_input/input | int8 | [1, 490] | The input is a processed MFCCs | + +## Network Outputs +| Output Node Name | Shape | Type | Example Path | Example Type | Example Shape | Example Use Case | +|-----------------|-------|-------|--------------|-------|-------|-----------------| +| Identity | (1, 12) | int8 | models/keyword_spotting/cnn_large/model_package_tf/model_archive/TFLite/tflite_int8/testing_output/Identity | int8 | [1, 12] | The probability on 12 keywords | \ No newline at end of file diff --git a/models/keyword_spotting/cnn_large/tflite_int8/cnn_l_quantized.tflite b/models/keyword_spotting/cnn_large/model_package_tf/model_archive/TFLite/tflite_int8/cnn_l_quantized.tflite similarity index 100% rename from models/keyword_spotting/cnn_large/tflite_int8/cnn_l_quantized.tflite rename to models/keyword_spotting/cnn_large/model_package_tf/model_archive/TFLite/tflite_int8/cnn_l_quantized.tflite diff --git a/models/keyword_spotting/cnn_large/model_package_tf/model_archive/TFLite/tflite_int8/definition.yaml b/models/keyword_spotting/cnn_large/model_package_tf/model_archive/TFLite/tflite_int8/definition.yaml new file mode 100644 index 0000000..32429b1 --- /dev/null +++ b/models/keyword_spotting/cnn_large/model_package_tf/model_archive/TFLite/tflite_int8/definition.yaml @@ -0,0 +1,64 @@ +benchmark: + benchmark_metrics: + accuracy: 92.27% + benchmark_name: Google Speech Commands test set +description: This is a fully quantized int8 version of the CNN Large model developed + by Arm, from the Hello Edge paper. +license: +- Apache-2.0 +network: + datatype: int8 + file_size_bytes: 484600 + filename: cnn_l_quantized.tflite + framework: TensorFlow Lite + hash: + algorithm: sha1 + value: a61ab748ae8f52f78ab568342db67a792c6ecf34 + provenance: https://arxiv.org/abs/1711.07128 + training: Trained by Arm +network_parameters: + input_nodes: + - description: The input is a processed MFCCs of shape (1, 490) + example_input: + path: models/keyword_spotting/cnn_large/model_package_tf/model_archive/TFLite/tflite_int8/testing_input/input + shape: + - 1 + - 490 + type: int8 + use_case: Random input for model regression. + input_datatype: int8 + name: input + shape: + - 1 + - 490 + output_nodes: + - description: The probability on 12 keywords. + example_output: + path: models/keyword_spotting/cnn_large/model_package_tf/model_archive/TFLite/tflite_int8/testing_output/Identity + shape: + - 1 + - 12 + type: int8 + use_case: output for model regression. + name: Identity + output_datatype: int8 + shape: + - 1 + - 12 +network_quality: + clustered: false + is_vanilla: true + pruned: false + quality_level: Deployable + quality_level_hero_hw: cortex_m + quantization_default: false + quantization_full: true + recreate: true +operators: + TensorFlow Lite: + - CONV_2D + - FULLY_CONNECTED + - RELU + - RESHAPE + - SOFTMAX +paper: https://arxiv.org/abs/1711.07128 \ No newline at end of file diff --git a/models/keyword_spotting/cnn_large/tflite_int8/testing_input/input/0.npy b/models/keyword_spotting/cnn_large/model_package_tf/model_archive/TFLite/tflite_int8/testing_input/input/0.npy similarity index 100% rename from models/keyword_spotting/cnn_large/tflite_int8/testing_input/input/0.npy rename to models/keyword_spotting/cnn_large/model_package_tf/model_archive/TFLite/tflite_int8/testing_input/input/0.npy diff --git a/models/keyword_spotting/cnn_large/tflite_int8/testing_output/Identity/0.npy b/models/keyword_spotting/cnn_large/model_package_tf/model_archive/TFLite/tflite_int8/testing_output/Identity/0.npy similarity index 100% rename from models/keyword_spotting/cnn_large/tflite_int8/testing_output/Identity/0.npy rename to models/keyword_spotting/cnn_large/model_package_tf/model_archive/TFLite/tflite_int8/testing_output/Identity/0.npy diff --git a/models/keyword_spotting/cnn_large/model_package_tf/model_archive/model_source/saved_model/cnn_large/keras_metadata.pb b/models/keyword_spotting/cnn_large/model_package_tf/model_archive/model_source/saved_model/cnn_large/keras_metadata.pb new file mode 100644 index 0000000..95bf328 --- /dev/null +++ b/models/keyword_spotting/cnn_large/model_package_tf/model_archive/model_source/saved_model/cnn_large/keras_metadata.pb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4200839672e3d67af379cc06349ee6af8ab3b53c966562595b31473afc252c6d +size 28876 diff --git a/models/keyword_spotting/cnn_large/model_package_tf/model_archive/model_source/saved_model/cnn_large/saved_model.pb b/models/keyword_spotting/cnn_large/model_package_tf/model_archive/model_source/saved_model/cnn_large/saved_model.pb new file mode 100644 index 0000000..ff4b1b6 --- /dev/null +++ b/models/keyword_spotting/cnn_large/model_package_tf/model_archive/model_source/saved_model/cnn_large/saved_model.pb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4d0494f8fe5b99a8b92217809d33d287f855e9281465548650037906c57912a2 +size 302218 diff --git a/models/keyword_spotting/cnn_large/model_package_tf/model_archive/model_source/saved_model/cnn_large/variables/variables.data-00000-of-00001 b/models/keyword_spotting/cnn_large/model_package_tf/model_archive/model_source/saved_model/cnn_large/variables/variables.data-00000-of-00001 new file mode 100644 index 0000000..d05f350 --- /dev/null +++ b/models/keyword_spotting/cnn_large/model_package_tf/model_archive/model_source/saved_model/cnn_large/variables/variables.data-00000-of-00001 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4d8519182ae8e5d3dbf4762e2db5c1ac27472e95e9ef4aa0772aec6991020ffd +size 1917320 diff --git a/models/keyword_spotting/cnn_large/model_package_tf/model_archive/model_source/saved_model/cnn_large/variables/variables.index b/models/keyword_spotting/cnn_large/model_package_tf/model_archive/model_source/saved_model/cnn_large/variables/variables.index new file mode 100644 index 0000000..f6645fe --- /dev/null +++ b/models/keyword_spotting/cnn_large/model_package_tf/model_archive/model_source/saved_model/cnn_large/variables/variables.index @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:398bc377f651632cfde25ca4c1e372d04fe199868080ec162f482db3a7d8399e +size 1478 diff --git a/models/keyword_spotting/cnn_large/tflite_int8/ckpt/checkpoint b/models/keyword_spotting/cnn_large/model_package_tf/model_archive/model_source/weights/checkpoint similarity index 100% rename from models/keyword_spotting/cnn_large/tflite_int8/ckpt/checkpoint rename to models/keyword_spotting/cnn_large/model_package_tf/model_archive/model_source/weights/checkpoint diff --git a/models/keyword_spotting/cnn_large/tflite_int8/ckpt/cnn_0.94_ckpt.data-00000-of-00001 b/models/keyword_spotting/cnn_large/model_package_tf/model_archive/model_source/weights/cnn_0.94_ckpt.data-00000-of-00001 similarity index 100% rename from models/keyword_spotting/cnn_large/tflite_int8/ckpt/cnn_0.94_ckpt.data-00000-of-00001 rename to models/keyword_spotting/cnn_large/model_package_tf/model_archive/model_source/weights/cnn_0.94_ckpt.data-00000-of-00001 diff --git a/models/keyword_spotting/cnn_large/tflite_int8/ckpt/cnn_0.94_ckpt.index b/models/keyword_spotting/cnn_large/model_package_tf/model_archive/model_source/weights/cnn_0.94_ckpt.index similarity index 100% rename from models/keyword_spotting/cnn_large/tflite_int8/ckpt/cnn_0.94_ckpt.index rename to models/keyword_spotting/cnn_large/model_package_tf/model_archive/model_source/weights/cnn_0.94_ckpt.index diff --git a/models/keyword_spotting/cnn_large/model_package_tf/model_core_utils/__init__.py b/models/keyword_spotting/cnn_large/model_package_tf/model_core_utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/models/keyword_spotting/cnn_large/model_package_tf/model_core_utils/models.py b/models/keyword_spotting/cnn_large/model_package_tf/model_core_utils/models.py new file mode 100644 index 0000000..1978136 --- /dev/null +++ b/models/keyword_spotting/cnn_large/model_package_tf/model_core_utils/models.py @@ -0,0 +1,327 @@ +# Copyright © 2023 Arm Ltd. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Model definitions for simple keyword spotting.""" + +import math + +import tensorflow as tf + + +def prepare_model_settings(label_count, sample_rate, clip_duration_ms, + window_size_ms, window_stride_ms, + dct_coefficient_count): + """Calculates common settings needed for all models. + + Args: + label_count: How many classes are to be recognized. + sample_rate: Number of audio samples per second. + clip_duration_ms: Length of each audio clip to be analyzed. + window_size_ms: Duration of frequency analysis window. + window_stride_ms: How far to move in time between frequency windows. + dct_coefficient_count: Number of frequency bins to use for analysis. + + Returns: + Dictionary containing common settings. + """ + desired_samples = int(sample_rate * clip_duration_ms / 1000) + window_size_samples = int(sample_rate * window_size_ms / 1000) + window_stride_samples = int(sample_rate * window_stride_ms / 1000) + length_minus_window = (desired_samples - window_size_samples) + if length_minus_window < 0: + spectrogram_length = 0 + else: + spectrogram_length = 1 + int(length_minus_window / window_stride_samples) + fingerprint_size = dct_coefficient_count * spectrogram_length + + return { + 'desired_samples': desired_samples, + 'window_size_samples': window_size_samples, + 'window_stride_samples': window_stride_samples, + 'spectrogram_length': spectrogram_length, + 'dct_coefficient_count': dct_coefficient_count, + 'fingerprint_size': fingerprint_size, + 'label_count': label_count, + 'sample_rate': sample_rate, + } + + +def create_model(model_settings, model_architecture, model_size_info, is_training): + """Builds a tf.keras model of the requested architecture compatible with the settings. + + Args: + model_settings: Dictionary of information about the model. + model_architecture: String specifying which kind of model to create. + model_size_info: Array with specific information for the chosen architecture + (e.g convolutional parameters, number of layers). + + Returns: + A tf.keras Model with the requested architecture. + + Raises: + Exception: If the architecture type isn't recognized. + """ + + if model_architecture == 'dnn': + return create_dnn_model(model_settings, model_size_info) + + elif model_architecture == 'cnn': + return create_cnn_model(model_settings, model_size_info) + + elif model_architecture == 'ds_cnn': + return create_ds_cnn_model(model_settings, model_size_info) + elif model_architecture == 'single_fc': + return create_single_fc_model(model_settings) + elif model_architecture == 'basic_lstm': + return create_basic_lstm_model(model_settings, model_size_info, is_training) + else: + raise Exception(f'model_architecture argument {model_architecture} not recognized' + f', should be one of, "dnn", "cnn", "ds_cnn" ') + + +def create_single_fc_model(model_settings): + """Builds a model with a single fully-connected layer. + + For details see https://arxiv.org/abs/1711.07128. + + Args: + model_settings: Dict of different settings for model training. + + Returns: + tf.keras Model of the 'SINGLE_FC' architecture. + """ + inputs = tf.keras.Input(shape=(model_settings['fingerprint_size'],), name='input') + # Fully connected layer + output = tf.keras.layers.Dense(units=model_settings['label_count'], activation='softmax')(inputs) + + return tf.keras.Model(inputs, output) + + +def create_basic_lstm_model(model_settings, model_size_info, is_training): + """Builds a model with a basic lstm layer. + + For details see https://arxiv.org/abs/1711.07128. + + Args: + model_settings: Dict of different settings for model training. + model_size_info: Length of the array defines the number of hidden-layers and + each element in the array represent the number of neurons in that layer. + is_training: Determining whether the use of the model is for training or for something else. + + Returns: + tf.keras Model of the 'Basic_LSTM' architecture. + """ + inputs = tf.keras.Input(shape=(model_settings['fingerprint_size'], ), name='input') + + input_frequency_size = model_settings['dct_coefficient_count'] + input_time_size = model_settings['spectrogram_length'] + x = tf.reshape(inputs, shape=(-1, input_time_size, input_frequency_size)) + + # LSTM layer, and unrolling depending on whether you are training or not + if is_training: + x = tf.keras.layers.LSTM(units=model_size_info[0], time_major=False, unroll=False)(x) + else: + x = tf.keras.layers.LSTM(units=model_size_info[0], time_major=False, unroll=True)(x) + + # Outputs a fully connected layer + output = tf.keras.layers.Dense(units=model_settings['label_count'], activation='softmax')(x) + + return tf.keras.Model(inputs, output) + + +def create_dnn_model(model_settings, model_size_info): + """Builds a model with multiple hidden fully-connected layers. + + For details see https://arxiv.org/abs/1711.07128. + + Args: + model_settings: Dict of different settings for model training. + model_size_info: Length of the array defines the number of hidden-layers and + each element in the array represent the number of neurons in that layer. + + Returns: + tf.keras Model of the 'DNN' architecture. + """ + + inputs = tf.keras.Input(shape=(model_settings['fingerprint_size'], ), name='input') + + # First fully connected layer. + x = tf.keras.layers.Dense(units=model_size_info[0], activation='relu')(inputs) + + # Hidden layers with ReLU activations. + for i in range(1, len(model_size_info)): + x = tf.keras.layers.Dense(units=model_size_info[i], activation='relu')(x) + + # Output fully connected layer. + output = tf.keras.layers.Dense(units=model_settings['label_count'], activation='softmax')(x) + + return tf.keras.Model(inputs, output) + + +def create_cnn_model(model_settings, model_size_info): + """Builds a model with 2 convolution layers followed by a linear layer and a hidden fully-connected layer. + + For details see https://arxiv.org/abs/1711.07128. + + Args: + model_settings: Dict of different settings for model training. + model_size_info: Defines the first and second convolution parameters in + {number of conv features, conv filter height, width, stride in y,x dir.}, + followed by linear layer size and fully-connected layer size. + + Returns: + tf.keras Model of the 'CNN' architecture. + """ + + input_frequency_size = model_settings['dct_coefficient_count'] + input_time_size = model_settings['spectrogram_length'] + + first_filter_count = model_size_info[0] + first_filter_height = model_size_info[1] # Time axis. + first_filter_width = model_size_info[2] # Frequency axis. + first_filter_stride_y = model_size_info[3] # Time axis. + first_filter_stride_x = model_size_info[4] # Frequency_axis. + + second_filter_count = model_size_info[5] + second_filter_height = model_size_info[6] # Time axis. + second_filter_width = model_size_info[7] # Frequency axis. + second_filter_stride_y = model_size_info[8] # Time axis. + second_filter_stride_x = model_size_info[9] # Frequency axis. + + linear_layer_size = model_size_info[10] + fc_size = model_size_info[11] + + inputs = tf.keras.Input(shape=(model_settings['fingerprint_size']), name='input') + + # Reshape the flattened input. + x = tf.reshape(inputs, shape=(-1, input_time_size, input_frequency_size, 1)) + + # First convolution. + x = tf.keras.layers.Conv2D(filters=first_filter_count, + kernel_size=(first_filter_height, first_filter_width), + strides=(first_filter_stride_y, first_filter_stride_x), + padding='VALID')(x) + x = tf.keras.layers.BatchNormalization()(x) + x = tf.keras.layers.ReLU()(x) + x = tf.keras.layers.Dropout(rate=0)(x) + + # Second convolution. + x = tf.keras.layers.Conv2D(filters=second_filter_count, + kernel_size=(second_filter_height, second_filter_width), + strides=(second_filter_stride_y, second_filter_stride_x), + padding='VALID')(x) + x = tf.keras.layers.BatchNormalization()(x) + x = tf.keras.layers.ReLU()(x) + x = tf.keras.layers.Dropout(rate=0)(x) + + # Flatten for fully connected layers. + x = tf.keras.layers.Flatten()(x) + + # Fully connected layer with no activation. + x = tf.keras.layers.Dense(units=linear_layer_size)(x) + + # Fully connected layer with ReLU activation. + x = tf.keras.layers.Dense(units=fc_size)(x) + x = tf.keras.layers.BatchNormalization()(x) + x = tf.keras.layers.ReLU()(x) + x = tf.keras.layers.Dropout(rate=0)(x) + + # Output fully connected. + output = tf.keras.layers.Dense(units=model_settings['label_count'], activation='softmax')(x) + + return tf.keras.Model(inputs, output) + + +def create_ds_cnn_model(model_settings, model_size_info): + """Builds a model with convolutional & depthwise separable convolutional layers. + + For more details see https://arxiv.org/abs/1711.07128. + + Args: + model_settings: Dict of different settings for model training. + model_size_info: Defines number of layers, followed by the DS-Conv layer + parameters in the order {number of conv features, conv filter height, + width and stride in y,x dir.} for each of the layers. + + Returns: + tf.keras Model of the 'DS-CNN' architecture. + """ + + label_count = model_settings['label_count'] + input_frequency_size = model_settings['dct_coefficient_count'] + input_time_size = model_settings['spectrogram_length'] + + t_dim = input_time_size + f_dim = input_frequency_size + + # Extract model dimensions from model_size_info. + num_layers = model_size_info[0] + conv_feat = [None]*num_layers + conv_kt = [None]*num_layers + conv_kf = [None]*num_layers + conv_st = [None]*num_layers + conv_sf = [None]*num_layers + + i = 1 + for layer_no in range(0, num_layers): + conv_feat[layer_no] = model_size_info[i] + i += 1 + conv_kt[layer_no] = model_size_info[i] + i += 1 + conv_kf[layer_no] = model_size_info[i] + i += 1 + conv_st[layer_no] = model_size_info[i] + i += 1 + conv_sf[layer_no] = model_size_info[i] + i += 1 + + inputs = tf.keras.Input(shape=(model_settings['fingerprint_size']), name='input') + + # Reshape the flattened input. + x = tf.reshape(inputs, shape=(-1, input_time_size, input_frequency_size, 1)) + + # Depthwise separable convolutions. + for layer_no in range(0, num_layers): + if layer_no == 0: + # First convolution. + x = tf.keras.layers.Conv2D(filters=conv_feat[0], + kernel_size=(conv_kt[0], conv_kf[0]), + strides=(conv_st[0], conv_sf[0]), + padding='SAME')(x) + x = tf.keras.layers.BatchNormalization()(x) + x = tf.keras.layers.ReLU()(x) + else: + # Depthwise convolution. + x = tf.keras.layers.DepthwiseConv2D(kernel_size=(conv_kt[layer_no], conv_kf[layer_no]), + strides=(conv_sf[layer_no], conv_st[layer_no]), + padding='SAME')(x) + x = tf.keras.layers.BatchNormalization()(x) + x = tf.keras.layers.ReLU()(x) + + # Pointwise convolution. + x = tf.keras.layers.Conv2D(filters=conv_feat[layer_no], kernel_size=(1, 1))(x) + x = tf.keras.layers.BatchNormalization()(x) + x = tf.keras.layers.ReLU()(x) + + t_dim = math.ceil(t_dim/float(conv_st[layer_no])) + f_dim = math.ceil(f_dim/float(conv_sf[layer_no])) + + # Global average pool. + x = tf.keras.layers.AveragePooling2D(pool_size=(t_dim, f_dim), strides=1)(x) + + # Squeeze before passing to output fully connected layer. + x = tf.reshape(x, shape=(-1, conv_feat[layer_no])) + + # Output connected layer. + output = tf.keras.layers.Dense(units=label_count, activation='softmax')(x) + + return tf.keras.Model(inputs, output) diff --git a/models/keyword_spotting/cnn_large/model_package_tf/optimisations.py b/models/keyword_spotting/cnn_large/model_package_tf/optimisations.py new file mode 100644 index 0000000..16b6f4c --- /dev/null +++ b/models/keyword_spotting/cnn_large/model_package_tf/optimisations.py @@ -0,0 +1,259 @@ +# Copyright © 2023 Arm Ltd. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Functions for optimizing simple keyword spotting models using clustering API.""" + +import argparse +from pathlib import Path + +import tensorflow as tf +import numpy as np +import tensorflow_model_optimization as tfmot + +from data_processing import data_preprocessing +from model_core_utils import models + + +def print_model_weight_clusters(model): + + for layer in model.layers: + if isinstance(layer, tf.keras.layers.Wrapper): + weights = layer.trainable_weights + else: + weights = layer.weights + for weight in weights: + if "kernel" in weight.name: + unique_count = len(np.unique(weight)) + print( + f"{layer.name}/{weight.name}: {unique_count} clusters " + ) + + +def optimize(): + model_settings = models.prepare_model_settings( + len(data_preprocessing.prepare_words_list(FLAGS.wanted_words.split(','))), + FLAGS.sample_rate, FLAGS.clip_duration_ms, FLAGS.window_size_ms, + FLAGS.window_stride_ms, FLAGS.dct_coefficient_count) + + # Create the model to optimize from checkpoint. + model = models.create_model(model_settings, FLAGS.model_architecture, FLAGS.model_size_info) + model.load_weights(FLAGS.checkpoint).expect_partial() + + audio_processor = data_preprocessing.AudioProcessor(data_url=FLAGS.data_url, + data_dir=FLAGS.data_dir, + silence_percentage=FLAGS.silence_percentage, + unknown_percentage=FLAGS.unknown_percentage, + wanted_words=FLAGS.wanted_words.split(','), + validation_percentage=FLAGS.validation_percentage, + testing_percentage=FLAGS.testing_percentage, + model_settings=model_settings) + + # We decay learning rate in a constant piecewise way to help learning. + training_steps_list = list(map(int, FLAGS.how_many_training_steps.split(','))) + learning_rates_list = list(map(float, FLAGS.learning_rate.split(','))) + lr_boundary_list = training_steps_list[:-1] # Only need the values at which to change lr. + lr_schedule = tf.keras.optimizers.schedules.PiecewiseConstantDecay(boundaries=lr_boundary_list, + values=learning_rates_list) + + cluster_weights = tfmot.clustering.keras.cluster_weights + CentroidInitialization = tfmot.clustering.keras.CentroidInitialization + + clustering_params = { + 'number_of_clusters': 32, + 'cluster_centroids_init': CentroidInitialization.KMEANS_PLUS_PLUS} + + clustered_model = cluster_weights(model, **clustering_params) + + # Specify the optimizer configurations. + optimizer = tf.keras.optimizers.Adam(learning_rate=lr_schedule) + clustered_model.compile(optimizer=optimizer, + loss=tf.keras.losses.SparseCategoricalCrossentropy(), + metrics=['accuracy']) + + train_data = audio_processor.get_data(audio_processor.Modes.TRAINING, + FLAGS.background_frequency, FLAGS.background_volume, + int((FLAGS.time_shift_ms * FLAGS.sample_rate) / 1000)) + train_data = train_data.repeat().batch(FLAGS.batch_size).prefetch(tf.data.AUTOTUNE) + val_data = audio_processor.get_data(audio_processor.Modes.VALIDATION) + val_data = val_data.batch(FLAGS.batch_size).prefetch(tf.data.AUTOTUNE) + + # We train for a max number of iterations so need to calculate how many 'epochs' this will be. + training_steps_max = np.sum(training_steps_list) + training_epoch_max = int(np.ceil(training_steps_max / FLAGS.eval_step_interval)) + + # Train the model with clustering applied. + clustered_model.fit(x=train_data, + steps_per_epoch=FLAGS.eval_step_interval, + epochs=training_epoch_max, + validation_data=val_data) + + stripped_clustered_model = tfmot.clustering.keras.strip_clustering(clustered_model) + + print_model_weight_clusters(stripped_clustered_model) + + # Save the clustered model weights + train_dir = Path(FLAGS.train_dir) / "optimized" + train_dir.mkdir(parents=True, exist_ok=True) + + stripped_clustered_model.save_weights((train_dir / + (FLAGS.model_architecture + + "_clustered_ckpt"))) + + # Test the model. + test_data = audio_processor.get_data(audio_processor.Modes.TESTING) + test_data = test_data.batch(FLAGS.batch_size) + + stripped_clustered_model.compile(optimizer=optimizer, + loss=tf.keras.losses.SparseCategoricalCrossentropy(), + metrics=['accuracy']) + + test_loss, test_acc = stripped_clustered_model.evaluate(x=test_data) + print(f'Final test accuracy: {test_acc*100:.2f}%') + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + '--data_url', + type=str, + default='http://download.tensorflow.org/data/speech_commands_v0.02.tar.gz', + help='Location of speech training data archive on the web.') + parser.add_argument( + '--data_dir', + type=str, + default='/tmp/speech_dataset/', + help="""\ + Where to download the speech training data to. + """) + parser.add_argument( + '--background_volume', + type=float, + default=0.1, + help="""\ + How loud the background noise should be, between 0 and 1. + """) + parser.add_argument( + '--background_frequency', + type=float, + default=0.8, + help="""\ + How many of the training samples have background noise mixed in. + """) + parser.add_argument( + '--silence_percentage', + type=float, + default=10.0, + help="""\ + How much of the training data should be silence. + """) + parser.add_argument( + '--unknown_percentage', + type=float, + default=10.0, + help="""\ + How much of the training data should be unknown words. + """) + parser.add_argument( + '--time_shift_ms', + type=float, + default=100.0, + help="""\ + Range to randomly shift the training audio by in time. + """) + parser.add_argument( + '--testing_percentage', + type=int, + default=10, + help='What percentage of wavs to use as a test set.') + parser.add_argument( + '--validation_percentage', + type=int, + default=10, + help='What percentage of wavs to use as a validation set.') + parser.add_argument( + '--sample_rate', + type=int, + default=16000, + help='Expected sample rate of the wavs',) + parser.add_argument( + '--clip_duration_ms', + type=int, + default=1000, + help='Expected duration in milliseconds of the wavs',) + parser.add_argument( + '--window_size_ms', + type=float, + default=30.0, + help='How long each spectrogram timeslice is',) + parser.add_argument( + '--window_stride_ms', + type=float, + default=10.0, + help='How long each spectrogram timeslice is',) + parser.add_argument( + '--dct_coefficient_count', + type=int, + default=40, + help='How many bins to use for the MFCC fingerprint',) + parser.add_argument( + '--how_many_training_steps', + type=str, + default='3750,750', + help='How many training loops to run',) + parser.add_argument( + '--eval_step_interval', + type=int, + default=400, + help='How often to evaluate the training results.') + parser.add_argument( + '--learning_rate', + type=str, + default='0.001,0.0001', + help='How large a learning rate to use when training.') + parser.add_argument( + '--batch_size', + type=int, + default=100, + help='How many items to train with at once',) + parser.add_argument( + '--wanted_words', + type=str, + default='yes,no,up,down,left,right,on,off,stop,go', + help='Words to use (others will be added to an unknown label)',) + parser.add_argument( + '--train_dir', + type=str, + default='/tmp/speech_commands_train', + help='Directory to write event logs and checkpoint.') + parser.add_argument( + '--save_step_interval', + type=int, + default=100, + help='Save model checkpoint every save_steps.') + parser.add_argument( + '--checkpoint', + type=str, + help='Checkpoint to load the weights from before fine-tuning.') + parser.add_argument( + '--model_architecture', + type=str, + default='dnn', + help='What model architecture to use') + parser.add_argument( + '--model_size_info', + type=int, + nargs="+", + default=[128, 128, 128], + help='Model dimensions - different for various models') + + FLAGS, _ = parser.parse_known_args() + optimize() diff --git a/models/keyword_spotting/cnn_large/model_package_tf/recreate_model.sh b/models/keyword_spotting/cnn_large/model_package_tf/recreate_model.sh new file mode 100644 index 0000000..1ea0506 --- /dev/null +++ b/models/keyword_spotting/cnn_large/model_package_tf/recreate_model.sh @@ -0,0 +1,52 @@ +#!/usr/bin/env bash +# Copyright (C) 2023 Arm Limited or its affiliates. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the License); you may +# not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an AS IS BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +ckpt_path=model_archive/model_source/weights/cnn_0.94_ckpt +train=false + +# Parse command line args +while (( $# >= 1 )); do + case $1 in + --ckpt) + if [ "$2" ]; then + ckpt_path=$2 + shift + else + printf 'ERROR: "--ckpt" requires a path to be supplied.\n' + exit 1 + fi + ;; + --train) + train=true + break;; + *) shift; + esac; +done + + +# CNN Large training +if [ "$train" = true ] +then +python train.py --model_architecture cnn --model_size_info 60 10 4 1 1 76 10 4 2 1 58 128 --dct_coefficient_count 10 --window_size_ms 40 --window_stride_ms 20 --learning_rate 0.0005,0.0001,0.00002 --how_many_training_steps 10000,10000,10000 --summaries_dir work/CNN/CNN_L/retrain_logs --train_dir work/CNN/CNN_L/training +fi + +# Conversion to TFLite fp32 +python convert_to_tflite.py --model_architecture cnn --model_size_info 60 10 4 1 1 76 10 4 2 1 58 128 --dct_coefficient_count 10 --window_size_ms 40 --window_stride_ms 20 --checkpoint $ckpt_path --no-quantize + +# Conversion to TFLite int8 +python convert_to_tflite.py --model_architecture cnn --model_size_info 60 10 4 1 1 76 10 4 2 1 58 128 --dct_coefficient_count 10 --window_size_ms 40 --window_stride_ms 20 --checkpoint $ckpt_path --inference_type int8 + diff --git a/models/keyword_spotting/cnn_large/model_package_tf/requirements.txt b/models/keyword_spotting/cnn_large/model_package_tf/requirements.txt new file mode 100644 index 0000000..3448cff --- /dev/null +++ b/models/keyword_spotting/cnn_large/model_package_tf/requirements.txt @@ -0,0 +1,3 @@ +numpy == 1.19.5 +tensorflow == 2.5.0 +tensorflow-model-optimization == 0.6.0 \ No newline at end of file diff --git a/models/keyword_spotting/cnn_large/model_package_tf/train.py b/models/keyword_spotting/cnn_large/model_package_tf/train.py new file mode 100644 index 0000000..8c488b3 --- /dev/null +++ b/models/keyword_spotting/cnn_large/model_package_tf/train.py @@ -0,0 +1,227 @@ +# Copyright © 2023 Arm Ltd. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Functions for training simple keyword spotting models.""" + +import argparse +from pathlib import Path + +import tensorflow as tf +import numpy as np + +from data_processing import data_preprocessing +from model_core_utils import models + + +def train(): + model_settings = models.prepare_model_settings( + len(data_preprocessing.prepare_words_list(FLAGS.wanted_words.split(','))), + FLAGS.sample_rate, FLAGS.clip_duration_ms, FLAGS.window_size_ms, + FLAGS.window_stride_ms, FLAGS.dct_coefficient_count) + + # Create the model. + model = models.create_model(model_settings, FLAGS.model_architecture, FLAGS.model_size_info, True) + + audio_processor = data_preprocessing.AudioProcessor(data_url=FLAGS.data_url, + data_dir=FLAGS.data_dir, + silence_percentage=FLAGS.silence_percentage, + unknown_percentage=FLAGS.unknown_percentage, + wanted_words=FLAGS.wanted_words.split(','), + validation_percentage=FLAGS.validation_percentage, + testing_percentage=FLAGS.testing_percentage, + model_settings=model_settings) + + # We decay learning rate in a constant piecewise way to help learning. + training_steps_list = list(map(int, FLAGS.how_many_training_steps.split(','))) + learning_rates_list = list(map(float, FLAGS.learning_rate.split(','))) + lr_boundary_list = training_steps_list[:-1] # Only need the values at which to change lr. + lr_schedule = tf.keras.optimizers.schedules.PiecewiseConstantDecay(boundaries=lr_boundary_list, + values=learning_rates_list) + + # Specify the optimizer configurations. + optimizer = tf.keras.optimizers.Adam(learning_rate=lr_schedule) + model.compile(optimizer=optimizer, + loss=tf.keras.losses.SparseCategoricalCrossentropy(), + metrics=['accuracy']) + + train_data = audio_processor.get_data(audio_processor.Modes.TRAINING, + FLAGS.background_frequency, FLAGS.background_volume, + int((FLAGS.time_shift_ms * FLAGS.sample_rate) / 1000)) + train_data = train_data.repeat().batch(FLAGS.batch_size).prefetch(tf.data.AUTOTUNE) + val_data = audio_processor.get_data(audio_processor.Modes.VALIDATION) + val_data = val_data.batch(FLAGS.batch_size).prefetch(tf.data.AUTOTUNE) + + # We train for a max number of iterations so need to calculate how many 'epochs' this will be. + training_steps_max = np.sum(training_steps_list) + training_epoch_max = int(np.ceil(training_steps_max / FLAGS.eval_step_interval)) + + # Callbacks. + train_dir = Path(FLAGS.train_dir) / "best" + train_dir.mkdir(parents=True, exist_ok=True) + model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint( + filepath=(train_dir / (FLAGS.model_architecture + "_{val_accuracy:.3f}_ckpt")), + save_weights_only=True, + monitor='val_accuracy', + mode='max', + save_best_only=True) + tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=FLAGS.summaries_dir) + + # Train the model. + model.fit(x=train_data, + steps_per_epoch=FLAGS.eval_step_interval, + epochs=training_epoch_max, + validation_data=val_data, + callbacks=[model_checkpoint_callback, tensorboard_callback]) + + # Test and save the model. + test_data = audio_processor.get_data(audio_processor.Modes.TESTING) + test_data = test_data.batch(FLAGS.batch_size) + + test_loss, test_acc = model.evaluate(x=test_data) + print(f'Final test accuracy: {test_acc*100:.2f}%') + model.save(f'saved_model/{FLAGS.model_architecture}') + model.save(f'keras/{FLAGS.model_architecture}.h5') + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + '--data_url', + type=str, + default='http://download.tensorflow.org/data/speech_commands_v0.02.tar.gz', + help='Location of speech training data archive on the web.') + parser.add_argument( + '--data_dir', + type=str, + default='/tmp/speech_dataset/', + help="""\ + Where to download the speech training data to. + """) + parser.add_argument( + '--background_volume', + type=float, + default=0.1, + help="""\ + How loud the background noise should be, between 0 and 1. + """) + parser.add_argument( + '--background_frequency', + type=float, + default=0.8, + help="""\ + How many of the training samples have background noise mixed in. + """) + parser.add_argument( + '--silence_percentage', + type=float, + default=10.0, + help="""\ + How much of the training data should be silence. + """) + parser.add_argument( + '--unknown_percentage', + type=float, + default=10.0, + help="""\ + How much of the training data should be unknown words. + """) + parser.add_argument( + '--time_shift_ms', + type=float, + default=100.0, + help="""\ + Range to randomly shift the training audio by in time. + """) + parser.add_argument( + '--testing_percentage', + type=int, + default=10, + help='What percentage of wavs to use as a test set.') + parser.add_argument( + '--validation_percentage', + type=int, + default=10, + help='What percentage of wavs to use as a validation set.') + parser.add_argument( + '--sample_rate', + type=int, + default=16000, + help='Expected sample rate of the wavs',) + parser.add_argument( + '--clip_duration_ms', + type=int, + default=1000, + help='Expected duration in milliseconds of the wavs',) + parser.add_argument( + '--window_size_ms', + type=float, + default=30.0, + help='How long each spectrogram timeslice is',) + parser.add_argument( + '--window_stride_ms', + type=float, + default=10.0, + help='How long each spectrogram timeslice is',) + parser.add_argument( + '--dct_coefficient_count', + type=int, + default=40, + help='How many bins to use for the MFCC fingerprint',) + parser.add_argument( + '--how_many_training_steps', + type=str, + default='15000,3000', + help='How many training loops to run',) + parser.add_argument( + '--eval_step_interval', + type=int, + default=400, + help='How often to evaluate the training results.') + parser.add_argument( + '--learning_rate', + type=str, + default='0.001,0.0001', + help='How large a learning rate to use when training.') + parser.add_argument( + '--batch_size', + type=int, + default=100, + help='How many items to train with at once',) + parser.add_argument( + '--summaries_dir', + type=str, + default='/tmp/retrain_logs', + help='Where to save summary logs for TensorBoard.') + parser.add_argument( + '--wanted_words', + type=str, + default='yes,no,up,down,left,right,on,off,stop,go', + help='Words to use (others will be added to an unknown label)',) + parser.add_argument( + '--train_dir', + type=str, + default='/tmp/speech_commands_train', + help='Directory to write event logs and checkpoint.') + parser.add_argument( + '--model_architecture', + type=str, + default='dnn', + help='What model architecture to use') + parser.add_argument( + '--model_size_info', + type=int, + nargs="+", + default=[128, 128, 128], + help='Model dimensions - different for various models') + + FLAGS, _ = parser.parse_known_args() + train() diff --git a/models/keyword_spotting/cnn_large/model_package_tf/validation_utils/labels.txt b/models/keyword_spotting/cnn_large/model_package_tf/validation_utils/labels.txt new file mode 100644 index 0000000..ba41645 --- /dev/null +++ b/models/keyword_spotting/cnn_large/model_package_tf/validation_utils/labels.txt @@ -0,0 +1,12 @@ +_silence_ +_unknown_ +yes +no +up +down +left +right +on +off +stop +go \ No newline at end of file diff --git a/models/keyword_spotting/cnn_large/tflite_int8/README.md b/models/keyword_spotting/cnn_large/tflite_int8/README.md deleted file mode 100644 index 479133f..0000000 --- a/models/keyword_spotting/cnn_large/tflite_int8/README.md +++ /dev/null @@ -1,58 +0,0 @@ -# CNN Large INT8 - -## Description -This is a fully quantized version (asymmetrical int8) of the CNN Large model developed by Arm, with training checkpoints, from the Hello Edge paper. Code to recreate this model can be found here: https://github.com/ARM-software/ML-examples/tree/master/tflu-kws-cortex-m - -## License -[Apache-2.0](https://spdx.org/licenses/Apache-2.0.html) - -## Related Materials -### Class Labels -The class labels associated with this model can be downloaded by running the script `get_class_labels.sh`. - -### Model Recreation Code -Code to recreate this model can be found here: https://github.com/ARM-software/ML-examples/tree/master/tflu-kws-cortex-m. - -## Network Information -| Network Information | Value | -|---------------------|------------------| -| Framework | TensorFlow Lite | -| SHA-1 Hash | a61ab748ae8f52f78ab568342db67a792c6ecf34 | -| Size (Bytes) | 484600 | -| Provenance | https://github.com/ARM-software/ML-examples/tree/master/tflu-kws-cortex-m | -| Paper | https://arxiv.org/abs/1711.07128 | - -## Accuracy -Dataset: Google Speech Commands Test Set - -| Metric | Value | -|--------|-------| -| Accuracy | 0.931 | - -## Performance -| Platform | Optimized | -|----------|:---------:| -| Cortex-A |:heavy_check_mark: | -| Cortex-M |:heavy_check_mark: | -| Mali GPU |:heavy_check_mark: | -| Ethos U |:heavy_check_mark: | - -### Key -* :heavy_check_mark: - Will run on this platform. -* :heavy_multiplication_x: - Will not run on this platform. - - -## Optimizations -| Optimization | Value | -|-----------------|---------| -| Quantization | INT8 | - -## Network Inputs -| Input Node Name | Shape | Description | -|-----------------|---------|-------------| -| input | (1, 490) | The input is a processed MFCCs of shape (1, 490) | - -## Network Outputs -| Output Node Name | Shape | Description | -|------------------|---------|-------------| -| Identity | (1, 12) | The probability on 12 keywords. | diff --git a/models/keyword_spotting/cnn_large/tflite_int8/definition.yaml b/models/keyword_spotting/cnn_large/tflite_int8/definition.yaml deleted file mode 100644 index 63dcf0d..0000000 --- a/models/keyword_spotting/cnn_large/tflite_int8/definition.yaml +++ /dev/null @@ -1,43 +0,0 @@ -benchmark: - Google Speech Commands test set: - Accuracy: 93.09% -description: 'This is a fully quantized version (asymmetrical int8) of the CNN Large - model developed by Arm, with training checkpoints, from the Hello Edge paper. Code - to recreate this model can be found here: https://github.com/ARM-software/ML-examples/tree/master/tflu-kws-cortex-m' -license: -- Apache-2.0 -network: - file_size_bytes: 484600 - filename: cnn_l_quantized.tflite - framework: TensorFlow Lite - hash: - algorithm: sha1 - value: a61ab748ae8f52f78ab568342db67a792c6ecf34 - provenance: https://github.com/ARM-software/ML-examples/tree/master/tflu-kws-cortex-m - quality_level: null -network_parameters: - input_nodes: - - description: The input is a processed MFCCs of shape (1, 490) - example_input: - path: models/keyword_spotting/cnn_large/tflite_int8/testing_input/input - name: input - shape: - - 1 - - 490 - output_nodes: - - description: The probability on 12 keywords. - name: Identity - shape: - - 1 - - 12 - test_output_path: models/keyword_spotting/cnn_large/tflite_int8/testing_output/Identity -operators: - TensorFlow Lite: - - CONV_2D - - DEQUANTIZE - - FULLY_CONNECTED - - QUANTIZE - - RELU - - RESHAPE - - SOFTMAX -paper: https://arxiv.org/abs/1711.07128 diff --git a/models/keyword_spotting/cnn_large/tflite_int8/get_class_labels.sh b/models/keyword_spotting/cnn_large/tflite_int8/get_class_labels.sh deleted file mode 100755 index e59caf5..0000000 --- a/models/keyword_spotting/cnn_large/tflite_int8/get_class_labels.sh +++ /dev/null @@ -1,20 +0,0 @@ -# Copyright (C) 2021 Arm Limited or its affiliates. All rights reserved. -# -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the License); you may -# not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an AS IS BASIS, WITHOUT -# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -#!/usr/bin/env bash - -wget https://raw.githubusercontent.com/ARM-software/ML-KWS-for-MCU/e9cf319e9aa2ff71d433e111477dd95329fb94cb/Pretrained_models/labels.txt -mv labels.txt labelmappings.txt \ No newline at end of file diff --git a/models/keyword_spotting/cnn_medium/model_package_tf/README.md b/models/keyword_spotting/cnn_medium/model_package_tf/README.md new file mode 100644 index 0000000..bb7380f --- /dev/null +++ b/models/keyword_spotting/cnn_medium/model_package_tf/README.md @@ -0,0 +1,115 @@ +# CNN Medium model package + +This folder contains code that will allow you to recreate the CNN Medium keyword spotting model from +the [Hello Edge paper](https://arxiv.org/pdf/1711.07128.pdf). + +## License +[Apache-2.0](https://spdx.org/licenses/Apache-2.0.html) + +## Model Package Overview +| Model | CNN_Medium | +|:---------------: |:--------------------------------------------------------------:| +| **Format**: | Keras, Saved Model, TensorFlow Lite int8, TensorFlow Lite fp32 | +| **Feature**: | Keyword spotting for Arm Cortex-M CPUs | +| **Architectural Delta w.r.t. Vanilla**: | None | +| **Domain**: | Keyword spotting | +| **Package Quality**: | Optimised | + +## Model Recreation + +In order to recreate the model you will first need to be using ```Python3.7``` and install the requirements in ```requirements.txt```. + +Once you have these requirements satisfied you can execute the recreation script contained within this folder, just run: + +```bash +bash ./recreate_model.sh +``` + +Running this script will use the pre-trained checkpoint files supplied in the ```./model_archive/model_source/weights``` folder +to generate the TFLite files and perform evaluation on the test sets. Both an fp32 version and a quantized version will be produced. +The quantized version will use post-training quantization to fully quantize it. + +If you want to run training from scratch you can do this by supplying ```--train``` when running the script. For example: + +```bash +bash ./recreate_model.sh --train +``` + +Training is then performed and should produce a model to the stated accuracy in this repository. +Note that exporting to TFLite will still happen with the pre-trained checkpoint files so you will need to re-run the script +and this time supply the path to the new checkpoint files you want to use, for example: + +```bash +bash ./recreate_model.sh --ckpt +``` + + +## Training + +To train a DNN with 3 fully-connected layers with 128 neurons in each layer, run: + +``` +python train.py --model_architecture dnn --model_size_info 128 128 128 +``` +The command line argument *--model_size_info* is used to pass the neural network layer +dimensions such as number of layers, convolution filter size/stride as a list to models.py, +which builds the TensorFlow graph based on the provided model architecture +and layer dimensions. For more info on *model_size_info* for each network architecture see +[models.py](models.py). + +The training commands with all the hyperparameters to reproduce the models shown in the +[paper](https://arxiv.org/pdf/1711.07128.pdf) are given [here](recreate_model.sh). + +## Testing +To run inference on the trained model from a checkpoint and get accuracy on validation and test sets, run: +``` +python evaluation.py --model_architecture dnn --model_size_info 128 128 128 --checkpoint +``` +The parameters used here should match those used in the Training step. + +## Optimization + +We introduce a new *optional* step to optimize the trained keyword spotting model for deployment. + +Here we use TensorFlow's [weight clustering API](https://www.tensorflow.org/model_optimization/guide/clustering) to reduce the compressed model size and optimize inference on supported hardware. 32 weight clusters and kmeans++ cluster intialization method are used as the clustering hyperparameters. + +To optimize your trained model (e.g. a DNN), a trained model checkpoint is needed to run clustering and fine-tuning on. +You can use the pre-trained checkpoints provided, or train your own model and use the resulting checkpoint. + +To apply the optimization and fine-tuning, run the following command: +``` +python optimisations.py --model_architecture dnn --model_size_info 128 128 128 --checkpoint +``` +The parameters used here should match those used in the Training step, except for the number of training steps. +The number of training steps is reduced since the optimization step only requires fine-tuning. + +This will generate a clustered model checkpoint that can be used in the quantization step to generate a quantized and clustered TFLite model. + +## Quantization and TFLite Conversion + +As part of the update we now use TensorFlow's +[post training quantization](https://www.tensorflow.org/lite/performance/post_training_quantization) to +make quantization of the trained models super simple. + +To quantize your trained model (e.g. a DNN) run: +``` +python convert_to_tflite.py --model_architecture dnn --model_size_info 128 128 128 --checkpoint [--inference_type int8|int16] +``` +The parameters used here should match those used in the Training step. + +The inference_type parameter is *optional* and to be used if a fully quantized model with inputs and outputs of type int8 or int16 is needed. It defaults to fp32. + +This step will produce a quantized TFLite file *dnn_quantized.tflite*. +You can test the accuracy of this quantized model on the test set by running: +``` +python evaluation.py --tflite_path dnn_quantized.tflite +``` +The parameters used here should match those used in the Training step. + +`convert_to_tflite.py` uses post-training quantization to generate a quantized model by default. If you wish to convert to a floating point TFLite model, use the command below: + +``` +python convert_to_tflite.py --model_architecture dnn --model_size_info 128 128 128 --checkpoint --no-quantize +``` + +This will produce a floating point TFLite file *dnn.tflite*. You can test the accuracy of this floating point model using `evaluation.py` as above. diff --git a/models/keyword_spotting/cnn_medium/model_package_tf/cnn_m_inference_keras.py b/models/keyword_spotting/cnn_medium/model_package_tf/cnn_m_inference_keras.py new file mode 100644 index 0000000..db7694a --- /dev/null +++ b/models/keyword_spotting/cnn_medium/model_package_tf/cnn_m_inference_keras.py @@ -0,0 +1,76 @@ +# Copyright © 2023 Arm Ltd. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from data_processing.data_preprocessing import load_wav_file, calculate_mfcc + +import tensorflow as tf +import argparse + + +def load_labels(filename): + """Read in labels, one label per line.""" + f = open(filename, "r") + return f.read().splitlines() + + +def main(): + window_size_samples = int(FLAGS.sample_rate * FLAGS.window_size_ms / 1000) + window_stride_samples = int(FLAGS.sample_rate * FLAGS.window_stride_ms / 1000) + decoded, sample = load_wav_file(FLAGS.wav, FLAGS.sample_rate) + x = calculate_mfcc(decoded, sample, window_size_samples, window_stride_samples, FLAGS.dct_coefficient_count) + x = tf.reshape(x, [1, -1]) + + model = tf.keras.models.load_model(FLAGS.keras_file_path) + predictions = model.predict(x) + + # Sort to show labels in order of confidence + top_k = predictions[0].argsort()[-1:][::-1] + for node_id in top_k: + human_string = load_labels(FLAGS.labels)[int(node_id)] + score = predictions[0,node_id] + print(f'model predicted: {human_string} with score {score:.5f}') + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument( + '--wav', type=str, default='', help='Audio file to be identified.') + parser.add_argument( + '--labels', type=str, default='', help='Path to file containing labels.') + parser.add_argument( + '--sample_rate', + type=int, + default=16000, + help='Expected sample rate of the wavs', ) + parser.add_argument( + '--window_size_ms', + type=float, + default=30.0, + help='How long each spectrogram timeslice is', ) + parser.add_argument( + '--window_stride_ms', + type=float, + default=10.0, + help='How long each spectrogram timeslice is', ) + parser.add_argument( + '--dct_coefficient_count', + type=int, + default=40, + help='How many bins to use for the MFCC fingerprint', ) + parser.add_argument( + '--keras_file_path', + type=str, + default='', + help='Path to the .h5 Keras model file to use for testing.') + FLAGS, unparsed = parser.parse_known_args() + main() diff --git a/models/keyword_spotting/cnn_medium/model_package_tf/cnn_m_inference_tflite.py b/models/keyword_spotting/cnn_medium/model_package_tf/cnn_m_inference_tflite.py new file mode 100644 index 0000000..9f79d99 --- /dev/null +++ b/models/keyword_spotting/cnn_medium/model_package_tf/cnn_m_inference_tflite.py @@ -0,0 +1,120 @@ +# Copyright © 2023 Arm Ltd. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from data_processing.data_preprocessing import load_wav_file, calculate_mfcc + +import tensorflow as tf +import numpy as np +import argparse + + +def tflite_inference(input_data, tflite_path): + """Call forwards pass of TFLite file and returns the result. + + Args: + input_data: Input data to use on forward pass. + tflite_path: Path to TFLite file to run. + + Returns: + Output from inference. + """ + supported_quant_dtypes = (np.int8, np.int16) + interpreter = tf.lite.Interpreter(model_path=tflite_path) + interpreter.allocate_tensors() + + input_details = interpreter.get_input_details() + output_details = interpreter.get_output_details() + + input_dtype = input_details[0]["dtype"] + output_dtype = output_details[0]["dtype"] + + # Check if the input/output type is quantized, + # set scale and zero-point accordingly + if input_dtype in supported_quant_dtypes: + input_scale, input_zero_point = input_details[0]["quantization"] + else: + input_scale, input_zero_point = 1, 0 + + input_data = input_data / input_scale + input_zero_point + input_data = np.round(input_data) if input_dtype in supported_quant_dtypes else input_data + + if output_dtype in supported_quant_dtypes: + output_scale, output_zero_point = output_details[0]["quantization"] + else: + output_scale, output_zero_point = 1, 0 + + interpreter.set_tensor(input_details[0]['index'], tf.cast(input_data, input_dtype)) + interpreter.invoke() + + output_data = interpreter.get_tensor(output_details[0]['index']) + + output_data = output_scale * (output_data.astype(np.float32) - output_zero_point) + + return output_data + + +def load_labels(filename): + """Read in labels, one label per line.""" + f = open(filename, "r") + return f.read().splitlines() + + +def main(): + window_size_samples = int(FLAGS.sample_rate * FLAGS.window_size_ms / 1000) + window_stride_samples = int(FLAGS.sample_rate * FLAGS.window_stride_ms / 1000) + decoded, sample = load_wav_file(FLAGS.wav, FLAGS.sample_rate) + x = calculate_mfcc(decoded, sample, window_size_samples, window_stride_samples, FLAGS.dct_coefficient_count) + x = tf.reshape(x, [1, -1]) + predictions = tflite_inference(x, FLAGS.tflite_path) + + # Sort to show labels in order of confidence + top_k = predictions[0].argsort()[-1:][::-1] + for node_id in top_k: + human_string = load_labels(FLAGS.labels)[int(node_id)] + score = predictions[0,node_id] + print(f'model predicted: {human_string} with score {score:.5f}') + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument( + '--wav', type=str, default='', help='Audio file to be identified.') + parser.add_argument( + '--labels', type=str, default='', help='Path to file containing labels.') + parser.add_argument( + '--sample_rate', + type=int, + default=16000, + help='Expected sample rate of the wavs', ) + parser.add_argument( + '--window_size_ms', + type=float, + default=30.0, + help='How long each spectrogram timeslice is', ) + parser.add_argument( + '--window_stride_ms', + type=float, + default=10.0, + help='How long each spectrogram timeslice is', ) + parser.add_argument( + '--dct_coefficient_count', + type=int, + default=40, + help='How many bins to use for the MFCC fingerprint', ) + parser.add_argument( + '--tflite_path', + type=str, + default='', + help='Path to TFLite file to use for testing.') + FLAGS, unparsed = parser.parse_known_args() + main() diff --git a/models/keyword_spotting/cnn_medium/model_package_tf/convert_to_tflite.py b/models/keyword_spotting/cnn_medium/model_package_tf/convert_to_tflite.py new file mode 100644 index 0000000..64ab8df --- /dev/null +++ b/models/keyword_spotting/cnn_medium/model_package_tf/convert_to_tflite.py @@ -0,0 +1,234 @@ +# Copyright © 2023 Arm Ltd. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Functions for converting and quantizing a trained keyword spotting + model and saving to TFLite.""" + +import argparse + +import tensorflow as tf + +from data_processing import data_preprocessing +from model_core_utils import models +from evaluation import tflite_test + +NUM_REP_DATA_SAMPLES = 100 # How many samples to use for post training quantization. + + +def convert(model_settings, audio_processor, checkpoint, quantize, inference_type, tflite_path): + """Load our trained floating point model and convert it. + + TFLite conversion or post training quantization is performed and the + resulting model is saved as a TFLite file. + We use samples from the validation set to do post training quantization. + + Args: + model_settings: Dictionary of common model settings. + audio_processor: Audio processor class object. + checkpoint: Path to training checkpoint to load. + quantize: Whether to quantize the model or convert to fp32 TFLite model. + inference_type: Input/output type of the quantized model. + tflite_path: Output TFLite file save path. + """ + model = models.create_model(model_settings, FLAGS.model_architecture, FLAGS.model_size_info, False) + model.load_weights(checkpoint).expect_partial() + + val_data = audio_processor.get_data(audio_processor.Modes.VALIDATION).batch(1) + + def _rep_dataset(): + """Generator function to produce representative dataset.""" + i = 0 + for mfcc, label in val_data: + if i > NUM_REP_DATA_SAMPLES: + break + i += 1 + yield [mfcc] + + if quantize: + # Quantize model and save to disk. + tflite_model = post_training_quantize(model, inference_type, _rep_dataset) + with open(tflite_path, 'wb') as f: + f.write(tflite_model) + print(f'Quantized model saved to {tflite_path}.') + else: + converter = tf.lite.TFLiteConverter.from_keras_model(model) + tflite_model = converter.convert() + with open(tflite_path, 'wb') as f: + f.write(tflite_model) + print(f'Converted model saved to {tflite_path}.') + + +def post_training_quantize(keras_model, inference_type, rep_dataset): + """Perform post training quantization and returns the TFLite model ready for saving. + + See https://www.tensorflow.org/lite/performance/post_training_quantization#full_integer_quantization for + more details. + + Args: + keras_model: The trained tf Keras model used for post training quantization. + inference_type: Input/output type of the quantized model. + rep_dataset: Function to use as a representative dataset, must be callable. + + Returns: + Quantized TFLite model ready for saving to disk. + """ + converter = tf.lite.TFLiteConverter.from_keras_model(keras_model) + converter.optimizations = [tf.lite.Optimize.DEFAULT] + + if inference_type == 'int8': + converter.inference_input_type = tf.int8 + converter.inference_output_type = tf.int8 + supported_ops = tf.lite.OpsSet.TFLITE_BUILTINS_INT8 + if inference_type == 'int16': + converter.inference_input_type = tf.int16 + converter.inference_output_type = tf.int16 + supported_ops = tf.lite.OpsSet.EXPERIMENTAL_TFLITE_BUILTINS_ACTIVATIONS_INT16_WEIGHTS_INT8 + + # Int8 post training quantization needs representative dataset. + converter.representative_dataset = rep_dataset + converter.target_spec.supported_ops = [supported_ops] + + tflite_model = converter.convert() + + return tflite_model + + +def main(): + model_settings = models.prepare_model_settings( + len(data_preprocessing.prepare_words_list(FLAGS.wanted_words.split(','))), + FLAGS.sample_rate, FLAGS.clip_duration_ms, FLAGS.window_size_ms, + FLAGS.window_stride_ms, FLAGS.dct_coefficient_count) + + audio_processor = data_preprocessing.AudioProcessor(data_url=FLAGS.data_url, + data_dir=FLAGS.data_dir, + silence_percentage=FLAGS.silence_percentage, + unknown_percentage=FLAGS.unknown_percentage, + wanted_words=FLAGS.wanted_words.split(','), + validation_percentage=FLAGS.validation_percentage, + testing_percentage=FLAGS.testing_percentage, + model_settings=model_settings) + + if FLAGS.quantize: + tflite_path = f'{FLAGS.model_architecture}_quantized.tflite' + else: + tflite_path = f'{FLAGS.model_architecture}.tflite' + + # Load floating point model from checkpoint and convert it. + convert(model_settings, audio_processor, FLAGS.checkpoint, + FLAGS.quantize, FLAGS.inference_type, tflite_path) + + # Test the newly converted model on the test set. + tflite_test(model_settings, audio_processor, tflite_path) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument( + '--data_url', + type=str, + default='http://download.tensorflow.org/data/speech_commands_v0.02.tar.gz', + help='Location of speech training data archive on the web.') + parser.add_argument( + '--data_dir', + type=str, + default='/tmp/speech_dataset/', + help="""\ + Where to download the speech training data to. + """) + parser.add_argument( + '--silence_percentage', + type=float, + default=10.0, + help="""\ + How much of the training data should be silence. + """) + parser.add_argument( + '--unknown_percentage', + type=float, + default=10.0, + help="""\ + How much of the training data should be unknown words. + """) + parser.add_argument( + '--testing_percentage', + type=int, + default=10, + help='What percentage of wavs to use as a test set.') + parser.add_argument( + '--validation_percentage', + type=int, + default=10, + help='What percentage of wavs to use as a validation set.') + parser.add_argument( + '--sample_rate', + type=int, + default=16000, + help='Expected sample rate of the wavs',) + parser.add_argument( + '--clip_duration_ms', + type=int, + default=1000, + help='Expected duration in milliseconds of the wavs',) + parser.add_argument( + '--window_size_ms', + type=float, + default=30.0, + help='How long each spectrogram timeslice is',) + parser.add_argument( + '--window_stride_ms', + type=float, + default=10.0, + help='How long each spectrogram timeslice is',) + parser.add_argument( + '--dct_coefficient_count', + type=int, + default=40, + help='How many bins to use for the MFCC fingerprint',) + parser.add_argument( + '--wanted_words', + type=str, + default='yes,no,up,down,left,right,on,off,stop,go', + help='Words to use (others will be added to an unknown label)',) + parser.add_argument( + '--model_architecture', + type=str, + default='dnn', + help='What model architecture to use') + parser.add_argument( + '--model_size_info', + type=int, + nargs="+", + default=[128, 128, 128], + help='Model dimensions - different for various models') + parser.add_argument( + '--checkpoint', + type=str, + help='Checkpoint to load the weights from.') + parser.add_argument( + '--quantize', + dest='quantize', + action="store_true", + default=True, + help='Whether to quantize the model or convert to fp32 TFLite model. Defaults to True.') + parser.add_argument( + '--no-quantize', + dest='quantize', + action="store_false", + help='Whether to quantize the model or convert to fp32 TFLite model. Defaults to True.') + parser.add_argument( + '--inference_type', + type=str, + default='fp32', + help='If quantize is true, whether the model input and output is float32, int8 or int16') + + FLAGS, _ = parser.parse_known_args() + main() diff --git a/models/keyword_spotting/cnn_medium/model_package_tf/data_processing/__init__.py b/models/keyword_spotting/cnn_medium/model_package_tf/data_processing/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/models/keyword_spotting/cnn_medium/model_package_tf/data_processing/data_preprocessing.py b/models/keyword_spotting/cnn_medium/model_package_tf/data_processing/data_preprocessing.py new file mode 100644 index 0000000..05cf5ba --- /dev/null +++ b/models/keyword_spotting/cnn_medium/model_package_tf/data_processing/data_preprocessing.py @@ -0,0 +1,462 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +# +# Modifications Copyright 2023 Arm Inc. All Rights Reserved. +# Modified to use TensorFlow 2.0 and data pipelines. +# +"""Functions for loading and preparing data for keyword spotting.""" + +import os +import re +import sys +import urllib +from pathlib import Path +import tarfile +import hashlib +import random +import math +from enum import Enum + +import numpy as np +import tensorflow as tf +from tensorflow.python.ops import gen_audio_ops as audio_ops + +MAX_NUM_WAVS_PER_CLASS = 2**27 - 1 # ~134M +RANDOM_SEED = 59185 +BACKGROUND_NOISE_DIR_NAME = '_background_noise_' +SILENCE_LABEL = '_silence_' +SILENCE_INDEX = 0 +UNKNOWN_WORD_INDEX = 1 +UNKNOWN_WORD_LABEL = '_unknown_' + + +def load_wav_file(wav_filename, desired_samples): + """Loads and then decodes a given 16bit PCM wav file. + + Decoded audio is scaled to the range [-1, 1] and padded or cropped to the desired number of samples. + + Args: + wav_filename: 16bit PCM wav file to load. + desired_samples: Number of samples wanted from the audio file. + + Returns: + Tuple consisting of the decoded audio and sample rate. + """ + wav_file = tf.io.read_file(wav_filename) + decoded_wav = audio_ops.decode_wav(wav_file, desired_channels=1, desired_samples=desired_samples) + + return decoded_wav.audio, decoded_wav.sample_rate + + +def calculate_mfcc(audio_signal, audio_sample_rate, window_size, window_stride, num_mfcc): + """Returns Mel Frequency Cepstral Coefficients (MFCC) for a given audio signal. + + Args: + audio_signal: Raw audio signal in range [-1, 1] + audio_sample_rate: Audio signal sample rate + window_size: Window size in samples for calculating spectrogram + window_stride: Window stride in samples for calculating spectrogram + num_mfcc: The number of MFCC features wanted. + + Returns: + Calculated mffc features. + """ + spectrogram = audio_ops.audio_spectrogram(input=audio_signal, window_size=window_size, stride=window_stride, + magnitude_squared=True) + + mfcc_features = audio_ops.mfcc(spectrogram, audio_sample_rate, dct_coefficient_count=num_mfcc) + + return mfcc_features + + +def which_set(filename, validation_percentage, testing_percentage): + """Determines which data partition the file should belong to. + + We want to keep files in the same training, validation, or testing sets even + if new ones are added over time. This makes it less likely that testing + samples will accidentally be reused in training when long runs are restarted + for example. To keep this stability, a hash of the filename is taken and used + to determine which set it should belong to. This determination only depends on + the name and the set proportions, so it won't change as other files are added. + It's also useful to associate particular files as related (for example words + spoken by the same person), so anything after '_nohash_' in a filename is + ignored for set determination. This ensures that 'bobby_nohash_0.wav' and + 'bobby_nohash_1.wav' are always in the same set, for example. + + Args: + filename: File path of the data sample. + validation_percentage: How much of the data set to use for validation. + testing_percentage: How much of the data set to use for testing. + + Returns: + String, one of 'training', 'validation', or 'testing'. + """ + base_name = os.path.basename(filename) + # We want to ignore anything after '_nohash_' in the file name when + # deciding which set to put a wav in, so the data set creator has a way of + # grouping wavs that are close variations of each other. + hash_name = re.sub(r'_nohash_.*$', '', base_name) + # This looks a bit magical, but we need to decide whether this file should + # go into the training, testing, or validation sets, and we want to keep + # existing files in the same set even if more files are subsequently + # added. + # To do that, we need a stable way of deciding based on just the file name + # itself, so we do a hash of that and then use that to generate a + # probability value that we use to assign it. + hash_name_hashed = hashlib.sha1(tf.compat.as_bytes(hash_name)).hexdigest() + percentage_hash = ((int(hash_name_hashed, 16) % + (MAX_NUM_WAVS_PER_CLASS + 1)) * + (100.0 / MAX_NUM_WAVS_PER_CLASS)) + if percentage_hash < validation_percentage: + result = 'validation' + elif percentage_hash < (testing_percentage + validation_percentage): + result = 'testing' + else: + result = 'training' + return result + + +def prepare_words_list(wanted_words): + """Prepends common tokens to the custom word list. + + Args: + wanted_words: List of strings containing custom words to spot. + + Returns: + List of words with silence and unknown tokens added. + """ + return [SILENCE_LABEL, UNKNOWN_WORD_LABEL] + wanted_words + + +class AudioProcessor: + """Handles loading, partitioning, and preparing audio training data.""" + + class Modes(Enum): + TRAINING = 1 + VALIDATION = 2 + TESTING = 3 + + def __init__(self, data_url, data_dir, silence_percentage, unknown_percentage, + wanted_words, validation_percentage, testing_percentage, model_settings): + self.data_dir = Path(data_dir) + self.model_settings = model_settings + self.words_list = prepare_words_list(wanted_words) + + self._tf_datasets = {} + self.background_data = None + self._set_size = {'training': 0, 'validation': 0, 'testing': 0} + + self._download_and_extract_data(data_url, data_dir) + self._prepare_datasets(silence_percentage, unknown_percentage, wanted_words, + validation_percentage, testing_percentage) + self._prepare_background_data() + + def get_data(self, mode, background_frequency=0, background_volume_range=0, time_shift=0): + """Returns the train, validation or test set for KWS as a TF Dataset. + + Args: + mode: The set to return, see AudioProcessor.Modes enumeration. + background_frequency: How many of the samples have background noise mixed in. + background_volume_range: How loud the background noise should be, between 0 and 1. + time_shift: Range to randomly shift the training audio by in time. + + Returns: + TF dataset that will generate tuples containing an mfcc and corresponding label. + + Raises: + ValueError: If mode is not recognised. + """ + if mode == AudioProcessor.Modes.TRAINING: + dataset = self._tf_datasets['training'] + elif mode == AudioProcessor.Modes.VALIDATION: + dataset = self._tf_datasets['validation'] + elif mode == AudioProcessor.Modes.TESTING: + dataset = self._tf_datasets['testing'] + else: + ValueError("Incorrect dataset type given") + + use_background = (self.background_data is not None) and (mode == AudioProcessor.Modes.TRAINING) + dataset = dataset.map(lambda path, label: self._process_path(path, label, self.model_settings, + background_frequency, background_volume_range, + time_shift, use_background, self.background_data), + num_parallel_calls=tf.data.experimental.AUTOTUNE) + + return dataset + + def set_size(self, mode): + """Get the number of samples in the requested dataset partition. + + Args: + mode: Which partition, see AudioProcessor.Modes enumeration. + + Returns: + Number of samples in the partition. + + Raises: + ValueError: If mode is not recognised. + """ + if mode == AudioProcessor.Modes.TRAINING: + return self._set_size['training'] + elif mode == AudioProcessor.Modes.VALIDATION: + return self._set_size['validation'] + elif mode == AudioProcessor.Modes.TESTING: + return self._set_size['testing'] + else: + ValueError('Incorrect dataset type given') + + @staticmethod + def _process_path(path, label, model_settings, background_frequency, background_volume_range, time_shift_samples, + use_background, background_data): + """Load wav files and calculate mfcc features. + + Random shifting of samples and adding in background noise is done within this function as well. + This function is meant to be mapped onto a TF Dataset by using a lambda function. + + Args: + path: Path to the wav file to load. + label: Integer label for classifying the audio clip. + model_settings: Dictionary of settings for model being trained. + background_frequency: How many clips will have background noise, 0.0 to 1.0. + background_volume_range: How loud the background noise will be. + time_shift_samples: How much to randomly shift the clips by. + use_background: Add in background noise to audio clips or not. + background_data: Ragged tensor of loaded background noise samples. + + Returns: + Tuple of calculated flattened mfcc and its class label. + """ + + desired_samples = model_settings['desired_samples'] + audio, sample_rate = load_wav_file(path, desired_samples=desired_samples) + + # Make our own silence audio data. + if label == SILENCE_INDEX: + audio = tf.multiply(audio, 0) + + # Shift samples start position and pad any gaps with zeros. + if time_shift_samples > 0: + time_shift_amount = tf.random.uniform(shape=(), minval=-time_shift_samples, maxval=time_shift_samples, + dtype=tf.int32) + else: + time_shift_amount = 0 + if time_shift_amount > 0: + time_shift_padding = [[time_shift_amount, 0], [0, 0]] + time_shift_offset = [0, 0] + else: + time_shift_padding = [[0, -time_shift_amount], [0, 0]] + time_shift_offset = [-time_shift_amount, 0] + + padded_foreground = tf.pad(audio, time_shift_padding, mode='CONSTANT') + sliced_foreground = tf.slice(padded_foreground, time_shift_offset, [desired_samples, -1]) + + # Get a random section of background noise. + if use_background: + background_index = tf.random.uniform(shape=(), maxval=background_data.shape[0], dtype=tf.int32) + background_sample = background_data[background_index] + background_offset = tf.random.uniform(shape=(), maxval=len(background_sample)-desired_samples, + dtype=tf.int32) + background_clipped = background_sample[background_offset:(background_offset + desired_samples)] + background_reshaped = tf.reshape(background_clipped, [desired_samples, 1]) + if tf.random.uniform(shape=(), maxval=1) < background_frequency: + background_volume = tf.random.uniform(shape=(), maxval=background_volume_range) + else: + background_volume = tf.constant(0, dtype='float32') + else: + background_reshaped = np.zeros([desired_samples, 1], dtype=np.float32) + background_volume = tf.constant(0, dtype='float32') + + # Mix in background noise. + background_mul = tf.multiply(background_reshaped, background_volume) + background_add = tf.add(background_mul, sliced_foreground) + background_clamp = tf.clip_by_value(background_add, -1.0, 1.0) + + mfcc = calculate_mfcc(background_clamp, sample_rate, model_settings['window_size_samples'], + model_settings['window_stride_samples'], + model_settings['dct_coefficient_count']) + mfcc = tf.reshape(mfcc, [-1]) + + return mfcc, label + + def _download_and_extract_data(self, data_url, target_directory): + """Downloads and extracts file to target directory. + + If the file does not already exist download it and then untar into the target directory. + + Args: + data_url: Web link to the tarred data to download. + target_directory: Directory to download and extract to. + """ + target_directory = Path(target_directory) + target_directory.mkdir(exist_ok=True) + + filename = data_url.split('/')[-1] + filepath = target_directory / filename + + if not filepath.exists(): + def _report_hook(block_num, block_size, total_size): + """Function to track download progress in urllib""" + read_so_far = block_num * block_size + percent = (read_so_far / total_size) * 100.0 + + s = f"\rDownloading {filename} {percent:.1f}%" + + sys.stdout.write(s) + sys.stdout.flush() + + filepath, _ = urllib.request.urlretrieve(data_url, filepath, _report_hook) + print() + + print(f'Untarring {filename}...') + tarfile.open(filepath, 'r:gz').extractall(target_directory) + + def _prepare_datasets(self, silence_percentage, unknown_percentage, wanted_words, + validation_percentage, testing_percentage): + """Split the data into train, validation and testing sets. + + Silence and unknown data is added, then sets are converted to TF Datasets. + + Args: + silence_percentage: Percent of words should be silence. + unknown_percentage: Percent of words that should be unknown. + wanted_words: List of words wanted to classify. + validation_percentage: Percent to split off for validation. + testing_percentage: Percent to split off for testing. + """ + # Make sure the shuffling and picking of unknowns is deterministic. + random.seed(RANDOM_SEED) + wanted_words_index = {} + + for index, wanted_word in enumerate(wanted_words): + wanted_words_index[wanted_word] = index + 2 + + # Find all wav files in subfolders. + search_path = self.data_dir / '*' / '*.wav' + data_index, unknown_index, all_words = self._find_and_sort_wavs(search_path, validation_percentage, + testing_percentage, wanted_words_index) + + for index, wanted_word in enumerate(wanted_words): + if wanted_word not in all_words: + raise Exception(f'Tried to find {wanted_word} in labels but only found: {", ".join(all_words.keys())}') + + word_to_index = {} + for word in all_words: + if word in wanted_words_index: + word_to_index[word] = wanted_words_index[word] + else: + word_to_index[word] = UNKNOWN_WORD_INDEX + word_to_index[SILENCE_LABEL] = SILENCE_INDEX + + # We need an arbitrary file to load as the input for the silence samples. + # It's multiplied by zero later, so the content doesn't matter. + silence_wav_path = data_index['training'][0]['file'] + for set_index in ['validation', 'testing', 'training']: + set_size = len(data_index[set_index]) # Size before adding silence and unknown samples. + silence_size = int(math.ceil(set_size * silence_percentage / 100)) + for _ in range(silence_size): + data_index[set_index].append({ + 'label': SILENCE_LABEL, + 'file': silence_wav_path + }) + # Pick some unknowns to add to each partition of the data set. + random.shuffle(unknown_index[set_index]) + unknown_size = int(math.ceil(set_size * unknown_percentage / 100)) + data_index[set_index].extend(unknown_index[set_index][:unknown_size]) + + self._set_size[set_index] = len(data_index[set_index]) # Size after adding silence and unknown samples. + + # Make sure the ordering is random. + random.shuffle(data_index[set_index]) + + # Transform into TF Datasets ready for easier processing later. + labels, paths = list(zip(*[d.values() for d in data_index[set_index]])) + labels = [word_to_index[label] for label in labels] + self._tf_datasets[set_index] = tf.data.Dataset.from_tensor_slices((list(paths), labels)) + + def _find_and_sort_wavs(self, search_pattern, validation_percentage, testing_percentage, wanted_words_index): + """Find and sort wav files into known and unknown word sets. + + Known words are files containing words in the list of wanted words. + Any other clip goes to the unknown label set. Labels come from the folder names. + All clips are also assigned to train, test and validation sets. + + Args: + search_pattern: Path pattern used by glob to find wav files. + validation_percentage: Percent to split off for validation. + testing_percentage: Percent to split off for testing. + wanted_words_index: Dict mapping wanted words to their label index. + + Returns: + 3-tuple of known words, unknown words and mapping of all word labels. + """ + data_index = {'validation': [], 'testing': [], 'training': []} + unknown_index = {'validation': [], 'testing': [], 'training': []} + all_words = {} + + for wav_path in sorted(tf.io.gfile.glob(str(search_pattern))): + word = Path(wav_path).parent.name.lower() + + # Treat the '_background_noise_' folder as a special case, since we expect + # it to contain long audio samples we mix in to improve training. + if word == BACKGROUND_NOISE_DIR_NAME: + continue + + all_words[word] = True + set_index = which_set(wav_path, validation_percentage, testing_percentage) + # If it's a known class, store its detail, otherwise add it to the list + # we'll use to train the unknown label. + if word in wanted_words_index: + data_index[set_index].append({'label': word, 'file': wav_path}) + else: + unknown_index[set_index].append({'label': word, 'file': wav_path}) + if not all_words: + raise Exception('No .wavs found at ' + str(search_pattern)) + + return data_index, unknown_index, all_words + + def _prepare_background_data(self): + """Searches a folder for background noise audio, and loads it into memory. + + It's expected that the background audio samples will be in a subdirectory + named '_background_noise_' inside the 'data_dir' folder, as .wavs that match + the sample rate of the training data, but can be much longer in duration. + + If the '_background_noise_' folder doesn't exist at all, this isn't an + error, it's just taken to mean that no background noise augmentation should + be used. If the folder does exist, but it's empty, that's treated as an + error. + + Returns: + Ragged tensor of raw PCM-encoded audio samples of background noise. + None if '_background_noise_' folder doesnt exist. + + Raises: + Exception: If files aren't found in the folder. + """ + background_data = [] + background_dir = Path(self.data_dir / BACKGROUND_NOISE_DIR_NAME) + if not background_dir.exists(): + self.background_data = None + return + + search_path = Path(background_dir / '*.wav') + for wav_path in tf.io.gfile.glob(str(search_path)): + wav_data, _ = load_wav_file(wav_path, desired_samples=-1) + background_data.append(tf.reshape(wav_data, [-1])) + + if not background_data: + raise Exception('No background wav files were found in ' + str(search_path)) + + # Ragged tensor as we cant use lists in tf dataset map functions. + self.background_data = tf.ragged.stack(background_data) diff --git a/models/keyword_spotting/cnn_medium/model_package_tf/evaluation.py b/models/keyword_spotting/cnn_medium/model_package_tf/evaluation.py new file mode 100644 index 0000000..e5dcf30 --- /dev/null +++ b/models/keyword_spotting/cnn_medium/model_package_tf/evaluation.py @@ -0,0 +1,250 @@ +# Copyright © 2023 Arm Ltd. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Functions for testing trained keyword spotting models from checkpoint files and TFLite files.""" + +import argparse + +import numpy as np +import tensorflow as tf + +from data_processing import data_preprocessing +from model_core_utils import models +from cnn_m_inference_tflite import tflite_inference + + +def tflite_test(model_settings, audio_processor, tflite_path): + """Calculate accuracy and confusion matrices on the validation and test sets. + + A TFLite model is used for doing testing. + + Args: + model_settings: Dictionary of common model settings. + audio_processor: Audio processor class object. + tflite_path: Path to TFLite file to use for inference. + """ + # Evaluate on validation set. + print("Running TFLite evaluation on validation set...") + val_data = audio_processor.get_data(audio_processor.Modes.VALIDATION).batch(1) + expected_indices = np.concatenate([y for x, y in val_data]) + predicted_indices = [] + + for mfcc, label in val_data: + prediction = tflite_inference(mfcc, tflite_path) + predicted_indices.append(np.squeeze(tf.argmax(prediction, axis=1))) + + val_accuracy = calculate_accuracy(predicted_indices, expected_indices) + confusion_matrix = tf.math.confusion_matrix(expected_indices, predicted_indices, + num_classes=model_settings['label_count']) + + print(confusion_matrix.numpy()) + print(f'Validation accuracy = {val_accuracy * 100:.2f}%' + f'(N={audio_processor.set_size(audio_processor.Modes.VALIDATION)})') + + # Evaluate on testing set. + print("Running TFLite evaluation on test set...") + test_data = audio_processor.get_data(audio_processor.Modes.TESTING).batch(1) + expected_indices = np.concatenate([y for x, y in test_data]) + predicted_indices = [] + + for mfcc, label in test_data: + prediction = tflite_inference(mfcc, tflite_path) + predicted_indices.append(np.squeeze(tf.argmax(prediction, axis=1))) + + test_accuracy = calculate_accuracy(predicted_indices, expected_indices) + confusion_matrix = tf.math.confusion_matrix(expected_indices, predicted_indices, + num_classes=model_settings['label_count']) + + print(confusion_matrix.numpy()) + print(f'Test accuracy = {test_accuracy * 100:.2f}%' + f'(N={audio_processor.set_size(audio_processor.Modes.TESTING)})') + + +def keras_test(model_settings, audio_processor, model): + """Calculate accuracy and confusion matrices on the validation and test sets. + + A loaded keras model is used for doing testing. + + Args: + model_settings: Dictionary of common model settings. + audio_processor: Audio processor class object. + model: Loaded keras model. + """ + # Evaluate on validation set. + print("Running TF evaluation on validation set...") + val_data = audio_processor.get_data(audio_processor.Modes.VALIDATION).batch(FLAGS.batch_size) + expected_indices = np.concatenate([y for x, y in val_data]) + + predictions = model.predict(val_data) + predicted_indices = tf.argmax(predictions, axis=1) + + val_accuracy = calculate_accuracy(predicted_indices, expected_indices) + confusion_matrix = tf.math.confusion_matrix(expected_indices, predicted_indices, + num_classes=model_settings['label_count']) + print(confusion_matrix.numpy()) + print(f'Validation accuracy = {val_accuracy * 100:.2f}%' + f'(N={audio_processor.set_size(audio_processor.Modes.VALIDATION)})') + + # Evaluate on testing set. + print("Running TF evaluation on test set...") + test_data = audio_processor.get_data(audio_processor.Modes.TESTING).batch(FLAGS.batch_size) + expected_indices = np.concatenate([y for x, y in test_data]) + + predictions = model.predict(test_data) + predicted_indices = tf.argmax(predictions, axis=1) + + test_accuracy = calculate_accuracy(predicted_indices, expected_indices) + confusion_matrix = tf.math.confusion_matrix(expected_indices, predicted_indices, + num_classes=model_settings['label_count']) + print(confusion_matrix.numpy()) + print(f'Test accuracy = {test_accuracy * 100:.2f}%' + f'(N={audio_processor.set_size(audio_processor.Modes.TESTING)})') + + +def calculate_accuracy(predicted_indices, expected_indices): + """Calculates and returns accuracy. + + Args: + predicted_indices: List of predicted integer indices. + expected_indices: List of expected integer indices. + + Returns: + Accuracy value between 0 and 1. + """ + correct_prediction = tf.equal(predicted_indices, expected_indices) + accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) + return accuracy + + +def evaluate(): + """Calculate accuracy and confusion matrices on validation and test sets. + + Model is created and weights loaded from supplied command line arguments. + """ + model_settings = models.prepare_model_settings( + len(data_preprocessing.prepare_words_list(FLAGS.wanted_words.split(','))), + FLAGS.sample_rate, FLAGS.clip_duration_ms, FLAGS.window_size_ms, + FLAGS.window_stride_ms, FLAGS.dct_coefficient_count) + + audio_processor = data_preprocessing.AudioProcessor(data_url=FLAGS.data_url, + data_dir=FLAGS.data_dir, + silence_percentage=FLAGS.silence_percentage, + unknown_percentage=FLAGS.unknown_percentage, + wanted_words=FLAGS.wanted_words.split(','), + validation_percentage=FLAGS.validation_percentage, + testing_percentage=FLAGS.testing_percentage, + model_settings=model_settings) + + if FLAGS.tflite_path: + tflite_test(model_settings, audio_processor, FLAGS.tflite_path) + + if FLAGS.checkpoint: + model = models.create_model(model_settings, FLAGS.model_architecture, FLAGS.model_size_info, False) + model.load_weights(FLAGS.checkpoint).expect_partial() + keras_test(model_settings, audio_processor, model) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument( + '--data_url', + type=str, + default='http://download.tensorflow.org/data/speech_commands_v0.02.tar.gz', + help='Location of speech training data archive on the web.') + parser.add_argument( + '--data_dir', + type=str, + default='/tmp/speech_dataset/', + help="""\ + Where to download the speech training data to. + """) + parser.add_argument( + '--silence_percentage', + type=float, + default=10.0, + help="""\ + How much of the training data should be silence. + """) + parser.add_argument( + '--unknown_percentage', + type=float, + default=10.0, + help="""\ + How much of the training data should be unknown words. + """) + parser.add_argument( + '--testing_percentage', + type=int, + default=10, + help='What percentage of wavs to use as a test set.') + parser.add_argument( + '--validation_percentage', + type=int, + default=10, + help='What percentage of wavs to use as a validation set.') + parser.add_argument( + '--sample_rate', + type=int, + default=16000, + help='Expected sample rate of the wavs',) + parser.add_argument( + '--clip_duration_ms', + type=int, + default=1000, + help='Expected duration in milliseconds of the wavs',) + parser.add_argument( + '--window_size_ms', + type=float, + default=30.0, + help='How long each spectrogram timeslice is',) + parser.add_argument( + '--window_stride_ms', + type=float, + default=10.0, + help='How long each spectrogram timeslice is',) + parser.add_argument( + '--dct_coefficient_count', + type=int, + default=40, + help='How many bins to use for the MFCC fingerprint',) + parser.add_argument( + '--batch_size', + type=int, + default=100, + help='How many items to train with at once',) + parser.add_argument( + '--wanted_words', + type=str, + default='yes,no,up,down,left,right,on,off,stop,go', + help='Words to use (others will be added to an unknown label)',) + parser.add_argument( + '--checkpoint', + type=str, + help='Checkpoint to load the weights from') + parser.add_argument( + '--tflite_path', + type=str, + help='Path to TFLite file to use for evaluation') + parser.add_argument( + '--model_architecture', + type=str, + default='dnn', + help='What model architecture to use') + parser.add_argument( + '--model_size_info', + type=int, + nargs="+", + default=[128, 128, 128], + help='Model dimensions - different for various models') + + FLAGS, _ = parser.parse_known_args() + evaluate() diff --git a/models/keyword_spotting/cnn_medium/model_package_tf/how_to_guidance.ipynb b/models/keyword_spotting/cnn_medium/model_package_tf/how_to_guidance.ipynb new file mode 100644 index 0000000..34a8579 --- /dev/null +++ b/models/keyword_spotting/cnn_medium/model_package_tf/how_to_guidance.ipynb @@ -0,0 +1,428 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Copyright (C) 2023 Arm Limited or its affiliates. All rights reserved.\n", + "#\n", + "# SPDX-License-Identifier: Apache-2.0\n", + "#\n", + "# Licensed under the Apache License, Version 2.0 (the License); you may\n", + "# not use this file except in compliance with the License.\n", + "# You may obtain a copy of the License at\n", + "#\n", + "# www.apache.org/licenses/LICENSE-2.0\n", + "#\n", + "# Unless required by applicable law or agreed to in writing, software\n", + "# distributed under the License is distributed on an AS IS BASIS, WITHOUT\n", + "# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", + "# See the License for the specific language governing permissions and\n", + "# limitations under the License." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# CNN_Medium - Optimised\n", + "\n", + "Here we reproduce the models with our established codebase and ModelPackage approach for your convenience.\n", + "\n", + "## Model-Package Overview:\n", + "\n", + "| Model \t| CNN_Medium \t|\n", + "|:---------------:\t|:---------------------------------------------------------------:\t|\n", + "| **Format**: \t| Keras, Saved Model, TensorFlow Lite int8, TensorFlow Lite fp32 |\n", + "| **Feature**: \t| Keyword spotting for Arm Cortex-M CPUs |\n", + "| **Architectural Delta w.r.t. Vanilla**: | None |\n", + "| **Domain**: \t| Keyword spotting |\n", + "| **Package Quality**: \t| Optimised |" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Table of contents \n", + "\n", + "This how-to guidance presents the key steps to reproduce everything in this package. The contents are organised as below. We provided the internal navigation links for users to easy-jump among different sections. \n", + "\n", + " \n", + "* [1.0 Model recreation](#model_recreation)\n", + "\n", + "* [2.0 Training](#training)\n", + "\n", + "* [3.0 Testing](#testing)\n", + "\n", + "* [4.0 Optimization](#optimization)\n", + "\n", + "* [5.0 Quantization and TFLite conversion](#tflite_conversion)\n", + "\n", + "* [6.0 Inference the TFLite model files](#tflite_inference)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1.0 Model Recreation\n", + "\n", + "In order to recreate the model you will first need to be using ```Python3.7``` and install the requirements in ```requirements.txt```.\n", + "\n", + "Once you have these requirements satisfied you can execute the recreation script contained within this folder, just run:" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2023-01-31 12:28:00.950084: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0\n", + "Untarring speech_commands_v0.02.tar.gz...\n", + "2023-01-31 12:28:52.604010: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcuda.so.1\n", + "2023-01-31 12:28:52.642244: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: \n", + "pciBusID: 0000:03:00.0 name: NVIDIA TITAN Xp computeCapability: 6.1\n", + "coreClock: 1.582GHz coreCount: 30 deviceMemorySize: 11.91GiB deviceMemoryBandwidth: 510.07GiB/s\n", + "2023-01-31 12:28:52.642282: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0\n", + "2023-01-31 12:28:52.661881: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcublas.so.11\n", + "2023-01-31 12:28:52.661959: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcublasLt.so.11\n", + "2023-01-31 12:28:52.664744: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcufft.so.10\n", + "2023-01-31 12:28:52.665058: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcurand.so.10\n", + "2023-01-31 12:28:52.665625: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcusolver.so.11\n", + "2023-01-31 12:28:52.666342: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcusparse.so.11\n", + "2023-01-31 12:28:52.666491: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudnn.so.8\n", + "2023-01-31 12:28:52.666964: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0\n", + "2023-01-31 12:28:52.667239: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA\n", + "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n", + "2023-01-31 12:28:52.668032: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: \n", + "pciBusID: 0000:03:00.0 name: NVIDIA TITAN Xp computeCapability: 6.1\n", + "coreClock: 1.582GHz coreCount: 30 deviceMemorySize: 11.91GiB deviceMemoryBandwidth: 510.07GiB/s\n", + "2023-01-31 12:28:52.668409: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0\n", + "2023-01-31 12:28:52.668474: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0\n", + "2023-01-31 12:28:53.120304: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1258] Device interconnect StreamExecutor with strength 1 edge matrix:\n", + "2023-01-31 12:28:53.120344: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1264] 0 \n", + "2023-01-31 12:28:53.120355: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1277] 0: N \n", + "2023-01-31 12:28:53.120872: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1418] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 10987 MB memory) -> physical GPU (device: 0, name: NVIDIA TITAN Xp, pci bus id: 0000:03:00.0, compute capability: 6.1)\n", + "WARNING:tensorflow:Compiled the loaded model, but the compiled metrics have yet to be built. `model.compile_metrics` will be empty until you train or evaluate the model.\n", + "2023-01-31 12:28:54.678368: W tensorflow/python/util/util.cc:348] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.\n", + "2023-01-31 12:28:55.540021: I tensorflow/core/grappler/devices.cc:69] Number of eligible GPUs (core count >= 8, compute capability >= 0.0): 1\n", + "2023-01-31 12:28:55.540187: I tensorflow/core/grappler/clusters/single_machine.cc:357] Starting new session\n", + "2023-01-31 12:28:55.540624: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: \n", + "pciBusID: 0000:03:00.0 name: NVIDIA TITAN Xp computeCapability: 6.1\n", + "coreClock: 1.582GHz coreCount: 30 deviceMemorySize: 11.91GiB deviceMemoryBandwidth: 510.07GiB/s\n", + "2023-01-31 12:28:55.540870: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0\n", + "2023-01-31 12:28:55.540900: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1258] Device interconnect StreamExecutor with strength 1 edge matrix:\n", + "2023-01-31 12:28:55.540909: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1264] 0 \n", + "2023-01-31 12:28:55.540916: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1277] 0: N \n", + "2023-01-31 12:28:55.541191: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1418] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 10987 MB memory) -> physical GPU (device: 0, name: NVIDIA TITAN Xp, pci bus id: 0000:03:00.0, compute capability: 6.1)\n", + "2023-01-31 12:28:55.559442: I tensorflow/core/platform/profile_utils/cpu_utils.cc:114] CPU Frequency: 3492140000 Hz\n", + "2023-01-31 12:28:55.561433: I tensorflow/core/grappler/optimizers/meta_optimizer.cc:1144] Optimization results for grappler item: graph_to_optimize\n", + " function_optimizer: function_optimizer did nothing. time = 0.011ms.\n", + " function_optimizer: function_optimizer did nothing. time = 0.001ms.\n", + "\n", + "2023-01-31 12:28:55.642998: W tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc:345] Ignored output_format.\n", + "2023-01-31 12:28:55.643041: W tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc:348] Ignored drop_control_dependency.\n", + "2023-01-31 12:28:55.647105: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:210] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.\n", + "2023-01-31 12:28:55.649478: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: \n", + "pciBusID: 0000:03:00.0 name: NVIDIA TITAN Xp computeCapability: 6.1\n", + "coreClock: 1.582GHz coreCount: 30 deviceMemorySize: 11.91GiB deviceMemoryBandwidth: 510.07GiB/s\n", + "2023-01-31 12:28:55.649793: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0\n", + "2023-01-31 12:28:55.649827: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1258] Device interconnect StreamExecutor with strength 1 edge matrix:\n", + "2023-01-31 12:28:55.649839: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1264] 0 \n", + "2023-01-31 12:28:55.649846: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1277] 0: N \n", + "2023-01-31 12:28:55.650184: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1418] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 10987 MB memory) -> physical GPU (device: 0, name: NVIDIA TITAN Xp, pci bus id: 0000:03:00.0, compute capability: 6.1)\n", + "Converted model saved to cnn.tflite.\n", + "Running TFLite evaluation on validation set...\n", + "2023-01-31 12:28:55.708536: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:176] None of the MLIR Optimization Passes are enabled (registered 2)\n", + "[[371 0 0 0 0 0 0 0 0 0 0 0]\n", + " [ 0 307 3 8 0 15 9 4 10 5 1 9]\n", + " [ 0 2 384 0 0 2 6 1 0 0 0 2]\n", + " [ 1 5 3 368 1 12 3 0 1 0 2 10]\n", + " [ 0 1 1 2 324 0 3 0 0 12 6 1]\n", + " [ 0 3 0 12 0 352 2 1 0 1 1 5]\n", + " [ 0 5 8 1 1 0 334 2 0 1 0 0]\n", + " [ 0 3 0 1 1 1 1 352 1 2 0 1]\n", + " [ 1 7 0 0 5 0 0 0 337 9 1 3]\n", + " [ 0 7 1 0 16 0 1 0 2 342 3 1]\n", + " [ 1 2 1 0 9 2 1 0 1 2 330 1]\n", + " [ 0 5 0 11 1 6 1 0 2 3 3 340]]\n", + "Validation accuracy = 93.16%(N=4445)\n", + "Running TFLite evaluation on test set...\n", + "[[408 0 0 0 0 0 0 0 0 0 0 0]\n", + " [ 0 341 3 6 5 6 8 6 12 3 3 15]\n", + " [ 0 6 395 2 0 2 12 0 0 0 0 2]\n", + " [ 0 8 2 363 0 11 9 0 0 0 0 12]\n", + " [ 0 8 0 1 386 1 1 0 3 15 8 2]\n", + " [ 0 6 3 12 1 371 3 0 3 0 1 6]\n", + " [ 0 4 6 1 1 1 394 3 0 0 2 0]\n", + " [ 0 13 0 0 1 0 6 372 0 1 1 2]\n", + " [ 1 9 0 0 4 7 1 0 356 17 1 0]\n", + " [ 0 5 0 1 14 0 3 1 5 364 1 8]\n", + " [ 0 0 0 0 9 3 1 0 0 1 392 5]\n", + " [ 0 8 1 24 3 6 2 0 0 4 5 349]]\n", + "Test accuracy = 91.84%(N=4890)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2023-01-31 12:29:24.873900: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0\n", + "Untarring speech_commands_v0.02.tar.gz...\n", + "2023-01-31 12:30:17.291981: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcuda.so.1\n", + "2023-01-31 12:30:17.332661: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: \n", + "pciBusID: 0000:03:00.0 name: NVIDIA TITAN Xp computeCapability: 6.1\n", + "coreClock: 1.582GHz coreCount: 30 deviceMemorySize: 11.91GiB deviceMemoryBandwidth: 510.07GiB/s\n", + "2023-01-31 12:30:17.332698: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0\n", + "2023-01-31 12:30:17.352880: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcublas.so.11\n", + "2023-01-31 12:30:17.352950: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcublasLt.so.11\n", + "2023-01-31 12:30:17.355747: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcufft.so.10\n", + "2023-01-31 12:30:17.356015: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcurand.so.10\n", + "2023-01-31 12:30:17.356577: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcusolver.so.11\n", + "2023-01-31 12:30:17.357311: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcusparse.so.11\n", + "2023-01-31 12:30:17.357465: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudnn.so.8\n", + "2023-01-31 12:30:17.357965: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0\n", + "2023-01-31 12:30:17.358267: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA\n", + "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n", + "2023-01-31 12:30:17.358989: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: \n", + "pciBusID: 0000:03:00.0 name: NVIDIA TITAN Xp computeCapability: 6.1\n", + "coreClock: 1.582GHz coreCount: 30 deviceMemorySize: 11.91GiB deviceMemoryBandwidth: 510.07GiB/s\n", + "2023-01-31 12:30:17.359555: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0\n", + "2023-01-31 12:30:17.359642: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0\n", + "2023-01-31 12:30:17.803416: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1258] Device interconnect StreamExecutor with strength 1 edge matrix:\n", + "2023-01-31 12:30:17.803457: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1264] 0 \n", + "2023-01-31 12:30:17.803465: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1277] 0: N \n", + "2023-01-31 12:30:17.803976: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1418] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 10960 MB memory) -> physical GPU (device: 0, name: NVIDIA TITAN Xp, pci bus id: 0000:03:00.0, compute capability: 6.1)\n", + "WARNING:tensorflow:Compiled the loaded model, but the compiled metrics have yet to be built. `model.compile_metrics` will be empty until you train or evaluate the model.\n", + "2023-01-31 12:30:19.386735: W tensorflow/python/util/util.cc:348] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.\n", + "2023-01-31 12:30:20.196203: I tensorflow/core/grappler/devices.cc:69] Number of eligible GPUs (core count >= 8, compute capability >= 0.0): 1\n", + "2023-01-31 12:30:20.196287: I tensorflow/core/grappler/clusters/single_machine.cc:357] Starting new session\n", + "2023-01-31 12:30:20.196874: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: \n", + "pciBusID: 0000:03:00.0 name: NVIDIA TITAN Xp computeCapability: 6.1\n", + "coreClock: 1.582GHz coreCount: 30 deviceMemorySize: 11.91GiB deviceMemoryBandwidth: 510.07GiB/s\n", + "2023-01-31 12:30:20.197122: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0\n", + "2023-01-31 12:30:20.197152: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1258] Device interconnect StreamExecutor with strength 1 edge matrix:\n", + "2023-01-31 12:30:20.197161: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1264] 0 \n", + "2023-01-31 12:30:20.197168: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1277] 0: N \n", + "2023-01-31 12:30:20.197458: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1418] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 10960 MB memory) -> physical GPU (device: 0, name: NVIDIA TITAN Xp, pci bus id: 0000:03:00.0, compute capability: 6.1)\n", + "2023-01-31 12:30:20.215456: I tensorflow/core/platform/profile_utils/cpu_utils.cc:114] CPU Frequency: 3492140000 Hz\n", + "2023-01-31 12:30:20.218487: I tensorflow/core/grappler/optimizers/meta_optimizer.cc:1144] Optimization results for grappler item: graph_to_optimize\n", + " function_optimizer: function_optimizer did nothing. time = 0.015ms.\n", + " function_optimizer: function_optimizer did nothing. time = 0.003ms.\n", + "\n", + "2023-01-31 12:30:20.293490: W tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc:345] Ignored output_format.\n", + "2023-01-31 12:30:20.293531: W tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc:348] Ignored drop_control_dependency.\n", + "2023-01-31 12:30:20.297417: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:210] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.\n", + "2023-01-31 12:30:20.299779: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: \n", + "pciBusID: 0000:03:00.0 name: NVIDIA TITAN Xp computeCapability: 6.1\n", + "coreClock: 1.582GHz coreCount: 30 deviceMemorySize: 11.91GiB deviceMemoryBandwidth: 510.07GiB/s\n", + "2023-01-31 12:30:20.300054: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0\n", + "2023-01-31 12:30:20.300091: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1258] Device interconnect StreamExecutor with strength 1 edge matrix:\n", + "2023-01-31 12:30:20.300104: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1264] 0 \n", + "2023-01-31 12:30:20.300114: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1277] 0: N \n", + "2023-01-31 12:30:20.300414: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1418] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 10960 MB memory) -> physical GPU (device: 0, name: NVIDIA TITAN Xp, pci bus id: 0000:03:00.0, compute capability: 6.1)\n", + "2023-01-31 12:30:20.327055: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:176] None of the MLIR Optimization Passes are enabled (registered 2)\n", + "fully_quantize: 0, inference_type: 6, input_inference_type: 9, output_inference_type: 9\n", + "Quantized model saved to cnn_quantized.tflite.\n", + "Running TFLite evaluation on validation set...\n", + "[[371 0 0 0 0 0 0 0 0 0 0 0]\n", + " [ 0 305 3 9 0 14 12 4 7 5 1 11]\n", + " [ 1 3 380 0 0 1 7 3 0 0 1 1]\n", + " [ 1 14 3 349 1 10 7 0 1 0 4 16]\n", + " [ 0 4 1 1 310 2 3 0 0 15 13 1]\n", + " [ 0 6 0 12 0 341 2 1 1 1 8 5]\n", + " [ 0 5 9 1 3 0 327 3 0 0 4 0]\n", + " [ 0 7 0 0 3 0 3 346 0 2 0 2]\n", + " [ 1 12 0 1 2 0 0 0 333 9 1 4]\n", + " [ 0 7 1 0 20 0 2 0 1 331 8 3]\n", + " [ 1 2 1 0 11 2 2 0 1 2 326 2]\n", + " [ 0 5 0 12 2 7 1 0 4 2 5 334]]\n", + "Validation accuracy = 91.18%(N=4445)\n", + "Running TFLite evaluation on test set...\n", + "[[408 0 0 0 0 0 0 0 0 0 0 0]\n", + " [ 1 335 5 6 6 9 9 5 12 3 2 15]\n", + " [ 0 9 390 3 0 3 10 0 0 1 0 3]\n", + " [ 0 8 2 358 3 10 8 1 0 0 2 13]\n", + " [ 0 10 0 1 380 2 5 1 2 14 9 1]\n", + " [ 0 7 3 12 2 361 3 0 3 0 4 11]\n", + " [ 0 7 7 1 1 0 391 5 0 0 0 0]\n", + " [ 0 14 0 0 2 1 6 367 0 2 1 3]\n", + " [ 2 13 0 0 6 7 2 0 349 16 1 0]\n", + " [ 0 6 0 1 13 0 5 1 4 360 3 9]\n", + " [ 0 1 0 1 8 9 1 0 0 1 382 8]\n", + " [ 0 10 0 29 3 8 3 2 0 5 6 336]]\n", + "Test accuracy = 90.33%(N=4890)\n" + ] + } + ], + "source": [ + "!bash ./recreate_model.sh" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Running this script will use the pre-trained checkpoint files supplied in the ```./model_archive/model_source/weights``` folder to generate the TFLite files and perform evaluation on the test set. Both an fp32 version and a quantized version will be produced. The quantized version will use post-training quantization to fully quantize it.\n", + "\n", + "If you want to run training from scratch you can do this by supplying ```--train``` when running the script. For example:\n", + "\n", + "```bash\n", + "bash ./recreate_model.sh --train\n", + "```\n", + "\n", + "Training is then performed and should produce a model to the stated accuracy in this repository. Note that exporting to TFLite will still happen with the baseline pre-trained checkpoint files, so you will need to re-run the script and this time supply the path to the new checkpoint files you want to use, for example:\n", + "\n", + "```bash\n", + "bash ./recreate_model.sh --ckpt \n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2.0 Training\n", + "\n", + "The training scripts can be used to recreate any of the models from the [Hello Edge paper](https://arxiv.org/pdf/1711.07128.pdf) provided the right hyperparameters are used. The training commands with all the hyperparameters to reproduce the model in this repository are given [here](recreate_model.sh). The model in this part of the repository represents just one variation of the models from the paper, other varieties are covered in other parts of the repository.\n", + "\n", + "\n", + "As a general example of how to train a DNN with 3 fully-connected layers with 128 neurons in each layer, run:\n", + "```\n", + "python train.py --model_architecture dnn --model_size_info 128 128 128\n", + "```\n", + "\n", + "The command line argument *--model_size_info* is used to pass the neural network layer\n", + "dimensions such as number of layers, convolution filter size/stride as a list to models.py,\n", + "which builds the TensorFlow graph based on the provided model architecture\n", + "and layer dimensions. For more info on *model_size_info* for each network architecture see\n", + "[models.py](model_core_utils/models.py).\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 3.0 Testing\n", + "To run inference on the trained model from a checkpoint and get accuracy on validation and test sets, run:\n", + "```\n", + "python evaluation.py --model_architecture dnn --model_size_info 128 128 128 --checkpoint \n", + "```\n", + "**The model and feature extraction parameters passed to this script should match those used in the Training step.**" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 4.0 Optimization\n", + "\n", + "We introduce an *optional* step to optimize the trained keyword spotting model for deployment.\n", + "\n", + "Here we use TensorFlow's [weight clustering API](https://www.tensorflow.org/model_optimization/guide/clustering) to reduce the compressed model size and optimize inference on supported hardware. 32 weight clusters and kmeans++ cluster intialization method are used as the clustering hyperparameters.\n", + "\n", + "To optimize your trained model (e.g. a DNN), a trained model checkpoint is needed to run clustering and fine-tuning on.\n", + "You can use the pre-trained checkpoints provided, or train your own model and use the resulting checkpoint.\n", + "\n", + "To apply the optimization and fine-tuning, run the following command:\n", + "```\n", + "python optimisations.py --model_architecture dnn --model_size_info 128 128 128 --checkpoint \n", + "```\n", + "**The model and feature extraction parameters used here should match those used in the Training step, except for the number of training steps.\n", + "The number of training steps is reduced since the optimization step only requires fine-tuning.**\n", + "\n", + "This will generate a clustered model checkpoint that can be used in the quantization step to generate a quantized and clustered TFLite model." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 5.0 Quantization and TFLite Conversion\n", + "\n", + "You can now use TensorFlow's\n", + "[post training quantization](https://www.tensorflow.org/lite/performance/post_training_quantization) to\n", + "make quantization of the trained models super simple.\n", + "\n", + "To quantize your trained model (e.g. a DNN) run:\n", + "```\n", + "python convert_to_tflite.py --model_architecture dnn --model_size_info 128 128 128 --checkpoint [--inference_type int8|int16]\n", + "```\n", + "**The model and feature extraction parameters used here should match those used in the Training step.**\n", + "\n", + "The ```inference_type``` parameter is *optional* and to be used if a fully quantized model with inputs and outputs of type int8 or int16 is needed. It defaults to fp32.\n", + "\n", + "In this example, this step will produce a quantized TFLite file *dnn_quantized.tflite*." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can test the accuracy of this quantized model on the test set by running:\n", + "```\n", + "python evaluation.py --tflite_path dnn_quantized.tflite\n", + "```\n", + "**The model and feature extraction parameters used here should match those used in the Training step.**\n", + "\n", + "`convert_to_tflite.py` uses post-training quantization to generate a quantized model by default. If you wish to convert to a floating point TFLite model, use the command below:\n", + "\n", + "```\n", + "python convert_to_tflite.py --model_architecture dnn --model_size_info 128 128 128 --checkpoint --no-quantize\n", + "```\n", + "\n", + "This will produce a floating point TFLite file *dnn.tflite*. You can test the accuracy of this floating point model using `evaluation.py` as above.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 6.0 Single inference of the TFLite model files \n", + "\n", + "You can conduct TFLite inference for .fp32 and .int8 model files by using the following command: \n", + "\n", + "```python cnn_m_inference_tflite.py --labels validation_utils/labels.txt --wav --tflite_path ```\n", + "\n", + "**The feature extraction parameters used here should match those used in the Training step.**\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.4" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/models/keyword_spotting/cnn_medium/model_package_tf/model_archive/TFLite/tflite_fp32/README.md b/models/keyword_spotting/cnn_medium/model_package_tf/model_archive/TFLite/tflite_fp32/README.md new file mode 100644 index 0000000..37debc0 --- /dev/null +++ b/models/keyword_spotting/cnn_medium/model_package_tf/model_archive/TFLite/tflite_fp32/README.md @@ -0,0 +1,62 @@ +# keyword_spotting/cnn_medium/model_package_tf/model_archive/TFLite/tflite_fp32 + +## Description +This is a floating point fp32 version of the CNN Medium model developed by Arm, from the Hello Edge paper. + +## License +[Apache-2.0](https://spdx.org/licenses/Apache-2.0.html) + +## Network Information +| Network Information | Value | +|---------------------|-------| +| Framework | TensorFlow Lite | +| Datatype | fp32 | +| SHA-1 Hash | 0057378e784ccb8fa28abaa972a86988fbecea19 | +| Size (Bytes) | 717268 | +| Provenance | https://arxiv.org/abs/1711.07128 | +| Training | Trained by Arm | +| Paper | https://arxiv.org/abs/1711.07128 | + +## DataSet +| Dataset Information | Value | +|--------|-------| +| Name | Google Speech Commands test set | + +## Accuracy + +| Metric | Value | +|--------|-------| +| accuracy | 91.84% | + +## HW Support +| HW Support | Value | +|--------------|-------| +| Cortex-A |:heavy_check_mark: | +| Cortex-M |:heavy_check_mark: | +| Mali GPU |:heavy_check_mark: | +| Ethos U |:heavy_multiplication_x: | + +### Key +* :heavy_check_mark: - Will run on this platform. +* :heavy_multiplication_x: - Will not run on this platform. + +## Network Quality +| Network Quality | Value | +|-------------------------|-------| +| Recreate | :heavy_check_mark: | +| Quality level | Deployable | +| Vanilla | :heavy_check_mark: | +| Clustered | :heavy_multiplication_x: | +| Pruned | :heavy_multiplication_x: | +| Quantization - default | :heavy_multiplication_x: | +| Quantization - full | :heavy_multiplication_x: | + +## Network Inputs +| Input Node Name | Shape | Type | Example Path | Example Type | Example Shape | Example Use Case | +|-----------------|-------|-------|--------------|-------|-------|-----------------| +| input | (1, 490) | fp32 | models/keyword_spotting/cnn_medium/model_package_tf/model_archive/TFLite/tflite_fp32/testing_input/input | fp32 | [1, 490] | The input is a processed MFCCs. | + +## Network Outputs +| Output Node Name | Shape | Type | Example Path | Example Type | Example Shape | Example Use Case | +|-----------------|-------|-------|--------------|-------|-------|-----------------| +| Identity | (1, 12) | fp32 | models/keyword_spotting/cnn_medium/model_package_tf/model_archive/TFLite/tflite_fp32/testing_output/Identity | fp32 | [1, 12] | The probability on 12 keywords | \ No newline at end of file diff --git a/models/keyword_spotting/cnn_medium/model_package_tf/model_archive/TFLite/tflite_fp32/cnn_m.tflite b/models/keyword_spotting/cnn_medium/model_package_tf/model_archive/TFLite/tflite_fp32/cnn_m.tflite new file mode 100644 index 0000000..f928da7 --- /dev/null +++ b/models/keyword_spotting/cnn_medium/model_package_tf/model_archive/TFLite/tflite_fp32/cnn_m.tflite @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d18705eebbb20d0ffa569266c97c839082f9a6cd37115c834661081832edc22c +size 717268 diff --git a/models/keyword_spotting/cnn_medium/model_package_tf/model_archive/TFLite/tflite_fp32/definition.yaml b/models/keyword_spotting/cnn_medium/model_package_tf/model_archive/TFLite/tflite_fp32/definition.yaml new file mode 100644 index 0000000..8bea635 --- /dev/null +++ b/models/keyword_spotting/cnn_medium/model_package_tf/model_archive/TFLite/tflite_fp32/definition.yaml @@ -0,0 +1,64 @@ +benchmark: + benchmark_metrics: + accuracy: 91.84% + benchmark_name: Google Speech Commands test set +description: This is a floating point fp32 version of the CNN Medium model developed + by Arm, from the Hello Edge paper. +license: +- Apache-2.0 +network: + datatype: fp32 + file_size_bytes: 717268 + filename: cnn_m.tflite + framework: TensorFlow Lite + hash: + algorithm: sha1 + value: 0057378e784ccb8fa28abaa972a86988fbecea19 + provenance: https://arxiv.org/abs/1711.07128 + training: Trained by Arm +network_parameters: + input_nodes: + - description: The input is a processed MFCCs of shape (1, 490) + example_input: + path: models/keyword_spotting/cnn_medium/model_package_tf/model_archive/TFLite/tflite_fp32/testing_input/input + shape: + - 1 + - 490 + type: fp32 + use_case: Random input for model regression. + input_datatype: fp32 + name: input + shape: + - 1 + - 490 + output_nodes: + - description: The probability on 12 keywords. + example_output: + path: models/keyword_spotting/cnn_medium/model_package_tf/model_archive/TFLite/tflite_fp32/testing_output/Identity + shape: + - 1 + - 12 + type: fp32 + use_case: output for model regression. + name: Identity + output_datatype: fp32 + shape: + - 1 + - 12 +network_quality: + clustered: false + is_vanilla: true + pruned: false + quality_level: Deployable + quality_level_hero_hw: cortex_m + quantization_default: false + quantization_full: false + recreate: true +operators: + TensorFlow Lite: + - CONV_2D + - FULLY_CONNECTED + - RELU + - RESHAPE + - SOFTMAX +paper: https://arxiv.org/abs/1711.07128 \ No newline at end of file diff --git a/models/keyword_spotting/cnn_medium/model_package_tf/model_archive/TFLite/tflite_fp32/testing_input/input/0.npy b/models/keyword_spotting/cnn_medium/model_package_tf/model_archive/TFLite/tflite_fp32/testing_input/input/0.npy new file mode 100644 index 0000000..1752993 --- /dev/null +++ b/models/keyword_spotting/cnn_medium/model_package_tf/model_archive/TFLite/tflite_fp32/testing_input/input/0.npy @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a2a935408c16cb85e8d23f9d604ea41231df1f8005c067e0a692146e7b881481 +size 2088 diff --git a/models/keyword_spotting/cnn_medium/model_package_tf/model_archive/TFLite/tflite_fp32/testing_output/Identity/0.npy b/models/keyword_spotting/cnn_medium/model_package_tf/model_archive/TFLite/tflite_fp32/testing_output/Identity/0.npy new file mode 100644 index 0000000..c590a95 --- /dev/null +++ b/models/keyword_spotting/cnn_medium/model_package_tf/model_archive/TFLite/tflite_fp32/testing_output/Identity/0.npy @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:62786f0bb0878883ab48d4a76086aff8cea161ac537ea41615901378926052a8 +size 176 diff --git a/models/keyword_spotting/cnn_medium/model_package_tf/model_archive/TFLite/tflite_int8/README.md b/models/keyword_spotting/cnn_medium/model_package_tf/model_archive/TFLite/tflite_int8/README.md new file mode 100644 index 0000000..6318de4 --- /dev/null +++ b/models/keyword_spotting/cnn_medium/model_package_tf/model_archive/TFLite/tflite_int8/README.md @@ -0,0 +1,62 @@ +# keyword_spotting/cnn_medium/model_package_tf/model_archive/TFLite/tflite_int8 + +## Description +This is a fully quantized int8 version of the CNN Medium model developed by Arm, from the Hello Edge paper. + +## License +[Apache-2.0](https://spdx.org/licenses/Apache-2.0.html) + +## Network Information +| Network Information | Value | +|---------------------|-------| +| Framework | TensorFlow Lite | +| Datatype | int8 | +| SHA-1 Hash | 6bc68074d960bbb0c695e19fd96fd7903131ef60 | +| Size (Bytes) | 186064 | +| Provenance | https://arxiv.org/abs/1711.07128 | +| Training | Trained by Arm | +| Paper | https://arxiv.org/abs/1711.07128 | + +## DataSet +| Dataset Information | Value | +|--------|-------| +| Name | Google Speech Commands test set | + +## Accuracy + +| Metric | Value | +|--------|-------| +| Accuracy | 90.47% | + +## HW Support +| HW Support | Value | +|--------------|-------| +| Cortex-A |:heavy_check_mark: | +| Cortex-M |:heavy_check_mark: | +| Mali GPU |:heavy_check_mark: | +| Ethos U |:heavy_check_mark: | + +### Key +* :heavy_check_mark: - Will run on this platform. +* :heavy_multiplication_x: - Will not run on this platform. + +## Network Quality +| Network Quality | Value | +|-------------------------|-------| +| Recreate | :heavy_check_mark: | +| Quality level | Deployable | +| Vanilla | :heavy_check_mark: | +| Clustered | :heavy_multiplication_x: | +| Pruned | :heavy_multiplication_x: | +| Quantization - default | :heavy_multiplication_x: | +| Quantization - full | :heavy_check_mark: | + +## Network Inputs +| Input Node Name | Shape | Type | Example Path | Example Type | Example Shape | Example Use Case | +|-----------------|-------|-------|--------------|-------|-------|-----------------| +| input | (1, 490) | int8 | models/keyword_spotting/cnn_medium/model_package_tf/model_archive/TFLite/tflite_int8/testing_input/input | fp32 | [1, 490] | The input is a processed MFCCs | + +## Network Outputs +| Output Node Name | Shape | Type | Example Path | Example Type | Example Shape | Example Use Case | +|-----------------|-------|-------|--------------|-------|-------|-----------------| +| Identity | (1, 12) | int8 | models/keyword_spotting/cnn_medium/model_package_tf/model_archive/TFLite/tflite_int8/testing_output/Identity | fp32 | [1, 12] | The probability on 12 keywords | \ No newline at end of file diff --git a/models/keyword_spotting/cnn_medium/tflite_int8/cnn_m_quantized.tflite b/models/keyword_spotting/cnn_medium/model_package_tf/model_archive/TFLite/tflite_int8/cnn_m_quantized.tflite similarity index 100% rename from models/keyword_spotting/cnn_medium/tflite_int8/cnn_m_quantized.tflite rename to models/keyword_spotting/cnn_medium/model_package_tf/model_archive/TFLite/tflite_int8/cnn_m_quantized.tflite diff --git a/models/keyword_spotting/cnn_medium/model_package_tf/model_archive/TFLite/tflite_int8/definition.yaml b/models/keyword_spotting/cnn_medium/model_package_tf/model_archive/TFLite/tflite_int8/definition.yaml new file mode 100644 index 0000000..10f79a7 --- /dev/null +++ b/models/keyword_spotting/cnn_medium/model_package_tf/model_archive/TFLite/tflite_int8/definition.yaml @@ -0,0 +1,64 @@ +benchmark: + benchmark_metrics: + Accuracy: 90.47% + benchmark_name: Google Speech Commands test set +description: This is a fully quantized int8 version of the CNN Medium model developed + by Arm, from the Hello Edge paper. +license: +- Apache-2.0 +network: + datatype: int8 + file_size_bytes: 186064 + filename: cnn_m_quantized.tflite + framework: TensorFlow Lite + hash: + algorithm: sha1 + value: 6bc68074d960bbb0c695e19fd96fd7903131ef60 + provenance: https://arxiv.org/abs/1711.07128 + training: Trained by Arm +network_parameters: + input_nodes: + - description: The input is a processed MFCCs of shape (1, 490) + example_input: + path: models/keyword_spotting/cnn_medium/model_package_tf/model_archive/TFLite/tflite_int8/testing_input/input + shape: + - 1 + - 490 + type: int8 + use_case: Random input for model regression. + input_datatype: int8 + name: input + shape: + - 1 + - 490 + output_nodes: + - description: The probability on 12 keywords. + example_output: + path: models/keyword_spotting/cnn_medium/model_package_tf/model_archive/TFLite/tflite_int8/testing_output/Identity + shape: + - 1 + - 12 + type: int8 + use_case: output for model regression. + name: Identity + output_datatype: int8 + shape: + - 1 + - 12 +network_quality: + clustered: false + is_vanilla: true + pruned: false + quality_level: Deployable + quality_level_hero_hw: cortex_m + quantization_default: false + quantization_full: true + recreate: true +operators: + TensorFlow Lite: + - CONV_2D + - FULLY_CONNECTED + - RELU + - RESHAPE + - SOFTMAX +paper: https://arxiv.org/abs/1711.07128 \ No newline at end of file diff --git a/models/keyword_spotting/cnn_medium/tflite_int8/testing_input/input/0.npy b/models/keyword_spotting/cnn_medium/model_package_tf/model_archive/TFLite/tflite_int8/testing_input/input/0.npy similarity index 100% rename from models/keyword_spotting/cnn_medium/tflite_int8/testing_input/input/0.npy rename to models/keyword_spotting/cnn_medium/model_package_tf/model_archive/TFLite/tflite_int8/testing_input/input/0.npy diff --git a/models/keyword_spotting/cnn_medium/tflite_int8/testing_output/Identity/0.npy b/models/keyword_spotting/cnn_medium/model_package_tf/model_archive/TFLite/tflite_int8/testing_output/Identity/0.npy similarity index 100% rename from models/keyword_spotting/cnn_medium/tflite_int8/testing_output/Identity/0.npy rename to models/keyword_spotting/cnn_medium/model_package_tf/model_archive/TFLite/tflite_int8/testing_output/Identity/0.npy diff --git a/models/keyword_spotting/cnn_medium/model_package_tf/model_archive/model_source/saved_model/cnn_medium/keras_metadata.pb b/models/keyword_spotting/cnn_medium/model_package_tf/model_archive/model_source/saved_model/cnn_medium/keras_metadata.pb new file mode 100644 index 0000000..30ebf5e --- /dev/null +++ b/models/keyword_spotting/cnn_medium/model_package_tf/model_archive/model_source/saved_model/cnn_medium/keras_metadata.pb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ae36d2d043a0d2b71e7f5fd8eef87f627324344451706fbfa6dcdcd9fd95bd6f +size 28876 diff --git a/models/keyword_spotting/cnn_medium/model_package_tf/model_archive/model_source/saved_model/cnn_medium/saved_model.pb b/models/keyword_spotting/cnn_medium/model_package_tf/model_archive/model_source/saved_model/cnn_medium/saved_model.pb new file mode 100644 index 0000000..5d6fdbc --- /dev/null +++ b/models/keyword_spotting/cnn_medium/model_package_tf/model_archive/model_source/saved_model/cnn_medium/saved_model.pb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0228b4fa8fed68d9bfbaa60e6f7157f91c6b4e142d0278b4141006749fc1ccd8 +size 302218 diff --git a/models/keyword_spotting/cnn_medium/model_package_tf/model_archive/model_source/saved_model/cnn_medium/variables/variables.data-00000-of-00001 b/models/keyword_spotting/cnn_medium/model_package_tf/model_archive/model_source/saved_model/cnn_medium/variables/variables.data-00000-of-00001 new file mode 100644 index 0000000..6a79c8b --- /dev/null +++ b/models/keyword_spotting/cnn_medium/model_package_tf/model_archive/model_source/saved_model/cnn_medium/variables/variables.data-00000-of-00001 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d54b7d5df343e2d5285d1d64a9bfb743ace65a402e87e9d963e69b0417a59e5d +size 725888 diff --git a/models/keyword_spotting/cnn_medium/model_package_tf/model_archive/model_source/saved_model/cnn_medium/variables/variables.index b/models/keyword_spotting/cnn_medium/model_package_tf/model_archive/model_source/saved_model/cnn_medium/variables/variables.index new file mode 100644 index 0000000..99cba5f --- /dev/null +++ b/models/keyword_spotting/cnn_medium/model_package_tf/model_archive/model_source/saved_model/cnn_medium/variables/variables.index @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d12a6c029bb2ff6a692e3376a01e160f78461add8d82d1d6c53e7e65c0d5f278 +size 1476 diff --git a/models/keyword_spotting/cnn_medium/tflite_int8/ckpt/checkpoint b/models/keyword_spotting/cnn_medium/model_package_tf/model_archive/model_source/weights/checkpoint similarity index 100% rename from models/keyword_spotting/cnn_medium/tflite_int8/ckpt/checkpoint rename to models/keyword_spotting/cnn_medium/model_package_tf/model_archive/model_source/weights/checkpoint diff --git a/models/keyword_spotting/cnn_medium/tflite_int8/ckpt/cnn_0.93_ckpt.data-00000-of-00001 b/models/keyword_spotting/cnn_medium/model_package_tf/model_archive/model_source/weights/cnn_0.93_ckpt.data-00000-of-00001 similarity index 100% rename from models/keyword_spotting/cnn_medium/tflite_int8/ckpt/cnn_0.93_ckpt.data-00000-of-00001 rename to models/keyword_spotting/cnn_medium/model_package_tf/model_archive/model_source/weights/cnn_0.93_ckpt.data-00000-of-00001 diff --git a/models/keyword_spotting/cnn_medium/tflite_int8/ckpt/cnn_0.93_ckpt.index b/models/keyword_spotting/cnn_medium/model_package_tf/model_archive/model_source/weights/cnn_0.93_ckpt.index similarity index 100% rename from models/keyword_spotting/cnn_medium/tflite_int8/ckpt/cnn_0.93_ckpt.index rename to models/keyword_spotting/cnn_medium/model_package_tf/model_archive/model_source/weights/cnn_0.93_ckpt.index diff --git a/models/keyword_spotting/cnn_medium/model_package_tf/model_core_utils/__init__.py b/models/keyword_spotting/cnn_medium/model_package_tf/model_core_utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/models/keyword_spotting/cnn_medium/model_package_tf/model_core_utils/models.py b/models/keyword_spotting/cnn_medium/model_package_tf/model_core_utils/models.py new file mode 100644 index 0000000..1978136 --- /dev/null +++ b/models/keyword_spotting/cnn_medium/model_package_tf/model_core_utils/models.py @@ -0,0 +1,327 @@ +# Copyright © 2023 Arm Ltd. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Model definitions for simple keyword spotting.""" + +import math + +import tensorflow as tf + + +def prepare_model_settings(label_count, sample_rate, clip_duration_ms, + window_size_ms, window_stride_ms, + dct_coefficient_count): + """Calculates common settings needed for all models. + + Args: + label_count: How many classes are to be recognized. + sample_rate: Number of audio samples per second. + clip_duration_ms: Length of each audio clip to be analyzed. + window_size_ms: Duration of frequency analysis window. + window_stride_ms: How far to move in time between frequency windows. + dct_coefficient_count: Number of frequency bins to use for analysis. + + Returns: + Dictionary containing common settings. + """ + desired_samples = int(sample_rate * clip_duration_ms / 1000) + window_size_samples = int(sample_rate * window_size_ms / 1000) + window_stride_samples = int(sample_rate * window_stride_ms / 1000) + length_minus_window = (desired_samples - window_size_samples) + if length_minus_window < 0: + spectrogram_length = 0 + else: + spectrogram_length = 1 + int(length_minus_window / window_stride_samples) + fingerprint_size = dct_coefficient_count * spectrogram_length + + return { + 'desired_samples': desired_samples, + 'window_size_samples': window_size_samples, + 'window_stride_samples': window_stride_samples, + 'spectrogram_length': spectrogram_length, + 'dct_coefficient_count': dct_coefficient_count, + 'fingerprint_size': fingerprint_size, + 'label_count': label_count, + 'sample_rate': sample_rate, + } + + +def create_model(model_settings, model_architecture, model_size_info, is_training): + """Builds a tf.keras model of the requested architecture compatible with the settings. + + Args: + model_settings: Dictionary of information about the model. + model_architecture: String specifying which kind of model to create. + model_size_info: Array with specific information for the chosen architecture + (e.g convolutional parameters, number of layers). + + Returns: + A tf.keras Model with the requested architecture. + + Raises: + Exception: If the architecture type isn't recognized. + """ + + if model_architecture == 'dnn': + return create_dnn_model(model_settings, model_size_info) + + elif model_architecture == 'cnn': + return create_cnn_model(model_settings, model_size_info) + + elif model_architecture == 'ds_cnn': + return create_ds_cnn_model(model_settings, model_size_info) + elif model_architecture == 'single_fc': + return create_single_fc_model(model_settings) + elif model_architecture == 'basic_lstm': + return create_basic_lstm_model(model_settings, model_size_info, is_training) + else: + raise Exception(f'model_architecture argument {model_architecture} not recognized' + f', should be one of, "dnn", "cnn", "ds_cnn" ') + + +def create_single_fc_model(model_settings): + """Builds a model with a single fully-connected layer. + + For details see https://arxiv.org/abs/1711.07128. + + Args: + model_settings: Dict of different settings for model training. + + Returns: + tf.keras Model of the 'SINGLE_FC' architecture. + """ + inputs = tf.keras.Input(shape=(model_settings['fingerprint_size'],), name='input') + # Fully connected layer + output = tf.keras.layers.Dense(units=model_settings['label_count'], activation='softmax')(inputs) + + return tf.keras.Model(inputs, output) + + +def create_basic_lstm_model(model_settings, model_size_info, is_training): + """Builds a model with a basic lstm layer. + + For details see https://arxiv.org/abs/1711.07128. + + Args: + model_settings: Dict of different settings for model training. + model_size_info: Length of the array defines the number of hidden-layers and + each element in the array represent the number of neurons in that layer. + is_training: Determining whether the use of the model is for training or for something else. + + Returns: + tf.keras Model of the 'Basic_LSTM' architecture. + """ + inputs = tf.keras.Input(shape=(model_settings['fingerprint_size'], ), name='input') + + input_frequency_size = model_settings['dct_coefficient_count'] + input_time_size = model_settings['spectrogram_length'] + x = tf.reshape(inputs, shape=(-1, input_time_size, input_frequency_size)) + + # LSTM layer, and unrolling depending on whether you are training or not + if is_training: + x = tf.keras.layers.LSTM(units=model_size_info[0], time_major=False, unroll=False)(x) + else: + x = tf.keras.layers.LSTM(units=model_size_info[0], time_major=False, unroll=True)(x) + + # Outputs a fully connected layer + output = tf.keras.layers.Dense(units=model_settings['label_count'], activation='softmax')(x) + + return tf.keras.Model(inputs, output) + + +def create_dnn_model(model_settings, model_size_info): + """Builds a model with multiple hidden fully-connected layers. + + For details see https://arxiv.org/abs/1711.07128. + + Args: + model_settings: Dict of different settings for model training. + model_size_info: Length of the array defines the number of hidden-layers and + each element in the array represent the number of neurons in that layer. + + Returns: + tf.keras Model of the 'DNN' architecture. + """ + + inputs = tf.keras.Input(shape=(model_settings['fingerprint_size'], ), name='input') + + # First fully connected layer. + x = tf.keras.layers.Dense(units=model_size_info[0], activation='relu')(inputs) + + # Hidden layers with ReLU activations. + for i in range(1, len(model_size_info)): + x = tf.keras.layers.Dense(units=model_size_info[i], activation='relu')(x) + + # Output fully connected layer. + output = tf.keras.layers.Dense(units=model_settings['label_count'], activation='softmax')(x) + + return tf.keras.Model(inputs, output) + + +def create_cnn_model(model_settings, model_size_info): + """Builds a model with 2 convolution layers followed by a linear layer and a hidden fully-connected layer. + + For details see https://arxiv.org/abs/1711.07128. + + Args: + model_settings: Dict of different settings for model training. + model_size_info: Defines the first and second convolution parameters in + {number of conv features, conv filter height, width, stride in y,x dir.}, + followed by linear layer size and fully-connected layer size. + + Returns: + tf.keras Model of the 'CNN' architecture. + """ + + input_frequency_size = model_settings['dct_coefficient_count'] + input_time_size = model_settings['spectrogram_length'] + + first_filter_count = model_size_info[0] + first_filter_height = model_size_info[1] # Time axis. + first_filter_width = model_size_info[2] # Frequency axis. + first_filter_stride_y = model_size_info[3] # Time axis. + first_filter_stride_x = model_size_info[4] # Frequency_axis. + + second_filter_count = model_size_info[5] + second_filter_height = model_size_info[6] # Time axis. + second_filter_width = model_size_info[7] # Frequency axis. + second_filter_stride_y = model_size_info[8] # Time axis. + second_filter_stride_x = model_size_info[9] # Frequency axis. + + linear_layer_size = model_size_info[10] + fc_size = model_size_info[11] + + inputs = tf.keras.Input(shape=(model_settings['fingerprint_size']), name='input') + + # Reshape the flattened input. + x = tf.reshape(inputs, shape=(-1, input_time_size, input_frequency_size, 1)) + + # First convolution. + x = tf.keras.layers.Conv2D(filters=first_filter_count, + kernel_size=(first_filter_height, first_filter_width), + strides=(first_filter_stride_y, first_filter_stride_x), + padding='VALID')(x) + x = tf.keras.layers.BatchNormalization()(x) + x = tf.keras.layers.ReLU()(x) + x = tf.keras.layers.Dropout(rate=0)(x) + + # Second convolution. + x = tf.keras.layers.Conv2D(filters=second_filter_count, + kernel_size=(second_filter_height, second_filter_width), + strides=(second_filter_stride_y, second_filter_stride_x), + padding='VALID')(x) + x = tf.keras.layers.BatchNormalization()(x) + x = tf.keras.layers.ReLU()(x) + x = tf.keras.layers.Dropout(rate=0)(x) + + # Flatten for fully connected layers. + x = tf.keras.layers.Flatten()(x) + + # Fully connected layer with no activation. + x = tf.keras.layers.Dense(units=linear_layer_size)(x) + + # Fully connected layer with ReLU activation. + x = tf.keras.layers.Dense(units=fc_size)(x) + x = tf.keras.layers.BatchNormalization()(x) + x = tf.keras.layers.ReLU()(x) + x = tf.keras.layers.Dropout(rate=0)(x) + + # Output fully connected. + output = tf.keras.layers.Dense(units=model_settings['label_count'], activation='softmax')(x) + + return tf.keras.Model(inputs, output) + + +def create_ds_cnn_model(model_settings, model_size_info): + """Builds a model with convolutional & depthwise separable convolutional layers. + + For more details see https://arxiv.org/abs/1711.07128. + + Args: + model_settings: Dict of different settings for model training. + model_size_info: Defines number of layers, followed by the DS-Conv layer + parameters in the order {number of conv features, conv filter height, + width and stride in y,x dir.} for each of the layers. + + Returns: + tf.keras Model of the 'DS-CNN' architecture. + """ + + label_count = model_settings['label_count'] + input_frequency_size = model_settings['dct_coefficient_count'] + input_time_size = model_settings['spectrogram_length'] + + t_dim = input_time_size + f_dim = input_frequency_size + + # Extract model dimensions from model_size_info. + num_layers = model_size_info[0] + conv_feat = [None]*num_layers + conv_kt = [None]*num_layers + conv_kf = [None]*num_layers + conv_st = [None]*num_layers + conv_sf = [None]*num_layers + + i = 1 + for layer_no in range(0, num_layers): + conv_feat[layer_no] = model_size_info[i] + i += 1 + conv_kt[layer_no] = model_size_info[i] + i += 1 + conv_kf[layer_no] = model_size_info[i] + i += 1 + conv_st[layer_no] = model_size_info[i] + i += 1 + conv_sf[layer_no] = model_size_info[i] + i += 1 + + inputs = tf.keras.Input(shape=(model_settings['fingerprint_size']), name='input') + + # Reshape the flattened input. + x = tf.reshape(inputs, shape=(-1, input_time_size, input_frequency_size, 1)) + + # Depthwise separable convolutions. + for layer_no in range(0, num_layers): + if layer_no == 0: + # First convolution. + x = tf.keras.layers.Conv2D(filters=conv_feat[0], + kernel_size=(conv_kt[0], conv_kf[0]), + strides=(conv_st[0], conv_sf[0]), + padding='SAME')(x) + x = tf.keras.layers.BatchNormalization()(x) + x = tf.keras.layers.ReLU()(x) + else: + # Depthwise convolution. + x = tf.keras.layers.DepthwiseConv2D(kernel_size=(conv_kt[layer_no], conv_kf[layer_no]), + strides=(conv_sf[layer_no], conv_st[layer_no]), + padding='SAME')(x) + x = tf.keras.layers.BatchNormalization()(x) + x = tf.keras.layers.ReLU()(x) + + # Pointwise convolution. + x = tf.keras.layers.Conv2D(filters=conv_feat[layer_no], kernel_size=(1, 1))(x) + x = tf.keras.layers.BatchNormalization()(x) + x = tf.keras.layers.ReLU()(x) + + t_dim = math.ceil(t_dim/float(conv_st[layer_no])) + f_dim = math.ceil(f_dim/float(conv_sf[layer_no])) + + # Global average pool. + x = tf.keras.layers.AveragePooling2D(pool_size=(t_dim, f_dim), strides=1)(x) + + # Squeeze before passing to output fully connected layer. + x = tf.reshape(x, shape=(-1, conv_feat[layer_no])) + + # Output connected layer. + output = tf.keras.layers.Dense(units=label_count, activation='softmax')(x) + + return tf.keras.Model(inputs, output) diff --git a/models/keyword_spotting/cnn_medium/model_package_tf/optimisations.py b/models/keyword_spotting/cnn_medium/model_package_tf/optimisations.py new file mode 100644 index 0000000..16b6f4c --- /dev/null +++ b/models/keyword_spotting/cnn_medium/model_package_tf/optimisations.py @@ -0,0 +1,259 @@ +# Copyright © 2023 Arm Ltd. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Functions for optimizing simple keyword spotting models using clustering API.""" + +import argparse +from pathlib import Path + +import tensorflow as tf +import numpy as np +import tensorflow_model_optimization as tfmot + +from data_processing import data_preprocessing +from model_core_utils import models + + +def print_model_weight_clusters(model): + + for layer in model.layers: + if isinstance(layer, tf.keras.layers.Wrapper): + weights = layer.trainable_weights + else: + weights = layer.weights + for weight in weights: + if "kernel" in weight.name: + unique_count = len(np.unique(weight)) + print( + f"{layer.name}/{weight.name}: {unique_count} clusters " + ) + + +def optimize(): + model_settings = models.prepare_model_settings( + len(data_preprocessing.prepare_words_list(FLAGS.wanted_words.split(','))), + FLAGS.sample_rate, FLAGS.clip_duration_ms, FLAGS.window_size_ms, + FLAGS.window_stride_ms, FLAGS.dct_coefficient_count) + + # Create the model to optimize from checkpoint. + model = models.create_model(model_settings, FLAGS.model_architecture, FLAGS.model_size_info) + model.load_weights(FLAGS.checkpoint).expect_partial() + + audio_processor = data_preprocessing.AudioProcessor(data_url=FLAGS.data_url, + data_dir=FLAGS.data_dir, + silence_percentage=FLAGS.silence_percentage, + unknown_percentage=FLAGS.unknown_percentage, + wanted_words=FLAGS.wanted_words.split(','), + validation_percentage=FLAGS.validation_percentage, + testing_percentage=FLAGS.testing_percentage, + model_settings=model_settings) + + # We decay learning rate in a constant piecewise way to help learning. + training_steps_list = list(map(int, FLAGS.how_many_training_steps.split(','))) + learning_rates_list = list(map(float, FLAGS.learning_rate.split(','))) + lr_boundary_list = training_steps_list[:-1] # Only need the values at which to change lr. + lr_schedule = tf.keras.optimizers.schedules.PiecewiseConstantDecay(boundaries=lr_boundary_list, + values=learning_rates_list) + + cluster_weights = tfmot.clustering.keras.cluster_weights + CentroidInitialization = tfmot.clustering.keras.CentroidInitialization + + clustering_params = { + 'number_of_clusters': 32, + 'cluster_centroids_init': CentroidInitialization.KMEANS_PLUS_PLUS} + + clustered_model = cluster_weights(model, **clustering_params) + + # Specify the optimizer configurations. + optimizer = tf.keras.optimizers.Adam(learning_rate=lr_schedule) + clustered_model.compile(optimizer=optimizer, + loss=tf.keras.losses.SparseCategoricalCrossentropy(), + metrics=['accuracy']) + + train_data = audio_processor.get_data(audio_processor.Modes.TRAINING, + FLAGS.background_frequency, FLAGS.background_volume, + int((FLAGS.time_shift_ms * FLAGS.sample_rate) / 1000)) + train_data = train_data.repeat().batch(FLAGS.batch_size).prefetch(tf.data.AUTOTUNE) + val_data = audio_processor.get_data(audio_processor.Modes.VALIDATION) + val_data = val_data.batch(FLAGS.batch_size).prefetch(tf.data.AUTOTUNE) + + # We train for a max number of iterations so need to calculate how many 'epochs' this will be. + training_steps_max = np.sum(training_steps_list) + training_epoch_max = int(np.ceil(training_steps_max / FLAGS.eval_step_interval)) + + # Train the model with clustering applied. + clustered_model.fit(x=train_data, + steps_per_epoch=FLAGS.eval_step_interval, + epochs=training_epoch_max, + validation_data=val_data) + + stripped_clustered_model = tfmot.clustering.keras.strip_clustering(clustered_model) + + print_model_weight_clusters(stripped_clustered_model) + + # Save the clustered model weights + train_dir = Path(FLAGS.train_dir) / "optimized" + train_dir.mkdir(parents=True, exist_ok=True) + + stripped_clustered_model.save_weights((train_dir / + (FLAGS.model_architecture + + "_clustered_ckpt"))) + + # Test the model. + test_data = audio_processor.get_data(audio_processor.Modes.TESTING) + test_data = test_data.batch(FLAGS.batch_size) + + stripped_clustered_model.compile(optimizer=optimizer, + loss=tf.keras.losses.SparseCategoricalCrossentropy(), + metrics=['accuracy']) + + test_loss, test_acc = stripped_clustered_model.evaluate(x=test_data) + print(f'Final test accuracy: {test_acc*100:.2f}%') + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + '--data_url', + type=str, + default='http://download.tensorflow.org/data/speech_commands_v0.02.tar.gz', + help='Location of speech training data archive on the web.') + parser.add_argument( + '--data_dir', + type=str, + default='/tmp/speech_dataset/', + help="""\ + Where to download the speech training data to. + """) + parser.add_argument( + '--background_volume', + type=float, + default=0.1, + help="""\ + How loud the background noise should be, between 0 and 1. + """) + parser.add_argument( + '--background_frequency', + type=float, + default=0.8, + help="""\ + How many of the training samples have background noise mixed in. + """) + parser.add_argument( + '--silence_percentage', + type=float, + default=10.0, + help="""\ + How much of the training data should be silence. + """) + parser.add_argument( + '--unknown_percentage', + type=float, + default=10.0, + help="""\ + How much of the training data should be unknown words. + """) + parser.add_argument( + '--time_shift_ms', + type=float, + default=100.0, + help="""\ + Range to randomly shift the training audio by in time. + """) + parser.add_argument( + '--testing_percentage', + type=int, + default=10, + help='What percentage of wavs to use as a test set.') + parser.add_argument( + '--validation_percentage', + type=int, + default=10, + help='What percentage of wavs to use as a validation set.') + parser.add_argument( + '--sample_rate', + type=int, + default=16000, + help='Expected sample rate of the wavs',) + parser.add_argument( + '--clip_duration_ms', + type=int, + default=1000, + help='Expected duration in milliseconds of the wavs',) + parser.add_argument( + '--window_size_ms', + type=float, + default=30.0, + help='How long each spectrogram timeslice is',) + parser.add_argument( + '--window_stride_ms', + type=float, + default=10.0, + help='How long each spectrogram timeslice is',) + parser.add_argument( + '--dct_coefficient_count', + type=int, + default=40, + help='How many bins to use for the MFCC fingerprint',) + parser.add_argument( + '--how_many_training_steps', + type=str, + default='3750,750', + help='How many training loops to run',) + parser.add_argument( + '--eval_step_interval', + type=int, + default=400, + help='How often to evaluate the training results.') + parser.add_argument( + '--learning_rate', + type=str, + default='0.001,0.0001', + help='How large a learning rate to use when training.') + parser.add_argument( + '--batch_size', + type=int, + default=100, + help='How many items to train with at once',) + parser.add_argument( + '--wanted_words', + type=str, + default='yes,no,up,down,left,right,on,off,stop,go', + help='Words to use (others will be added to an unknown label)',) + parser.add_argument( + '--train_dir', + type=str, + default='/tmp/speech_commands_train', + help='Directory to write event logs and checkpoint.') + parser.add_argument( + '--save_step_interval', + type=int, + default=100, + help='Save model checkpoint every save_steps.') + parser.add_argument( + '--checkpoint', + type=str, + help='Checkpoint to load the weights from before fine-tuning.') + parser.add_argument( + '--model_architecture', + type=str, + default='dnn', + help='What model architecture to use') + parser.add_argument( + '--model_size_info', + type=int, + nargs="+", + default=[128, 128, 128], + help='Model dimensions - different for various models') + + FLAGS, _ = parser.parse_known_args() + optimize() diff --git a/models/keyword_spotting/cnn_medium/model_package_tf/recreate_model.sh b/models/keyword_spotting/cnn_medium/model_package_tf/recreate_model.sh new file mode 100644 index 0000000..a295f58 --- /dev/null +++ b/models/keyword_spotting/cnn_medium/model_package_tf/recreate_model.sh @@ -0,0 +1,52 @@ +#!/usr/bin/env bash +# Copyright (C) 2023 Arm Limited or its affiliates. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the License); you may +# not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an AS IS BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +ckpt_path=model_archive/model_source/weights/cnn_0.93_ckpt +train=false + +# Parse command line args +while (( $# >= 1 )); do + case $1 in + --ckpt) + if [ "$2" ]; then + ckpt_path=$2 + shift + else + printf 'ERROR: "--ckpt" requires a path to be supplied.\n' + exit 1 + fi + ;; + --train) + train=true + break;; + *) shift; + esac; +done + + +# CNN Medium training +if [ "$train" = true ] +then +python train.py --model_architecture cnn --model_size_info 64 10 4 1 1 48 10 4 2 1 16 128 --dct_coefficient_count 10 --window_size_ms 40 --window_stride_ms 20 --learning_rate 0.0005,0.0001,0.00002 --how_many_training_steps 10000,10000,10000 --summaries_dir work/CNN/CNN_M/retrain_logs --train_dir work/CNN/CNN_M/training +fi + +# Conversion to TFLite fp32 +python convert_to_tflite.py --model_architecture cnn --model_size_info 64 10 4 1 1 48 10 4 2 1 16 128 --dct_coefficient_count 10 --window_size_ms 40 --window_stride_ms 20 --checkpoint $ckpt_path --no-quantize + +# Conversion to TFLite int8 +python convert_to_tflite.py --model_architecture cnn --model_size_info 64 10 4 1 1 48 10 4 2 1 16 128 --dct_coefficient_count 10 --window_size_ms 40 --window_stride_ms 20 --checkpoint $ckpt_path --inference_type int8 + diff --git a/models/keyword_spotting/cnn_medium/model_package_tf/requirements.txt b/models/keyword_spotting/cnn_medium/model_package_tf/requirements.txt new file mode 100644 index 0000000..3448cff --- /dev/null +++ b/models/keyword_spotting/cnn_medium/model_package_tf/requirements.txt @@ -0,0 +1,3 @@ +numpy == 1.19.5 +tensorflow == 2.5.0 +tensorflow-model-optimization == 0.6.0 \ No newline at end of file diff --git a/models/keyword_spotting/cnn_medium/model_package_tf/train.py b/models/keyword_spotting/cnn_medium/model_package_tf/train.py new file mode 100644 index 0000000..8c488b3 --- /dev/null +++ b/models/keyword_spotting/cnn_medium/model_package_tf/train.py @@ -0,0 +1,227 @@ +# Copyright © 2023 Arm Ltd. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Functions for training simple keyword spotting models.""" + +import argparse +from pathlib import Path + +import tensorflow as tf +import numpy as np + +from data_processing import data_preprocessing +from model_core_utils import models + + +def train(): + model_settings = models.prepare_model_settings( + len(data_preprocessing.prepare_words_list(FLAGS.wanted_words.split(','))), + FLAGS.sample_rate, FLAGS.clip_duration_ms, FLAGS.window_size_ms, + FLAGS.window_stride_ms, FLAGS.dct_coefficient_count) + + # Create the model. + model = models.create_model(model_settings, FLAGS.model_architecture, FLAGS.model_size_info, True) + + audio_processor = data_preprocessing.AudioProcessor(data_url=FLAGS.data_url, + data_dir=FLAGS.data_dir, + silence_percentage=FLAGS.silence_percentage, + unknown_percentage=FLAGS.unknown_percentage, + wanted_words=FLAGS.wanted_words.split(','), + validation_percentage=FLAGS.validation_percentage, + testing_percentage=FLAGS.testing_percentage, + model_settings=model_settings) + + # We decay learning rate in a constant piecewise way to help learning. + training_steps_list = list(map(int, FLAGS.how_many_training_steps.split(','))) + learning_rates_list = list(map(float, FLAGS.learning_rate.split(','))) + lr_boundary_list = training_steps_list[:-1] # Only need the values at which to change lr. + lr_schedule = tf.keras.optimizers.schedules.PiecewiseConstantDecay(boundaries=lr_boundary_list, + values=learning_rates_list) + + # Specify the optimizer configurations. + optimizer = tf.keras.optimizers.Adam(learning_rate=lr_schedule) + model.compile(optimizer=optimizer, + loss=tf.keras.losses.SparseCategoricalCrossentropy(), + metrics=['accuracy']) + + train_data = audio_processor.get_data(audio_processor.Modes.TRAINING, + FLAGS.background_frequency, FLAGS.background_volume, + int((FLAGS.time_shift_ms * FLAGS.sample_rate) / 1000)) + train_data = train_data.repeat().batch(FLAGS.batch_size).prefetch(tf.data.AUTOTUNE) + val_data = audio_processor.get_data(audio_processor.Modes.VALIDATION) + val_data = val_data.batch(FLAGS.batch_size).prefetch(tf.data.AUTOTUNE) + + # We train for a max number of iterations so need to calculate how many 'epochs' this will be. + training_steps_max = np.sum(training_steps_list) + training_epoch_max = int(np.ceil(training_steps_max / FLAGS.eval_step_interval)) + + # Callbacks. + train_dir = Path(FLAGS.train_dir) / "best" + train_dir.mkdir(parents=True, exist_ok=True) + model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint( + filepath=(train_dir / (FLAGS.model_architecture + "_{val_accuracy:.3f}_ckpt")), + save_weights_only=True, + monitor='val_accuracy', + mode='max', + save_best_only=True) + tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=FLAGS.summaries_dir) + + # Train the model. + model.fit(x=train_data, + steps_per_epoch=FLAGS.eval_step_interval, + epochs=training_epoch_max, + validation_data=val_data, + callbacks=[model_checkpoint_callback, tensorboard_callback]) + + # Test and save the model. + test_data = audio_processor.get_data(audio_processor.Modes.TESTING) + test_data = test_data.batch(FLAGS.batch_size) + + test_loss, test_acc = model.evaluate(x=test_data) + print(f'Final test accuracy: {test_acc*100:.2f}%') + model.save(f'saved_model/{FLAGS.model_architecture}') + model.save(f'keras/{FLAGS.model_architecture}.h5') + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + '--data_url', + type=str, + default='http://download.tensorflow.org/data/speech_commands_v0.02.tar.gz', + help='Location of speech training data archive on the web.') + parser.add_argument( + '--data_dir', + type=str, + default='/tmp/speech_dataset/', + help="""\ + Where to download the speech training data to. + """) + parser.add_argument( + '--background_volume', + type=float, + default=0.1, + help="""\ + How loud the background noise should be, between 0 and 1. + """) + parser.add_argument( + '--background_frequency', + type=float, + default=0.8, + help="""\ + How many of the training samples have background noise mixed in. + """) + parser.add_argument( + '--silence_percentage', + type=float, + default=10.0, + help="""\ + How much of the training data should be silence. + """) + parser.add_argument( + '--unknown_percentage', + type=float, + default=10.0, + help="""\ + How much of the training data should be unknown words. + """) + parser.add_argument( + '--time_shift_ms', + type=float, + default=100.0, + help="""\ + Range to randomly shift the training audio by in time. + """) + parser.add_argument( + '--testing_percentage', + type=int, + default=10, + help='What percentage of wavs to use as a test set.') + parser.add_argument( + '--validation_percentage', + type=int, + default=10, + help='What percentage of wavs to use as a validation set.') + parser.add_argument( + '--sample_rate', + type=int, + default=16000, + help='Expected sample rate of the wavs',) + parser.add_argument( + '--clip_duration_ms', + type=int, + default=1000, + help='Expected duration in milliseconds of the wavs',) + parser.add_argument( + '--window_size_ms', + type=float, + default=30.0, + help='How long each spectrogram timeslice is',) + parser.add_argument( + '--window_stride_ms', + type=float, + default=10.0, + help='How long each spectrogram timeslice is',) + parser.add_argument( + '--dct_coefficient_count', + type=int, + default=40, + help='How many bins to use for the MFCC fingerprint',) + parser.add_argument( + '--how_many_training_steps', + type=str, + default='15000,3000', + help='How many training loops to run',) + parser.add_argument( + '--eval_step_interval', + type=int, + default=400, + help='How often to evaluate the training results.') + parser.add_argument( + '--learning_rate', + type=str, + default='0.001,0.0001', + help='How large a learning rate to use when training.') + parser.add_argument( + '--batch_size', + type=int, + default=100, + help='How many items to train with at once',) + parser.add_argument( + '--summaries_dir', + type=str, + default='/tmp/retrain_logs', + help='Where to save summary logs for TensorBoard.') + parser.add_argument( + '--wanted_words', + type=str, + default='yes,no,up,down,left,right,on,off,stop,go', + help='Words to use (others will be added to an unknown label)',) + parser.add_argument( + '--train_dir', + type=str, + default='/tmp/speech_commands_train', + help='Directory to write event logs and checkpoint.') + parser.add_argument( + '--model_architecture', + type=str, + default='dnn', + help='What model architecture to use') + parser.add_argument( + '--model_size_info', + type=int, + nargs="+", + default=[128, 128, 128], + help='Model dimensions - different for various models') + + FLAGS, _ = parser.parse_known_args() + train() diff --git a/models/keyword_spotting/cnn_medium/model_package_tf/validation_utils/labels.txt b/models/keyword_spotting/cnn_medium/model_package_tf/validation_utils/labels.txt new file mode 100644 index 0000000..ba41645 --- /dev/null +++ b/models/keyword_spotting/cnn_medium/model_package_tf/validation_utils/labels.txt @@ -0,0 +1,12 @@ +_silence_ +_unknown_ +yes +no +up +down +left +right +on +off +stop +go \ No newline at end of file diff --git a/models/keyword_spotting/cnn_medium/tflite_int8/README.md b/models/keyword_spotting/cnn_medium/tflite_int8/README.md deleted file mode 100644 index 5576d61..0000000 --- a/models/keyword_spotting/cnn_medium/tflite_int8/README.md +++ /dev/null @@ -1,59 +0,0 @@ -# CNN Medium INT8 - -## Description -This is a fully quantized version (asymmetrical int8) of the CNN Medium model developed by Arm, with training checkpoints, from the Hello Edge paper. Code to recreate this model can be found here: https://github.com/ARM-software/ML-examples/tree/master/tflu-kws-cortex-m - -## License -[Apache-2.0](https://spdx.org/licenses/Apache-2.0.html) - -## Related Materials -### Class Labels -The class labels associated with this model can be downloaded by running the script `get_class_labels.sh`. - -### Model Recreation Code -Code to recreate this model can be found here: https://github.com/ARM-software/ML-examples/tree/master/tflu-kws-cortex-m. - -## Network Information -| Network Information | Value | -|---------------------|------------------| -| Framework | TensorFlow Lite | -| SHA-1 Hash | 6bc68074d960bbb0c695e19fd96fd7903131ef60 | -| Size (Bytes) | 186064 | -| Provenance | https://github.com/ARM-software/ML-examples/tree/master/tflu-kws-cortex-m | -| Paper | https://arxiv.org/abs/1711.07128 | - -## Accuracy -Dataset: Google Speech Commands Test Set - -| Metric | Value | -|--------|-------| -| Accuracy | 0.911 | - -## Performance -| Platform | Optimized | -|----------|:---------:| -| Cortex-A |:heavy_check_mark: | -| Cortex-M |:heavy_check_mark: | -| Mali GPU |:heavy_check_mark: | -| Ethos U |:heavy_check_mark: | - -### Key -* :heavy_check_mark: - Will run on this platform. -* :heavy_multiplication_x: - Will not run on this platform. - - - -## Optimizations -| Optimization | Value | -|-----------------|---------| -| Quantization | INT8 | - -## Network Inputs -| Input Node Name | Shape | Description | -|-----------------|---------|-------------| -| input | (1, 490) | The input is a processed MFCCs of shape (1, 490) | - -## Network Outputs -| Output Node Name | Shape | Description | -|------------------|---------|-------------| -| Identity | (1, 12) | The probability on 12 keywords. | diff --git a/models/keyword_spotting/cnn_medium/tflite_int8/definition.yaml b/models/keyword_spotting/cnn_medium/tflite_int8/definition.yaml deleted file mode 100644 index a7851bb..0000000 --- a/models/keyword_spotting/cnn_medium/tflite_int8/definition.yaml +++ /dev/null @@ -1,43 +0,0 @@ -benchmark: - Google Speech Commands test set: - Accuracy: 91.08% -description: 'This is a fully quantized version (asymmetrical int8) of the CNN Medium - model developed by Arm, with training checkpoints, from the Hello Edge paper. Code - to recreate this model can be found here: https://github.com/ARM-software/ML-examples/tree/master/tflu-kws-cortex-m' -license: -- Apache-2.0 -network: - file_size_bytes: 186064 - filename: cnn_m_quantized.tflite - framework: TensorFlow Lite - hash: - algorithm: sha1 - value: 6bc68074d960bbb0c695e19fd96fd7903131ef60 - provenance: https://github.com/ARM-software/ML-examples/tree/master/tflu-kws-cortex-m - quality_level: null -network_parameters: - input_nodes: - - description: The input is a processed MFCCs of shape (1, 490) - example_input: - path: models/keyword_spotting/cnn_medium/tflite_int8/testing_input/input - name: input - shape: - - 1 - - 490 - output_nodes: - - description: The probability on 12 keywords. - name: Identity - shape: - - 1 - - 12 - test_output_path: models/keyword_spotting/cnn_medium/tflite_int8/testing_output/Identity -operators: - TensorFlow Lite: - - CONV_2D - - DEQUANTIZE - - FULLY_CONNECTED - - QUANTIZE - - RELU - - RESHAPE - - SOFTMAX -paper: https://arxiv.org/abs/1711.07128 diff --git a/models/keyword_spotting/cnn_medium/tflite_int8/get_class_labels.sh b/models/keyword_spotting/cnn_medium/tflite_int8/get_class_labels.sh deleted file mode 100755 index e59caf5..0000000 --- a/models/keyword_spotting/cnn_medium/tflite_int8/get_class_labels.sh +++ /dev/null @@ -1,20 +0,0 @@ -# Copyright (C) 2021 Arm Limited or its affiliates. All rights reserved. -# -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the License); you may -# not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an AS IS BASIS, WITHOUT -# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -#!/usr/bin/env bash - -wget https://raw.githubusercontent.com/ARM-software/ML-KWS-for-MCU/e9cf319e9aa2ff71d433e111477dd95329fb94cb/Pretrained_models/labels.txt -mv labels.txt labelmappings.txt \ No newline at end of file diff --git a/models/keyword_spotting/cnn_small/model_package_tf/README.md b/models/keyword_spotting/cnn_small/model_package_tf/README.md new file mode 100644 index 0000000..b74f3ba --- /dev/null +++ b/models/keyword_spotting/cnn_small/model_package_tf/README.md @@ -0,0 +1,115 @@ +# CNN Small model package + +This folder contains code that will allow you to recreate the CNN Small keyword spotting model from +the [Hello Edge paper](https://arxiv.org/pdf/1711.07128.pdf). + +## License +[Apache-2.0](https://spdx.org/licenses/Apache-2.0.html) + +## Model Package Overview +| Model | CNN_Small | +|:---------------: |:------------------------------------------:| +| **Format**: | Keras, Saved Model, TensorFlow Lite int8, TensorFlow Lite fp32 | +| **Feature**: | Keyword spotting for Arm Cortex-M CPUs | +| **Architectural Delta w.r.t. Vanilla**: | None | +| **Domain**: | Keyword spotting | +| **Package Quality**: | Optimised | + +## Model Recreation + +In order to recreate the model you will first need to be using ```Python3.7``` and install the requirements in ```requirements.txt```. + +Once you have these requirements satisfied you can execute the recreation script contained within this folder, just run: + +```bash +bash ./recreate_model.sh +``` + +Running this script will use the pre-trained checkpoint files supplied in the ```./model_archive/model_source/weights``` folder +to generate the TFLite files and perform evaluation on the test sets. Both an fp32 version and a quantized version will be produced. +The quantized version will use post-training quantization to fully quantize it. + +If you want to run training from scratch you can do this by supplying ```--train``` when running the script. For example: + +```bash +bash ./recreate_model.sh --train +``` + +Training is then performed and should produce a model to the stated accuracy in this repository. +Note that exporting to TFLite will still happen with the pre-trained checkpoint files so you will need to re-run the script +and this time supply the path to the new checkpoint files you want to use, for example: + +```bash +bash ./recreate_model.sh --ckpt +``` + + +## Training + +To train a DNN with 3 fully-connected layers with 128 neurons in each layer, run: + +``` +python train.py --model_architecture dnn --model_size_info 128 128 128 +``` +The command line argument *--model_size_info* is used to pass the neural network layer +dimensions such as number of layers, convolution filter size/stride as a list to models.py, +which builds the TensorFlow graph based on the provided model architecture +and layer dimensions. For more info on *model_size_info* for each network architecture see +[models.py](models.py). + +The training commands with all the hyperparameters to reproduce the models shown in the +[paper](https://arxiv.org/pdf/1711.07128.pdf) are given [here](recreate_model.sh). + +## Testing +To run inference on the trained model from a checkpoint and get accuracy on validation and test sets, run: +``` +python evaluation.py --model_architecture dnn --model_size_info 128 128 128 --checkpoint +``` +The parameters used here should match those used in the Training step. + +## Optimization + +We introduce a new *optional* step to optimize the trained keyword spotting model for deployment. + +Here we use TensorFlow's [weight clustering API](https://www.tensorflow.org/model_optimization/guide/clustering) to reduce the compressed model size and optimize inference on supported hardware. 32 weight clusters and kmeans++ cluster intialization method are used as the clustering hyperparameters. + +To optimize your trained model (e.g. a DNN), a trained model checkpoint is needed to run clustering and fine-tuning on. +You can use the pre-trained checkpoints provided, or train your own model and use the resulting checkpoint. + +To apply the optimization and fine-tuning, run the following command: +``` +python optimisations.py --model_architecture dnn --model_size_info 128 128 128 --checkpoint +``` +The parameters used here should match those used in the Training step, except for the number of training steps. +The number of training steps is reduced since the optimization step only requires fine-tuning. + +This will generate a clustered model checkpoint that can be used in the quantization step to generate a quantized and clustered TFLite model. + +## Quantization and TFLite Conversion + +As part of the update we now use TensorFlow's +[post training quantization](https://www.tensorflow.org/lite/performance/post_training_quantization) to +make quantization of the trained models super simple. + +To quantize your trained model (e.g. a DNN) run: +``` +python convert_to_tflite.py --model_architecture dnn --model_size_info 128 128 128 --checkpoint [--inference_type int8|int16] +``` +The parameters used here should match those used in the Training step. + +The inference_type parameter is *optional* and to be used if a fully quantized model with inputs and outputs of type int8 or int16 is needed. It defaults to fp32. + +This step will produce a quantized TFLite file *dnn_quantized.tflite*. +You can test the accuracy of this quantized model on the test set by running: +``` +python evaluation.py --tflite_path dnn_quantized.tflite +``` +The parameters used here should match those used in the Training step. + +`convert_to_tflite.py` uses post-training quantization to generate a quantized model by default. If you wish to convert to a floating point TFLite model, use the command below: + +``` +python convert_to_tflite.py --model_architecture dnn --model_size_info 128 128 128 --checkpoint --no-quantize +``` + +This will produce a floating point TFLite file *dnn.tflite*. You can test the accuracy of this floating point model using `evaluation.py` as above. diff --git a/models/keyword_spotting/cnn_small/model_package_tf/cnn_s_inference_keras.py b/models/keyword_spotting/cnn_small/model_package_tf/cnn_s_inference_keras.py new file mode 100644 index 0000000..db7694a --- /dev/null +++ b/models/keyword_spotting/cnn_small/model_package_tf/cnn_s_inference_keras.py @@ -0,0 +1,76 @@ +# Copyright © 2023 Arm Ltd. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from data_processing.data_preprocessing import load_wav_file, calculate_mfcc + +import tensorflow as tf +import argparse + + +def load_labels(filename): + """Read in labels, one label per line.""" + f = open(filename, "r") + return f.read().splitlines() + + +def main(): + window_size_samples = int(FLAGS.sample_rate * FLAGS.window_size_ms / 1000) + window_stride_samples = int(FLAGS.sample_rate * FLAGS.window_stride_ms / 1000) + decoded, sample = load_wav_file(FLAGS.wav, FLAGS.sample_rate) + x = calculate_mfcc(decoded, sample, window_size_samples, window_stride_samples, FLAGS.dct_coefficient_count) + x = tf.reshape(x, [1, -1]) + + model = tf.keras.models.load_model(FLAGS.keras_file_path) + predictions = model.predict(x) + + # Sort to show labels in order of confidence + top_k = predictions[0].argsort()[-1:][::-1] + for node_id in top_k: + human_string = load_labels(FLAGS.labels)[int(node_id)] + score = predictions[0,node_id] + print(f'model predicted: {human_string} with score {score:.5f}') + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument( + '--wav', type=str, default='', help='Audio file to be identified.') + parser.add_argument( + '--labels', type=str, default='', help='Path to file containing labels.') + parser.add_argument( + '--sample_rate', + type=int, + default=16000, + help='Expected sample rate of the wavs', ) + parser.add_argument( + '--window_size_ms', + type=float, + default=30.0, + help='How long each spectrogram timeslice is', ) + parser.add_argument( + '--window_stride_ms', + type=float, + default=10.0, + help='How long each spectrogram timeslice is', ) + parser.add_argument( + '--dct_coefficient_count', + type=int, + default=40, + help='How many bins to use for the MFCC fingerprint', ) + parser.add_argument( + '--keras_file_path', + type=str, + default='', + help='Path to the .h5 Keras model file to use for testing.') + FLAGS, unparsed = parser.parse_known_args() + main() diff --git a/models/keyword_spotting/cnn_small/model_package_tf/cnn_s_inference_tflite.py b/models/keyword_spotting/cnn_small/model_package_tf/cnn_s_inference_tflite.py new file mode 100644 index 0000000..9f79d99 --- /dev/null +++ b/models/keyword_spotting/cnn_small/model_package_tf/cnn_s_inference_tflite.py @@ -0,0 +1,120 @@ +# Copyright © 2023 Arm Ltd. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from data_processing.data_preprocessing import load_wav_file, calculate_mfcc + +import tensorflow as tf +import numpy as np +import argparse + + +def tflite_inference(input_data, tflite_path): + """Call forwards pass of TFLite file and returns the result. + + Args: + input_data: Input data to use on forward pass. + tflite_path: Path to TFLite file to run. + + Returns: + Output from inference. + """ + supported_quant_dtypes = (np.int8, np.int16) + interpreter = tf.lite.Interpreter(model_path=tflite_path) + interpreter.allocate_tensors() + + input_details = interpreter.get_input_details() + output_details = interpreter.get_output_details() + + input_dtype = input_details[0]["dtype"] + output_dtype = output_details[0]["dtype"] + + # Check if the input/output type is quantized, + # set scale and zero-point accordingly + if input_dtype in supported_quant_dtypes: + input_scale, input_zero_point = input_details[0]["quantization"] + else: + input_scale, input_zero_point = 1, 0 + + input_data = input_data / input_scale + input_zero_point + input_data = np.round(input_data) if input_dtype in supported_quant_dtypes else input_data + + if output_dtype in supported_quant_dtypes: + output_scale, output_zero_point = output_details[0]["quantization"] + else: + output_scale, output_zero_point = 1, 0 + + interpreter.set_tensor(input_details[0]['index'], tf.cast(input_data, input_dtype)) + interpreter.invoke() + + output_data = interpreter.get_tensor(output_details[0]['index']) + + output_data = output_scale * (output_data.astype(np.float32) - output_zero_point) + + return output_data + + +def load_labels(filename): + """Read in labels, one label per line.""" + f = open(filename, "r") + return f.read().splitlines() + + +def main(): + window_size_samples = int(FLAGS.sample_rate * FLAGS.window_size_ms / 1000) + window_stride_samples = int(FLAGS.sample_rate * FLAGS.window_stride_ms / 1000) + decoded, sample = load_wav_file(FLAGS.wav, FLAGS.sample_rate) + x = calculate_mfcc(decoded, sample, window_size_samples, window_stride_samples, FLAGS.dct_coefficient_count) + x = tf.reshape(x, [1, -1]) + predictions = tflite_inference(x, FLAGS.tflite_path) + + # Sort to show labels in order of confidence + top_k = predictions[0].argsort()[-1:][::-1] + for node_id in top_k: + human_string = load_labels(FLAGS.labels)[int(node_id)] + score = predictions[0,node_id] + print(f'model predicted: {human_string} with score {score:.5f}') + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument( + '--wav', type=str, default='', help='Audio file to be identified.') + parser.add_argument( + '--labels', type=str, default='', help='Path to file containing labels.') + parser.add_argument( + '--sample_rate', + type=int, + default=16000, + help='Expected sample rate of the wavs', ) + parser.add_argument( + '--window_size_ms', + type=float, + default=30.0, + help='How long each spectrogram timeslice is', ) + parser.add_argument( + '--window_stride_ms', + type=float, + default=10.0, + help='How long each spectrogram timeslice is', ) + parser.add_argument( + '--dct_coefficient_count', + type=int, + default=40, + help='How many bins to use for the MFCC fingerprint', ) + parser.add_argument( + '--tflite_path', + type=str, + default='', + help='Path to TFLite file to use for testing.') + FLAGS, unparsed = parser.parse_known_args() + main() diff --git a/models/keyword_spotting/cnn_small/model_package_tf/convert_to_tflite.py b/models/keyword_spotting/cnn_small/model_package_tf/convert_to_tflite.py new file mode 100644 index 0000000..64ab8df --- /dev/null +++ b/models/keyword_spotting/cnn_small/model_package_tf/convert_to_tflite.py @@ -0,0 +1,234 @@ +# Copyright © 2023 Arm Ltd. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Functions for converting and quantizing a trained keyword spotting + model and saving to TFLite.""" + +import argparse + +import tensorflow as tf + +from data_processing import data_preprocessing +from model_core_utils import models +from evaluation import tflite_test + +NUM_REP_DATA_SAMPLES = 100 # How many samples to use for post training quantization. + + +def convert(model_settings, audio_processor, checkpoint, quantize, inference_type, tflite_path): + """Load our trained floating point model and convert it. + + TFLite conversion or post training quantization is performed and the + resulting model is saved as a TFLite file. + We use samples from the validation set to do post training quantization. + + Args: + model_settings: Dictionary of common model settings. + audio_processor: Audio processor class object. + checkpoint: Path to training checkpoint to load. + quantize: Whether to quantize the model or convert to fp32 TFLite model. + inference_type: Input/output type of the quantized model. + tflite_path: Output TFLite file save path. + """ + model = models.create_model(model_settings, FLAGS.model_architecture, FLAGS.model_size_info, False) + model.load_weights(checkpoint).expect_partial() + + val_data = audio_processor.get_data(audio_processor.Modes.VALIDATION).batch(1) + + def _rep_dataset(): + """Generator function to produce representative dataset.""" + i = 0 + for mfcc, label in val_data: + if i > NUM_REP_DATA_SAMPLES: + break + i += 1 + yield [mfcc] + + if quantize: + # Quantize model and save to disk. + tflite_model = post_training_quantize(model, inference_type, _rep_dataset) + with open(tflite_path, 'wb') as f: + f.write(tflite_model) + print(f'Quantized model saved to {tflite_path}.') + else: + converter = tf.lite.TFLiteConverter.from_keras_model(model) + tflite_model = converter.convert() + with open(tflite_path, 'wb') as f: + f.write(tflite_model) + print(f'Converted model saved to {tflite_path}.') + + +def post_training_quantize(keras_model, inference_type, rep_dataset): + """Perform post training quantization and returns the TFLite model ready for saving. + + See https://www.tensorflow.org/lite/performance/post_training_quantization#full_integer_quantization for + more details. + + Args: + keras_model: The trained tf Keras model used for post training quantization. + inference_type: Input/output type of the quantized model. + rep_dataset: Function to use as a representative dataset, must be callable. + + Returns: + Quantized TFLite model ready for saving to disk. + """ + converter = tf.lite.TFLiteConverter.from_keras_model(keras_model) + converter.optimizations = [tf.lite.Optimize.DEFAULT] + + if inference_type == 'int8': + converter.inference_input_type = tf.int8 + converter.inference_output_type = tf.int8 + supported_ops = tf.lite.OpsSet.TFLITE_BUILTINS_INT8 + if inference_type == 'int16': + converter.inference_input_type = tf.int16 + converter.inference_output_type = tf.int16 + supported_ops = tf.lite.OpsSet.EXPERIMENTAL_TFLITE_BUILTINS_ACTIVATIONS_INT16_WEIGHTS_INT8 + + # Int8 post training quantization needs representative dataset. + converter.representative_dataset = rep_dataset + converter.target_spec.supported_ops = [supported_ops] + + tflite_model = converter.convert() + + return tflite_model + + +def main(): + model_settings = models.prepare_model_settings( + len(data_preprocessing.prepare_words_list(FLAGS.wanted_words.split(','))), + FLAGS.sample_rate, FLAGS.clip_duration_ms, FLAGS.window_size_ms, + FLAGS.window_stride_ms, FLAGS.dct_coefficient_count) + + audio_processor = data_preprocessing.AudioProcessor(data_url=FLAGS.data_url, + data_dir=FLAGS.data_dir, + silence_percentage=FLAGS.silence_percentage, + unknown_percentage=FLAGS.unknown_percentage, + wanted_words=FLAGS.wanted_words.split(','), + validation_percentage=FLAGS.validation_percentage, + testing_percentage=FLAGS.testing_percentage, + model_settings=model_settings) + + if FLAGS.quantize: + tflite_path = f'{FLAGS.model_architecture}_quantized.tflite' + else: + tflite_path = f'{FLAGS.model_architecture}.tflite' + + # Load floating point model from checkpoint and convert it. + convert(model_settings, audio_processor, FLAGS.checkpoint, + FLAGS.quantize, FLAGS.inference_type, tflite_path) + + # Test the newly converted model on the test set. + tflite_test(model_settings, audio_processor, tflite_path) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument( + '--data_url', + type=str, + default='http://download.tensorflow.org/data/speech_commands_v0.02.tar.gz', + help='Location of speech training data archive on the web.') + parser.add_argument( + '--data_dir', + type=str, + default='/tmp/speech_dataset/', + help="""\ + Where to download the speech training data to. + """) + parser.add_argument( + '--silence_percentage', + type=float, + default=10.0, + help="""\ + How much of the training data should be silence. + """) + parser.add_argument( + '--unknown_percentage', + type=float, + default=10.0, + help="""\ + How much of the training data should be unknown words. + """) + parser.add_argument( + '--testing_percentage', + type=int, + default=10, + help='What percentage of wavs to use as a test set.') + parser.add_argument( + '--validation_percentage', + type=int, + default=10, + help='What percentage of wavs to use as a validation set.') + parser.add_argument( + '--sample_rate', + type=int, + default=16000, + help='Expected sample rate of the wavs',) + parser.add_argument( + '--clip_duration_ms', + type=int, + default=1000, + help='Expected duration in milliseconds of the wavs',) + parser.add_argument( + '--window_size_ms', + type=float, + default=30.0, + help='How long each spectrogram timeslice is',) + parser.add_argument( + '--window_stride_ms', + type=float, + default=10.0, + help='How long each spectrogram timeslice is',) + parser.add_argument( + '--dct_coefficient_count', + type=int, + default=40, + help='How many bins to use for the MFCC fingerprint',) + parser.add_argument( + '--wanted_words', + type=str, + default='yes,no,up,down,left,right,on,off,stop,go', + help='Words to use (others will be added to an unknown label)',) + parser.add_argument( + '--model_architecture', + type=str, + default='dnn', + help='What model architecture to use') + parser.add_argument( + '--model_size_info', + type=int, + nargs="+", + default=[128, 128, 128], + help='Model dimensions - different for various models') + parser.add_argument( + '--checkpoint', + type=str, + help='Checkpoint to load the weights from.') + parser.add_argument( + '--quantize', + dest='quantize', + action="store_true", + default=True, + help='Whether to quantize the model or convert to fp32 TFLite model. Defaults to True.') + parser.add_argument( + '--no-quantize', + dest='quantize', + action="store_false", + help='Whether to quantize the model or convert to fp32 TFLite model. Defaults to True.') + parser.add_argument( + '--inference_type', + type=str, + default='fp32', + help='If quantize is true, whether the model input and output is float32, int8 or int16') + + FLAGS, _ = parser.parse_known_args() + main() diff --git a/models/keyword_spotting/cnn_small/model_package_tf/data_processing/__init__.py b/models/keyword_spotting/cnn_small/model_package_tf/data_processing/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/models/keyword_spotting/cnn_small/model_package_tf/data_processing/data_preprocessing.py b/models/keyword_spotting/cnn_small/model_package_tf/data_processing/data_preprocessing.py new file mode 100644 index 0000000..05cf5ba --- /dev/null +++ b/models/keyword_spotting/cnn_small/model_package_tf/data_processing/data_preprocessing.py @@ -0,0 +1,462 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +# +# Modifications Copyright 2023 Arm Inc. All Rights Reserved. +# Modified to use TensorFlow 2.0 and data pipelines. +# +"""Functions for loading and preparing data for keyword spotting.""" + +import os +import re +import sys +import urllib +from pathlib import Path +import tarfile +import hashlib +import random +import math +from enum import Enum + +import numpy as np +import tensorflow as tf +from tensorflow.python.ops import gen_audio_ops as audio_ops + +MAX_NUM_WAVS_PER_CLASS = 2**27 - 1 # ~134M +RANDOM_SEED = 59185 +BACKGROUND_NOISE_DIR_NAME = '_background_noise_' +SILENCE_LABEL = '_silence_' +SILENCE_INDEX = 0 +UNKNOWN_WORD_INDEX = 1 +UNKNOWN_WORD_LABEL = '_unknown_' + + +def load_wav_file(wav_filename, desired_samples): + """Loads and then decodes a given 16bit PCM wav file. + + Decoded audio is scaled to the range [-1, 1] and padded or cropped to the desired number of samples. + + Args: + wav_filename: 16bit PCM wav file to load. + desired_samples: Number of samples wanted from the audio file. + + Returns: + Tuple consisting of the decoded audio and sample rate. + """ + wav_file = tf.io.read_file(wav_filename) + decoded_wav = audio_ops.decode_wav(wav_file, desired_channels=1, desired_samples=desired_samples) + + return decoded_wav.audio, decoded_wav.sample_rate + + +def calculate_mfcc(audio_signal, audio_sample_rate, window_size, window_stride, num_mfcc): + """Returns Mel Frequency Cepstral Coefficients (MFCC) for a given audio signal. + + Args: + audio_signal: Raw audio signal in range [-1, 1] + audio_sample_rate: Audio signal sample rate + window_size: Window size in samples for calculating spectrogram + window_stride: Window stride in samples for calculating spectrogram + num_mfcc: The number of MFCC features wanted. + + Returns: + Calculated mffc features. + """ + spectrogram = audio_ops.audio_spectrogram(input=audio_signal, window_size=window_size, stride=window_stride, + magnitude_squared=True) + + mfcc_features = audio_ops.mfcc(spectrogram, audio_sample_rate, dct_coefficient_count=num_mfcc) + + return mfcc_features + + +def which_set(filename, validation_percentage, testing_percentage): + """Determines which data partition the file should belong to. + + We want to keep files in the same training, validation, or testing sets even + if new ones are added over time. This makes it less likely that testing + samples will accidentally be reused in training when long runs are restarted + for example. To keep this stability, a hash of the filename is taken and used + to determine which set it should belong to. This determination only depends on + the name and the set proportions, so it won't change as other files are added. + It's also useful to associate particular files as related (for example words + spoken by the same person), so anything after '_nohash_' in a filename is + ignored for set determination. This ensures that 'bobby_nohash_0.wav' and + 'bobby_nohash_1.wav' are always in the same set, for example. + + Args: + filename: File path of the data sample. + validation_percentage: How much of the data set to use for validation. + testing_percentage: How much of the data set to use for testing. + + Returns: + String, one of 'training', 'validation', or 'testing'. + """ + base_name = os.path.basename(filename) + # We want to ignore anything after '_nohash_' in the file name when + # deciding which set to put a wav in, so the data set creator has a way of + # grouping wavs that are close variations of each other. + hash_name = re.sub(r'_nohash_.*$', '', base_name) + # This looks a bit magical, but we need to decide whether this file should + # go into the training, testing, or validation sets, and we want to keep + # existing files in the same set even if more files are subsequently + # added. + # To do that, we need a stable way of deciding based on just the file name + # itself, so we do a hash of that and then use that to generate a + # probability value that we use to assign it. + hash_name_hashed = hashlib.sha1(tf.compat.as_bytes(hash_name)).hexdigest() + percentage_hash = ((int(hash_name_hashed, 16) % + (MAX_NUM_WAVS_PER_CLASS + 1)) * + (100.0 / MAX_NUM_WAVS_PER_CLASS)) + if percentage_hash < validation_percentage: + result = 'validation' + elif percentage_hash < (testing_percentage + validation_percentage): + result = 'testing' + else: + result = 'training' + return result + + +def prepare_words_list(wanted_words): + """Prepends common tokens to the custom word list. + + Args: + wanted_words: List of strings containing custom words to spot. + + Returns: + List of words with silence and unknown tokens added. + """ + return [SILENCE_LABEL, UNKNOWN_WORD_LABEL] + wanted_words + + +class AudioProcessor: + """Handles loading, partitioning, and preparing audio training data.""" + + class Modes(Enum): + TRAINING = 1 + VALIDATION = 2 + TESTING = 3 + + def __init__(self, data_url, data_dir, silence_percentage, unknown_percentage, + wanted_words, validation_percentage, testing_percentage, model_settings): + self.data_dir = Path(data_dir) + self.model_settings = model_settings + self.words_list = prepare_words_list(wanted_words) + + self._tf_datasets = {} + self.background_data = None + self._set_size = {'training': 0, 'validation': 0, 'testing': 0} + + self._download_and_extract_data(data_url, data_dir) + self._prepare_datasets(silence_percentage, unknown_percentage, wanted_words, + validation_percentage, testing_percentage) + self._prepare_background_data() + + def get_data(self, mode, background_frequency=0, background_volume_range=0, time_shift=0): + """Returns the train, validation or test set for KWS as a TF Dataset. + + Args: + mode: The set to return, see AudioProcessor.Modes enumeration. + background_frequency: How many of the samples have background noise mixed in. + background_volume_range: How loud the background noise should be, between 0 and 1. + time_shift: Range to randomly shift the training audio by in time. + + Returns: + TF dataset that will generate tuples containing an mfcc and corresponding label. + + Raises: + ValueError: If mode is not recognised. + """ + if mode == AudioProcessor.Modes.TRAINING: + dataset = self._tf_datasets['training'] + elif mode == AudioProcessor.Modes.VALIDATION: + dataset = self._tf_datasets['validation'] + elif mode == AudioProcessor.Modes.TESTING: + dataset = self._tf_datasets['testing'] + else: + ValueError("Incorrect dataset type given") + + use_background = (self.background_data is not None) and (mode == AudioProcessor.Modes.TRAINING) + dataset = dataset.map(lambda path, label: self._process_path(path, label, self.model_settings, + background_frequency, background_volume_range, + time_shift, use_background, self.background_data), + num_parallel_calls=tf.data.experimental.AUTOTUNE) + + return dataset + + def set_size(self, mode): + """Get the number of samples in the requested dataset partition. + + Args: + mode: Which partition, see AudioProcessor.Modes enumeration. + + Returns: + Number of samples in the partition. + + Raises: + ValueError: If mode is not recognised. + """ + if mode == AudioProcessor.Modes.TRAINING: + return self._set_size['training'] + elif mode == AudioProcessor.Modes.VALIDATION: + return self._set_size['validation'] + elif mode == AudioProcessor.Modes.TESTING: + return self._set_size['testing'] + else: + ValueError('Incorrect dataset type given') + + @staticmethod + def _process_path(path, label, model_settings, background_frequency, background_volume_range, time_shift_samples, + use_background, background_data): + """Load wav files and calculate mfcc features. + + Random shifting of samples and adding in background noise is done within this function as well. + This function is meant to be mapped onto a TF Dataset by using a lambda function. + + Args: + path: Path to the wav file to load. + label: Integer label for classifying the audio clip. + model_settings: Dictionary of settings for model being trained. + background_frequency: How many clips will have background noise, 0.0 to 1.0. + background_volume_range: How loud the background noise will be. + time_shift_samples: How much to randomly shift the clips by. + use_background: Add in background noise to audio clips or not. + background_data: Ragged tensor of loaded background noise samples. + + Returns: + Tuple of calculated flattened mfcc and its class label. + """ + + desired_samples = model_settings['desired_samples'] + audio, sample_rate = load_wav_file(path, desired_samples=desired_samples) + + # Make our own silence audio data. + if label == SILENCE_INDEX: + audio = tf.multiply(audio, 0) + + # Shift samples start position and pad any gaps with zeros. + if time_shift_samples > 0: + time_shift_amount = tf.random.uniform(shape=(), minval=-time_shift_samples, maxval=time_shift_samples, + dtype=tf.int32) + else: + time_shift_amount = 0 + if time_shift_amount > 0: + time_shift_padding = [[time_shift_amount, 0], [0, 0]] + time_shift_offset = [0, 0] + else: + time_shift_padding = [[0, -time_shift_amount], [0, 0]] + time_shift_offset = [-time_shift_amount, 0] + + padded_foreground = tf.pad(audio, time_shift_padding, mode='CONSTANT') + sliced_foreground = tf.slice(padded_foreground, time_shift_offset, [desired_samples, -1]) + + # Get a random section of background noise. + if use_background: + background_index = tf.random.uniform(shape=(), maxval=background_data.shape[0], dtype=tf.int32) + background_sample = background_data[background_index] + background_offset = tf.random.uniform(shape=(), maxval=len(background_sample)-desired_samples, + dtype=tf.int32) + background_clipped = background_sample[background_offset:(background_offset + desired_samples)] + background_reshaped = tf.reshape(background_clipped, [desired_samples, 1]) + if tf.random.uniform(shape=(), maxval=1) < background_frequency: + background_volume = tf.random.uniform(shape=(), maxval=background_volume_range) + else: + background_volume = tf.constant(0, dtype='float32') + else: + background_reshaped = np.zeros([desired_samples, 1], dtype=np.float32) + background_volume = tf.constant(0, dtype='float32') + + # Mix in background noise. + background_mul = tf.multiply(background_reshaped, background_volume) + background_add = tf.add(background_mul, sliced_foreground) + background_clamp = tf.clip_by_value(background_add, -1.0, 1.0) + + mfcc = calculate_mfcc(background_clamp, sample_rate, model_settings['window_size_samples'], + model_settings['window_stride_samples'], + model_settings['dct_coefficient_count']) + mfcc = tf.reshape(mfcc, [-1]) + + return mfcc, label + + def _download_and_extract_data(self, data_url, target_directory): + """Downloads and extracts file to target directory. + + If the file does not already exist download it and then untar into the target directory. + + Args: + data_url: Web link to the tarred data to download. + target_directory: Directory to download and extract to. + """ + target_directory = Path(target_directory) + target_directory.mkdir(exist_ok=True) + + filename = data_url.split('/')[-1] + filepath = target_directory / filename + + if not filepath.exists(): + def _report_hook(block_num, block_size, total_size): + """Function to track download progress in urllib""" + read_so_far = block_num * block_size + percent = (read_so_far / total_size) * 100.0 + + s = f"\rDownloading {filename} {percent:.1f}%" + + sys.stdout.write(s) + sys.stdout.flush() + + filepath, _ = urllib.request.urlretrieve(data_url, filepath, _report_hook) + print() + + print(f'Untarring {filename}...') + tarfile.open(filepath, 'r:gz').extractall(target_directory) + + def _prepare_datasets(self, silence_percentage, unknown_percentage, wanted_words, + validation_percentage, testing_percentage): + """Split the data into train, validation and testing sets. + + Silence and unknown data is added, then sets are converted to TF Datasets. + + Args: + silence_percentage: Percent of words should be silence. + unknown_percentage: Percent of words that should be unknown. + wanted_words: List of words wanted to classify. + validation_percentage: Percent to split off for validation. + testing_percentage: Percent to split off for testing. + """ + # Make sure the shuffling and picking of unknowns is deterministic. + random.seed(RANDOM_SEED) + wanted_words_index = {} + + for index, wanted_word in enumerate(wanted_words): + wanted_words_index[wanted_word] = index + 2 + + # Find all wav files in subfolders. + search_path = self.data_dir / '*' / '*.wav' + data_index, unknown_index, all_words = self._find_and_sort_wavs(search_path, validation_percentage, + testing_percentage, wanted_words_index) + + for index, wanted_word in enumerate(wanted_words): + if wanted_word not in all_words: + raise Exception(f'Tried to find {wanted_word} in labels but only found: {", ".join(all_words.keys())}') + + word_to_index = {} + for word in all_words: + if word in wanted_words_index: + word_to_index[word] = wanted_words_index[word] + else: + word_to_index[word] = UNKNOWN_WORD_INDEX + word_to_index[SILENCE_LABEL] = SILENCE_INDEX + + # We need an arbitrary file to load as the input for the silence samples. + # It's multiplied by zero later, so the content doesn't matter. + silence_wav_path = data_index['training'][0]['file'] + for set_index in ['validation', 'testing', 'training']: + set_size = len(data_index[set_index]) # Size before adding silence and unknown samples. + silence_size = int(math.ceil(set_size * silence_percentage / 100)) + for _ in range(silence_size): + data_index[set_index].append({ + 'label': SILENCE_LABEL, + 'file': silence_wav_path + }) + # Pick some unknowns to add to each partition of the data set. + random.shuffle(unknown_index[set_index]) + unknown_size = int(math.ceil(set_size * unknown_percentage / 100)) + data_index[set_index].extend(unknown_index[set_index][:unknown_size]) + + self._set_size[set_index] = len(data_index[set_index]) # Size after adding silence and unknown samples. + + # Make sure the ordering is random. + random.shuffle(data_index[set_index]) + + # Transform into TF Datasets ready for easier processing later. + labels, paths = list(zip(*[d.values() for d in data_index[set_index]])) + labels = [word_to_index[label] for label in labels] + self._tf_datasets[set_index] = tf.data.Dataset.from_tensor_slices((list(paths), labels)) + + def _find_and_sort_wavs(self, search_pattern, validation_percentage, testing_percentage, wanted_words_index): + """Find and sort wav files into known and unknown word sets. + + Known words are files containing words in the list of wanted words. + Any other clip goes to the unknown label set. Labels come from the folder names. + All clips are also assigned to train, test and validation sets. + + Args: + search_pattern: Path pattern used by glob to find wav files. + validation_percentage: Percent to split off for validation. + testing_percentage: Percent to split off for testing. + wanted_words_index: Dict mapping wanted words to their label index. + + Returns: + 3-tuple of known words, unknown words and mapping of all word labels. + """ + data_index = {'validation': [], 'testing': [], 'training': []} + unknown_index = {'validation': [], 'testing': [], 'training': []} + all_words = {} + + for wav_path in sorted(tf.io.gfile.glob(str(search_pattern))): + word = Path(wav_path).parent.name.lower() + + # Treat the '_background_noise_' folder as a special case, since we expect + # it to contain long audio samples we mix in to improve training. + if word == BACKGROUND_NOISE_DIR_NAME: + continue + + all_words[word] = True + set_index = which_set(wav_path, validation_percentage, testing_percentage) + # If it's a known class, store its detail, otherwise add it to the list + # we'll use to train the unknown label. + if word in wanted_words_index: + data_index[set_index].append({'label': word, 'file': wav_path}) + else: + unknown_index[set_index].append({'label': word, 'file': wav_path}) + if not all_words: + raise Exception('No .wavs found at ' + str(search_pattern)) + + return data_index, unknown_index, all_words + + def _prepare_background_data(self): + """Searches a folder for background noise audio, and loads it into memory. + + It's expected that the background audio samples will be in a subdirectory + named '_background_noise_' inside the 'data_dir' folder, as .wavs that match + the sample rate of the training data, but can be much longer in duration. + + If the '_background_noise_' folder doesn't exist at all, this isn't an + error, it's just taken to mean that no background noise augmentation should + be used. If the folder does exist, but it's empty, that's treated as an + error. + + Returns: + Ragged tensor of raw PCM-encoded audio samples of background noise. + None if '_background_noise_' folder doesnt exist. + + Raises: + Exception: If files aren't found in the folder. + """ + background_data = [] + background_dir = Path(self.data_dir / BACKGROUND_NOISE_DIR_NAME) + if not background_dir.exists(): + self.background_data = None + return + + search_path = Path(background_dir / '*.wav') + for wav_path in tf.io.gfile.glob(str(search_path)): + wav_data, _ = load_wav_file(wav_path, desired_samples=-1) + background_data.append(tf.reshape(wav_data, [-1])) + + if not background_data: + raise Exception('No background wav files were found in ' + str(search_path)) + + # Ragged tensor as we cant use lists in tf dataset map functions. + self.background_data = tf.ragged.stack(background_data) diff --git a/models/keyword_spotting/cnn_small/model_package_tf/evaluation.py b/models/keyword_spotting/cnn_small/model_package_tf/evaluation.py new file mode 100644 index 0000000..026e8f8 --- /dev/null +++ b/models/keyword_spotting/cnn_small/model_package_tf/evaluation.py @@ -0,0 +1,250 @@ +# Copyright © 2023 Arm Ltd. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Functions for testing trained keyword spotting models from checkpoint files and TFLite files.""" + +import argparse + +import numpy as np +import tensorflow as tf + +from data_processing import data_preprocessing +from model_core_utils import models +from cnn_s_inference_tflite import tflite_inference + + +def tflite_test(model_settings, audio_processor, tflite_path): + """Calculate accuracy and confusion matrices on the validation and test sets. + + A TFLite model is used for doing testing. + + Args: + model_settings: Dictionary of common model settings. + audio_processor: Audio processor class object. + tflite_path: Path to TFLite file to use for inference. + """ + # Evaluate on validation set. + print("Running TFLite evaluation on validation set...") + val_data = audio_processor.get_data(audio_processor.Modes.VALIDATION).batch(1) + expected_indices = np.concatenate([y for x, y in val_data]) + predicted_indices = [] + + for mfcc, label in val_data: + prediction = tflite_inference(mfcc, tflite_path) + predicted_indices.append(np.squeeze(tf.argmax(prediction, axis=1))) + + val_accuracy = calculate_accuracy(predicted_indices, expected_indices) + confusion_matrix = tf.math.confusion_matrix(expected_indices, predicted_indices, + num_classes=model_settings['label_count']) + + print(confusion_matrix.numpy()) + print(f'Validation accuracy = {val_accuracy * 100:.2f}%' + f'(N={audio_processor.set_size(audio_processor.Modes.VALIDATION)})') + + # Evaluate on testing set. + print("Running TFLite evaluation on test set...") + test_data = audio_processor.get_data(audio_processor.Modes.TESTING).batch(1) + expected_indices = np.concatenate([y for x, y in test_data]) + predicted_indices = [] + + for mfcc, label in test_data: + prediction = tflite_inference(mfcc, tflite_path) + predicted_indices.append(np.squeeze(tf.argmax(prediction, axis=1))) + + test_accuracy = calculate_accuracy(predicted_indices, expected_indices) + confusion_matrix = tf.math.confusion_matrix(expected_indices, predicted_indices, + num_classes=model_settings['label_count']) + + print(confusion_matrix.numpy()) + print(f'Test accuracy = {test_accuracy * 100:.2f}%' + f'(N={audio_processor.set_size(audio_processor.Modes.TESTING)})') + + +def keras_test(model_settings, audio_processor, model): + """Calculate accuracy and confusion matrices on the validation and test sets. + + A loaded keras model is used for doing testing. + + Args: + model_settings: Dictionary of common model settings. + audio_processor: Audio processor class object. + model: Loaded keras model. + """ + # Evaluate on validation set. + print("Running TF evaluation on validation set...") + val_data = audio_processor.get_data(audio_processor.Modes.VALIDATION).batch(FLAGS.batch_size) + expected_indices = np.concatenate([y for x, y in val_data]) + + predictions = model.predict(val_data) + predicted_indices = tf.argmax(predictions, axis=1) + + val_accuracy = calculate_accuracy(predicted_indices, expected_indices) + confusion_matrix = tf.math.confusion_matrix(expected_indices, predicted_indices, + num_classes=model_settings['label_count']) + print(confusion_matrix.numpy()) + print(f'Validation accuracy = {val_accuracy * 100:.2f}%' + f'(N={audio_processor.set_size(audio_processor.Modes.VALIDATION)})') + + # Evaluate on testing set. + print("Running TF evaluation on test set...") + test_data = audio_processor.get_data(audio_processor.Modes.TESTING).batch(FLAGS.batch_size) + expected_indices = np.concatenate([y for x, y in test_data]) + + predictions = model.predict(test_data) + predicted_indices = tf.argmax(predictions, axis=1) + + test_accuracy = calculate_accuracy(predicted_indices, expected_indices) + confusion_matrix = tf.math.confusion_matrix(expected_indices, predicted_indices, + num_classes=model_settings['label_count']) + print(confusion_matrix.numpy()) + print(f'Test accuracy = {test_accuracy * 100:.2f}%' + f'(N={audio_processor.set_size(audio_processor.Modes.TESTING)})') + + +def calculate_accuracy(predicted_indices, expected_indices): + """Calculates and returns accuracy. + + Args: + predicted_indices: List of predicted integer indices. + expected_indices: List of expected integer indices. + + Returns: + Accuracy value between 0 and 1. + """ + correct_prediction = tf.equal(predicted_indices, expected_indices) + accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) + return accuracy + + +def evaluate(): + """Calculate accuracy and confusion matrices on validation and test sets. + + Model is created and weights loaded from supplied command line arguments. + """ + model_settings = models.prepare_model_settings( + len(data_preprocessing.prepare_words_list(FLAGS.wanted_words.split(','))), + FLAGS.sample_rate, FLAGS.clip_duration_ms, FLAGS.window_size_ms, + FLAGS.window_stride_ms, FLAGS.dct_coefficient_count) + + audio_processor = data_preprocessing.AudioProcessor(data_url=FLAGS.data_url, + data_dir=FLAGS.data_dir, + silence_percentage=FLAGS.silence_percentage, + unknown_percentage=FLAGS.unknown_percentage, + wanted_words=FLAGS.wanted_words.split(','), + validation_percentage=FLAGS.validation_percentage, + testing_percentage=FLAGS.testing_percentage, + model_settings=model_settings) + + if FLAGS.tflite_path: + tflite_test(model_settings, audio_processor, FLAGS.tflite_path) + + if FLAGS.checkpoint: + model = models.create_model(model_settings, FLAGS.model_architecture, FLAGS.model_size_info, False) + model.load_weights(FLAGS.checkpoint).expect_partial() + keras_test(model_settings, audio_processor, model) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument( + '--data_url', + type=str, + default='http://download.tensorflow.org/data/speech_commands_v0.02.tar.gz', + help='Location of speech training data archive on the web.') + parser.add_argument( + '--data_dir', + type=str, + default='/tmp/speech_dataset/', + help="""\ + Where to download the speech training data to. + """) + parser.add_argument( + '--silence_percentage', + type=float, + default=10.0, + help="""\ + How much of the training data should be silence. + """) + parser.add_argument( + '--unknown_percentage', + type=float, + default=10.0, + help="""\ + How much of the training data should be unknown words. + """) + parser.add_argument( + '--testing_percentage', + type=int, + default=10, + help='What percentage of wavs to use as a test set.') + parser.add_argument( + '--validation_percentage', + type=int, + default=10, + help='What percentage of wavs to use as a validation set.') + parser.add_argument( + '--sample_rate', + type=int, + default=16000, + help='Expected sample rate of the wavs',) + parser.add_argument( + '--clip_duration_ms', + type=int, + default=1000, + help='Expected duration in milliseconds of the wavs',) + parser.add_argument( + '--window_size_ms', + type=float, + default=30.0, + help='How long each spectrogram timeslice is',) + parser.add_argument( + '--window_stride_ms', + type=float, + default=10.0, + help='How long each spectrogram timeslice is',) + parser.add_argument( + '--dct_coefficient_count', + type=int, + default=40, + help='How many bins to use for the MFCC fingerprint',) + parser.add_argument( + '--batch_size', + type=int, + default=100, + help='How many items to train with at once',) + parser.add_argument( + '--wanted_words', + type=str, + default='yes,no,up,down,left,right,on,off,stop,go', + help='Words to use (others will be added to an unknown label)',) + parser.add_argument( + '--checkpoint', + type=str, + help='Checkpoint to load the weights from') + parser.add_argument( + '--tflite_path', + type=str, + help='Path to TFLite file to use for evaluation') + parser.add_argument( + '--model_architecture', + type=str, + default='dnn', + help='What model architecture to use') + parser.add_argument( + '--model_size_info', + type=int, + nargs="+", + default=[128, 128, 128], + help='Model dimensions - different for various models') + + FLAGS, _ = parser.parse_known_args() + evaluate() diff --git a/models/keyword_spotting/cnn_small/model_package_tf/how_to_guidance.ipynb b/models/keyword_spotting/cnn_small/model_package_tf/how_to_guidance.ipynb new file mode 100644 index 0000000..8b19ae4 --- /dev/null +++ b/models/keyword_spotting/cnn_small/model_package_tf/how_to_guidance.ipynb @@ -0,0 +1,428 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Copyright (C) 2023 Arm Limited or its affiliates. All rights reserved.\n", + "#\n", + "# SPDX-License-Identifier: Apache-2.0\n", + "#\n", + "# Licensed under the Apache License, Version 2.0 (the License); you may\n", + "# not use this file except in compliance with the License.\n", + "# You may obtain a copy of the License at\n", + "#\n", + "# www.apache.org/licenses/LICENSE-2.0\n", + "#\n", + "# Unless required by applicable law or agreed to in writing, software\n", + "# distributed under the License is distributed on an AS IS BASIS, WITHOUT\n", + "# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", + "# See the License for the specific language governing permissions and\n", + "# limitations under the License." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# CNN_Small - Optimised\n", + "\n", + "Here we reproduce the models with our established codebase and ModelPackage approach for your convenience.\n", + "\n", + "## Model-Package Overview:\n", + "\n", + "| Model \t| CNN_Small \t|\n", + "|:---------------:\t|:---------------------------------------------------------------:\t|\n", + "| **Format**: \t| Keras, Saved Model, TensorFlow Lite int8, TensorFlow Lite fp32 |\n", + "| **Feature**: \t| Keyword spotting for Arm Cortex-M CPUs |\n", + "| **Architectural Delta w.r.t. Vanilla**: | None |\n", + "| **Domain**: \t| Keyword spotting |\n", + "| **Package Quality**: \t| Optimised |" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Table of contents \n", + "\n", + "This how-to guidance presents the key steps to reproduce everything in this package. The contents are organised as below. We provided the internal navigation links for users to easy-jump among different sections. \n", + "\n", + " \n", + "* [1.0 Model recreation](#model_recreation)\n", + "\n", + "* [2.0 Training](#training)\n", + "\n", + "* [3.0 Testing](#testing)\n", + "\n", + "* [4.0 Optimization](#optimization)\n", + "\n", + "* [5.0 Quantization and TFLite conversion](#tflite_conversion)\n", + "\n", + "* [6.0 Inference the TFLite model files](#tflite_inference)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1.0 Model Recreation\n", + "\n", + "In order to recreate the model you will first need to be using ```Python3.7``` and install the requirements in ```requirements.txt```.\n", + "\n", + "Once you have these requirements satisfied you can execute the recreation script contained within this folder, just run:" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2023-01-31 13:13:21.365383: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0\n", + "Untarring speech_commands_v0.02.tar.gz...\n", + "2023-01-31 13:14:12.415896: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcuda.so.1\n", + "2023-01-31 13:14:12.453662: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: \n", + "pciBusID: 0000:03:00.0 name: NVIDIA TITAN Xp computeCapability: 6.1\n", + "coreClock: 1.582GHz coreCount: 30 deviceMemorySize: 11.91GiB deviceMemoryBandwidth: 510.07GiB/s\n", + "2023-01-31 13:14:12.453701: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0\n", + "2023-01-31 13:14:12.477025: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcublas.so.11\n", + "2023-01-31 13:14:12.477130: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcublasLt.so.11\n", + "2023-01-31 13:14:12.480970: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcufft.so.10\n", + "2023-01-31 13:14:12.481614: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcurand.so.10\n", + "2023-01-31 13:14:12.482232: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcusolver.so.11\n", + "2023-01-31 13:14:12.483034: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcusparse.so.11\n", + "2023-01-31 13:14:12.483190: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudnn.so.8\n", + "2023-01-31 13:14:12.483677: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0\n", + "2023-01-31 13:14:12.483964: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA\n", + "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n", + "2023-01-31 13:14:12.484760: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: \n", + "pciBusID: 0000:03:00.0 name: NVIDIA TITAN Xp computeCapability: 6.1\n", + "coreClock: 1.582GHz coreCount: 30 deviceMemorySize: 11.91GiB deviceMemoryBandwidth: 510.07GiB/s\n", + "2023-01-31 13:14:12.485262: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0\n", + "2023-01-31 13:14:12.485316: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0\n", + "2023-01-31 13:14:12.916344: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1258] Device interconnect StreamExecutor with strength 1 edge matrix:\n", + "2023-01-31 13:14:12.916381: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1264] 0 \n", + "2023-01-31 13:14:12.916389: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1277] 0: N \n", + "2023-01-31 13:14:12.916905: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1418] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 10809 MB memory) -> physical GPU (device: 0, name: NVIDIA TITAN Xp, pci bus id: 0000:03:00.0, compute capability: 6.1)\n", + "WARNING:tensorflow:Compiled the loaded model, but the compiled metrics have yet to be built. `model.compile_metrics` will be empty until you train or evaluate the model.\n", + "2023-01-31 13:14:14.471348: W tensorflow/python/util/util.cc:348] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.\n", + "2023-01-31 13:14:15.329325: I tensorflow/core/grappler/devices.cc:69] Number of eligible GPUs (core count >= 8, compute capability >= 0.0): 1\n", + "2023-01-31 13:14:15.329556: I tensorflow/core/grappler/clusters/single_machine.cc:357] Starting new session\n", + "2023-01-31 13:14:15.329983: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: \n", + "pciBusID: 0000:03:00.0 name: NVIDIA TITAN Xp computeCapability: 6.1\n", + "coreClock: 1.582GHz coreCount: 30 deviceMemorySize: 11.91GiB deviceMemoryBandwidth: 510.07GiB/s\n", + "2023-01-31 13:14:15.330272: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0\n", + "2023-01-31 13:14:15.330306: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1258] Device interconnect StreamExecutor with strength 1 edge matrix:\n", + "2023-01-31 13:14:15.330322: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1264] 0 \n", + "2023-01-31 13:14:15.330334: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1277] 0: N \n", + "2023-01-31 13:14:15.330642: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1418] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 10809 MB memory) -> physical GPU (device: 0, name: NVIDIA TITAN Xp, pci bus id: 0000:03:00.0, compute capability: 6.1)\n", + "2023-01-31 13:14:15.347491: I tensorflow/core/platform/profile_utils/cpu_utils.cc:114] CPU Frequency: 3492140000 Hz\n", + "2023-01-31 13:14:15.352470: I tensorflow/core/grappler/optimizers/meta_optimizer.cc:1144] Optimization results for grappler item: graph_to_optimize\n", + " function_optimizer: function_optimizer did nothing. time = 0.021ms.\n", + " function_optimizer: function_optimizer did nothing. time = 0.002ms.\n", + "\n", + "2023-01-31 13:14:15.425956: W tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc:345] Ignored output_format.\n", + "2023-01-31 13:14:15.425996: W tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc:348] Ignored drop_control_dependency.\n", + "2023-01-31 13:14:15.429502: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:210] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.\n", + "2023-01-31 13:14:15.431843: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: \n", + "pciBusID: 0000:03:00.0 name: NVIDIA TITAN Xp computeCapability: 6.1\n", + "coreClock: 1.582GHz coreCount: 30 deviceMemorySize: 11.91GiB deviceMemoryBandwidth: 510.07GiB/s\n", + "2023-01-31 13:14:15.432118: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0\n", + "2023-01-31 13:14:15.432154: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1258] Device interconnect StreamExecutor with strength 1 edge matrix:\n", + "2023-01-31 13:14:15.432167: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1264] 0 \n", + "2023-01-31 13:14:15.432178: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1277] 0: N \n", + "2023-01-31 13:14:15.432489: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1418] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 10809 MB memory) -> physical GPU (device: 0, name: NVIDIA TITAN Xp, pci bus id: 0000:03:00.0, compute capability: 6.1)\n", + "Converted model saved to cnn.tflite.\n", + "Running TFLite evaluation on validation set...\n", + "2023-01-31 13:14:15.484981: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:176] None of the MLIR Optimization Passes are enabled (registered 2)\n", + "[[371 0 0 0 0 0 0 0 0 0 0 0]\n", + " [ 0 300 5 7 2 13 9 4 11 4 2 14]\n", + " [ 0 1 381 4 0 2 8 0 0 0 0 1]\n", + " [ 1 13 1 363 0 8 3 1 0 1 4 11]\n", + " [ 0 3 1 1 328 0 1 0 5 8 3 0]\n", + " [ 0 9 0 12 1 340 5 0 0 0 4 6]\n", + " [ 1 3 9 2 2 0 332 2 0 0 0 1]\n", + " [ 0 11 0 0 1 2 6 341 0 1 0 1]\n", + " [ 1 9 0 0 4 1 0 0 339 8 1 0]\n", + " [ 0 3 2 0 20 0 4 0 4 334 3 3]\n", + " [ 1 5 1 0 9 1 2 0 0 2 329 0]\n", + " [ 0 9 0 28 1 8 1 0 0 5 6 314]]\n", + "Validation accuracy = 91.61%(N=4445)\n", + "Running TFLite evaluation on test set...\n", + "[[408 0 0 0 0 0 0 0 0 0 0 0]\n", + " [ 0 338 5 9 3 1 6 10 17 4 6 9]\n", + " [ 0 10 395 2 0 2 9 0 0 0 0 1]\n", + " [ 0 7 3 374 0 10 5 0 0 0 0 6]\n", + " [ 0 8 0 0 395 2 0 0 5 7 6 2]\n", + " [ 0 9 2 14 1 369 0 1 3 0 2 5]\n", + " [ 0 6 7 0 1 0 394 2 0 1 1 0]\n", + " [ 0 4 0 0 0 2 8 378 1 1 0 2]\n", + " [ 1 13 0 0 5 3 1 0 356 14 1 2]\n", + " [ 0 2 0 1 11 0 1 0 7 372 0 8]\n", + " [ 0 1 0 0 5 4 2 0 0 0 394 5]\n", + " [ 0 15 0 28 4 10 2 2 1 2 2 336]]\n", + "Test accuracy = 92.21%(N=4890)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2023-01-31 13:14:39.184982: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0\n", + "Untarring speech_commands_v0.02.tar.gz...\n", + "2023-01-31 13:15:30.798819: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcuda.so.1\n", + "2023-01-31 13:15:30.834958: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: \n", + "pciBusID: 0000:03:00.0 name: NVIDIA TITAN Xp computeCapability: 6.1\n", + "coreClock: 1.582GHz coreCount: 30 deviceMemorySize: 11.91GiB deviceMemoryBandwidth: 510.07GiB/s\n", + "2023-01-31 13:15:30.834997: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0\n", + "2023-01-31 13:15:30.856434: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcublas.so.11\n", + "2023-01-31 13:15:30.856508: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcublasLt.so.11\n", + "2023-01-31 13:15:30.860012: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcufft.so.10\n", + "2023-01-31 13:15:30.860406: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcurand.so.10\n", + "2023-01-31 13:15:30.861063: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcusolver.so.11\n", + "2023-01-31 13:15:30.861848: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcusparse.so.11\n", + "2023-01-31 13:15:30.862001: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudnn.so.8\n", + "2023-01-31 13:15:30.862359: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0\n", + "2023-01-31 13:15:30.862643: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA\n", + "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n", + "2023-01-31 13:15:30.863248: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: \n", + "pciBusID: 0000:03:00.0 name: NVIDIA TITAN Xp computeCapability: 6.1\n", + "coreClock: 1.582GHz coreCount: 30 deviceMemorySize: 11.91GiB deviceMemoryBandwidth: 510.07GiB/s\n", + "2023-01-31 13:15:30.863639: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0\n", + "2023-01-31 13:15:30.863701: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0\n", + "2023-01-31 13:15:31.316265: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1258] Device interconnect StreamExecutor with strength 1 edge matrix:\n", + "2023-01-31 13:15:31.316302: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1264] 0 \n", + "2023-01-31 13:15:31.316312: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1277] 0: N \n", + "2023-01-31 13:15:31.316827: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1418] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 10809 MB memory) -> physical GPU (device: 0, name: NVIDIA TITAN Xp, pci bus id: 0000:03:00.0, compute capability: 6.1)\n", + "WARNING:tensorflow:Compiled the loaded model, but the compiled metrics have yet to be built. `model.compile_metrics` will be empty until you train or evaluate the model.\n", + "2023-01-31 13:15:32.911559: W tensorflow/python/util/util.cc:348] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.\n", + "2023-01-31 13:15:33.701396: I tensorflow/core/grappler/devices.cc:69] Number of eligible GPUs (core count >= 8, compute capability >= 0.0): 1\n", + "2023-01-31 13:15:33.701483: I tensorflow/core/grappler/clusters/single_machine.cc:357] Starting new session\n", + "2023-01-31 13:15:33.702020: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: \n", + "pciBusID: 0000:03:00.0 name: NVIDIA TITAN Xp computeCapability: 6.1\n", + "coreClock: 1.582GHz coreCount: 30 deviceMemorySize: 11.91GiB deviceMemoryBandwidth: 510.07GiB/s\n", + "2023-01-31 13:15:33.702305: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0\n", + "2023-01-31 13:15:33.702342: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1258] Device interconnect StreamExecutor with strength 1 edge matrix:\n", + "2023-01-31 13:15:33.702357: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1264] 0 \n", + "2023-01-31 13:15:33.702364: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1277] 0: N \n", + "2023-01-31 13:15:33.702677: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1418] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 10809 MB memory) -> physical GPU (device: 0, name: NVIDIA TITAN Xp, pci bus id: 0000:03:00.0, compute capability: 6.1)\n", + "2023-01-31 13:15:33.719401: I tensorflow/core/platform/profile_utils/cpu_utils.cc:114] CPU Frequency: 3492140000 Hz\n", + "2023-01-31 13:15:33.721665: I tensorflow/core/grappler/optimizers/meta_optimizer.cc:1144] Optimization results for grappler item: graph_to_optimize\n", + " function_optimizer: function_optimizer did nothing. time = 0.012ms.\n", + " function_optimizer: function_optimizer did nothing. time = 0.002ms.\n", + "\n", + "2023-01-31 13:15:33.790485: W tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc:345] Ignored output_format.\n", + "2023-01-31 13:15:33.790521: W tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc:348] Ignored drop_control_dependency.\n", + "2023-01-31 13:15:33.793705: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:210] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.\n", + "2023-01-31 13:15:33.795921: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: \n", + "pciBusID: 0000:03:00.0 name: NVIDIA TITAN Xp computeCapability: 6.1\n", + "coreClock: 1.582GHz coreCount: 30 deviceMemorySize: 11.91GiB deviceMemoryBandwidth: 510.07GiB/s\n", + "2023-01-31 13:15:33.796178: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0\n", + "2023-01-31 13:15:33.796208: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1258] Device interconnect StreamExecutor with strength 1 edge matrix:\n", + "2023-01-31 13:15:33.796218: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1264] 0 \n", + "2023-01-31 13:15:33.796225: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1277] 0: N \n", + "2023-01-31 13:15:33.796508: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1418] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 10809 MB memory) -> physical GPU (device: 0, name: NVIDIA TITAN Xp, pci bus id: 0000:03:00.0, compute capability: 6.1)\n", + "2023-01-31 13:15:33.820120: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:176] None of the MLIR Optimization Passes are enabled (registered 2)\n", + "fully_quantize: 0, inference_type: 6, input_inference_type: 9, output_inference_type: 9\n", + "Quantized model saved to cnn_quantized.tflite.\n", + "Running TFLite evaluation on validation set...\n", + "[[371 0 0 0 0 0 0 0 0 0 0 0]\n", + " [ 0 301 5 8 2 12 9 4 11 3 3 13]\n", + " [ 0 2 376 3 0 2 11 2 0 0 0 1]\n", + " [ 1 13 1 350 4 8 5 1 1 0 4 18]\n", + " [ 0 4 1 1 321 0 2 0 5 7 9 0]\n", + " [ 0 10 0 9 3 337 4 1 0 0 7 6]\n", + " [ 1 6 9 1 4 1 327 2 0 0 0 1]\n", + " [ 0 14 0 0 2 2 6 337 1 1 0 0]\n", + " [ 1 9 1 0 4 2 1 0 339 5 1 0]\n", + " [ 0 4 1 0 25 0 5 0 6 322 5 5]\n", + " [ 1 6 1 0 13 1 1 0 1 3 323 0]\n", + " [ 0 11 1 26 3 7 1 1 1 3 4 314]]\n", + "Validation accuracy = 90.39%(N=4445)\n", + "Running TFLite evaluation on test set...\n", + "[[408 0 0 0 0 0 0 0 0 0 0 0]\n", + " [ 0 332 3 8 7 3 11 6 15 2 10 11]\n", + " [ 0 9 390 1 3 2 14 0 0 0 0 0]\n", + " [ 0 8 2 355 5 12 8 0 0 0 1 14]\n", + " [ 0 12 0 0 386 2 1 0 5 7 11 1]\n", + " [ 0 12 2 11 2 363 0 1 4 1 6 4]\n", + " [ 0 5 7 0 8 0 388 3 0 1 0 0]\n", + " [ 0 5 0 0 4 0 15 369 0 1 0 2]\n", + " [ 1 14 0 0 6 3 1 1 352 14 2 2]\n", + " [ 0 4 0 1 16 0 4 0 16 352 2 7]\n", + " [ 0 1 0 0 10 3 2 1 1 0 388 5]\n", + " [ 0 14 1 28 10 14 3 4 0 1 2 325]]\n", + "Test accuracy = 90.14%(N=4890)\n" + ] + } + ], + "source": [ + "!bash ./recreate_model.sh" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Running this script will use the pre-trained checkpoint files supplied in the ```./model_archive/model_source/weights``` folder to generate the TFLite files and perform evaluation on the test set. Both an fp32 version and a quantized version will be produced. The quantized version will use post-training quantization to fully quantize it.\n", + "\n", + "If you want to run training from scratch you can do this by supplying ```--train``` when running the script. For example:\n", + "\n", + "```bash\n", + "bash ./recreate_model.sh --train\n", + "```\n", + "\n", + "Training is then performed and should produce a model to the stated accuracy in this repository. Note that exporting to TFLite will still happen with the baseline pre-trained checkpoint files, so you will need to re-run the script and this time supply the path to the new checkpoint files you want to use, for example:\n", + "\n", + "```bash\n", + "bash ./recreate_model.sh --ckpt \n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2.0 Training\n", + "\n", + "The training scripts can be used to recreate any of the models from the [Hello Edge paper](https://arxiv.org/pdf/1711.07128.pdf) provided the right hyperparameters are used. The training commands with all the hyperparameters to reproduce the model in this repository are given [here](recreate_model.sh). The model in this part of the repository represents just one variation of the models from the paper, other varieties are covered in other parts of the repository.\n", + "\n", + "\n", + "As a general example of how to train a DNN with 3 fully-connected layers with 128 neurons in each layer, run:\n", + "```\n", + "python train.py --model_architecture dnn --model_size_info 128 128 128\n", + "```\n", + "\n", + "The command line argument *--model_size_info* is used to pass the neural network layer\n", + "dimensions such as number of layers, convolution filter size/stride as a list to models.py,\n", + "which builds the TensorFlow graph based on the provided model architecture\n", + "and layer dimensions. For more info on *model_size_info* for each network architecture see\n", + "[models.py](model_core_utils/models.py).\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 3.0 Testing\n", + "To run inference on the trained model from a checkpoint and get accuracy on validation and test sets, run:\n", + "```\n", + "python evaluation.py --model_architecture dnn --model_size_info 128 128 128 --checkpoint \n", + "```\n", + "**The model and feature extraction parameters passed to this script should match those used in the Training step.**" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 4.0 Optimization\n", + "\n", + "We introduce an *optional* step to optimize the trained keyword spotting model for deployment.\n", + "\n", + "Here we use TensorFlow's [weight clustering API](https://www.tensorflow.org/model_optimization/guide/clustering) to reduce the compressed model size and optimize inference on supported hardware. 32 weight clusters and kmeans++ cluster intialization method are used as the clustering hyperparameters.\n", + "\n", + "To optimize your trained model (e.g. a DNN), a trained model checkpoint is needed to run clustering and fine-tuning on.\n", + "You can use the pre-trained checkpoints provided, or train your own model and use the resulting checkpoint.\n", + "\n", + "To apply the optimization and fine-tuning, run the following command:\n", + "```\n", + "python optimisations.py --model_architecture dnn --model_size_info 128 128 128 --checkpoint \n", + "```\n", + "**The model and feature extraction parameters used here should match those used in the Training step, except for the number of training steps.\n", + "The number of training steps is reduced since the optimization step only requires fine-tuning.**\n", + "\n", + "This will generate a clustered model checkpoint that can be used in the quantization step to generate a quantized and clustered TFLite model." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 5.0 Quantization and TFLite Conversion\n", + "\n", + "You can now use TensorFlow's\n", + "[post training quantization](https://www.tensorflow.org/lite/performance/post_training_quantization) to\n", + "make quantization of the trained models super simple.\n", + "\n", + "To quantize your trained model (e.g. a DNN) run:\n", + "```\n", + "python convert_to_tflite.py --model_architecture dnn --model_size_info 128 128 128 --checkpoint [--inference_type int8|int16]\n", + "```\n", + "**The model and feature extraction parameters used here should match those used in the Training step.**\n", + "\n", + "The ```inference_type``` parameter is *optional* and to be used if a fully quantized model with inputs and outputs of type int8 or int16 is needed. It defaults to fp32.\n", + "\n", + "In this example, this step will produce a quantized TFLite file *dnn_quantized.tflite*." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can test the accuracy of this quantized model on the test set by running:\n", + "```\n", + "python evaluation.py --tflite_path dnn_quantized.tflite\n", + "```\n", + "**The model and feature extraction parameters used here should match those used in the Training step.**\n", + "\n", + "`convert_to_tflite.py` uses post-training quantization to generate a quantized model by default. If you wish to convert to a floating point TFLite model, use the command below:\n", + "\n", + "```\n", + "python convert_to_tflite.py --model_architecture dnn --model_size_info 128 128 128 --checkpoint --no-quantize\n", + "```\n", + "\n", + "This will produce a floating point TFLite file *dnn.tflite*. You can test the accuracy of this floating point model using `evaluation.py` as above.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 6.0 Single inference of the TFLite model files \n", + "\n", + "You can conduct TFLite inference for .fp32 and .int8 model files by using the following command: \n", + "\n", + "```python cnn_s_inference_tflite.py --labels validation_utils/labels.txt --wav --tflite_path ```\n", + "\n", + "**The feature extraction parameters used here should match those used in the Training step.**\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.4" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/models/keyword_spotting/cnn_small/model_package_tf/model_archive/TFLite/tflite_fp32/README.md b/models/keyword_spotting/cnn_small/model_package_tf/model_archive/TFLite/tflite_fp32/README.md new file mode 100644 index 0000000..c964371 --- /dev/null +++ b/models/keyword_spotting/cnn_small/model_package_tf/model_archive/TFLite/tflite_fp32/README.md @@ -0,0 +1,62 @@ +# keyword_spotting/cnn_small/model_package_tf/model_archive/TFLite/tflite_fp32 + +## Description +This is a floating point fp32 version of the CNN Small model developed by Arm, from the Hello Edge paper. + +## License +[Apache-2.0](https://spdx.org/licenses/Apache-2.0.html) + +## Network Information +| Network Information | Value | +|---------------------|-------| +| Framework | TensorFlow Lite | +| Datatype | fp32 | +| SHA-1 Hash | e9471348e6fb25191092236dac6af7c1fc84116b | +| Size (Bytes) | 280444 | +| Provenance | https://arxiv.org/abs/1711.07128 | +| Training | Trained by Arm | +| Paper | https://arxiv.org/abs/1711.07128 | + +## DataSet +| Dataset Information | Value | +|--------|-------| +| Name | Google Speech Commands test set | + +## Accuracy + +| Metric | Value | +|--------|-------| +| accuracy | 92.21% | + +## HW Support +| HW Support | Value | +|--------------|-------| +| Cortex-A |:heavy_check_mark: | +| Cortex-M |:heavy_check_mark: | +| Mali GPU |:heavy_check_mark: | +| Ethos U |:heavy_multiplication_x: | + +### Key +* :heavy_check_mark: - Will run on this platform. +* :heavy_multiplication_x: - Will not run on this platform. + +## Network Quality +| Network Quality | Value | +|-------------------------|-------| +| Recreate | :heavy_check_mark: | +| Quality level | Deployable | +| Vanilla | :heavy_check_mark: | +| Clustered | :heavy_multiplication_x: | +| Pruned | :heavy_multiplication_x: | +| Quantization - default | :heavy_multiplication_x: | +| Quantization - full | :heavy_multiplication_x: | + +## Network Inputs +| Input Node Name | Shape | Type | Example Path | Example Type | Example Shape | Example Use Case | +|-----------------|-------|-------|--------------|-------|-------|-----------------| +| input | (1, 490) | fp32 | models/keyword_spotting/cnn_small/model_package_tf/model_archive/TFLite/tflite_fp32/testing_input/input | fp32 | [1, 490] | The input is a processed MFCCs | + +## Network Outputs +| Output Node Name | Shape | Type | Example Path | Example Type | Example Shape | Example Use Case | +|-----------------|-------|-------|--------------|-------|-------|-----------------| +| Identity | (1, 12) | fp32 | models/keyword_spotting/cnn_small/model_package_tf/model_archive/TFLite/tflite_fp32/testing_output/Identity | fp32 | [1, 12] | The probability on 12 keywords | \ No newline at end of file diff --git a/models/keyword_spotting/cnn_small/model_package_tf/model_archive/TFLite/tflite_fp32/cnn_s.tflite b/models/keyword_spotting/cnn_small/model_package_tf/model_archive/TFLite/tflite_fp32/cnn_s.tflite new file mode 100644 index 0000000..11ed7c3 --- /dev/null +++ b/models/keyword_spotting/cnn_small/model_package_tf/model_archive/TFLite/tflite_fp32/cnn_s.tflite @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:39d968b59dec6a543fba800718fd72c9009644b39bcfd1e08226e18b40b6d9b5 +size 280444 diff --git a/models/keyword_spotting/cnn_small/model_package_tf/model_archive/TFLite/tflite_fp32/definition.yaml b/models/keyword_spotting/cnn_small/model_package_tf/model_archive/TFLite/tflite_fp32/definition.yaml new file mode 100644 index 0000000..18e9f60 --- /dev/null +++ b/models/keyword_spotting/cnn_small/model_package_tf/model_archive/TFLite/tflite_fp32/definition.yaml @@ -0,0 +1,64 @@ +benchmark: + benchmark_metrics: + accuracy: 92.21% + benchmark_name: Google Speech Commands test set +description: This is a floating point fp32 version of the CNN Small model developed + by Arm, from the Hello Edge paper. +license: +- Apache-2.0 +network: + datatype: fp32 + file_size_bytes: 280444 + filename: cnn_s.tflite + framework: TensorFlow Lite + hash: + algorithm: sha1 + value: e9471348e6fb25191092236dac6af7c1fc84116b + provenance: https://arxiv.org/abs/1711.07128 + training: Trained by Arm +network_parameters: + input_nodes: + - description: The input is a processed MFCCs of shape (1, 490) + example_input: + path: models/keyword_spotting/cnn_small/model_package_tf/model_archive/TFLite/tflite_fp32/testing_input/input + shape: + - 1 + - 490 + type: fp32 + use_case: Random input for model regression. + input_datatype: fp32 + name: input + shape: + - 1 + - 490 + output_nodes: + - description: The probability on 12 keywords. + example_output: + path: models/keyword_spotting/cnn_small/model_package_tf/model_archive/TFLite/tflite_fp32/testing_output/Identity + shape: + - 1 + - 12 + type: fp32 + use_case: output for model regression. + name: Identity + output_datatype: fp32 + shape: + - 1 + - 12 +network_quality: + clustered: false + is_vanilla: true + pruned: false + quality_level: Deployable + quality_level_hero_hw: cortex_m + quantization_default: false + quantization_full: false + recreate: true +operators: + TensorFlow Lite: + - CONV_2D + - FULLY_CONNECTED + - RELU + - RESHAPE + - SOFTMAX +paper: https://arxiv.org/abs/1711.07128 \ No newline at end of file diff --git a/models/keyword_spotting/cnn_small/model_package_tf/model_archive/TFLite/tflite_fp32/testing_input/input/0.npy b/models/keyword_spotting/cnn_small/model_package_tf/model_archive/TFLite/tflite_fp32/testing_input/input/0.npy new file mode 100644 index 0000000..2759db6 --- /dev/null +++ b/models/keyword_spotting/cnn_small/model_package_tf/model_archive/TFLite/tflite_fp32/testing_input/input/0.npy @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a4e38dbf192916f7af5440e17d27eaf1a19e13054977fed1ec5e85322e3da897 +size 2088 diff --git a/models/keyword_spotting/cnn_small/model_package_tf/model_archive/TFLite/tflite_fp32/testing_output/Identity/0.npy b/models/keyword_spotting/cnn_small/model_package_tf/model_archive/TFLite/tflite_fp32/testing_output/Identity/0.npy new file mode 100644 index 0000000..b651412 --- /dev/null +++ b/models/keyword_spotting/cnn_small/model_package_tf/model_archive/TFLite/tflite_fp32/testing_output/Identity/0.npy @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9ad3e4972e18774433a093b7228742fe66dceece314ea2de02bc0ac29a632cf8 +size 176 diff --git a/models/keyword_spotting/cnn_small/model_package_tf/model_archive/TFLite/tflite_int8/README.md b/models/keyword_spotting/cnn_small/model_package_tf/model_archive/TFLite/tflite_int8/README.md new file mode 100644 index 0000000..30ae15d --- /dev/null +++ b/models/keyword_spotting/cnn_small/model_package_tf/model_archive/TFLite/tflite_int8/README.md @@ -0,0 +1,62 @@ +# keyword_spotting/cnn_small/model_package_tf/model_archive/TFLite/tflite_int8 + +## Description +This is a fully quantized int8 version of the CNN Small model developed by Arm, from the Hello Edge paper. + +## License +[Apache-2.0](https://spdx.org/licenses/Apache-2.0.html) + +## Network Information +| Network Information | Value | +|---------------------|-------| +| Framework | TensorFlow Lite | +| Datatype | int8 | +| SHA-1 Hash | 3415f88dfb8f78fe47d282d68ccbc3ce71a7510f | +| Size (Bytes) | 75400 | +| Provenance | https://arxiv.org/abs/1711.07128 | +| Training | Trained by Arm | +| Paper | https://arxiv.org/abs/1711.07128 | + +## DataSet +| Dataset Information | Value | +|--------|-------| +| Name | Google Speech Commands test set | + +## Accuracy + +| Metric | Value | +|--------|-------| +| Accuracy | 90.18% | + +## HW Support +| HW Support | Value | +|--------------|-------| +| Cortex-A |:heavy_check_mark: | +| Cortex-M |:heavy_check_mark: | +| Mali GPU |:heavy_check_mark: | +| Ethos U |:heavy_check_mark: | + +### Key +* :heavy_check_mark: - Will run on this platform. +* :heavy_multiplication_x: - Will not run on this platform. + +## Network Quality +| Network Quality | Value | +|-------------------------|-------| +| Recreate | :heavy_check_mark: | +| Quality level | Deployable | +| Vanilla | :heavy_check_mark: | +| Clustered | :heavy_multiplication_x: | +| Pruned | :heavy_multiplication_x: | +| Quantization - default | :heavy_multiplication_x: | +| Quantization - full | :heavy_check_mark: | + +## Network Inputs +| Input Node Name | Shape | Type | Example Path | Example Type | Example Shape | Example Use Case | +|-----------------|-------|-------|--------------|-------|-------|-----------------| +| input | (1, 490) | int8 | models/keyword_spotting/cnn_small/model_package_tf/model_archive/TFLite/tflite_int8/testing_input/input | fp32 | [1, 490] | The input is a processed MFCCs | + +## Network Outputs +| Output Node Name | Shape | Type | Example Path | Example Type | Example Shape | Example Use Case | +|-----------------|-------|-------|--------------|-------|-------|-----------------| +| Identity | (1, 12) | int8 | models/keyword_spotting/cnn_small/model_package_tf/model_archive/TFLite/tflite_int8/testing_output/Identity | fp32 | [1, 12] | The probability on 12 keywords | \ No newline at end of file diff --git a/models/keyword_spotting/cnn_small/tflite_int8/cnn_s_quantized.tflite b/models/keyword_spotting/cnn_small/model_package_tf/model_archive/TFLite/tflite_int8/cnn_s_quantized.tflite similarity index 100% rename from models/keyword_spotting/cnn_small/tflite_int8/cnn_s_quantized.tflite rename to models/keyword_spotting/cnn_small/model_package_tf/model_archive/TFLite/tflite_int8/cnn_s_quantized.tflite diff --git a/models/keyword_spotting/cnn_small/model_package_tf/model_archive/TFLite/tflite_int8/definition.yaml b/models/keyword_spotting/cnn_small/model_package_tf/model_archive/TFLite/tflite_int8/definition.yaml new file mode 100644 index 0000000..c836274 --- /dev/null +++ b/models/keyword_spotting/cnn_small/model_package_tf/model_archive/TFLite/tflite_int8/definition.yaml @@ -0,0 +1,64 @@ +benchmark: + benchmark_metrics: + Accuracy: 90.18% + benchmark_name: Google Speech Commands test set +description: This is a fully quantized int8 version of the CNN Small model developed + by Arm, from the Hello Edge paper. +license: +- Apache-2.0 +network: + datatype: int8 + file_size_bytes: 75400 + filename: cnn_s_quantized.tflite + framework: TensorFlow Lite + hash: + algorithm: sha1 + value: 3415f88dfb8f78fe47d282d68ccbc3ce71a7510f + provenance: https://arxiv.org/abs/1711.07128 + training: Trained by Arm +network_parameters: + input_nodes: + - description: The input is a processed MFCCs of shape (1, 490) + example_input: + path: models/keyword_spotting/cnn_small/model_package_tf/model_archive/TFLite/tflite_int8/testing_input/input + shape: + - 1 + - 490 + type: int8 + use_case: Random input for model regression. + input_datatype: int8 + name: input + shape: + - 1 + - 490 + output_nodes: + - description: The probability on 12 keywords. + example_output: + path: models/keyword_spotting/cnn_small/model_package_tf/model_archive/TFLite/tflite_int8/testing_output/Identity + shape: + - 1 + - 12 + type: int8 + use_case: output for model regression. + name: Identity + output_datatype: int8 + shape: + - 1 + - 12 +network_quality: + clustered: false + is_vanilla: true + pruned: false + quality_level: Deployable + quality_level_hero_hw: cortex_m + quantization_default: false + quantization_full: true + recreate: true +operators: + TensorFlow Lite: + - CONV_2D + - FULLY_CONNECTED + - RELU + - RESHAPE + - SOFTMAX +paper: https://arxiv.org/abs/1711.07128 \ No newline at end of file diff --git a/models/keyword_spotting/cnn_small/tflite_int8/testing_input/input/0.npy b/models/keyword_spotting/cnn_small/model_package_tf/model_archive/TFLite/tflite_int8/testing_input/input/0.npy similarity index 100% rename from models/keyword_spotting/cnn_small/tflite_int8/testing_input/input/0.npy rename to models/keyword_spotting/cnn_small/model_package_tf/model_archive/TFLite/tflite_int8/testing_input/input/0.npy diff --git a/models/keyword_spotting/cnn_small/tflite_int8/testing_output/Identity/0.npy b/models/keyword_spotting/cnn_small/model_package_tf/model_archive/TFLite/tflite_int8/testing_output/Identity/0.npy similarity index 100% rename from models/keyword_spotting/cnn_small/tflite_int8/testing_output/Identity/0.npy rename to models/keyword_spotting/cnn_small/model_package_tf/model_archive/TFLite/tflite_int8/testing_output/Identity/0.npy diff --git a/models/keyword_spotting/cnn_small/model_package_tf/model_archive/model_source/saved_model/cnn_small/keras_metadata.pb b/models/keyword_spotting/cnn_small/model_package_tf/model_archive/model_source/saved_model/cnn_small/keras_metadata.pb new file mode 100644 index 0000000..f463c39 --- /dev/null +++ b/models/keyword_spotting/cnn_small/model_package_tf/model_archive/model_source/saved_model/cnn_small/keras_metadata.pb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:97d0b45b0027a13e5c2d0a0049775bfa1ac4661ee6e1e9c20690137ba0b91539 +size 28876 diff --git a/models/keyword_spotting/cnn_small/model_package_tf/model_archive/model_source/saved_model/cnn_small/saved_model.pb b/models/keyword_spotting/cnn_small/model_package_tf/model_archive/model_source/saved_model/cnn_small/saved_model.pb new file mode 100644 index 0000000..1904687 --- /dev/null +++ b/models/keyword_spotting/cnn_small/model_package_tf/model_archive/model_source/saved_model/cnn_small/saved_model.pb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c1d3d2c96b473b7cd1b9ca9cd60695a3c6e27d6cc57469b79da75e709e869ff6 +size 302218 diff --git a/models/keyword_spotting/cnn_small/model_package_tf/model_archive/model_source/saved_model/cnn_small/variables/variables.data-00000-of-00001 b/models/keyword_spotting/cnn_small/model_package_tf/model_archive/model_source/saved_model/cnn_small/variables/variables.data-00000-of-00001 new file mode 100644 index 0000000..ad5b44d --- /dev/null +++ b/models/keyword_spotting/cnn_small/model_package_tf/model_archive/model_source/saved_model/cnn_small/variables/variables.data-00000-of-00001 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:da9dce03052ef2895fbd3b41f28aade4d53d3ba38a706ded903c133b4c57a549 +size 288200 diff --git a/models/keyword_spotting/cnn_small/model_package_tf/model_archive/model_source/saved_model/cnn_small/variables/variables.index b/models/keyword_spotting/cnn_small/model_package_tf/model_archive/model_source/saved_model/cnn_small/variables/variables.index new file mode 100644 index 0000000..c4f021a --- /dev/null +++ b/models/keyword_spotting/cnn_small/model_package_tf/model_archive/model_source/saved_model/cnn_small/variables/variables.index @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3021889ecbad08fd6d5edf947596f2fd9dee8a594a63a1f3d2f4bafee7271cce +size 1466 diff --git a/models/keyword_spotting/cnn_small/tflite_int8/ckpt/checkpoint b/models/keyword_spotting/cnn_small/model_package_tf/model_archive/model_source/weights/checkpoint similarity index 100% rename from models/keyword_spotting/cnn_small/tflite_int8/ckpt/checkpoint rename to models/keyword_spotting/cnn_small/model_package_tf/model_archive/model_source/weights/checkpoint diff --git a/models/keyword_spotting/cnn_small/tflite_int8/ckpt/cnn_0.92_ckpt.data-00000-of-00001 b/models/keyword_spotting/cnn_small/model_package_tf/model_archive/model_source/weights/cnn_0.92_ckpt.data-00000-of-00001 similarity index 100% rename from models/keyword_spotting/cnn_small/tflite_int8/ckpt/cnn_0.92_ckpt.data-00000-of-00001 rename to models/keyword_spotting/cnn_small/model_package_tf/model_archive/model_source/weights/cnn_0.92_ckpt.data-00000-of-00001 diff --git a/models/keyword_spotting/cnn_small/tflite_int8/ckpt/cnn_0.92_ckpt.index b/models/keyword_spotting/cnn_small/model_package_tf/model_archive/model_source/weights/cnn_0.92_ckpt.index similarity index 100% rename from models/keyword_spotting/cnn_small/tflite_int8/ckpt/cnn_0.92_ckpt.index rename to models/keyword_spotting/cnn_small/model_package_tf/model_archive/model_source/weights/cnn_0.92_ckpt.index diff --git a/models/keyword_spotting/cnn_small/model_package_tf/model_core_utils/__init__.py b/models/keyword_spotting/cnn_small/model_package_tf/model_core_utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/models/keyword_spotting/cnn_small/model_package_tf/model_core_utils/models.py b/models/keyword_spotting/cnn_small/model_package_tf/model_core_utils/models.py new file mode 100644 index 0000000..1978136 --- /dev/null +++ b/models/keyword_spotting/cnn_small/model_package_tf/model_core_utils/models.py @@ -0,0 +1,327 @@ +# Copyright © 2023 Arm Ltd. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Model definitions for simple keyword spotting.""" + +import math + +import tensorflow as tf + + +def prepare_model_settings(label_count, sample_rate, clip_duration_ms, + window_size_ms, window_stride_ms, + dct_coefficient_count): + """Calculates common settings needed for all models. + + Args: + label_count: How many classes are to be recognized. + sample_rate: Number of audio samples per second. + clip_duration_ms: Length of each audio clip to be analyzed. + window_size_ms: Duration of frequency analysis window. + window_stride_ms: How far to move in time between frequency windows. + dct_coefficient_count: Number of frequency bins to use for analysis. + + Returns: + Dictionary containing common settings. + """ + desired_samples = int(sample_rate * clip_duration_ms / 1000) + window_size_samples = int(sample_rate * window_size_ms / 1000) + window_stride_samples = int(sample_rate * window_stride_ms / 1000) + length_minus_window = (desired_samples - window_size_samples) + if length_minus_window < 0: + spectrogram_length = 0 + else: + spectrogram_length = 1 + int(length_minus_window / window_stride_samples) + fingerprint_size = dct_coefficient_count * spectrogram_length + + return { + 'desired_samples': desired_samples, + 'window_size_samples': window_size_samples, + 'window_stride_samples': window_stride_samples, + 'spectrogram_length': spectrogram_length, + 'dct_coefficient_count': dct_coefficient_count, + 'fingerprint_size': fingerprint_size, + 'label_count': label_count, + 'sample_rate': sample_rate, + } + + +def create_model(model_settings, model_architecture, model_size_info, is_training): + """Builds a tf.keras model of the requested architecture compatible with the settings. + + Args: + model_settings: Dictionary of information about the model. + model_architecture: String specifying which kind of model to create. + model_size_info: Array with specific information for the chosen architecture + (e.g convolutional parameters, number of layers). + + Returns: + A tf.keras Model with the requested architecture. + + Raises: + Exception: If the architecture type isn't recognized. + """ + + if model_architecture == 'dnn': + return create_dnn_model(model_settings, model_size_info) + + elif model_architecture == 'cnn': + return create_cnn_model(model_settings, model_size_info) + + elif model_architecture == 'ds_cnn': + return create_ds_cnn_model(model_settings, model_size_info) + elif model_architecture == 'single_fc': + return create_single_fc_model(model_settings) + elif model_architecture == 'basic_lstm': + return create_basic_lstm_model(model_settings, model_size_info, is_training) + else: + raise Exception(f'model_architecture argument {model_architecture} not recognized' + f', should be one of, "dnn", "cnn", "ds_cnn" ') + + +def create_single_fc_model(model_settings): + """Builds a model with a single fully-connected layer. + + For details see https://arxiv.org/abs/1711.07128. + + Args: + model_settings: Dict of different settings for model training. + + Returns: + tf.keras Model of the 'SINGLE_FC' architecture. + """ + inputs = tf.keras.Input(shape=(model_settings['fingerprint_size'],), name='input') + # Fully connected layer + output = tf.keras.layers.Dense(units=model_settings['label_count'], activation='softmax')(inputs) + + return tf.keras.Model(inputs, output) + + +def create_basic_lstm_model(model_settings, model_size_info, is_training): + """Builds a model with a basic lstm layer. + + For details see https://arxiv.org/abs/1711.07128. + + Args: + model_settings: Dict of different settings for model training. + model_size_info: Length of the array defines the number of hidden-layers and + each element in the array represent the number of neurons in that layer. + is_training: Determining whether the use of the model is for training or for something else. + + Returns: + tf.keras Model of the 'Basic_LSTM' architecture. + """ + inputs = tf.keras.Input(shape=(model_settings['fingerprint_size'], ), name='input') + + input_frequency_size = model_settings['dct_coefficient_count'] + input_time_size = model_settings['spectrogram_length'] + x = tf.reshape(inputs, shape=(-1, input_time_size, input_frequency_size)) + + # LSTM layer, and unrolling depending on whether you are training or not + if is_training: + x = tf.keras.layers.LSTM(units=model_size_info[0], time_major=False, unroll=False)(x) + else: + x = tf.keras.layers.LSTM(units=model_size_info[0], time_major=False, unroll=True)(x) + + # Outputs a fully connected layer + output = tf.keras.layers.Dense(units=model_settings['label_count'], activation='softmax')(x) + + return tf.keras.Model(inputs, output) + + +def create_dnn_model(model_settings, model_size_info): + """Builds a model with multiple hidden fully-connected layers. + + For details see https://arxiv.org/abs/1711.07128. + + Args: + model_settings: Dict of different settings for model training. + model_size_info: Length of the array defines the number of hidden-layers and + each element in the array represent the number of neurons in that layer. + + Returns: + tf.keras Model of the 'DNN' architecture. + """ + + inputs = tf.keras.Input(shape=(model_settings['fingerprint_size'], ), name='input') + + # First fully connected layer. + x = tf.keras.layers.Dense(units=model_size_info[0], activation='relu')(inputs) + + # Hidden layers with ReLU activations. + for i in range(1, len(model_size_info)): + x = tf.keras.layers.Dense(units=model_size_info[i], activation='relu')(x) + + # Output fully connected layer. + output = tf.keras.layers.Dense(units=model_settings['label_count'], activation='softmax')(x) + + return tf.keras.Model(inputs, output) + + +def create_cnn_model(model_settings, model_size_info): + """Builds a model with 2 convolution layers followed by a linear layer and a hidden fully-connected layer. + + For details see https://arxiv.org/abs/1711.07128. + + Args: + model_settings: Dict of different settings for model training. + model_size_info: Defines the first and second convolution parameters in + {number of conv features, conv filter height, width, stride in y,x dir.}, + followed by linear layer size and fully-connected layer size. + + Returns: + tf.keras Model of the 'CNN' architecture. + """ + + input_frequency_size = model_settings['dct_coefficient_count'] + input_time_size = model_settings['spectrogram_length'] + + first_filter_count = model_size_info[0] + first_filter_height = model_size_info[1] # Time axis. + first_filter_width = model_size_info[2] # Frequency axis. + first_filter_stride_y = model_size_info[3] # Time axis. + first_filter_stride_x = model_size_info[4] # Frequency_axis. + + second_filter_count = model_size_info[5] + second_filter_height = model_size_info[6] # Time axis. + second_filter_width = model_size_info[7] # Frequency axis. + second_filter_stride_y = model_size_info[8] # Time axis. + second_filter_stride_x = model_size_info[9] # Frequency axis. + + linear_layer_size = model_size_info[10] + fc_size = model_size_info[11] + + inputs = tf.keras.Input(shape=(model_settings['fingerprint_size']), name='input') + + # Reshape the flattened input. + x = tf.reshape(inputs, shape=(-1, input_time_size, input_frequency_size, 1)) + + # First convolution. + x = tf.keras.layers.Conv2D(filters=first_filter_count, + kernel_size=(first_filter_height, first_filter_width), + strides=(first_filter_stride_y, first_filter_stride_x), + padding='VALID')(x) + x = tf.keras.layers.BatchNormalization()(x) + x = tf.keras.layers.ReLU()(x) + x = tf.keras.layers.Dropout(rate=0)(x) + + # Second convolution. + x = tf.keras.layers.Conv2D(filters=second_filter_count, + kernel_size=(second_filter_height, second_filter_width), + strides=(second_filter_stride_y, second_filter_stride_x), + padding='VALID')(x) + x = tf.keras.layers.BatchNormalization()(x) + x = tf.keras.layers.ReLU()(x) + x = tf.keras.layers.Dropout(rate=0)(x) + + # Flatten for fully connected layers. + x = tf.keras.layers.Flatten()(x) + + # Fully connected layer with no activation. + x = tf.keras.layers.Dense(units=linear_layer_size)(x) + + # Fully connected layer with ReLU activation. + x = tf.keras.layers.Dense(units=fc_size)(x) + x = tf.keras.layers.BatchNormalization()(x) + x = tf.keras.layers.ReLU()(x) + x = tf.keras.layers.Dropout(rate=0)(x) + + # Output fully connected. + output = tf.keras.layers.Dense(units=model_settings['label_count'], activation='softmax')(x) + + return tf.keras.Model(inputs, output) + + +def create_ds_cnn_model(model_settings, model_size_info): + """Builds a model with convolutional & depthwise separable convolutional layers. + + For more details see https://arxiv.org/abs/1711.07128. + + Args: + model_settings: Dict of different settings for model training. + model_size_info: Defines number of layers, followed by the DS-Conv layer + parameters in the order {number of conv features, conv filter height, + width and stride in y,x dir.} for each of the layers. + + Returns: + tf.keras Model of the 'DS-CNN' architecture. + """ + + label_count = model_settings['label_count'] + input_frequency_size = model_settings['dct_coefficient_count'] + input_time_size = model_settings['spectrogram_length'] + + t_dim = input_time_size + f_dim = input_frequency_size + + # Extract model dimensions from model_size_info. + num_layers = model_size_info[0] + conv_feat = [None]*num_layers + conv_kt = [None]*num_layers + conv_kf = [None]*num_layers + conv_st = [None]*num_layers + conv_sf = [None]*num_layers + + i = 1 + for layer_no in range(0, num_layers): + conv_feat[layer_no] = model_size_info[i] + i += 1 + conv_kt[layer_no] = model_size_info[i] + i += 1 + conv_kf[layer_no] = model_size_info[i] + i += 1 + conv_st[layer_no] = model_size_info[i] + i += 1 + conv_sf[layer_no] = model_size_info[i] + i += 1 + + inputs = tf.keras.Input(shape=(model_settings['fingerprint_size']), name='input') + + # Reshape the flattened input. + x = tf.reshape(inputs, shape=(-1, input_time_size, input_frequency_size, 1)) + + # Depthwise separable convolutions. + for layer_no in range(0, num_layers): + if layer_no == 0: + # First convolution. + x = tf.keras.layers.Conv2D(filters=conv_feat[0], + kernel_size=(conv_kt[0], conv_kf[0]), + strides=(conv_st[0], conv_sf[0]), + padding='SAME')(x) + x = tf.keras.layers.BatchNormalization()(x) + x = tf.keras.layers.ReLU()(x) + else: + # Depthwise convolution. + x = tf.keras.layers.DepthwiseConv2D(kernel_size=(conv_kt[layer_no], conv_kf[layer_no]), + strides=(conv_sf[layer_no], conv_st[layer_no]), + padding='SAME')(x) + x = tf.keras.layers.BatchNormalization()(x) + x = tf.keras.layers.ReLU()(x) + + # Pointwise convolution. + x = tf.keras.layers.Conv2D(filters=conv_feat[layer_no], kernel_size=(1, 1))(x) + x = tf.keras.layers.BatchNormalization()(x) + x = tf.keras.layers.ReLU()(x) + + t_dim = math.ceil(t_dim/float(conv_st[layer_no])) + f_dim = math.ceil(f_dim/float(conv_sf[layer_no])) + + # Global average pool. + x = tf.keras.layers.AveragePooling2D(pool_size=(t_dim, f_dim), strides=1)(x) + + # Squeeze before passing to output fully connected layer. + x = tf.reshape(x, shape=(-1, conv_feat[layer_no])) + + # Output connected layer. + output = tf.keras.layers.Dense(units=label_count, activation='softmax')(x) + + return tf.keras.Model(inputs, output) diff --git a/models/keyword_spotting/cnn_small/model_package_tf/optimisations.py b/models/keyword_spotting/cnn_small/model_package_tf/optimisations.py new file mode 100644 index 0000000..16b6f4c --- /dev/null +++ b/models/keyword_spotting/cnn_small/model_package_tf/optimisations.py @@ -0,0 +1,259 @@ +# Copyright © 2023 Arm Ltd. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Functions for optimizing simple keyword spotting models using clustering API.""" + +import argparse +from pathlib import Path + +import tensorflow as tf +import numpy as np +import tensorflow_model_optimization as tfmot + +from data_processing import data_preprocessing +from model_core_utils import models + + +def print_model_weight_clusters(model): + + for layer in model.layers: + if isinstance(layer, tf.keras.layers.Wrapper): + weights = layer.trainable_weights + else: + weights = layer.weights + for weight in weights: + if "kernel" in weight.name: + unique_count = len(np.unique(weight)) + print( + f"{layer.name}/{weight.name}: {unique_count} clusters " + ) + + +def optimize(): + model_settings = models.prepare_model_settings( + len(data_preprocessing.prepare_words_list(FLAGS.wanted_words.split(','))), + FLAGS.sample_rate, FLAGS.clip_duration_ms, FLAGS.window_size_ms, + FLAGS.window_stride_ms, FLAGS.dct_coefficient_count) + + # Create the model to optimize from checkpoint. + model = models.create_model(model_settings, FLAGS.model_architecture, FLAGS.model_size_info) + model.load_weights(FLAGS.checkpoint).expect_partial() + + audio_processor = data_preprocessing.AudioProcessor(data_url=FLAGS.data_url, + data_dir=FLAGS.data_dir, + silence_percentage=FLAGS.silence_percentage, + unknown_percentage=FLAGS.unknown_percentage, + wanted_words=FLAGS.wanted_words.split(','), + validation_percentage=FLAGS.validation_percentage, + testing_percentage=FLAGS.testing_percentage, + model_settings=model_settings) + + # We decay learning rate in a constant piecewise way to help learning. + training_steps_list = list(map(int, FLAGS.how_many_training_steps.split(','))) + learning_rates_list = list(map(float, FLAGS.learning_rate.split(','))) + lr_boundary_list = training_steps_list[:-1] # Only need the values at which to change lr. + lr_schedule = tf.keras.optimizers.schedules.PiecewiseConstantDecay(boundaries=lr_boundary_list, + values=learning_rates_list) + + cluster_weights = tfmot.clustering.keras.cluster_weights + CentroidInitialization = tfmot.clustering.keras.CentroidInitialization + + clustering_params = { + 'number_of_clusters': 32, + 'cluster_centroids_init': CentroidInitialization.KMEANS_PLUS_PLUS} + + clustered_model = cluster_weights(model, **clustering_params) + + # Specify the optimizer configurations. + optimizer = tf.keras.optimizers.Adam(learning_rate=lr_schedule) + clustered_model.compile(optimizer=optimizer, + loss=tf.keras.losses.SparseCategoricalCrossentropy(), + metrics=['accuracy']) + + train_data = audio_processor.get_data(audio_processor.Modes.TRAINING, + FLAGS.background_frequency, FLAGS.background_volume, + int((FLAGS.time_shift_ms * FLAGS.sample_rate) / 1000)) + train_data = train_data.repeat().batch(FLAGS.batch_size).prefetch(tf.data.AUTOTUNE) + val_data = audio_processor.get_data(audio_processor.Modes.VALIDATION) + val_data = val_data.batch(FLAGS.batch_size).prefetch(tf.data.AUTOTUNE) + + # We train for a max number of iterations so need to calculate how many 'epochs' this will be. + training_steps_max = np.sum(training_steps_list) + training_epoch_max = int(np.ceil(training_steps_max / FLAGS.eval_step_interval)) + + # Train the model with clustering applied. + clustered_model.fit(x=train_data, + steps_per_epoch=FLAGS.eval_step_interval, + epochs=training_epoch_max, + validation_data=val_data) + + stripped_clustered_model = tfmot.clustering.keras.strip_clustering(clustered_model) + + print_model_weight_clusters(stripped_clustered_model) + + # Save the clustered model weights + train_dir = Path(FLAGS.train_dir) / "optimized" + train_dir.mkdir(parents=True, exist_ok=True) + + stripped_clustered_model.save_weights((train_dir / + (FLAGS.model_architecture + + "_clustered_ckpt"))) + + # Test the model. + test_data = audio_processor.get_data(audio_processor.Modes.TESTING) + test_data = test_data.batch(FLAGS.batch_size) + + stripped_clustered_model.compile(optimizer=optimizer, + loss=tf.keras.losses.SparseCategoricalCrossentropy(), + metrics=['accuracy']) + + test_loss, test_acc = stripped_clustered_model.evaluate(x=test_data) + print(f'Final test accuracy: {test_acc*100:.2f}%') + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + '--data_url', + type=str, + default='http://download.tensorflow.org/data/speech_commands_v0.02.tar.gz', + help='Location of speech training data archive on the web.') + parser.add_argument( + '--data_dir', + type=str, + default='/tmp/speech_dataset/', + help="""\ + Where to download the speech training data to. + """) + parser.add_argument( + '--background_volume', + type=float, + default=0.1, + help="""\ + How loud the background noise should be, between 0 and 1. + """) + parser.add_argument( + '--background_frequency', + type=float, + default=0.8, + help="""\ + How many of the training samples have background noise mixed in. + """) + parser.add_argument( + '--silence_percentage', + type=float, + default=10.0, + help="""\ + How much of the training data should be silence. + """) + parser.add_argument( + '--unknown_percentage', + type=float, + default=10.0, + help="""\ + How much of the training data should be unknown words. + """) + parser.add_argument( + '--time_shift_ms', + type=float, + default=100.0, + help="""\ + Range to randomly shift the training audio by in time. + """) + parser.add_argument( + '--testing_percentage', + type=int, + default=10, + help='What percentage of wavs to use as a test set.') + parser.add_argument( + '--validation_percentage', + type=int, + default=10, + help='What percentage of wavs to use as a validation set.') + parser.add_argument( + '--sample_rate', + type=int, + default=16000, + help='Expected sample rate of the wavs',) + parser.add_argument( + '--clip_duration_ms', + type=int, + default=1000, + help='Expected duration in milliseconds of the wavs',) + parser.add_argument( + '--window_size_ms', + type=float, + default=30.0, + help='How long each spectrogram timeslice is',) + parser.add_argument( + '--window_stride_ms', + type=float, + default=10.0, + help='How long each spectrogram timeslice is',) + parser.add_argument( + '--dct_coefficient_count', + type=int, + default=40, + help='How many bins to use for the MFCC fingerprint',) + parser.add_argument( + '--how_many_training_steps', + type=str, + default='3750,750', + help='How many training loops to run',) + parser.add_argument( + '--eval_step_interval', + type=int, + default=400, + help='How often to evaluate the training results.') + parser.add_argument( + '--learning_rate', + type=str, + default='0.001,0.0001', + help='How large a learning rate to use when training.') + parser.add_argument( + '--batch_size', + type=int, + default=100, + help='How many items to train with at once',) + parser.add_argument( + '--wanted_words', + type=str, + default='yes,no,up,down,left,right,on,off,stop,go', + help='Words to use (others will be added to an unknown label)',) + parser.add_argument( + '--train_dir', + type=str, + default='/tmp/speech_commands_train', + help='Directory to write event logs and checkpoint.') + parser.add_argument( + '--save_step_interval', + type=int, + default=100, + help='Save model checkpoint every save_steps.') + parser.add_argument( + '--checkpoint', + type=str, + help='Checkpoint to load the weights from before fine-tuning.') + parser.add_argument( + '--model_architecture', + type=str, + default='dnn', + help='What model architecture to use') + parser.add_argument( + '--model_size_info', + type=int, + nargs="+", + default=[128, 128, 128], + help='Model dimensions - different for various models') + + FLAGS, _ = parser.parse_known_args() + optimize() diff --git a/models/keyword_spotting/cnn_small/model_package_tf/recreate_model.sh b/models/keyword_spotting/cnn_small/model_package_tf/recreate_model.sh new file mode 100644 index 0000000..1f0289a --- /dev/null +++ b/models/keyword_spotting/cnn_small/model_package_tf/recreate_model.sh @@ -0,0 +1,52 @@ +#!/usr/bin/env bash +# Copyright (C) 2023 Arm Limited or its affiliates. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the License); you may +# not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an AS IS BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +ckpt_path=model_archive/model_source/weights/cnn_0.92_ckpt +train=false + +# Parse command line args +while (( $# >= 1 )); do + case $1 in + --ckpt) + if [ "$2" ]; then + ckpt_path=$2 + shift + else + printf 'ERROR: "--ckpt" requires a path to be supplied.\n' + exit 1 + fi + ;; + --train) + train=true + break;; + *) shift; + esac; +done + + +# CNN Small training +if [ "$train" = true ] +then +python train.py --model_architecture cnn --model_size_info 28 10 4 1 1 30 10 4 2 1 16 128 --dct_coefficient_count 10 --window_size_ms 40 --window_stride_ms 20 --learning_rate 0.0005,0.0001,0.00002 --how_many_training_steps 10000,10000,10000 --summaries_dir work/CNN/CNN_S/retrain_logs --train_dir work/CNN/CNN_S/training +fi + +# Conversion to TFLite fp32 +python convert_to_tflite.py --model_architecture cnn --model_size_info 28 10 4 1 1 30 10 4 2 1 16 128 --dct_coefficient_count 10 --window_size_ms 40 --window_stride_ms 20 --checkpoint $ckpt_path --no-quantize + +# Conversion to TFLite int8 +python convert_to_tflite.py --model_architecture cnn --model_size_info 28 10 4 1 1 30 10 4 2 1 16 128 --dct_coefficient_count 10 --window_size_ms 40 --window_stride_ms 20 --checkpoint $ckpt_path --inference_type int8 + diff --git a/models/keyword_spotting/cnn_small/model_package_tf/requirements.txt b/models/keyword_spotting/cnn_small/model_package_tf/requirements.txt new file mode 100644 index 0000000..3448cff --- /dev/null +++ b/models/keyword_spotting/cnn_small/model_package_tf/requirements.txt @@ -0,0 +1,3 @@ +numpy == 1.19.5 +tensorflow == 2.5.0 +tensorflow-model-optimization == 0.6.0 \ No newline at end of file diff --git a/models/keyword_spotting/cnn_small/model_package_tf/train.py b/models/keyword_spotting/cnn_small/model_package_tf/train.py new file mode 100644 index 0000000..8c488b3 --- /dev/null +++ b/models/keyword_spotting/cnn_small/model_package_tf/train.py @@ -0,0 +1,227 @@ +# Copyright © 2023 Arm Ltd. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Functions for training simple keyword spotting models.""" + +import argparse +from pathlib import Path + +import tensorflow as tf +import numpy as np + +from data_processing import data_preprocessing +from model_core_utils import models + + +def train(): + model_settings = models.prepare_model_settings( + len(data_preprocessing.prepare_words_list(FLAGS.wanted_words.split(','))), + FLAGS.sample_rate, FLAGS.clip_duration_ms, FLAGS.window_size_ms, + FLAGS.window_stride_ms, FLAGS.dct_coefficient_count) + + # Create the model. + model = models.create_model(model_settings, FLAGS.model_architecture, FLAGS.model_size_info, True) + + audio_processor = data_preprocessing.AudioProcessor(data_url=FLAGS.data_url, + data_dir=FLAGS.data_dir, + silence_percentage=FLAGS.silence_percentage, + unknown_percentage=FLAGS.unknown_percentage, + wanted_words=FLAGS.wanted_words.split(','), + validation_percentage=FLAGS.validation_percentage, + testing_percentage=FLAGS.testing_percentage, + model_settings=model_settings) + + # We decay learning rate in a constant piecewise way to help learning. + training_steps_list = list(map(int, FLAGS.how_many_training_steps.split(','))) + learning_rates_list = list(map(float, FLAGS.learning_rate.split(','))) + lr_boundary_list = training_steps_list[:-1] # Only need the values at which to change lr. + lr_schedule = tf.keras.optimizers.schedules.PiecewiseConstantDecay(boundaries=lr_boundary_list, + values=learning_rates_list) + + # Specify the optimizer configurations. + optimizer = tf.keras.optimizers.Adam(learning_rate=lr_schedule) + model.compile(optimizer=optimizer, + loss=tf.keras.losses.SparseCategoricalCrossentropy(), + metrics=['accuracy']) + + train_data = audio_processor.get_data(audio_processor.Modes.TRAINING, + FLAGS.background_frequency, FLAGS.background_volume, + int((FLAGS.time_shift_ms * FLAGS.sample_rate) / 1000)) + train_data = train_data.repeat().batch(FLAGS.batch_size).prefetch(tf.data.AUTOTUNE) + val_data = audio_processor.get_data(audio_processor.Modes.VALIDATION) + val_data = val_data.batch(FLAGS.batch_size).prefetch(tf.data.AUTOTUNE) + + # We train for a max number of iterations so need to calculate how many 'epochs' this will be. + training_steps_max = np.sum(training_steps_list) + training_epoch_max = int(np.ceil(training_steps_max / FLAGS.eval_step_interval)) + + # Callbacks. + train_dir = Path(FLAGS.train_dir) / "best" + train_dir.mkdir(parents=True, exist_ok=True) + model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint( + filepath=(train_dir / (FLAGS.model_architecture + "_{val_accuracy:.3f}_ckpt")), + save_weights_only=True, + monitor='val_accuracy', + mode='max', + save_best_only=True) + tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=FLAGS.summaries_dir) + + # Train the model. + model.fit(x=train_data, + steps_per_epoch=FLAGS.eval_step_interval, + epochs=training_epoch_max, + validation_data=val_data, + callbacks=[model_checkpoint_callback, tensorboard_callback]) + + # Test and save the model. + test_data = audio_processor.get_data(audio_processor.Modes.TESTING) + test_data = test_data.batch(FLAGS.batch_size) + + test_loss, test_acc = model.evaluate(x=test_data) + print(f'Final test accuracy: {test_acc*100:.2f}%') + model.save(f'saved_model/{FLAGS.model_architecture}') + model.save(f'keras/{FLAGS.model_architecture}.h5') + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + '--data_url', + type=str, + default='http://download.tensorflow.org/data/speech_commands_v0.02.tar.gz', + help='Location of speech training data archive on the web.') + parser.add_argument( + '--data_dir', + type=str, + default='/tmp/speech_dataset/', + help="""\ + Where to download the speech training data to. + """) + parser.add_argument( + '--background_volume', + type=float, + default=0.1, + help="""\ + How loud the background noise should be, between 0 and 1. + """) + parser.add_argument( + '--background_frequency', + type=float, + default=0.8, + help="""\ + How many of the training samples have background noise mixed in. + """) + parser.add_argument( + '--silence_percentage', + type=float, + default=10.0, + help="""\ + How much of the training data should be silence. + """) + parser.add_argument( + '--unknown_percentage', + type=float, + default=10.0, + help="""\ + How much of the training data should be unknown words. + """) + parser.add_argument( + '--time_shift_ms', + type=float, + default=100.0, + help="""\ + Range to randomly shift the training audio by in time. + """) + parser.add_argument( + '--testing_percentage', + type=int, + default=10, + help='What percentage of wavs to use as a test set.') + parser.add_argument( + '--validation_percentage', + type=int, + default=10, + help='What percentage of wavs to use as a validation set.') + parser.add_argument( + '--sample_rate', + type=int, + default=16000, + help='Expected sample rate of the wavs',) + parser.add_argument( + '--clip_duration_ms', + type=int, + default=1000, + help='Expected duration in milliseconds of the wavs',) + parser.add_argument( + '--window_size_ms', + type=float, + default=30.0, + help='How long each spectrogram timeslice is',) + parser.add_argument( + '--window_stride_ms', + type=float, + default=10.0, + help='How long each spectrogram timeslice is',) + parser.add_argument( + '--dct_coefficient_count', + type=int, + default=40, + help='How many bins to use for the MFCC fingerprint',) + parser.add_argument( + '--how_many_training_steps', + type=str, + default='15000,3000', + help='How many training loops to run',) + parser.add_argument( + '--eval_step_interval', + type=int, + default=400, + help='How often to evaluate the training results.') + parser.add_argument( + '--learning_rate', + type=str, + default='0.001,0.0001', + help='How large a learning rate to use when training.') + parser.add_argument( + '--batch_size', + type=int, + default=100, + help='How many items to train with at once',) + parser.add_argument( + '--summaries_dir', + type=str, + default='/tmp/retrain_logs', + help='Where to save summary logs for TensorBoard.') + parser.add_argument( + '--wanted_words', + type=str, + default='yes,no,up,down,left,right,on,off,stop,go', + help='Words to use (others will be added to an unknown label)',) + parser.add_argument( + '--train_dir', + type=str, + default='/tmp/speech_commands_train', + help='Directory to write event logs and checkpoint.') + parser.add_argument( + '--model_architecture', + type=str, + default='dnn', + help='What model architecture to use') + parser.add_argument( + '--model_size_info', + type=int, + nargs="+", + default=[128, 128, 128], + help='Model dimensions - different for various models') + + FLAGS, _ = parser.parse_known_args() + train() diff --git a/models/keyword_spotting/cnn_small/model_package_tf/validation_utils/labels.txt b/models/keyword_spotting/cnn_small/model_package_tf/validation_utils/labels.txt new file mode 100644 index 0000000..ba41645 --- /dev/null +++ b/models/keyword_spotting/cnn_small/model_package_tf/validation_utils/labels.txt @@ -0,0 +1,12 @@ +_silence_ +_unknown_ +yes +no +up +down +left +right +on +off +stop +go \ No newline at end of file diff --git a/models/keyword_spotting/cnn_small/tflite_int8/README.md b/models/keyword_spotting/cnn_small/tflite_int8/README.md deleted file mode 100644 index 54e42bd..0000000 --- a/models/keyword_spotting/cnn_small/tflite_int8/README.md +++ /dev/null @@ -1,57 +0,0 @@ -# CNN Small INT8 - -## Description -This is a fully quantized version (asymmetrical int8) of the CNN Small model developed by Arm, with training checkpoints, from the Hello Edge paper. Code to recreate this model can be found here: https://github.com/ARM-software/ML-examples/tree/master/tflu-kws-cortex-m - -## License -[Apache-2.0](https://spdx.org/licenses/Apache-2.0.html) - -## Related Materials -### Class Labels -The class labels associated with this model can be downloaded by running the script `get_class_labels.sh`. - -### Model Recreation Code -Code to recreate this model can be found here: https://github.com/ARM-software/ML-examples/tree/master/tflu-kws-cortex-m. - -## Network Information -| Network Information | Value | -|---------------------|------------------| -| Framework | TensorFlow Lite | -| SHA-1 Hash | 3415f88dfb8f78fe47d282d68ccbc3ce71a7510f | -| Size (Bytes) | 75400 | -| Provenance | https://github.com/ARM-software/ML-examples/tree/master/tflu-kws-cortex-m | -| Paper | https://arxiv.org/abs/1711.07128 | - -## Accuracy -Dataset: Google Speech Commands Test Set - -| Metric | Value | -|--------|-------| -| Accuracy | 0.912 | - -## Performance -| Platform | Optimized | -|----------|:---------:| -| Cortex-A |:heavy_check_mark: | -| Cortex-M |:heavy_check_mark: | -| Mali GPU |:heavy_check_mark: | -| Ethos U |:heavy_check_mark: | - -### Key -* :heavy_check_mark: - Will run on this platform. -* :heavy_multiplication_x: - Will not run on this platform. - -## Optimizations -| Optimization | Value | -|-----------------|---------| -| Quantization | INT8 | - -## Network Inputs -| Input Node Name | Shape | Description | -|-----------------|---------|-------------| -| input | (1, 490) | The input is a processed MFCCs of shape (1, 490) | - -## Network Outputs -| Output Node Name | Shape | Description | -|------------------|---------|-------------| -| Identity | (1, 12) | The probability on 12 keywords. | diff --git a/models/keyword_spotting/cnn_small/tflite_int8/definition.yaml b/models/keyword_spotting/cnn_small/tflite_int8/definition.yaml deleted file mode 100644 index e5cd3c4..0000000 --- a/models/keyword_spotting/cnn_small/tflite_int8/definition.yaml +++ /dev/null @@ -1,43 +0,0 @@ -benchmark: - Google Speech Commands test set: - Accuracy: 91.23% -description: 'This is a fully quantized version (asymmetrical int8) of the CNN Small - model developed by Arm, with training checkpoints, from the Hello Edge paper. Code - to recreate this model can be found here: https://github.com/ARM-software/ML-examples/tree/master/tflu-kws-cortex-m' -license: -- Apache-2.0 -network: - file_size_bytes: 75400 - filename: cnn_s_quantized.tflite - framework: TensorFlow Lite - hash: - algorithm: sha1 - value: 3415f88dfb8f78fe47d282d68ccbc3ce71a7510f - provenance: https://github.com/ARM-software/ML-examples/tree/master/tflu-kws-cortex-m - quality_level: null -network_parameters: - input_nodes: - - description: The input is a processed MFCCs of shape (1, 490) - example_input: - path: models/keyword_spotting/cnn_small/tflite_int8/testing_input/input - name: input - shape: - - 1 - - 490 - output_nodes: - - description: The probability on 12 keywords. - name: Identity - shape: - - 1 - - 12 - test_output_path: models/keyword_spotting/cnn_small/tflite_int8/testing_output/Identity -operators: - TensorFlow Lite: - - CONV_2D - - DEQUANTIZE - - FULLY_CONNECTED - - QUANTIZE - - RELU - - RESHAPE - - SOFTMAX -paper: https://arxiv.org/abs/1711.07128 diff --git a/models/keyword_spotting/cnn_small/tflite_int8/get_class_labels.sh b/models/keyword_spotting/cnn_small/tflite_int8/get_class_labels.sh deleted file mode 100755 index e59caf5..0000000 --- a/models/keyword_spotting/cnn_small/tflite_int8/get_class_labels.sh +++ /dev/null @@ -1,20 +0,0 @@ -# Copyright (C) 2021 Arm Limited or its affiliates. All rights reserved. -# -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the License); you may -# not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an AS IS BASIS, WITHOUT -# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -#!/usr/bin/env bash - -wget https://raw.githubusercontent.com/ARM-software/ML-KWS-for-MCU/e9cf319e9aa2ff71d433e111477dd95329fb94cb/Pretrained_models/labels.txt -mv labels.txt labelmappings.txt \ No newline at end of file diff --git a/models/keyword_spotting/dnn_large/model_package_tf/README.md b/models/keyword_spotting/dnn_large/model_package_tf/README.md new file mode 100644 index 0000000..75d5348 --- /dev/null +++ b/models/keyword_spotting/dnn_large/model_package_tf/README.md @@ -0,0 +1,115 @@ +# DNN Large model package + +This folder contains code that will allow you to recreate the DNN Large keyword spotting model from +the [Hello Edge paper](https://arxiv.org/pdf/1711.07128.pdf). + +## License +[Apache-2.0](https://spdx.org/licenses/Apache-2.0.html) + +## Model Package Overview +| Model | DNN_Large | +|:---------------: |:------------------------------------------:| +| **Format**: | Keras, Saved Model, TensorFlow Lite int8, TensorFlow Lite fp32 | +| **Feature**: | Keyword spotting for Arm Cortex-M CPUs | +| **Architectural Delta w.r.t. Vanilla**: | None | +| **Domain**: | Keyword spotting | +| **Package Quality**: | Optimised | + +## Model Recreation + +In order to recreate the model you will first need to be using ```Python3.7``` and install the requirements in ```requirements.txt```. + +Once you have these requirements satisfied you can execute the recreation script contained within this folder, just run: + +```bash +bash ./recreate_model.sh +``` + +Running this script will use the pre-trained checkpoint files supplied in the ```./model_archive/model_source/weights``` folder +to generate the TFLite files and perform evaluation on the test sets. Both an fp32 version and a quantized version will be produced. +The quantized version will use post-training quantization to fully quantize it. + +If you want to run training from scratch you can do this by supplying ```--train``` when running the script. For example: + +```bash +bash ./recreate_model.sh --train +``` + +Training is then performed and should produce a model to the stated accuracy in this repository. +Note that exporting to TFLite will still happen with the pre-trained checkpoint files so you will need to re-run the script +and this time supply the path to the new checkpoint files you want to use, for example: + +```bash +bash ./recreate_model.sh --ckpt +``` + + +## Training + +To train a DNN with 3 fully-connected layers with 128 neurons in each layer, run: + +``` +python train.py --model_architecture dnn --model_size_info 128 128 128 +``` +The command line argument *--model_size_info* is used to pass the neural network layer +dimensions such as number of layers, convolution filter size/stride as a list to models.py, +which builds the TensorFlow graph based on the provided model architecture +and layer dimensions. For more info on *model_size_info* for each network architecture see +[models.py](models.py). + +The training commands with all the hyperparameters to reproduce the models shown in the +[paper](https://arxiv.org/pdf/1711.07128.pdf) are given [here](recreate_model.sh). + +## Testing +To run inference on the trained model from a checkpoint and get accuracy on validation and test sets, run: +``` +python evaluation.py --model_architecture dnn --model_size_info 128 128 128 --checkpoint +``` +The parameters used here should match those used in the Training step. + +## Optimization + +We introduce a new *optional* step to optimize the trained keyword spotting model for deployment. + +Here we use TensorFlow's [weight clustering API](https://www.tensorflow.org/model_optimization/guide/clustering) to reduce the compressed model size and optimize inference on supported hardware. 32 weight clusters and kmeans++ cluster intialization method are used as the clustering hyperparameters. + +To optimize your trained model (e.g. a DNN), a trained model checkpoint is needed to run clustering and fine-tuning on. +You can use the pre-trained checkpoints provided, or train your own model and use the resulting checkpoint. + +To apply the optimization and fine-tuning, run the following command: +``` +python optimisations.py --model_architecture dnn --model_size_info 128 128 128 --checkpoint +``` +The parameters used here should match those used in the Training step, except for the number of training steps. +The number of training steps is reduced since the optimization step only requires fine-tuning. + +This will generate a clustered model checkpoint that can be used in the quantization step to generate a quantized and clustered TFLite model. + +## Quantization and TFLite Conversion + +As part of the update we now use TensorFlow's +[post training quantization](https://www.tensorflow.org/lite/performance/post_training_quantization) to +make quantization of the trained models super simple. + +To quantize your trained model (e.g. a DNN) run: +``` +python convert_to_tflite.py --model_architecture dnn --model_size_info 128 128 128 --checkpoint [--inference_type int8|int16] +``` +The parameters used here should match those used in the Training step. + +The inference_type parameter is *optional* and to be used if a fully quantized model with inputs and outputs of type int8 or int16 is needed. It defaults to fp32. + +This step will produce a quantized TFLite file *dnn_quantized.tflite*. +You can test the accuracy of this quantized model on the test set by running: +``` +python evaluation.py --tflite_path dnn_quantized.tflite +``` +The parameters used here should match those used in the Training step. + +`convert_to_tflite.py` uses post-training quantization to generate a quantized model by default. If you wish to convert to a floating point TFLite model, use the command below: + +``` +python convert_to_tflite.py --model_architecture dnn --model_size_info 128 128 128 --checkpoint --no-quantize +``` + +This will produce a floating point TFLite file *dnn.tflite*. You can test the accuracy of this floating point model using `evaluation.py` as above. diff --git a/models/keyword_spotting/dnn_large/model_package_tf/convert_to_tflite.py b/models/keyword_spotting/dnn_large/model_package_tf/convert_to_tflite.py new file mode 100644 index 0000000..64ab8df --- /dev/null +++ b/models/keyword_spotting/dnn_large/model_package_tf/convert_to_tflite.py @@ -0,0 +1,234 @@ +# Copyright © 2023 Arm Ltd. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Functions for converting and quantizing a trained keyword spotting + model and saving to TFLite.""" + +import argparse + +import tensorflow as tf + +from data_processing import data_preprocessing +from model_core_utils import models +from evaluation import tflite_test + +NUM_REP_DATA_SAMPLES = 100 # How many samples to use for post training quantization. + + +def convert(model_settings, audio_processor, checkpoint, quantize, inference_type, tflite_path): + """Load our trained floating point model and convert it. + + TFLite conversion or post training quantization is performed and the + resulting model is saved as a TFLite file. + We use samples from the validation set to do post training quantization. + + Args: + model_settings: Dictionary of common model settings. + audio_processor: Audio processor class object. + checkpoint: Path to training checkpoint to load. + quantize: Whether to quantize the model or convert to fp32 TFLite model. + inference_type: Input/output type of the quantized model. + tflite_path: Output TFLite file save path. + """ + model = models.create_model(model_settings, FLAGS.model_architecture, FLAGS.model_size_info, False) + model.load_weights(checkpoint).expect_partial() + + val_data = audio_processor.get_data(audio_processor.Modes.VALIDATION).batch(1) + + def _rep_dataset(): + """Generator function to produce representative dataset.""" + i = 0 + for mfcc, label in val_data: + if i > NUM_REP_DATA_SAMPLES: + break + i += 1 + yield [mfcc] + + if quantize: + # Quantize model and save to disk. + tflite_model = post_training_quantize(model, inference_type, _rep_dataset) + with open(tflite_path, 'wb') as f: + f.write(tflite_model) + print(f'Quantized model saved to {tflite_path}.') + else: + converter = tf.lite.TFLiteConverter.from_keras_model(model) + tflite_model = converter.convert() + with open(tflite_path, 'wb') as f: + f.write(tflite_model) + print(f'Converted model saved to {tflite_path}.') + + +def post_training_quantize(keras_model, inference_type, rep_dataset): + """Perform post training quantization and returns the TFLite model ready for saving. + + See https://www.tensorflow.org/lite/performance/post_training_quantization#full_integer_quantization for + more details. + + Args: + keras_model: The trained tf Keras model used for post training quantization. + inference_type: Input/output type of the quantized model. + rep_dataset: Function to use as a representative dataset, must be callable. + + Returns: + Quantized TFLite model ready for saving to disk. + """ + converter = tf.lite.TFLiteConverter.from_keras_model(keras_model) + converter.optimizations = [tf.lite.Optimize.DEFAULT] + + if inference_type == 'int8': + converter.inference_input_type = tf.int8 + converter.inference_output_type = tf.int8 + supported_ops = tf.lite.OpsSet.TFLITE_BUILTINS_INT8 + if inference_type == 'int16': + converter.inference_input_type = tf.int16 + converter.inference_output_type = tf.int16 + supported_ops = tf.lite.OpsSet.EXPERIMENTAL_TFLITE_BUILTINS_ACTIVATIONS_INT16_WEIGHTS_INT8 + + # Int8 post training quantization needs representative dataset. + converter.representative_dataset = rep_dataset + converter.target_spec.supported_ops = [supported_ops] + + tflite_model = converter.convert() + + return tflite_model + + +def main(): + model_settings = models.prepare_model_settings( + len(data_preprocessing.prepare_words_list(FLAGS.wanted_words.split(','))), + FLAGS.sample_rate, FLAGS.clip_duration_ms, FLAGS.window_size_ms, + FLAGS.window_stride_ms, FLAGS.dct_coefficient_count) + + audio_processor = data_preprocessing.AudioProcessor(data_url=FLAGS.data_url, + data_dir=FLAGS.data_dir, + silence_percentage=FLAGS.silence_percentage, + unknown_percentage=FLAGS.unknown_percentage, + wanted_words=FLAGS.wanted_words.split(','), + validation_percentage=FLAGS.validation_percentage, + testing_percentage=FLAGS.testing_percentage, + model_settings=model_settings) + + if FLAGS.quantize: + tflite_path = f'{FLAGS.model_architecture}_quantized.tflite' + else: + tflite_path = f'{FLAGS.model_architecture}.tflite' + + # Load floating point model from checkpoint and convert it. + convert(model_settings, audio_processor, FLAGS.checkpoint, + FLAGS.quantize, FLAGS.inference_type, tflite_path) + + # Test the newly converted model on the test set. + tflite_test(model_settings, audio_processor, tflite_path) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument( + '--data_url', + type=str, + default='http://download.tensorflow.org/data/speech_commands_v0.02.tar.gz', + help='Location of speech training data archive on the web.') + parser.add_argument( + '--data_dir', + type=str, + default='/tmp/speech_dataset/', + help="""\ + Where to download the speech training data to. + """) + parser.add_argument( + '--silence_percentage', + type=float, + default=10.0, + help="""\ + How much of the training data should be silence. + """) + parser.add_argument( + '--unknown_percentage', + type=float, + default=10.0, + help="""\ + How much of the training data should be unknown words. + """) + parser.add_argument( + '--testing_percentage', + type=int, + default=10, + help='What percentage of wavs to use as a test set.') + parser.add_argument( + '--validation_percentage', + type=int, + default=10, + help='What percentage of wavs to use as a validation set.') + parser.add_argument( + '--sample_rate', + type=int, + default=16000, + help='Expected sample rate of the wavs',) + parser.add_argument( + '--clip_duration_ms', + type=int, + default=1000, + help='Expected duration in milliseconds of the wavs',) + parser.add_argument( + '--window_size_ms', + type=float, + default=30.0, + help='How long each spectrogram timeslice is',) + parser.add_argument( + '--window_stride_ms', + type=float, + default=10.0, + help='How long each spectrogram timeslice is',) + parser.add_argument( + '--dct_coefficient_count', + type=int, + default=40, + help='How many bins to use for the MFCC fingerprint',) + parser.add_argument( + '--wanted_words', + type=str, + default='yes,no,up,down,left,right,on,off,stop,go', + help='Words to use (others will be added to an unknown label)',) + parser.add_argument( + '--model_architecture', + type=str, + default='dnn', + help='What model architecture to use') + parser.add_argument( + '--model_size_info', + type=int, + nargs="+", + default=[128, 128, 128], + help='Model dimensions - different for various models') + parser.add_argument( + '--checkpoint', + type=str, + help='Checkpoint to load the weights from.') + parser.add_argument( + '--quantize', + dest='quantize', + action="store_true", + default=True, + help='Whether to quantize the model or convert to fp32 TFLite model. Defaults to True.') + parser.add_argument( + '--no-quantize', + dest='quantize', + action="store_false", + help='Whether to quantize the model or convert to fp32 TFLite model. Defaults to True.') + parser.add_argument( + '--inference_type', + type=str, + default='fp32', + help='If quantize is true, whether the model input and output is float32, int8 or int16') + + FLAGS, _ = parser.parse_known_args() + main() diff --git a/models/keyword_spotting/dnn_large/model_package_tf/data_processing/__init__.py b/models/keyword_spotting/dnn_large/model_package_tf/data_processing/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/models/keyword_spotting/dnn_large/model_package_tf/data_processing/data_preprocessing.py b/models/keyword_spotting/dnn_large/model_package_tf/data_processing/data_preprocessing.py new file mode 100644 index 0000000..05cf5ba --- /dev/null +++ b/models/keyword_spotting/dnn_large/model_package_tf/data_processing/data_preprocessing.py @@ -0,0 +1,462 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +# +# Modifications Copyright 2023 Arm Inc. All Rights Reserved. +# Modified to use TensorFlow 2.0 and data pipelines. +# +"""Functions for loading and preparing data for keyword spotting.""" + +import os +import re +import sys +import urllib +from pathlib import Path +import tarfile +import hashlib +import random +import math +from enum import Enum + +import numpy as np +import tensorflow as tf +from tensorflow.python.ops import gen_audio_ops as audio_ops + +MAX_NUM_WAVS_PER_CLASS = 2**27 - 1 # ~134M +RANDOM_SEED = 59185 +BACKGROUND_NOISE_DIR_NAME = '_background_noise_' +SILENCE_LABEL = '_silence_' +SILENCE_INDEX = 0 +UNKNOWN_WORD_INDEX = 1 +UNKNOWN_WORD_LABEL = '_unknown_' + + +def load_wav_file(wav_filename, desired_samples): + """Loads and then decodes a given 16bit PCM wav file. + + Decoded audio is scaled to the range [-1, 1] and padded or cropped to the desired number of samples. + + Args: + wav_filename: 16bit PCM wav file to load. + desired_samples: Number of samples wanted from the audio file. + + Returns: + Tuple consisting of the decoded audio and sample rate. + """ + wav_file = tf.io.read_file(wav_filename) + decoded_wav = audio_ops.decode_wav(wav_file, desired_channels=1, desired_samples=desired_samples) + + return decoded_wav.audio, decoded_wav.sample_rate + + +def calculate_mfcc(audio_signal, audio_sample_rate, window_size, window_stride, num_mfcc): + """Returns Mel Frequency Cepstral Coefficients (MFCC) for a given audio signal. + + Args: + audio_signal: Raw audio signal in range [-1, 1] + audio_sample_rate: Audio signal sample rate + window_size: Window size in samples for calculating spectrogram + window_stride: Window stride in samples for calculating spectrogram + num_mfcc: The number of MFCC features wanted. + + Returns: + Calculated mffc features. + """ + spectrogram = audio_ops.audio_spectrogram(input=audio_signal, window_size=window_size, stride=window_stride, + magnitude_squared=True) + + mfcc_features = audio_ops.mfcc(spectrogram, audio_sample_rate, dct_coefficient_count=num_mfcc) + + return mfcc_features + + +def which_set(filename, validation_percentage, testing_percentage): + """Determines which data partition the file should belong to. + + We want to keep files in the same training, validation, or testing sets even + if new ones are added over time. This makes it less likely that testing + samples will accidentally be reused in training when long runs are restarted + for example. To keep this stability, a hash of the filename is taken and used + to determine which set it should belong to. This determination only depends on + the name and the set proportions, so it won't change as other files are added. + It's also useful to associate particular files as related (for example words + spoken by the same person), so anything after '_nohash_' in a filename is + ignored for set determination. This ensures that 'bobby_nohash_0.wav' and + 'bobby_nohash_1.wav' are always in the same set, for example. + + Args: + filename: File path of the data sample. + validation_percentage: How much of the data set to use for validation. + testing_percentage: How much of the data set to use for testing. + + Returns: + String, one of 'training', 'validation', or 'testing'. + """ + base_name = os.path.basename(filename) + # We want to ignore anything after '_nohash_' in the file name when + # deciding which set to put a wav in, so the data set creator has a way of + # grouping wavs that are close variations of each other. + hash_name = re.sub(r'_nohash_.*$', '', base_name) + # This looks a bit magical, but we need to decide whether this file should + # go into the training, testing, or validation sets, and we want to keep + # existing files in the same set even if more files are subsequently + # added. + # To do that, we need a stable way of deciding based on just the file name + # itself, so we do a hash of that and then use that to generate a + # probability value that we use to assign it. + hash_name_hashed = hashlib.sha1(tf.compat.as_bytes(hash_name)).hexdigest() + percentage_hash = ((int(hash_name_hashed, 16) % + (MAX_NUM_WAVS_PER_CLASS + 1)) * + (100.0 / MAX_NUM_WAVS_PER_CLASS)) + if percentage_hash < validation_percentage: + result = 'validation' + elif percentage_hash < (testing_percentage + validation_percentage): + result = 'testing' + else: + result = 'training' + return result + + +def prepare_words_list(wanted_words): + """Prepends common tokens to the custom word list. + + Args: + wanted_words: List of strings containing custom words to spot. + + Returns: + List of words with silence and unknown tokens added. + """ + return [SILENCE_LABEL, UNKNOWN_WORD_LABEL] + wanted_words + + +class AudioProcessor: + """Handles loading, partitioning, and preparing audio training data.""" + + class Modes(Enum): + TRAINING = 1 + VALIDATION = 2 + TESTING = 3 + + def __init__(self, data_url, data_dir, silence_percentage, unknown_percentage, + wanted_words, validation_percentage, testing_percentage, model_settings): + self.data_dir = Path(data_dir) + self.model_settings = model_settings + self.words_list = prepare_words_list(wanted_words) + + self._tf_datasets = {} + self.background_data = None + self._set_size = {'training': 0, 'validation': 0, 'testing': 0} + + self._download_and_extract_data(data_url, data_dir) + self._prepare_datasets(silence_percentage, unknown_percentage, wanted_words, + validation_percentage, testing_percentage) + self._prepare_background_data() + + def get_data(self, mode, background_frequency=0, background_volume_range=0, time_shift=0): + """Returns the train, validation or test set for KWS as a TF Dataset. + + Args: + mode: The set to return, see AudioProcessor.Modes enumeration. + background_frequency: How many of the samples have background noise mixed in. + background_volume_range: How loud the background noise should be, between 0 and 1. + time_shift: Range to randomly shift the training audio by in time. + + Returns: + TF dataset that will generate tuples containing an mfcc and corresponding label. + + Raises: + ValueError: If mode is not recognised. + """ + if mode == AudioProcessor.Modes.TRAINING: + dataset = self._tf_datasets['training'] + elif mode == AudioProcessor.Modes.VALIDATION: + dataset = self._tf_datasets['validation'] + elif mode == AudioProcessor.Modes.TESTING: + dataset = self._tf_datasets['testing'] + else: + ValueError("Incorrect dataset type given") + + use_background = (self.background_data is not None) and (mode == AudioProcessor.Modes.TRAINING) + dataset = dataset.map(lambda path, label: self._process_path(path, label, self.model_settings, + background_frequency, background_volume_range, + time_shift, use_background, self.background_data), + num_parallel_calls=tf.data.experimental.AUTOTUNE) + + return dataset + + def set_size(self, mode): + """Get the number of samples in the requested dataset partition. + + Args: + mode: Which partition, see AudioProcessor.Modes enumeration. + + Returns: + Number of samples in the partition. + + Raises: + ValueError: If mode is not recognised. + """ + if mode == AudioProcessor.Modes.TRAINING: + return self._set_size['training'] + elif mode == AudioProcessor.Modes.VALIDATION: + return self._set_size['validation'] + elif mode == AudioProcessor.Modes.TESTING: + return self._set_size['testing'] + else: + ValueError('Incorrect dataset type given') + + @staticmethod + def _process_path(path, label, model_settings, background_frequency, background_volume_range, time_shift_samples, + use_background, background_data): + """Load wav files and calculate mfcc features. + + Random shifting of samples and adding in background noise is done within this function as well. + This function is meant to be mapped onto a TF Dataset by using a lambda function. + + Args: + path: Path to the wav file to load. + label: Integer label for classifying the audio clip. + model_settings: Dictionary of settings for model being trained. + background_frequency: How many clips will have background noise, 0.0 to 1.0. + background_volume_range: How loud the background noise will be. + time_shift_samples: How much to randomly shift the clips by. + use_background: Add in background noise to audio clips or not. + background_data: Ragged tensor of loaded background noise samples. + + Returns: + Tuple of calculated flattened mfcc and its class label. + """ + + desired_samples = model_settings['desired_samples'] + audio, sample_rate = load_wav_file(path, desired_samples=desired_samples) + + # Make our own silence audio data. + if label == SILENCE_INDEX: + audio = tf.multiply(audio, 0) + + # Shift samples start position and pad any gaps with zeros. + if time_shift_samples > 0: + time_shift_amount = tf.random.uniform(shape=(), minval=-time_shift_samples, maxval=time_shift_samples, + dtype=tf.int32) + else: + time_shift_amount = 0 + if time_shift_amount > 0: + time_shift_padding = [[time_shift_amount, 0], [0, 0]] + time_shift_offset = [0, 0] + else: + time_shift_padding = [[0, -time_shift_amount], [0, 0]] + time_shift_offset = [-time_shift_amount, 0] + + padded_foreground = tf.pad(audio, time_shift_padding, mode='CONSTANT') + sliced_foreground = tf.slice(padded_foreground, time_shift_offset, [desired_samples, -1]) + + # Get a random section of background noise. + if use_background: + background_index = tf.random.uniform(shape=(), maxval=background_data.shape[0], dtype=tf.int32) + background_sample = background_data[background_index] + background_offset = tf.random.uniform(shape=(), maxval=len(background_sample)-desired_samples, + dtype=tf.int32) + background_clipped = background_sample[background_offset:(background_offset + desired_samples)] + background_reshaped = tf.reshape(background_clipped, [desired_samples, 1]) + if tf.random.uniform(shape=(), maxval=1) < background_frequency: + background_volume = tf.random.uniform(shape=(), maxval=background_volume_range) + else: + background_volume = tf.constant(0, dtype='float32') + else: + background_reshaped = np.zeros([desired_samples, 1], dtype=np.float32) + background_volume = tf.constant(0, dtype='float32') + + # Mix in background noise. + background_mul = tf.multiply(background_reshaped, background_volume) + background_add = tf.add(background_mul, sliced_foreground) + background_clamp = tf.clip_by_value(background_add, -1.0, 1.0) + + mfcc = calculate_mfcc(background_clamp, sample_rate, model_settings['window_size_samples'], + model_settings['window_stride_samples'], + model_settings['dct_coefficient_count']) + mfcc = tf.reshape(mfcc, [-1]) + + return mfcc, label + + def _download_and_extract_data(self, data_url, target_directory): + """Downloads and extracts file to target directory. + + If the file does not already exist download it and then untar into the target directory. + + Args: + data_url: Web link to the tarred data to download. + target_directory: Directory to download and extract to. + """ + target_directory = Path(target_directory) + target_directory.mkdir(exist_ok=True) + + filename = data_url.split('/')[-1] + filepath = target_directory / filename + + if not filepath.exists(): + def _report_hook(block_num, block_size, total_size): + """Function to track download progress in urllib""" + read_so_far = block_num * block_size + percent = (read_so_far / total_size) * 100.0 + + s = f"\rDownloading {filename} {percent:.1f}%" + + sys.stdout.write(s) + sys.stdout.flush() + + filepath, _ = urllib.request.urlretrieve(data_url, filepath, _report_hook) + print() + + print(f'Untarring {filename}...') + tarfile.open(filepath, 'r:gz').extractall(target_directory) + + def _prepare_datasets(self, silence_percentage, unknown_percentage, wanted_words, + validation_percentage, testing_percentage): + """Split the data into train, validation and testing sets. + + Silence and unknown data is added, then sets are converted to TF Datasets. + + Args: + silence_percentage: Percent of words should be silence. + unknown_percentage: Percent of words that should be unknown. + wanted_words: List of words wanted to classify. + validation_percentage: Percent to split off for validation. + testing_percentage: Percent to split off for testing. + """ + # Make sure the shuffling and picking of unknowns is deterministic. + random.seed(RANDOM_SEED) + wanted_words_index = {} + + for index, wanted_word in enumerate(wanted_words): + wanted_words_index[wanted_word] = index + 2 + + # Find all wav files in subfolders. + search_path = self.data_dir / '*' / '*.wav' + data_index, unknown_index, all_words = self._find_and_sort_wavs(search_path, validation_percentage, + testing_percentage, wanted_words_index) + + for index, wanted_word in enumerate(wanted_words): + if wanted_word not in all_words: + raise Exception(f'Tried to find {wanted_word} in labels but only found: {", ".join(all_words.keys())}') + + word_to_index = {} + for word in all_words: + if word in wanted_words_index: + word_to_index[word] = wanted_words_index[word] + else: + word_to_index[word] = UNKNOWN_WORD_INDEX + word_to_index[SILENCE_LABEL] = SILENCE_INDEX + + # We need an arbitrary file to load as the input for the silence samples. + # It's multiplied by zero later, so the content doesn't matter. + silence_wav_path = data_index['training'][0]['file'] + for set_index in ['validation', 'testing', 'training']: + set_size = len(data_index[set_index]) # Size before adding silence and unknown samples. + silence_size = int(math.ceil(set_size * silence_percentage / 100)) + for _ in range(silence_size): + data_index[set_index].append({ + 'label': SILENCE_LABEL, + 'file': silence_wav_path + }) + # Pick some unknowns to add to each partition of the data set. + random.shuffle(unknown_index[set_index]) + unknown_size = int(math.ceil(set_size * unknown_percentage / 100)) + data_index[set_index].extend(unknown_index[set_index][:unknown_size]) + + self._set_size[set_index] = len(data_index[set_index]) # Size after adding silence and unknown samples. + + # Make sure the ordering is random. + random.shuffle(data_index[set_index]) + + # Transform into TF Datasets ready for easier processing later. + labels, paths = list(zip(*[d.values() for d in data_index[set_index]])) + labels = [word_to_index[label] for label in labels] + self._tf_datasets[set_index] = tf.data.Dataset.from_tensor_slices((list(paths), labels)) + + def _find_and_sort_wavs(self, search_pattern, validation_percentage, testing_percentage, wanted_words_index): + """Find and sort wav files into known and unknown word sets. + + Known words are files containing words in the list of wanted words. + Any other clip goes to the unknown label set. Labels come from the folder names. + All clips are also assigned to train, test and validation sets. + + Args: + search_pattern: Path pattern used by glob to find wav files. + validation_percentage: Percent to split off for validation. + testing_percentage: Percent to split off for testing. + wanted_words_index: Dict mapping wanted words to their label index. + + Returns: + 3-tuple of known words, unknown words and mapping of all word labels. + """ + data_index = {'validation': [], 'testing': [], 'training': []} + unknown_index = {'validation': [], 'testing': [], 'training': []} + all_words = {} + + for wav_path in sorted(tf.io.gfile.glob(str(search_pattern))): + word = Path(wav_path).parent.name.lower() + + # Treat the '_background_noise_' folder as a special case, since we expect + # it to contain long audio samples we mix in to improve training. + if word == BACKGROUND_NOISE_DIR_NAME: + continue + + all_words[word] = True + set_index = which_set(wav_path, validation_percentage, testing_percentage) + # If it's a known class, store its detail, otherwise add it to the list + # we'll use to train the unknown label. + if word in wanted_words_index: + data_index[set_index].append({'label': word, 'file': wav_path}) + else: + unknown_index[set_index].append({'label': word, 'file': wav_path}) + if not all_words: + raise Exception('No .wavs found at ' + str(search_pattern)) + + return data_index, unknown_index, all_words + + def _prepare_background_data(self): + """Searches a folder for background noise audio, and loads it into memory. + + It's expected that the background audio samples will be in a subdirectory + named '_background_noise_' inside the 'data_dir' folder, as .wavs that match + the sample rate of the training data, but can be much longer in duration. + + If the '_background_noise_' folder doesn't exist at all, this isn't an + error, it's just taken to mean that no background noise augmentation should + be used. If the folder does exist, but it's empty, that's treated as an + error. + + Returns: + Ragged tensor of raw PCM-encoded audio samples of background noise. + None if '_background_noise_' folder doesnt exist. + + Raises: + Exception: If files aren't found in the folder. + """ + background_data = [] + background_dir = Path(self.data_dir / BACKGROUND_NOISE_DIR_NAME) + if not background_dir.exists(): + self.background_data = None + return + + search_path = Path(background_dir / '*.wav') + for wav_path in tf.io.gfile.glob(str(search_path)): + wav_data, _ = load_wav_file(wav_path, desired_samples=-1) + background_data.append(tf.reshape(wav_data, [-1])) + + if not background_data: + raise Exception('No background wav files were found in ' + str(search_path)) + + # Ragged tensor as we cant use lists in tf dataset map functions. + self.background_data = tf.ragged.stack(background_data) diff --git a/models/keyword_spotting/dnn_large/model_package_tf/dnn_l_inference_keras.py b/models/keyword_spotting/dnn_large/model_package_tf/dnn_l_inference_keras.py new file mode 100644 index 0000000..db7694a --- /dev/null +++ b/models/keyword_spotting/dnn_large/model_package_tf/dnn_l_inference_keras.py @@ -0,0 +1,76 @@ +# Copyright © 2023 Arm Ltd. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from data_processing.data_preprocessing import load_wav_file, calculate_mfcc + +import tensorflow as tf +import argparse + + +def load_labels(filename): + """Read in labels, one label per line.""" + f = open(filename, "r") + return f.read().splitlines() + + +def main(): + window_size_samples = int(FLAGS.sample_rate * FLAGS.window_size_ms / 1000) + window_stride_samples = int(FLAGS.sample_rate * FLAGS.window_stride_ms / 1000) + decoded, sample = load_wav_file(FLAGS.wav, FLAGS.sample_rate) + x = calculate_mfcc(decoded, sample, window_size_samples, window_stride_samples, FLAGS.dct_coefficient_count) + x = tf.reshape(x, [1, -1]) + + model = tf.keras.models.load_model(FLAGS.keras_file_path) + predictions = model.predict(x) + + # Sort to show labels in order of confidence + top_k = predictions[0].argsort()[-1:][::-1] + for node_id in top_k: + human_string = load_labels(FLAGS.labels)[int(node_id)] + score = predictions[0,node_id] + print(f'model predicted: {human_string} with score {score:.5f}') + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument( + '--wav', type=str, default='', help='Audio file to be identified.') + parser.add_argument( + '--labels', type=str, default='', help='Path to file containing labels.') + parser.add_argument( + '--sample_rate', + type=int, + default=16000, + help='Expected sample rate of the wavs', ) + parser.add_argument( + '--window_size_ms', + type=float, + default=30.0, + help='How long each spectrogram timeslice is', ) + parser.add_argument( + '--window_stride_ms', + type=float, + default=10.0, + help='How long each spectrogram timeslice is', ) + parser.add_argument( + '--dct_coefficient_count', + type=int, + default=40, + help='How many bins to use for the MFCC fingerprint', ) + parser.add_argument( + '--keras_file_path', + type=str, + default='', + help='Path to the .h5 Keras model file to use for testing.') + FLAGS, unparsed = parser.parse_known_args() + main() diff --git a/models/keyword_spotting/dnn_large/model_package_tf/dnn_l_inference_tflite.py b/models/keyword_spotting/dnn_large/model_package_tf/dnn_l_inference_tflite.py new file mode 100644 index 0000000..9f79d99 --- /dev/null +++ b/models/keyword_spotting/dnn_large/model_package_tf/dnn_l_inference_tflite.py @@ -0,0 +1,120 @@ +# Copyright © 2023 Arm Ltd. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from data_processing.data_preprocessing import load_wav_file, calculate_mfcc + +import tensorflow as tf +import numpy as np +import argparse + + +def tflite_inference(input_data, tflite_path): + """Call forwards pass of TFLite file and returns the result. + + Args: + input_data: Input data to use on forward pass. + tflite_path: Path to TFLite file to run. + + Returns: + Output from inference. + """ + supported_quant_dtypes = (np.int8, np.int16) + interpreter = tf.lite.Interpreter(model_path=tflite_path) + interpreter.allocate_tensors() + + input_details = interpreter.get_input_details() + output_details = interpreter.get_output_details() + + input_dtype = input_details[0]["dtype"] + output_dtype = output_details[0]["dtype"] + + # Check if the input/output type is quantized, + # set scale and zero-point accordingly + if input_dtype in supported_quant_dtypes: + input_scale, input_zero_point = input_details[0]["quantization"] + else: + input_scale, input_zero_point = 1, 0 + + input_data = input_data / input_scale + input_zero_point + input_data = np.round(input_data) if input_dtype in supported_quant_dtypes else input_data + + if output_dtype in supported_quant_dtypes: + output_scale, output_zero_point = output_details[0]["quantization"] + else: + output_scale, output_zero_point = 1, 0 + + interpreter.set_tensor(input_details[0]['index'], tf.cast(input_data, input_dtype)) + interpreter.invoke() + + output_data = interpreter.get_tensor(output_details[0]['index']) + + output_data = output_scale * (output_data.astype(np.float32) - output_zero_point) + + return output_data + + +def load_labels(filename): + """Read in labels, one label per line.""" + f = open(filename, "r") + return f.read().splitlines() + + +def main(): + window_size_samples = int(FLAGS.sample_rate * FLAGS.window_size_ms / 1000) + window_stride_samples = int(FLAGS.sample_rate * FLAGS.window_stride_ms / 1000) + decoded, sample = load_wav_file(FLAGS.wav, FLAGS.sample_rate) + x = calculate_mfcc(decoded, sample, window_size_samples, window_stride_samples, FLAGS.dct_coefficient_count) + x = tf.reshape(x, [1, -1]) + predictions = tflite_inference(x, FLAGS.tflite_path) + + # Sort to show labels in order of confidence + top_k = predictions[0].argsort()[-1:][::-1] + for node_id in top_k: + human_string = load_labels(FLAGS.labels)[int(node_id)] + score = predictions[0,node_id] + print(f'model predicted: {human_string} with score {score:.5f}') + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument( + '--wav', type=str, default='', help='Audio file to be identified.') + parser.add_argument( + '--labels', type=str, default='', help='Path to file containing labels.') + parser.add_argument( + '--sample_rate', + type=int, + default=16000, + help='Expected sample rate of the wavs', ) + parser.add_argument( + '--window_size_ms', + type=float, + default=30.0, + help='How long each spectrogram timeslice is', ) + parser.add_argument( + '--window_stride_ms', + type=float, + default=10.0, + help='How long each spectrogram timeslice is', ) + parser.add_argument( + '--dct_coefficient_count', + type=int, + default=40, + help='How many bins to use for the MFCC fingerprint', ) + parser.add_argument( + '--tflite_path', + type=str, + default='', + help='Path to TFLite file to use for testing.') + FLAGS, unparsed = parser.parse_known_args() + main() diff --git a/models/keyword_spotting/dnn_large/model_package_tf/evaluation.py b/models/keyword_spotting/dnn_large/model_package_tf/evaluation.py new file mode 100644 index 0000000..5e60134 --- /dev/null +++ b/models/keyword_spotting/dnn_large/model_package_tf/evaluation.py @@ -0,0 +1,250 @@ +# Copyright © 2023 Arm Ltd. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Functions for testing trained keyword spotting models from checkpoint files and TFLite files.""" + +import argparse + +import numpy as np +import tensorflow as tf + +from data_processing import data_preprocessing +from model_core_utils import models +from dnn_l_inference_tflite import tflite_inference + + +def tflite_test(model_settings, audio_processor, tflite_path): + """Calculate accuracy and confusion matrices on the validation and test sets. + + A TFLite model is used for doing testing. + + Args: + model_settings: Dictionary of common model settings. + audio_processor: Audio processor class object. + tflite_path: Path to TFLite file to use for inference. + """ + # Evaluate on validation set. + print("Running TFLite evaluation on validation set...") + val_data = audio_processor.get_data(audio_processor.Modes.VALIDATION).batch(1) + expected_indices = np.concatenate([y for x, y in val_data]) + predicted_indices = [] + + for mfcc, label in val_data: + prediction = tflite_inference(mfcc, tflite_path) + predicted_indices.append(np.squeeze(tf.argmax(prediction, axis=1))) + + val_accuracy = calculate_accuracy(predicted_indices, expected_indices) + confusion_matrix = tf.math.confusion_matrix(expected_indices, predicted_indices, + num_classes=model_settings['label_count']) + + print(confusion_matrix.numpy()) + print(f'Validation accuracy = {val_accuracy * 100:.2f}%' + f'(N={audio_processor.set_size(audio_processor.Modes.VALIDATION)})') + + # Evaluate on testing set. + print("Running TFLite evaluation on test set...") + test_data = audio_processor.get_data(audio_processor.Modes.TESTING).batch(1) + expected_indices = np.concatenate([y for x, y in test_data]) + predicted_indices = [] + + for mfcc, label in test_data: + prediction = tflite_inference(mfcc, tflite_path) + predicted_indices.append(np.squeeze(tf.argmax(prediction, axis=1))) + + test_accuracy = calculate_accuracy(predicted_indices, expected_indices) + confusion_matrix = tf.math.confusion_matrix(expected_indices, predicted_indices, + num_classes=model_settings['label_count']) + + print(confusion_matrix.numpy()) + print(f'Test accuracy = {test_accuracy * 100:.2f}%' + f'(N={audio_processor.set_size(audio_processor.Modes.TESTING)})') + + +def keras_test(model_settings, audio_processor, model): + """Calculate accuracy and confusion matrices on the validation and test sets. + + A loaded keras model is used for doing testing. + + Args: + model_settings: Dictionary of common model settings. + audio_processor: Audio processor class object. + model: Loaded keras model. + """ + # Evaluate on validation set. + print("Running TF evaluation on validation set...") + val_data = audio_processor.get_data(audio_processor.Modes.VALIDATION).batch(FLAGS.batch_size) + expected_indices = np.concatenate([y for x, y in val_data]) + + predictions = model.predict(val_data) + predicted_indices = tf.argmax(predictions, axis=1) + + val_accuracy = calculate_accuracy(predicted_indices, expected_indices) + confusion_matrix = tf.math.confusion_matrix(expected_indices, predicted_indices, + num_classes=model_settings['label_count']) + print(confusion_matrix.numpy()) + print(f'Validation accuracy = {val_accuracy * 100:.2f}%' + f'(N={audio_processor.set_size(audio_processor.Modes.VALIDATION)})') + + # Evaluate on testing set. + print("Running TF evaluation on test set...") + test_data = audio_processor.get_data(audio_processor.Modes.TESTING).batch(FLAGS.batch_size) + expected_indices = np.concatenate([y for x, y in test_data]) + + predictions = model.predict(test_data) + predicted_indices = tf.argmax(predictions, axis=1) + + test_accuracy = calculate_accuracy(predicted_indices, expected_indices) + confusion_matrix = tf.math.confusion_matrix(expected_indices, predicted_indices, + num_classes=model_settings['label_count']) + print(confusion_matrix.numpy()) + print(f'Test accuracy = {test_accuracy * 100:.2f}%' + f'(N={audio_processor.set_size(audio_processor.Modes.TESTING)})') + + +def calculate_accuracy(predicted_indices, expected_indices): + """Calculates and returns accuracy. + + Args: + predicted_indices: List of predicted integer indices. + expected_indices: List of expected integer indices. + + Returns: + Accuracy value between 0 and 1. + """ + correct_prediction = tf.equal(predicted_indices, expected_indices) + accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) + return accuracy + + +def evaluate(): + """Calculate accuracy and confusion matrices on validation and test sets. + + Model is created and weights loaded from supplied command line arguments. + """ + model_settings = models.prepare_model_settings( + len(data_preprocessing.prepare_words_list(FLAGS.wanted_words.split(','))), + FLAGS.sample_rate, FLAGS.clip_duration_ms, FLAGS.window_size_ms, + FLAGS.window_stride_ms, FLAGS.dct_coefficient_count) + + audio_processor = data_preprocessing.AudioProcessor(data_url=FLAGS.data_url, + data_dir=FLAGS.data_dir, + silence_percentage=FLAGS.silence_percentage, + unknown_percentage=FLAGS.unknown_percentage, + wanted_words=FLAGS.wanted_words.split(','), + validation_percentage=FLAGS.validation_percentage, + testing_percentage=FLAGS.testing_percentage, + model_settings=model_settings) + + if FLAGS.tflite_path: + tflite_test(model_settings, audio_processor, FLAGS.tflite_path) + + if FLAGS.checkpoint: + model = models.create_model(model_settings, FLAGS.model_architecture, FLAGS.model_size_info, False) + model.load_weights(FLAGS.checkpoint).expect_partial() + keras_test(model_settings, audio_processor, model) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument( + '--data_url', + type=str, + default='http://download.tensorflow.org/data/speech_commands_v0.02.tar.gz', + help='Location of speech training data archive on the web.') + parser.add_argument( + '--data_dir', + type=str, + default='/tmp/speech_dataset/', + help="""\ + Where to download the speech training data to. + """) + parser.add_argument( + '--silence_percentage', + type=float, + default=10.0, + help="""\ + How much of the training data should be silence. + """) + parser.add_argument( + '--unknown_percentage', + type=float, + default=10.0, + help="""\ + How much of the training data should be unknown words. + """) + parser.add_argument( + '--testing_percentage', + type=int, + default=10, + help='What percentage of wavs to use as a test set.') + parser.add_argument( + '--validation_percentage', + type=int, + default=10, + help='What percentage of wavs to use as a validation set.') + parser.add_argument( + '--sample_rate', + type=int, + default=16000, + help='Expected sample rate of the wavs',) + parser.add_argument( + '--clip_duration_ms', + type=int, + default=1000, + help='Expected duration in milliseconds of the wavs',) + parser.add_argument( + '--window_size_ms', + type=float, + default=30.0, + help='How long each spectrogram timeslice is',) + parser.add_argument( + '--window_stride_ms', + type=float, + default=10.0, + help='How long each spectrogram timeslice is',) + parser.add_argument( + '--dct_coefficient_count', + type=int, + default=40, + help='How many bins to use for the MFCC fingerprint',) + parser.add_argument( + '--batch_size', + type=int, + default=100, + help='How many items to train with at once',) + parser.add_argument( + '--wanted_words', + type=str, + default='yes,no,up,down,left,right,on,off,stop,go', + help='Words to use (others will be added to an unknown label)',) + parser.add_argument( + '--checkpoint', + type=str, + help='Checkpoint to load the weights from') + parser.add_argument( + '--tflite_path', + type=str, + help='Path to TFLite file to use for evaluation') + parser.add_argument( + '--model_architecture', + type=str, + default='dnn', + help='What model architecture to use') + parser.add_argument( + '--model_size_info', + type=int, + nargs="+", + default=[128, 128, 128], + help='Model dimensions - different for various models') + + FLAGS, _ = parser.parse_known_args() + evaluate() diff --git a/models/keyword_spotting/dnn_large/model_package_tf/how_to_guidance.ipynb b/models/keyword_spotting/dnn_large/model_package_tf/how_to_guidance.ipynb new file mode 100644 index 0000000..67b2031 --- /dev/null +++ b/models/keyword_spotting/dnn_large/model_package_tf/how_to_guidance.ipynb @@ -0,0 +1,428 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Copyright (C) 2023 Arm Limited or its affiliates. All rights reserved.\n", + "#\n", + "# SPDX-License-Identifier: Apache-2.0\n", + "#\n", + "# Licensed under the Apache License, Version 2.0 (the License); you may\n", + "# not use this file except in compliance with the License.\n", + "# You may obtain a copy of the License at\n", + "#\n", + "# www.apache.org/licenses/LICENSE-2.0\n", + "#\n", + "# Unless required by applicable law or agreed to in writing, software\n", + "# distributed under the License is distributed on an AS IS BASIS, WITHOUT\n", + "# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", + "# See the License for the specific language governing permissions and\n", + "# limitations under the License." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# DNN_Large - Optimised\n", + "\n", + "Here we reproduce the models with our established codebase and ModelPackage approach for your convenience.\n", + "\n", + "## Model-Package Overview:\n", + "\n", + "| Model \t| DNN_Large \t|\n", + "|:---------------:\t|:---------------------------------------------------------------:\t|\n", + "| **Format**: \t| Keras, Saved Model, TensorFlow Lite int8, TensorFlow Lite fp32 |\n", + "| **Feature**: \t| Keyword spotting for Arm Cortex-M CPUs |\n", + "| **Architectural Delta w.r.t. Vanilla**: | None |\n", + "| **Domain**: \t| Keyword spotting |\n", + "| **Package Quality**: \t| Optimised |" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Table of contents \n", + "\n", + "This how-to guidance presents the key steps to reproduce everything in this package. The contents are organised as below. We provided the internal navigation links for users to easy-jump among different sections. \n", + "\n", + " \n", + "* [1.0 Model recreation](#model_recreation)\n", + "\n", + "* [2.0 Training](#training)\n", + "\n", + "* [3.0 Testing](#testing)\n", + "\n", + "* [4.0 Optimization](#optimization)\n", + "\n", + "* [5.0 Quantization and TFLite conversion](#tflite_conversion)\n", + "\n", + "* [6.0 Inference the TFLite model files](#tflite_inference)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1.0 Model Recreation\n", + "\n", + "In order to recreate the model you will first need to be using ```Python3.7``` and install the requirements in ```requirements.txt```.\n", + "\n", + "Once you have these requirements satisfied you can execute the recreation script contained within this folder, just run:" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2023-01-31 13:18:57.429502: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0\n", + "Untarring speech_commands_v0.02.tar.gz...\n", + "2023-01-31 13:19:44.590405: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcuda.so.1\n", + "2023-01-31 13:19:44.627169: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: \n", + "pciBusID: 0000:03:00.0 name: NVIDIA TITAN Xp computeCapability: 6.1\n", + "coreClock: 1.582GHz coreCount: 30 deviceMemorySize: 11.91GiB deviceMemoryBandwidth: 510.07GiB/s\n", + "2023-01-31 13:19:44.627205: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0\n", + "2023-01-31 13:19:44.650614: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcublas.so.11\n", + "2023-01-31 13:19:44.650690: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcublasLt.so.11\n", + "2023-01-31 13:19:44.653550: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcufft.so.10\n", + "2023-01-31 13:19:44.653884: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcurand.so.10\n", + "2023-01-31 13:19:44.654515: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcusolver.so.11\n", + "2023-01-31 13:19:44.655280: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcusparse.so.11\n", + "2023-01-31 13:19:44.655466: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudnn.so.8\n", + "2023-01-31 13:19:44.655866: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0\n", + "2023-01-31 13:19:44.656166: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA\n", + "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n", + "2023-01-31 13:19:44.657031: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: \n", + "pciBusID: 0000:03:00.0 name: NVIDIA TITAN Xp computeCapability: 6.1\n", + "coreClock: 1.582GHz coreCount: 30 deviceMemorySize: 11.91GiB deviceMemoryBandwidth: 510.07GiB/s\n", + "2023-01-31 13:19:44.657463: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0\n", + "2023-01-31 13:19:44.657531: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0\n", + "2023-01-31 13:19:45.095453: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1258] Device interconnect StreamExecutor with strength 1 edge matrix:\n", + "2023-01-31 13:19:45.095490: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1264] 0 \n", + "2023-01-31 13:19:45.095499: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1277] 0: N \n", + "2023-01-31 13:19:45.096006: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1418] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 10942 MB memory) -> physical GPU (device: 0, name: NVIDIA TITAN Xp, pci bus id: 0000:03:00.0, compute capability: 6.1)\n", + "WARNING:tensorflow:Compiled the loaded model, but the compiled metrics have yet to be built. `model.compile_metrics` will be empty until you train or evaluate the model.\n", + "2023-01-31 13:19:46.231729: W tensorflow/python/util/util.cc:348] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.\n", + "2023-01-31 13:19:46.494512: I tensorflow/core/grappler/devices.cc:69] Number of eligible GPUs (core count >= 8, compute capability >= 0.0): 1\n", + "2023-01-31 13:19:46.494713: I tensorflow/core/grappler/clusters/single_machine.cc:357] Starting new session\n", + "2023-01-31 13:19:46.495116: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: \n", + "pciBusID: 0000:03:00.0 name: NVIDIA TITAN Xp computeCapability: 6.1\n", + "coreClock: 1.582GHz coreCount: 30 deviceMemorySize: 11.91GiB deviceMemoryBandwidth: 510.07GiB/s\n", + "2023-01-31 13:19:46.495381: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0\n", + "2023-01-31 13:19:46.495413: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1258] Device interconnect StreamExecutor with strength 1 edge matrix:\n", + "2023-01-31 13:19:46.495422: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1264] 0 \n", + "2023-01-31 13:19:46.495429: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1277] 0: N \n", + "2023-01-31 13:19:46.495705: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1418] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 10942 MB memory) -> physical GPU (device: 0, name: NVIDIA TITAN Xp, pci bus id: 0000:03:00.0, compute capability: 6.1)\n", + "2023-01-31 13:19:46.519581: I tensorflow/core/platform/profile_utils/cpu_utils.cc:114] CPU Frequency: 3492140000 Hz\n", + "2023-01-31 13:19:46.520288: I tensorflow/core/grappler/optimizers/meta_optimizer.cc:1144] Optimization results for grappler item: graph_to_optimize\n", + " function_optimizer: function_optimizer did nothing. time = 0.007ms.\n", + " function_optimizer: function_optimizer did nothing. time = 0.001ms.\n", + "\n", + "2023-01-31 13:19:46.560745: W tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc:345] Ignored output_format.\n", + "2023-01-31 13:19:46.560780: W tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc:348] Ignored drop_control_dependency.\n", + "2023-01-31 13:19:46.564917: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:210] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.\n", + "2023-01-31 13:19:46.566851: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: \n", + "pciBusID: 0000:03:00.0 name: NVIDIA TITAN Xp computeCapability: 6.1\n", + "coreClock: 1.582GHz coreCount: 30 deviceMemorySize: 11.91GiB deviceMemoryBandwidth: 510.07GiB/s\n", + "2023-01-31 13:19:46.567112: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0\n", + "2023-01-31 13:19:46.567143: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1258] Device interconnect StreamExecutor with strength 1 edge matrix:\n", + "2023-01-31 13:19:46.567154: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1264] 0 \n", + "2023-01-31 13:19:46.567161: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1277] 0: N \n", + "2023-01-31 13:19:46.567471: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1418] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 10942 MB memory) -> physical GPU (device: 0, name: NVIDIA TITAN Xp, pci bus id: 0000:03:00.0, compute capability: 6.1)\n", + "Converted model saved to dnn.tflite.\n", + "Running TFLite evaluation on validation set...\n", + "2023-01-31 13:19:46.612300: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:176] None of the MLIR Optimization Passes are enabled (registered 2)\n", + "[[371 0 0 0 0 0 0 0 0 0 0 0]\n", + " [ 0 265 7 9 5 21 10 10 15 6 5 18]\n", + " [ 0 6 353 5 1 2 19 6 0 0 0 5]\n", + " [ 0 11 7 340 5 14 3 1 3 1 5 16]\n", + " [ 0 4 0 3 296 1 8 1 5 20 8 4]\n", + " [ 0 3 0 17 0 334 0 1 6 1 1 14]\n", + " [ 0 5 23 1 3 1 307 8 0 2 1 1]\n", + " [ 0 10 1 2 2 2 3 339 1 2 0 1]\n", + " [ 1 9 1 2 7 7 1 0 323 9 0 3]\n", + " [ 0 3 0 1 28 2 3 1 9 323 3 0]\n", + " [ 1 4 0 0 10 2 1 0 4 3 324 1]\n", + " [ 0 11 1 34 5 17 1 1 3 3 1 295]]\n", + "Validation accuracy = 87.06%(N=4445)\n", + "Running TFLite evaluation on test set...\n", + "[[408 0 0 0 0 0 0 0 0 0 0 0]\n", + " [ 0 308 9 12 7 10 9 6 19 4 9 15]\n", + " [ 0 9 382 2 0 7 14 2 0 0 1 2]\n", + " [ 0 9 3 332 0 25 2 0 0 0 2 32]\n", + " [ 0 11 1 2 366 4 3 0 11 9 13 5]\n", + " [ 0 13 1 27 2 337 8 1 1 0 1 15]\n", + " [ 0 9 13 5 4 2 365 9 1 1 2 1]\n", + " [ 0 16 0 1 3 2 5 362 2 4 0 1]\n", + " [ 0 9 1 0 2 9 1 0 351 21 1 1]\n", + " [ 0 10 0 0 17 1 5 2 11 350 1 5]\n", + " [ 0 3 1 4 15 4 0 1 0 2 377 4]\n", + " [ 0 12 3 55 6 9 4 2 3 5 4 299]]\n", + "Test accuracy = 86.65%(N=4890)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2023-01-31 13:19:59.827495: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0\n", + "Untarring speech_commands_v0.02.tar.gz...\n", + "2023-01-31 13:20:49.624250: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcuda.so.1\n", + "2023-01-31 13:20:49.663343: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: \n", + "pciBusID: 0000:03:00.0 name: NVIDIA TITAN Xp computeCapability: 6.1\n", + "coreClock: 1.582GHz coreCount: 30 deviceMemorySize: 11.91GiB deviceMemoryBandwidth: 510.07GiB/s\n", + "2023-01-31 13:20:49.663382: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0\n", + "2023-01-31 13:20:49.683862: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcublas.so.11\n", + "2023-01-31 13:20:49.683941: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcublasLt.so.11\n", + "2023-01-31 13:20:49.686764: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcufft.so.10\n", + "2023-01-31 13:20:49.687075: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcurand.so.10\n", + "2023-01-31 13:20:49.687678: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcusolver.so.11\n", + "2023-01-31 13:20:49.688414: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcusparse.so.11\n", + "2023-01-31 13:20:49.688571: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudnn.so.8\n", + "2023-01-31 13:20:49.688929: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0\n", + "2023-01-31 13:20:49.689226: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA\n", + "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n", + "2023-01-31 13:20:49.689923: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: \n", + "pciBusID: 0000:03:00.0 name: NVIDIA TITAN Xp computeCapability: 6.1\n", + "coreClock: 1.582GHz coreCount: 30 deviceMemorySize: 11.91GiB deviceMemoryBandwidth: 510.07GiB/s\n", + "2023-01-31 13:20:49.690297: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0\n", + "2023-01-31 13:20:49.690365: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0\n", + "2023-01-31 13:20:50.138334: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1258] Device interconnect StreamExecutor with strength 1 edge matrix:\n", + "2023-01-31 13:20:50.138374: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1264] 0 \n", + "2023-01-31 13:20:50.138386: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1277] 0: N \n", + "2023-01-31 13:20:50.138892: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1418] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 10942 MB memory) -> physical GPU (device: 0, name: NVIDIA TITAN Xp, pci bus id: 0000:03:00.0, compute capability: 6.1)\n", + "WARNING:tensorflow:Compiled the loaded model, but the compiled metrics have yet to be built. `model.compile_metrics` will be empty until you train or evaluate the model.\n", + "2023-01-31 13:20:51.250414: W tensorflow/python/util/util.cc:348] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.\n", + "2023-01-31 13:20:51.521477: I tensorflow/core/grappler/devices.cc:69] Number of eligible GPUs (core count >= 8, compute capability >= 0.0): 1\n", + "2023-01-31 13:20:51.521575: I tensorflow/core/grappler/clusters/single_machine.cc:357] Starting new session\n", + "2023-01-31 13:20:51.522122: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: \n", + "pciBusID: 0000:03:00.0 name: NVIDIA TITAN Xp computeCapability: 6.1\n", + "coreClock: 1.582GHz coreCount: 30 deviceMemorySize: 11.91GiB deviceMemoryBandwidth: 510.07GiB/s\n", + "2023-01-31 13:20:51.522382: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0\n", + "2023-01-31 13:20:51.522413: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1258] Device interconnect StreamExecutor with strength 1 edge matrix:\n", + "2023-01-31 13:20:51.522424: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1264] 0 \n", + "2023-01-31 13:20:51.522432: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1277] 0: N \n", + "2023-01-31 13:20:51.522720: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1418] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 10942 MB memory) -> physical GPU (device: 0, name: NVIDIA TITAN Xp, pci bus id: 0000:03:00.0, compute capability: 6.1)\n", + "2023-01-31 13:20:51.539458: I tensorflow/core/platform/profile_utils/cpu_utils.cc:114] CPU Frequency: 3492140000 Hz\n", + "2023-01-31 13:20:51.540454: I tensorflow/core/grappler/optimizers/meta_optimizer.cc:1144] Optimization results for grappler item: graph_to_optimize\n", + " function_optimizer: function_optimizer did nothing. time = 0.01ms.\n", + " function_optimizer: function_optimizer did nothing. time = 0.001ms.\n", + "\n", + "2023-01-31 13:20:51.584213: W tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc:345] Ignored output_format.\n", + "2023-01-31 13:20:51.584254: W tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc:348] Ignored drop_control_dependency.\n", + "2023-01-31 13:20:51.588197: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:210] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.\n", + "2023-01-31 13:20:51.590131: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: \n", + "pciBusID: 0000:03:00.0 name: NVIDIA TITAN Xp computeCapability: 6.1\n", + "coreClock: 1.582GHz coreCount: 30 deviceMemorySize: 11.91GiB deviceMemoryBandwidth: 510.07GiB/s\n", + "2023-01-31 13:20:51.590402: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0\n", + "2023-01-31 13:20:51.590432: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1258] Device interconnect StreamExecutor with strength 1 edge matrix:\n", + "2023-01-31 13:20:51.590442: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1264] 0 \n", + "2023-01-31 13:20:51.590450: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1277] 0: N \n", + "2023-01-31 13:20:51.590759: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1418] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 10942 MB memory) -> physical GPU (device: 0, name: NVIDIA TITAN Xp, pci bus id: 0000:03:00.0, compute capability: 6.1)\n", + "2023-01-31 13:20:51.621299: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:176] None of the MLIR Optimization Passes are enabled (registered 2)\n", + "fully_quantize: 0, inference_type: 6, input_inference_type: 9, output_inference_type: 9\n", + "Quantized model saved to dnn_quantized.tflite.\n", + "Running TFLite evaluation on validation set...\n", + "[[371 0 0 0 0 0 0 0 0 0 0 0]\n", + " [ 0 271 7 7 5 19 9 10 16 6 5 16]\n", + " [ 0 8 354 6 2 2 17 4 0 0 1 3]\n", + " [ 0 16 7 333 9 14 5 2 2 1 3 14]\n", + " [ 0 6 1 3 293 2 8 1 4 19 8 5]\n", + " [ 0 9 2 19 9 320 0 1 3 1 3 10]\n", + " [ 0 3 29 1 5 1 297 11 0 2 1 2]\n", + " [ 0 14 1 4 8 1 4 325 1 2 2 1]\n", + " [ 1 10 2 1 10 4 1 1 323 7 0 3]\n", + " [ 0 4 0 0 32 2 3 0 6 320 4 2]\n", + " [ 1 7 0 1 16 3 0 3 3 2 314 0]\n", + " [ 0 11 1 47 9 18 1 1 3 3 1 277]]\n", + "Validation accuracy = 85.44%(N=4445)\n", + "Running TFLite evaluation on test set...\n", + "[[408 0 0 0 0 0 0 0 0 0 0 0]\n", + " [ 0 319 8 13 7 8 9 5 17 4 7 11]\n", + " [ 0 10 379 2 5 7 10 2 0 0 1 3]\n", + " [ 0 13 3 332 4 20 2 2 0 2 1 26]\n", + " [ 0 15 0 3 363 3 3 3 10 11 10 4]\n", + " [ 0 17 6 26 5 327 6 2 1 0 2 14]\n", + " [ 0 13 15 3 11 2 352 10 2 3 1 0]\n", + " [ 0 18 2 0 7 2 9 350 3 2 1 2]\n", + " [ 0 13 1 0 9 7 0 1 342 17 4 2]\n", + " [ 0 14 1 0 27 0 6 3 8 334 2 7]\n", + " [ 0 5 1 2 23 3 4 1 0 1 366 5]\n", + " [ 0 13 2 62 13 13 4 2 1 3 6 283]]\n", + "Test accuracy = 84.97%(N=4890)\n" + ] + } + ], + "source": [ + "!bash ./recreate_model.sh" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Running this script will use the pre-trained checkpoint files supplied in the ```./model_archive/model_source/weights``` folder to generate the TFLite files and perform evaluation on the test set. Both an fp32 version and a quantized version will be produced. The quantized version will use post-training quantization to fully quantize it.\n", + "\n", + "If you want to run training from scratch you can do this by supplying ```--train``` when running the script. For example:\n", + "\n", + "```bash\n", + "bash ./recreate_model.sh --train\n", + "```\n", + "\n", + "Training is then performed and should produce a model to the stated accuracy in this repository. Note that exporting to TFLite will still happen with the baseline pre-trained checkpoint files, so you will need to re-run the script and this time supply the path to the new checkpoint files you want to use, for example:\n", + "\n", + "```bash\n", + "bash ./recreate_model.sh --ckpt \n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2.0 Training\n", + "\n", + "The training scripts can be used to recreate any of the models from the [Hello Edge paper](https://arxiv.org/pdf/1711.07128.pdf) provided the right hyperparameters are used. The training commands with all the hyperparameters to reproduce the model in this repository are given [here](recreate_model.sh). The model in this part of the repository represents just one variation of the models from the paper, other varieties are covered in other parts of the repository.\n", + "\n", + "\n", + "As a general example of how to train a DNN with 3 fully-connected layers with 128 neurons in each layer, run:\n", + "```\n", + "python train.py --model_architecture dnn --model_size_info 128 128 128\n", + "```\n", + "\n", + "The command line argument *--model_size_info* is used to pass the neural network layer\n", + "dimensions such as number of layers, convolution filter size/stride as a list to models.py,\n", + "which builds the TensorFlow graph based on the provided model architecture\n", + "and layer dimensions. For more info on *model_size_info* for each network architecture see\n", + "[models.py](model_core_utils/models.py).\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 3.0 Testing\n", + "To run inference on the trained model from a checkpoint and get accuracy on validation and test sets, run:\n", + "```\n", + "python evaluation.py --model_architecture dnn --model_size_info 128 128 128 --checkpoint \n", + "```\n", + "**The model and feature extraction parameters passed to this script should match those used in the Training step.**" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 4.0 Optimization\n", + "\n", + "We introduce an *optional* step to optimize the trained keyword spotting model for deployment.\n", + "\n", + "Here we use TensorFlow's [weight clustering API](https://www.tensorflow.org/model_optimization/guide/clustering) to reduce the compressed model size and optimize inference on supported hardware. 32 weight clusters and kmeans++ cluster intialization method are used as the clustering hyperparameters.\n", + "\n", + "To optimize your trained model (e.g. a DNN), a trained model checkpoint is needed to run clustering and fine-tuning on.\n", + "You can use the pre-trained checkpoints provided, or train your own model and use the resulting checkpoint.\n", + "\n", + "To apply the optimization and fine-tuning, run the following command:\n", + "```\n", + "python optimisations.py --model_architecture dnn --model_size_info 128 128 128 --checkpoint \n", + "```\n", + "**The model and feature extraction parameters used here should match those used in the Training step, except for the number of training steps.\n", + "The number of training steps is reduced since the optimization step only requires fine-tuning.**\n", + "\n", + "This will generate a clustered model checkpoint that can be used in the quantization step to generate a quantized and clustered TFLite model." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 5.0 Quantization and TFLite Conversion\n", + "\n", + "You can now use TensorFlow's\n", + "[post training quantization](https://www.tensorflow.org/lite/performance/post_training_quantization) to\n", + "make quantization of the trained models super simple.\n", + "\n", + "To quantize your trained model (e.g. a DNN) run:\n", + "```\n", + "python convert_to_tflite.py --model_architecture dnn --model_size_info 128 128 128 --checkpoint [--inference_type int8|int16]\n", + "```\n", + "**The model and feature extraction parameters used here should match those used in the Training step.**\n", + "\n", + "The ```inference_type``` parameter is *optional* and to be used if a fully quantized model with inputs and outputs of type int8 or int16 is needed. It defaults to fp32.\n", + "\n", + "In this example, this step will produce a quantized TFLite file *dnn_quantized.tflite*." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can test the accuracy of this quantized model on the test set by running:\n", + "```\n", + "python evaluation.py --tflite_path dnn_quantized.tflite\n", + "```\n", + "**The model and feature extraction parameters used here should match those used in the Training step.**\n", + "\n", + "`convert_to_tflite.py` uses post-training quantization to generate a quantized model by default. If you wish to convert to a floating point TFLite model, use the command below:\n", + "\n", + "```\n", + "python convert_to_tflite.py --model_architecture dnn --model_size_info 128 128 128 --checkpoint --no-quantize\n", + "```\n", + "\n", + "This will produce a floating point TFLite file *dnn.tflite*. You can test the accuracy of this floating point model using `evaluation.py` as above.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 6.0 Single inference of the TFLite model files \n", + "\n", + "You can conduct TFLite inference for .fp32 and .int8 model files by using the following command: \n", + "\n", + "```python dnn_l_inference_tflite.py --labels validation_utils/labels.txt --wav --tflite_path ```\n", + "\n", + "**The feature extraction parameters used here should match those used in the Training step.**\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.4" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/models/keyword_spotting/dnn_large/model_package_tf/model_archive/TFLite/tflite_fp32/README.md b/models/keyword_spotting/dnn_large/model_package_tf/model_archive/TFLite/tflite_fp32/README.md new file mode 100644 index 0000000..6d3f666 --- /dev/null +++ b/models/keyword_spotting/dnn_large/model_package_tf/model_archive/TFLite/tflite_fp32/README.md @@ -0,0 +1,62 @@ +# keyword_spotting/dnn_large/model_package_tf/model_archive/TFLite/tflite_fp32 + +## Description +This is a floating point fp32 version of the DNN Large model developed by Arm, from the Hello Edge paper. + +## License +[Apache-2.0](https://spdx.org/licenses/Apache-2.0.html) + +## Network Information +| Network Information | Value | +|---------------------|-------| +| Framework | TensorFlow Lite | +| Datatype | fp32 | +| SHA-1 Hash | 1ce04d01ed7decc016076a868f22858d8f092942 | +| Size (Bytes) | 1985048 | +| Provenance | https://arxiv.org/abs/1711.07128 | +| Training | Trained by Arm | +| Paper | https://arxiv.org/abs/1711.07128 | + +## DataSet +| Dataset Information | Value | +|--------|-------| +| Name | Google Speech Commands test set | + +## Accuracy + +| Metric | Value | +|--------|-------| +| accuracy | 86.65% | + +## HW Support +| HW Support | Value | +|--------------|-------| +| Cortex-A |:heavy_check_mark: | +| Cortex-M |:heavy_check_mark: | +| Mali GPU |:heavy_check_mark: | +| Ethos U |:heavy_multiplication_x: | + +### Key +* :heavy_check_mark: - Will run on this platform. +* :heavy_multiplication_x: - Will not run on this platform. + +## Network Quality +| Network Quality | Value | +|-------------------------|-------| +| Recreate | :heavy_check_mark: | +| Quality level | Deployable | +| Vanilla | :heavy_check_mark: | +| Clustered | :heavy_multiplication_x: | +| Pruned | :heavy_multiplication_x: | +| Quantization - default | :heavy_multiplication_x: | +| Quantization - full | :heavy_multiplication_x: | + +## Network Inputs +| Input Node Name | Shape | Type | Example Path | Example Type | Example Shape | Example Use Case | +|-----------------|-------|-------|--------------|-------|-------|-----------------| +| input | (1, 250) | fp32 | models/keyword_spotting/dnn_large/model_package_tf/model_archive/TFLite/tflite_fp32/testing_input/input | fp32 | [1, 250] | The input is a processed MFCCs | + +## Network Outputs +| Output Node Name | Shape | Type | Example Path | Example Type | Example Shape | Example Use Case | +|-----------------|-------|-------|--------------|-------|-------|-----------------| +| Identity | (1, 12) | fp32 | models/keyword_spotting/dnn_large/model_package_tf/model_archive/TFLite/tflite_fp32/testing_output/Identity | fp32 | [1, 12] | The probability on 12 keywords | \ No newline at end of file diff --git a/models/keyword_spotting/dnn_large/model_package_tf/model_archive/TFLite/tflite_fp32/definition.yaml b/models/keyword_spotting/dnn_large/model_package_tf/model_archive/TFLite/tflite_fp32/definition.yaml new file mode 100644 index 0000000..38082c2 --- /dev/null +++ b/models/keyword_spotting/dnn_large/model_package_tf/model_archive/TFLite/tflite_fp32/definition.yaml @@ -0,0 +1,62 @@ +benchmark: + benchmark_metrics: + accuracy: 86.65% + benchmark_name: Google Speech Commands test set +description: This is a floating point fp32 version of the DNN Large model developed + by Arm, from the Hello Edge paper. +license: +- Apache-2.0 +network: + datatype: fp32 + file_size_bytes: 1985048 + filename: dnn_l.tflite + framework: TensorFlow Lite + hash: + algorithm: sha1 + value: 1ce04d01ed7decc016076a868f22858d8f092942 + provenance: https://arxiv.org/abs/1711.07128 + training: Trained by Arm +network_parameters: + input_nodes: + - description: The input is a processed MFCCs of shape (1, 490) + example_input: + path: models/keyword_spotting/dnn_large/model_package_tf/model_archive/TFLite/tflite_fp32/testing_input/input + shape: + - 1 + - 250 + type: fp32 + use_case: Random input for model regression. + input_datatype: fp32 + name: input + shape: + - 1 + - 250 + output_nodes: + - description: The probability on 12 keywords. + example_output: + path: models/keyword_spotting/dnn_large/model_package_tf/model_archive/TFLite/tflite_fp32/testing_output/Identity + shape: + - 1 + - 12 + type: fp32 + use_case: output for model regression. + name: Identity + output_datatype: fp32 + shape: + - 1 + - 12 +network_quality: + clustered: false + is_vanilla: true + pruned: false + quality_level: Deployable + quality_level_hero_hw: cortex_m + quantization_default: false + quantization_full: false + recreate: true +operators: + TensorFlow Lite: + - FULLY_CONNECTED + - RELU + - SOFTMAX +paper: https://arxiv.org/abs/1711.07128 \ No newline at end of file diff --git a/models/keyword_spotting/dnn_large/model_package_tf/model_archive/TFLite/tflite_fp32/dnn_l.tflite b/models/keyword_spotting/dnn_large/model_package_tf/model_archive/TFLite/tflite_fp32/dnn_l.tflite new file mode 100644 index 0000000..e5cbfe0 --- /dev/null +++ b/models/keyword_spotting/dnn_large/model_package_tf/model_archive/TFLite/tflite_fp32/dnn_l.tflite @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dffdcf515fce70988132d98f8007564e0b303d0b463c422f039e2074cb29fc51 +size 1985048 diff --git a/models/keyword_spotting/dnn_large/model_package_tf/model_archive/TFLite/tflite_fp32/testing_input/input/0.npy b/models/keyword_spotting/dnn_large/model_package_tf/model_archive/TFLite/tflite_fp32/testing_input/input/0.npy new file mode 100644 index 0000000..5c996be --- /dev/null +++ b/models/keyword_spotting/dnn_large/model_package_tf/model_archive/TFLite/tflite_fp32/testing_input/input/0.npy @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:77bdd1510d16c990db4276179453648d51e6526f4fbbe29091c183316184c827 +size 1128 diff --git a/models/keyword_spotting/dnn_large/model_package_tf/model_archive/TFLite/tflite_fp32/testing_output/Identity/0.npy b/models/keyword_spotting/dnn_large/model_package_tf/model_archive/TFLite/tflite_fp32/testing_output/Identity/0.npy new file mode 100644 index 0000000..98bc3fd --- /dev/null +++ b/models/keyword_spotting/dnn_large/model_package_tf/model_archive/TFLite/tflite_fp32/testing_output/Identity/0.npy @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5bc29017fbb5d27101b9b96399c1fbc857a07871d759ca39a20de0b39ecc0396 +size 176 diff --git a/models/keyword_spotting/dnn_large/model_package_tf/model_archive/TFLite/tflite_int8/README.md b/models/keyword_spotting/dnn_large/model_package_tf/model_archive/TFLite/tflite_int8/README.md new file mode 100644 index 0000000..db3aa64 --- /dev/null +++ b/models/keyword_spotting/dnn_large/model_package_tf/model_archive/TFLite/tflite_int8/README.md @@ -0,0 +1,62 @@ +# keyword_spotting/dnn_large/model_package_tf/model_archive/TFLite/tflite_int8 + +## Description +This is a fully quantized int8 version of the DNN Large model developed by Arm, from the Hello Edge paper. + +## License +[Apache-2.0](https://spdx.org/licenses/Apache-2.0.html) + +## Network Information +| Network Information | Value | +|---------------------|-------| +| Framework | TensorFlow Lite | +| Datatype | int8 | +| SHA-1 Hash | 2b1ee34e4c87ba6f24092c7457593227099efaf1 | +| Size (Bytes) | 502272 | +| Provenance | https://arxiv.org/abs/1711.07128 | +| Training | Trained by Arm | +| Paper | https://arxiv.org/abs/1711.07128 | + +## DataSet +| Dataset Information | Value | +|--------|-------| +| Name | Google Speech Commands test set | + +## Accuracy + +| Metric | Value | +|--------|-------| +| Accuracy | 86.01% | + +## HW Support +| HW Support | Value | +|--------------|-------| +| Cortex-A |:heavy_check_mark: | +| Cortex-M |:heavy_check_mark: | +| Mali GPU |:heavy_check_mark: | +| Ethos U |:heavy_check_mark: | + +### Key +* :heavy_check_mark: - Will run on this platform. +* :heavy_multiplication_x: - Will not run on this platform. + +## Network Quality +| Network Quality | Value | +|-------------------------|-------| +| Recreate | :heavy_check_mark: | +| Quality level | Deployable | +| Vanilla | :heavy_check_mark: | +| Clustered | :heavy_multiplication_x: | +| Pruned | :heavy_multiplication_x: | +| Quantization - default | :heavy_multiplication_x: | +| Quantization - full | :heavy_check_mark: | + +## Network Inputs +| Input Node Name | Shape | Type | Example Path | Example Type | Example Shape | Example Use Case | +|-----------------|-------|-------|--------------|-------|-------|-----------------| +| input | (1, 250) | int8 | models/keyword_spotting/dnn_large/model_package_tf/model_archive/TFLite/tflite_int8/testing_input/input | int8 | [1, 250] | The input is a processed MFCCs | + +## Network Outputs +| Output Node Name | Shape | Type | Example Path | Example Type | Example Shape | Example Use Case | +|-----------------|-------|-------|--------------|-------|-------|-----------------| +| Identity | (1, 12) | int8 | models/keyword_spotting/dnn_large/model_package_tf/model_archive/TFLite/tflite_int8/testing_output/Identity | int8 | [1, 12] | The probability on 12 keywords | \ No newline at end of file diff --git a/models/keyword_spotting/dnn_large/model_package_tf/model_archive/TFLite/tflite_int8/definition.yaml b/models/keyword_spotting/dnn_large/model_package_tf/model_archive/TFLite/tflite_int8/definition.yaml new file mode 100644 index 0000000..7040a89 --- /dev/null +++ b/models/keyword_spotting/dnn_large/model_package_tf/model_archive/TFLite/tflite_int8/definition.yaml @@ -0,0 +1,62 @@ +benchmark: + benchmark_metrics: + accuracy: 86.01% + benchmark_name: Google Speech Commands test set +description: This is a fully quantized int8 version of the DNN Large model developed + by Arm, from the Hello Edge paper. +license: +- Apache-2.0 +network: + datatype: int8 + file_size_bytes: 502272 + filename: dnn_l_quantized.tflite + framework: TensorFlow Lite + hash: + algorithm: sha1 + value: 2b1ee34e4c87ba6f24092c7457593227099efaf1 + provenance: https://arxiv.org/abs/1711.07128 + training: Trained by Arm +network_parameters: + input_nodes: + - description: The input is a processed MFCCs of shape (1, 250) + example_input: + path: models/keyword_spotting/dnn_large/model_package_tf/model_archive/TFLite/tflite_int8/testing_input/input + shape: + - 1 + - 250 + type: int8 + use_case: Random input for model regression. + input_datatype: int8 + name: input + shape: + - 1 + - 250 + output_nodes: + - description: The probability on 12 keywords. + example_output: + path: models/keyword_spotting/dnn_large/model_package_tf/model_archive/TFLite/tflite_int8/testing_output/Identity + shape: + - 1 + - 12 + type: int8 + use_case: output for model regression. + name: Identity + output_datatype: int8 + shape: + - 1 + - 12 +network_quality: + clustered: false + is_vanilla: true + pruned: false + quality_level: Deployable + quality_level_hero_hw: cortex_m + quantization_default: false + quantization_full: true + recreate: true +operators: + TensorFlow Lite: + - FULLY_CONNECTED + - RELU + - SOFTMAX +paper: https://arxiv.org/abs/1711.07128 \ No newline at end of file diff --git a/models/keyword_spotting/dnn_large/tflite_int8/dnn_l_quantized.tflite b/models/keyword_spotting/dnn_large/model_package_tf/model_archive/TFLite/tflite_int8/dnn_l_quantized.tflite similarity index 100% rename from models/keyword_spotting/dnn_large/tflite_int8/dnn_l_quantized.tflite rename to models/keyword_spotting/dnn_large/model_package_tf/model_archive/TFLite/tflite_int8/dnn_l_quantized.tflite diff --git a/models/keyword_spotting/dnn_large/tflite_int8/testing_input/input/0.npy b/models/keyword_spotting/dnn_large/model_package_tf/model_archive/TFLite/tflite_int8/testing_input/input/0.npy similarity index 100% rename from models/keyword_spotting/dnn_large/tflite_int8/testing_input/input/0.npy rename to models/keyword_spotting/dnn_large/model_package_tf/model_archive/TFLite/tflite_int8/testing_input/input/0.npy diff --git a/models/keyword_spotting/dnn_large/tflite_int8/testing_output/Identity/0.npy b/models/keyword_spotting/dnn_large/model_package_tf/model_archive/TFLite/tflite_int8/testing_output/Identity/0.npy similarity index 100% rename from models/keyword_spotting/dnn_large/tflite_int8/testing_output/Identity/0.npy rename to models/keyword_spotting/dnn_large/model_package_tf/model_archive/TFLite/tflite_int8/testing_output/Identity/0.npy diff --git a/models/keyword_spotting/dnn_large/model_package_tf/model_archive/model_source/saved_model/dnn_large/keras_metadata.pb b/models/keyword_spotting/dnn_large/model_package_tf/model_archive/model_source/saved_model/dnn_large/keras_metadata.pb new file mode 100644 index 0000000..364939d --- /dev/null +++ b/models/keyword_spotting/dnn_large/model_package_tf/model_archive/model_source/saved_model/dnn_large/keras_metadata.pb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:06b16edf52376997d110f024184261ef588cd3309d8175c8769aa45482cd0164 +size 10087 diff --git a/models/keyword_spotting/dnn_large/model_package_tf/model_archive/model_source/saved_model/dnn_large/saved_model.pb b/models/keyword_spotting/dnn_large/model_package_tf/model_archive/model_source/saved_model/dnn_large/saved_model.pb new file mode 100644 index 0000000..59d2022 --- /dev/null +++ b/models/keyword_spotting/dnn_large/model_package_tf/model_archive/model_source/saved_model/dnn_large/saved_model.pb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:acb29f21ca89c9369eca08f583daaf3c7e64cd26ab5fec4cb0b95cf9d04435ef +size 85126 diff --git a/models/keyword_spotting/dnn_large/model_package_tf/model_archive/model_source/saved_model/dnn_large/variables/variables.data-00000-of-00001 b/models/keyword_spotting/dnn_large/model_package_tf/model_archive/model_source/saved_model/dnn_large/variables/variables.data-00000-of-00001 new file mode 100644 index 0000000..4d554fc --- /dev/null +++ b/models/keyword_spotting/dnn_large/model_package_tf/model_archive/model_source/saved_model/dnn_large/variables/variables.data-00000-of-00001 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:11f3d672a01c44c0e86a5f485ddfe4b2e5c8c6770563a6e0520297ed1e029579 +size 1985615 diff --git a/models/keyword_spotting/dnn_large/model_package_tf/model_archive/model_source/saved_model/dnn_large/variables/variables.index b/models/keyword_spotting/dnn_large/model_package_tf/model_archive/model_source/saved_model/dnn_large/variables/variables.index new file mode 100644 index 0000000..fc9e90c --- /dev/null +++ b/models/keyword_spotting/dnn_large/model_package_tf/model_archive/model_source/saved_model/dnn_large/variables/variables.index @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d1ae6b295e9da819138459f3684755e71c2fac683da141510581996541e509e6 +size 642 diff --git a/models/keyword_spotting/dnn_large/tflite_int8/ckpt/checkpoint b/models/keyword_spotting/dnn_large/model_package_tf/model_archive/model_source/weights/checkpoint similarity index 100% rename from models/keyword_spotting/dnn_large/tflite_int8/ckpt/checkpoint rename to models/keyword_spotting/dnn_large/model_package_tf/model_archive/model_source/weights/checkpoint diff --git a/models/keyword_spotting/dnn_large/tflite_int8/ckpt/dnn_0.87_ckpt.data-00000-of-00001 b/models/keyword_spotting/dnn_large/model_package_tf/model_archive/model_source/weights/dnn_0.87_ckpt.data-00000-of-00001 similarity index 100% rename from models/keyword_spotting/dnn_large/tflite_int8/ckpt/dnn_0.87_ckpt.data-00000-of-00001 rename to models/keyword_spotting/dnn_large/model_package_tf/model_archive/model_source/weights/dnn_0.87_ckpt.data-00000-of-00001 diff --git a/models/keyword_spotting/dnn_large/tflite_int8/ckpt/dnn_0.87_ckpt.index b/models/keyword_spotting/dnn_large/model_package_tf/model_archive/model_source/weights/dnn_0.87_ckpt.index similarity index 100% rename from models/keyword_spotting/dnn_large/tflite_int8/ckpt/dnn_0.87_ckpt.index rename to models/keyword_spotting/dnn_large/model_package_tf/model_archive/model_source/weights/dnn_0.87_ckpt.index diff --git a/models/keyword_spotting/dnn_large/model_package_tf/model_core_utils/__init__.py b/models/keyword_spotting/dnn_large/model_package_tf/model_core_utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/models/keyword_spotting/dnn_large/model_package_tf/model_core_utils/models.py b/models/keyword_spotting/dnn_large/model_package_tf/model_core_utils/models.py new file mode 100644 index 0000000..1978136 --- /dev/null +++ b/models/keyword_spotting/dnn_large/model_package_tf/model_core_utils/models.py @@ -0,0 +1,327 @@ +# Copyright © 2023 Arm Ltd. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Model definitions for simple keyword spotting.""" + +import math + +import tensorflow as tf + + +def prepare_model_settings(label_count, sample_rate, clip_duration_ms, + window_size_ms, window_stride_ms, + dct_coefficient_count): + """Calculates common settings needed for all models. + + Args: + label_count: How many classes are to be recognized. + sample_rate: Number of audio samples per second. + clip_duration_ms: Length of each audio clip to be analyzed. + window_size_ms: Duration of frequency analysis window. + window_stride_ms: How far to move in time between frequency windows. + dct_coefficient_count: Number of frequency bins to use for analysis. + + Returns: + Dictionary containing common settings. + """ + desired_samples = int(sample_rate * clip_duration_ms / 1000) + window_size_samples = int(sample_rate * window_size_ms / 1000) + window_stride_samples = int(sample_rate * window_stride_ms / 1000) + length_minus_window = (desired_samples - window_size_samples) + if length_minus_window < 0: + spectrogram_length = 0 + else: + spectrogram_length = 1 + int(length_minus_window / window_stride_samples) + fingerprint_size = dct_coefficient_count * spectrogram_length + + return { + 'desired_samples': desired_samples, + 'window_size_samples': window_size_samples, + 'window_stride_samples': window_stride_samples, + 'spectrogram_length': spectrogram_length, + 'dct_coefficient_count': dct_coefficient_count, + 'fingerprint_size': fingerprint_size, + 'label_count': label_count, + 'sample_rate': sample_rate, + } + + +def create_model(model_settings, model_architecture, model_size_info, is_training): + """Builds a tf.keras model of the requested architecture compatible with the settings. + + Args: + model_settings: Dictionary of information about the model. + model_architecture: String specifying which kind of model to create. + model_size_info: Array with specific information for the chosen architecture + (e.g convolutional parameters, number of layers). + + Returns: + A tf.keras Model with the requested architecture. + + Raises: + Exception: If the architecture type isn't recognized. + """ + + if model_architecture == 'dnn': + return create_dnn_model(model_settings, model_size_info) + + elif model_architecture == 'cnn': + return create_cnn_model(model_settings, model_size_info) + + elif model_architecture == 'ds_cnn': + return create_ds_cnn_model(model_settings, model_size_info) + elif model_architecture == 'single_fc': + return create_single_fc_model(model_settings) + elif model_architecture == 'basic_lstm': + return create_basic_lstm_model(model_settings, model_size_info, is_training) + else: + raise Exception(f'model_architecture argument {model_architecture} not recognized' + f', should be one of, "dnn", "cnn", "ds_cnn" ') + + +def create_single_fc_model(model_settings): + """Builds a model with a single fully-connected layer. + + For details see https://arxiv.org/abs/1711.07128. + + Args: + model_settings: Dict of different settings for model training. + + Returns: + tf.keras Model of the 'SINGLE_FC' architecture. + """ + inputs = tf.keras.Input(shape=(model_settings['fingerprint_size'],), name='input') + # Fully connected layer + output = tf.keras.layers.Dense(units=model_settings['label_count'], activation='softmax')(inputs) + + return tf.keras.Model(inputs, output) + + +def create_basic_lstm_model(model_settings, model_size_info, is_training): + """Builds a model with a basic lstm layer. + + For details see https://arxiv.org/abs/1711.07128. + + Args: + model_settings: Dict of different settings for model training. + model_size_info: Length of the array defines the number of hidden-layers and + each element in the array represent the number of neurons in that layer. + is_training: Determining whether the use of the model is for training or for something else. + + Returns: + tf.keras Model of the 'Basic_LSTM' architecture. + """ + inputs = tf.keras.Input(shape=(model_settings['fingerprint_size'], ), name='input') + + input_frequency_size = model_settings['dct_coefficient_count'] + input_time_size = model_settings['spectrogram_length'] + x = tf.reshape(inputs, shape=(-1, input_time_size, input_frequency_size)) + + # LSTM layer, and unrolling depending on whether you are training or not + if is_training: + x = tf.keras.layers.LSTM(units=model_size_info[0], time_major=False, unroll=False)(x) + else: + x = tf.keras.layers.LSTM(units=model_size_info[0], time_major=False, unroll=True)(x) + + # Outputs a fully connected layer + output = tf.keras.layers.Dense(units=model_settings['label_count'], activation='softmax')(x) + + return tf.keras.Model(inputs, output) + + +def create_dnn_model(model_settings, model_size_info): + """Builds a model with multiple hidden fully-connected layers. + + For details see https://arxiv.org/abs/1711.07128. + + Args: + model_settings: Dict of different settings for model training. + model_size_info: Length of the array defines the number of hidden-layers and + each element in the array represent the number of neurons in that layer. + + Returns: + tf.keras Model of the 'DNN' architecture. + """ + + inputs = tf.keras.Input(shape=(model_settings['fingerprint_size'], ), name='input') + + # First fully connected layer. + x = tf.keras.layers.Dense(units=model_size_info[0], activation='relu')(inputs) + + # Hidden layers with ReLU activations. + for i in range(1, len(model_size_info)): + x = tf.keras.layers.Dense(units=model_size_info[i], activation='relu')(x) + + # Output fully connected layer. + output = tf.keras.layers.Dense(units=model_settings['label_count'], activation='softmax')(x) + + return tf.keras.Model(inputs, output) + + +def create_cnn_model(model_settings, model_size_info): + """Builds a model with 2 convolution layers followed by a linear layer and a hidden fully-connected layer. + + For details see https://arxiv.org/abs/1711.07128. + + Args: + model_settings: Dict of different settings for model training. + model_size_info: Defines the first and second convolution parameters in + {number of conv features, conv filter height, width, stride in y,x dir.}, + followed by linear layer size and fully-connected layer size. + + Returns: + tf.keras Model of the 'CNN' architecture. + """ + + input_frequency_size = model_settings['dct_coefficient_count'] + input_time_size = model_settings['spectrogram_length'] + + first_filter_count = model_size_info[0] + first_filter_height = model_size_info[1] # Time axis. + first_filter_width = model_size_info[2] # Frequency axis. + first_filter_stride_y = model_size_info[3] # Time axis. + first_filter_stride_x = model_size_info[4] # Frequency_axis. + + second_filter_count = model_size_info[5] + second_filter_height = model_size_info[6] # Time axis. + second_filter_width = model_size_info[7] # Frequency axis. + second_filter_stride_y = model_size_info[8] # Time axis. + second_filter_stride_x = model_size_info[9] # Frequency axis. + + linear_layer_size = model_size_info[10] + fc_size = model_size_info[11] + + inputs = tf.keras.Input(shape=(model_settings['fingerprint_size']), name='input') + + # Reshape the flattened input. + x = tf.reshape(inputs, shape=(-1, input_time_size, input_frequency_size, 1)) + + # First convolution. + x = tf.keras.layers.Conv2D(filters=first_filter_count, + kernel_size=(first_filter_height, first_filter_width), + strides=(first_filter_stride_y, first_filter_stride_x), + padding='VALID')(x) + x = tf.keras.layers.BatchNormalization()(x) + x = tf.keras.layers.ReLU()(x) + x = tf.keras.layers.Dropout(rate=0)(x) + + # Second convolution. + x = tf.keras.layers.Conv2D(filters=second_filter_count, + kernel_size=(second_filter_height, second_filter_width), + strides=(second_filter_stride_y, second_filter_stride_x), + padding='VALID')(x) + x = tf.keras.layers.BatchNormalization()(x) + x = tf.keras.layers.ReLU()(x) + x = tf.keras.layers.Dropout(rate=0)(x) + + # Flatten for fully connected layers. + x = tf.keras.layers.Flatten()(x) + + # Fully connected layer with no activation. + x = tf.keras.layers.Dense(units=linear_layer_size)(x) + + # Fully connected layer with ReLU activation. + x = tf.keras.layers.Dense(units=fc_size)(x) + x = tf.keras.layers.BatchNormalization()(x) + x = tf.keras.layers.ReLU()(x) + x = tf.keras.layers.Dropout(rate=0)(x) + + # Output fully connected. + output = tf.keras.layers.Dense(units=model_settings['label_count'], activation='softmax')(x) + + return tf.keras.Model(inputs, output) + + +def create_ds_cnn_model(model_settings, model_size_info): + """Builds a model with convolutional & depthwise separable convolutional layers. + + For more details see https://arxiv.org/abs/1711.07128. + + Args: + model_settings: Dict of different settings for model training. + model_size_info: Defines number of layers, followed by the DS-Conv layer + parameters in the order {number of conv features, conv filter height, + width and stride in y,x dir.} for each of the layers. + + Returns: + tf.keras Model of the 'DS-CNN' architecture. + """ + + label_count = model_settings['label_count'] + input_frequency_size = model_settings['dct_coefficient_count'] + input_time_size = model_settings['spectrogram_length'] + + t_dim = input_time_size + f_dim = input_frequency_size + + # Extract model dimensions from model_size_info. + num_layers = model_size_info[0] + conv_feat = [None]*num_layers + conv_kt = [None]*num_layers + conv_kf = [None]*num_layers + conv_st = [None]*num_layers + conv_sf = [None]*num_layers + + i = 1 + for layer_no in range(0, num_layers): + conv_feat[layer_no] = model_size_info[i] + i += 1 + conv_kt[layer_no] = model_size_info[i] + i += 1 + conv_kf[layer_no] = model_size_info[i] + i += 1 + conv_st[layer_no] = model_size_info[i] + i += 1 + conv_sf[layer_no] = model_size_info[i] + i += 1 + + inputs = tf.keras.Input(shape=(model_settings['fingerprint_size']), name='input') + + # Reshape the flattened input. + x = tf.reshape(inputs, shape=(-1, input_time_size, input_frequency_size, 1)) + + # Depthwise separable convolutions. + for layer_no in range(0, num_layers): + if layer_no == 0: + # First convolution. + x = tf.keras.layers.Conv2D(filters=conv_feat[0], + kernel_size=(conv_kt[0], conv_kf[0]), + strides=(conv_st[0], conv_sf[0]), + padding='SAME')(x) + x = tf.keras.layers.BatchNormalization()(x) + x = tf.keras.layers.ReLU()(x) + else: + # Depthwise convolution. + x = tf.keras.layers.DepthwiseConv2D(kernel_size=(conv_kt[layer_no], conv_kf[layer_no]), + strides=(conv_sf[layer_no], conv_st[layer_no]), + padding='SAME')(x) + x = tf.keras.layers.BatchNormalization()(x) + x = tf.keras.layers.ReLU()(x) + + # Pointwise convolution. + x = tf.keras.layers.Conv2D(filters=conv_feat[layer_no], kernel_size=(1, 1))(x) + x = tf.keras.layers.BatchNormalization()(x) + x = tf.keras.layers.ReLU()(x) + + t_dim = math.ceil(t_dim/float(conv_st[layer_no])) + f_dim = math.ceil(f_dim/float(conv_sf[layer_no])) + + # Global average pool. + x = tf.keras.layers.AveragePooling2D(pool_size=(t_dim, f_dim), strides=1)(x) + + # Squeeze before passing to output fully connected layer. + x = tf.reshape(x, shape=(-1, conv_feat[layer_no])) + + # Output connected layer. + output = tf.keras.layers.Dense(units=label_count, activation='softmax')(x) + + return tf.keras.Model(inputs, output) diff --git a/models/keyword_spotting/dnn_large/model_package_tf/optimisations.py b/models/keyword_spotting/dnn_large/model_package_tf/optimisations.py new file mode 100644 index 0000000..16b6f4c --- /dev/null +++ b/models/keyword_spotting/dnn_large/model_package_tf/optimisations.py @@ -0,0 +1,259 @@ +# Copyright © 2023 Arm Ltd. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Functions for optimizing simple keyword spotting models using clustering API.""" + +import argparse +from pathlib import Path + +import tensorflow as tf +import numpy as np +import tensorflow_model_optimization as tfmot + +from data_processing import data_preprocessing +from model_core_utils import models + + +def print_model_weight_clusters(model): + + for layer in model.layers: + if isinstance(layer, tf.keras.layers.Wrapper): + weights = layer.trainable_weights + else: + weights = layer.weights + for weight in weights: + if "kernel" in weight.name: + unique_count = len(np.unique(weight)) + print( + f"{layer.name}/{weight.name}: {unique_count} clusters " + ) + + +def optimize(): + model_settings = models.prepare_model_settings( + len(data_preprocessing.prepare_words_list(FLAGS.wanted_words.split(','))), + FLAGS.sample_rate, FLAGS.clip_duration_ms, FLAGS.window_size_ms, + FLAGS.window_stride_ms, FLAGS.dct_coefficient_count) + + # Create the model to optimize from checkpoint. + model = models.create_model(model_settings, FLAGS.model_architecture, FLAGS.model_size_info) + model.load_weights(FLAGS.checkpoint).expect_partial() + + audio_processor = data_preprocessing.AudioProcessor(data_url=FLAGS.data_url, + data_dir=FLAGS.data_dir, + silence_percentage=FLAGS.silence_percentage, + unknown_percentage=FLAGS.unknown_percentage, + wanted_words=FLAGS.wanted_words.split(','), + validation_percentage=FLAGS.validation_percentage, + testing_percentage=FLAGS.testing_percentage, + model_settings=model_settings) + + # We decay learning rate in a constant piecewise way to help learning. + training_steps_list = list(map(int, FLAGS.how_many_training_steps.split(','))) + learning_rates_list = list(map(float, FLAGS.learning_rate.split(','))) + lr_boundary_list = training_steps_list[:-1] # Only need the values at which to change lr. + lr_schedule = tf.keras.optimizers.schedules.PiecewiseConstantDecay(boundaries=lr_boundary_list, + values=learning_rates_list) + + cluster_weights = tfmot.clustering.keras.cluster_weights + CentroidInitialization = tfmot.clustering.keras.CentroidInitialization + + clustering_params = { + 'number_of_clusters': 32, + 'cluster_centroids_init': CentroidInitialization.KMEANS_PLUS_PLUS} + + clustered_model = cluster_weights(model, **clustering_params) + + # Specify the optimizer configurations. + optimizer = tf.keras.optimizers.Adam(learning_rate=lr_schedule) + clustered_model.compile(optimizer=optimizer, + loss=tf.keras.losses.SparseCategoricalCrossentropy(), + metrics=['accuracy']) + + train_data = audio_processor.get_data(audio_processor.Modes.TRAINING, + FLAGS.background_frequency, FLAGS.background_volume, + int((FLAGS.time_shift_ms * FLAGS.sample_rate) / 1000)) + train_data = train_data.repeat().batch(FLAGS.batch_size).prefetch(tf.data.AUTOTUNE) + val_data = audio_processor.get_data(audio_processor.Modes.VALIDATION) + val_data = val_data.batch(FLAGS.batch_size).prefetch(tf.data.AUTOTUNE) + + # We train for a max number of iterations so need to calculate how many 'epochs' this will be. + training_steps_max = np.sum(training_steps_list) + training_epoch_max = int(np.ceil(training_steps_max / FLAGS.eval_step_interval)) + + # Train the model with clustering applied. + clustered_model.fit(x=train_data, + steps_per_epoch=FLAGS.eval_step_interval, + epochs=training_epoch_max, + validation_data=val_data) + + stripped_clustered_model = tfmot.clustering.keras.strip_clustering(clustered_model) + + print_model_weight_clusters(stripped_clustered_model) + + # Save the clustered model weights + train_dir = Path(FLAGS.train_dir) / "optimized" + train_dir.mkdir(parents=True, exist_ok=True) + + stripped_clustered_model.save_weights((train_dir / + (FLAGS.model_architecture + + "_clustered_ckpt"))) + + # Test the model. + test_data = audio_processor.get_data(audio_processor.Modes.TESTING) + test_data = test_data.batch(FLAGS.batch_size) + + stripped_clustered_model.compile(optimizer=optimizer, + loss=tf.keras.losses.SparseCategoricalCrossentropy(), + metrics=['accuracy']) + + test_loss, test_acc = stripped_clustered_model.evaluate(x=test_data) + print(f'Final test accuracy: {test_acc*100:.2f}%') + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + '--data_url', + type=str, + default='http://download.tensorflow.org/data/speech_commands_v0.02.tar.gz', + help='Location of speech training data archive on the web.') + parser.add_argument( + '--data_dir', + type=str, + default='/tmp/speech_dataset/', + help="""\ + Where to download the speech training data to. + """) + parser.add_argument( + '--background_volume', + type=float, + default=0.1, + help="""\ + How loud the background noise should be, between 0 and 1. + """) + parser.add_argument( + '--background_frequency', + type=float, + default=0.8, + help="""\ + How many of the training samples have background noise mixed in. + """) + parser.add_argument( + '--silence_percentage', + type=float, + default=10.0, + help="""\ + How much of the training data should be silence. + """) + parser.add_argument( + '--unknown_percentage', + type=float, + default=10.0, + help="""\ + How much of the training data should be unknown words. + """) + parser.add_argument( + '--time_shift_ms', + type=float, + default=100.0, + help="""\ + Range to randomly shift the training audio by in time. + """) + parser.add_argument( + '--testing_percentage', + type=int, + default=10, + help='What percentage of wavs to use as a test set.') + parser.add_argument( + '--validation_percentage', + type=int, + default=10, + help='What percentage of wavs to use as a validation set.') + parser.add_argument( + '--sample_rate', + type=int, + default=16000, + help='Expected sample rate of the wavs',) + parser.add_argument( + '--clip_duration_ms', + type=int, + default=1000, + help='Expected duration in milliseconds of the wavs',) + parser.add_argument( + '--window_size_ms', + type=float, + default=30.0, + help='How long each spectrogram timeslice is',) + parser.add_argument( + '--window_stride_ms', + type=float, + default=10.0, + help='How long each spectrogram timeslice is',) + parser.add_argument( + '--dct_coefficient_count', + type=int, + default=40, + help='How many bins to use for the MFCC fingerprint',) + parser.add_argument( + '--how_many_training_steps', + type=str, + default='3750,750', + help='How many training loops to run',) + parser.add_argument( + '--eval_step_interval', + type=int, + default=400, + help='How often to evaluate the training results.') + parser.add_argument( + '--learning_rate', + type=str, + default='0.001,0.0001', + help='How large a learning rate to use when training.') + parser.add_argument( + '--batch_size', + type=int, + default=100, + help='How many items to train with at once',) + parser.add_argument( + '--wanted_words', + type=str, + default='yes,no,up,down,left,right,on,off,stop,go', + help='Words to use (others will be added to an unknown label)',) + parser.add_argument( + '--train_dir', + type=str, + default='/tmp/speech_commands_train', + help='Directory to write event logs and checkpoint.') + parser.add_argument( + '--save_step_interval', + type=int, + default=100, + help='Save model checkpoint every save_steps.') + parser.add_argument( + '--checkpoint', + type=str, + help='Checkpoint to load the weights from before fine-tuning.') + parser.add_argument( + '--model_architecture', + type=str, + default='dnn', + help='What model architecture to use') + parser.add_argument( + '--model_size_info', + type=int, + nargs="+", + default=[128, 128, 128], + help='Model dimensions - different for various models') + + FLAGS, _ = parser.parse_known_args() + optimize() diff --git a/models/keyword_spotting/dnn_large/model_package_tf/recreate_model.sh b/models/keyword_spotting/dnn_large/model_package_tf/recreate_model.sh new file mode 100644 index 0000000..cb54318 --- /dev/null +++ b/models/keyword_spotting/dnn_large/model_package_tf/recreate_model.sh @@ -0,0 +1,52 @@ +#!/usr/bin/env bash +# Copyright (C) 2023 Arm Limited or its affiliates. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the License); you may +# not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an AS IS BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +ckpt_path=model_archive/model_source/weights/dnn_0.87_ckpt +train=false + +# Parse command line args +while (( $# >= 1 )); do + case $1 in + --ckpt) + if [ "$2" ]; then + ckpt_path=$2 + shift + else + printf 'ERROR: "--ckpt" requires a path to be supplied.\n' + exit 1 + fi + ;; + --train) + train=true + break;; + *) shift; + esac; +done + + +# DNN Large training +if [ "$train" = true ] +then +python train.py --model_architecture dnn --model_size_info 436 436 436 --dct_coefficient_count 10 --window_size_ms 40 --window_stride_ms 40 --learning_rate 0.0005,0.0001,0.00002 --how_many_training_steps 10000,10000,10000 --summaries_dir work/DNN/DNN_L/retrain_logs --train_dir work/DNN/DNN_L/training +fi + +# Conversion to TFLite fp32 +python convert_to_tflite.py --model_architecture dnn --model_size_info 436 436 436 --dct_coefficient_count 10 --window_size_ms 40 --window_stride_ms 40 --checkpoint $ckpt_path --no-quantize + +# Conversion to TFLite int8 +python convert_to_tflite.py --model_architecture dnn --model_size_info 436 436 436 --dct_coefficient_count 10 --window_size_ms 40 --window_stride_ms 40 --checkpoint $ckpt_path --inference_type int8 + diff --git a/models/keyword_spotting/dnn_large/model_package_tf/requirements.txt b/models/keyword_spotting/dnn_large/model_package_tf/requirements.txt new file mode 100644 index 0000000..3448cff --- /dev/null +++ b/models/keyword_spotting/dnn_large/model_package_tf/requirements.txt @@ -0,0 +1,3 @@ +numpy == 1.19.5 +tensorflow == 2.5.0 +tensorflow-model-optimization == 0.6.0 \ No newline at end of file diff --git a/models/keyword_spotting/dnn_large/model_package_tf/train.py b/models/keyword_spotting/dnn_large/model_package_tf/train.py new file mode 100644 index 0000000..8c488b3 --- /dev/null +++ b/models/keyword_spotting/dnn_large/model_package_tf/train.py @@ -0,0 +1,227 @@ +# Copyright © 2023 Arm Ltd. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Functions for training simple keyword spotting models.""" + +import argparse +from pathlib import Path + +import tensorflow as tf +import numpy as np + +from data_processing import data_preprocessing +from model_core_utils import models + + +def train(): + model_settings = models.prepare_model_settings( + len(data_preprocessing.prepare_words_list(FLAGS.wanted_words.split(','))), + FLAGS.sample_rate, FLAGS.clip_duration_ms, FLAGS.window_size_ms, + FLAGS.window_stride_ms, FLAGS.dct_coefficient_count) + + # Create the model. + model = models.create_model(model_settings, FLAGS.model_architecture, FLAGS.model_size_info, True) + + audio_processor = data_preprocessing.AudioProcessor(data_url=FLAGS.data_url, + data_dir=FLAGS.data_dir, + silence_percentage=FLAGS.silence_percentage, + unknown_percentage=FLAGS.unknown_percentage, + wanted_words=FLAGS.wanted_words.split(','), + validation_percentage=FLAGS.validation_percentage, + testing_percentage=FLAGS.testing_percentage, + model_settings=model_settings) + + # We decay learning rate in a constant piecewise way to help learning. + training_steps_list = list(map(int, FLAGS.how_many_training_steps.split(','))) + learning_rates_list = list(map(float, FLAGS.learning_rate.split(','))) + lr_boundary_list = training_steps_list[:-1] # Only need the values at which to change lr. + lr_schedule = tf.keras.optimizers.schedules.PiecewiseConstantDecay(boundaries=lr_boundary_list, + values=learning_rates_list) + + # Specify the optimizer configurations. + optimizer = tf.keras.optimizers.Adam(learning_rate=lr_schedule) + model.compile(optimizer=optimizer, + loss=tf.keras.losses.SparseCategoricalCrossentropy(), + metrics=['accuracy']) + + train_data = audio_processor.get_data(audio_processor.Modes.TRAINING, + FLAGS.background_frequency, FLAGS.background_volume, + int((FLAGS.time_shift_ms * FLAGS.sample_rate) / 1000)) + train_data = train_data.repeat().batch(FLAGS.batch_size).prefetch(tf.data.AUTOTUNE) + val_data = audio_processor.get_data(audio_processor.Modes.VALIDATION) + val_data = val_data.batch(FLAGS.batch_size).prefetch(tf.data.AUTOTUNE) + + # We train for a max number of iterations so need to calculate how many 'epochs' this will be. + training_steps_max = np.sum(training_steps_list) + training_epoch_max = int(np.ceil(training_steps_max / FLAGS.eval_step_interval)) + + # Callbacks. + train_dir = Path(FLAGS.train_dir) / "best" + train_dir.mkdir(parents=True, exist_ok=True) + model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint( + filepath=(train_dir / (FLAGS.model_architecture + "_{val_accuracy:.3f}_ckpt")), + save_weights_only=True, + monitor='val_accuracy', + mode='max', + save_best_only=True) + tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=FLAGS.summaries_dir) + + # Train the model. + model.fit(x=train_data, + steps_per_epoch=FLAGS.eval_step_interval, + epochs=training_epoch_max, + validation_data=val_data, + callbacks=[model_checkpoint_callback, tensorboard_callback]) + + # Test and save the model. + test_data = audio_processor.get_data(audio_processor.Modes.TESTING) + test_data = test_data.batch(FLAGS.batch_size) + + test_loss, test_acc = model.evaluate(x=test_data) + print(f'Final test accuracy: {test_acc*100:.2f}%') + model.save(f'saved_model/{FLAGS.model_architecture}') + model.save(f'keras/{FLAGS.model_architecture}.h5') + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + '--data_url', + type=str, + default='http://download.tensorflow.org/data/speech_commands_v0.02.tar.gz', + help='Location of speech training data archive on the web.') + parser.add_argument( + '--data_dir', + type=str, + default='/tmp/speech_dataset/', + help="""\ + Where to download the speech training data to. + """) + parser.add_argument( + '--background_volume', + type=float, + default=0.1, + help="""\ + How loud the background noise should be, between 0 and 1. + """) + parser.add_argument( + '--background_frequency', + type=float, + default=0.8, + help="""\ + How many of the training samples have background noise mixed in. + """) + parser.add_argument( + '--silence_percentage', + type=float, + default=10.0, + help="""\ + How much of the training data should be silence. + """) + parser.add_argument( + '--unknown_percentage', + type=float, + default=10.0, + help="""\ + How much of the training data should be unknown words. + """) + parser.add_argument( + '--time_shift_ms', + type=float, + default=100.0, + help="""\ + Range to randomly shift the training audio by in time. + """) + parser.add_argument( + '--testing_percentage', + type=int, + default=10, + help='What percentage of wavs to use as a test set.') + parser.add_argument( + '--validation_percentage', + type=int, + default=10, + help='What percentage of wavs to use as a validation set.') + parser.add_argument( + '--sample_rate', + type=int, + default=16000, + help='Expected sample rate of the wavs',) + parser.add_argument( + '--clip_duration_ms', + type=int, + default=1000, + help='Expected duration in milliseconds of the wavs',) + parser.add_argument( + '--window_size_ms', + type=float, + default=30.0, + help='How long each spectrogram timeslice is',) + parser.add_argument( + '--window_stride_ms', + type=float, + default=10.0, + help='How long each spectrogram timeslice is',) + parser.add_argument( + '--dct_coefficient_count', + type=int, + default=40, + help='How many bins to use for the MFCC fingerprint',) + parser.add_argument( + '--how_many_training_steps', + type=str, + default='15000,3000', + help='How many training loops to run',) + parser.add_argument( + '--eval_step_interval', + type=int, + default=400, + help='How often to evaluate the training results.') + parser.add_argument( + '--learning_rate', + type=str, + default='0.001,0.0001', + help='How large a learning rate to use when training.') + parser.add_argument( + '--batch_size', + type=int, + default=100, + help='How many items to train with at once',) + parser.add_argument( + '--summaries_dir', + type=str, + default='/tmp/retrain_logs', + help='Where to save summary logs for TensorBoard.') + parser.add_argument( + '--wanted_words', + type=str, + default='yes,no,up,down,left,right,on,off,stop,go', + help='Words to use (others will be added to an unknown label)',) + parser.add_argument( + '--train_dir', + type=str, + default='/tmp/speech_commands_train', + help='Directory to write event logs and checkpoint.') + parser.add_argument( + '--model_architecture', + type=str, + default='dnn', + help='What model architecture to use') + parser.add_argument( + '--model_size_info', + type=int, + nargs="+", + default=[128, 128, 128], + help='Model dimensions - different for various models') + + FLAGS, _ = parser.parse_known_args() + train() diff --git a/models/keyword_spotting/dnn_large/model_package_tf/validation_utils/labels.txt b/models/keyword_spotting/dnn_large/model_package_tf/validation_utils/labels.txt new file mode 100644 index 0000000..ba41645 --- /dev/null +++ b/models/keyword_spotting/dnn_large/model_package_tf/validation_utils/labels.txt @@ -0,0 +1,12 @@ +_silence_ +_unknown_ +yes +no +up +down +left +right +on +off +stop +go \ No newline at end of file diff --git a/models/keyword_spotting/dnn_large/tflite_int8/README.md b/models/keyword_spotting/dnn_large/tflite_int8/README.md deleted file mode 100644 index 40a0507..0000000 --- a/models/keyword_spotting/dnn_large/tflite_int8/README.md +++ /dev/null @@ -1,59 +0,0 @@ -# DNN Large INT8 - -## Description -This is a fully quantized version (asymmetrical int8) of the DNN Large model developed by Arm, with training checkpoints, from the Hello Edge paper. Code to recreate this model can be found here: https://github.com/ARM-software/ML-examples/tree/master/tflu-kws-cortex-m - -## License -[Apache-2.0](https://spdx.org/licenses/Apache-2.0.html) - -## Related Materials -### Class Labels -The class labels associated with this model can be downloaded by running the script `get_class_labels.sh`. - -### Model Recreation Code -Code to recreate this model can be found here: https://github.com/ARM-software/ML-examples/tree/master/tflu-kws-cortex-m. - -## Network Information -| Network Information | Value | -|---------------------|------------------| -| Framework | TensorFlow Lite | -| SHA-1 Hash | 2b1ee34e4c87ba6f24092c7457593227099efaf1 | -| Size (Bytes) | 502272 | -| Provenance | https://github.com/ARM-software/ML-examples/tree/master/tflu-kws-cortex-m | -| Paper | https://arxiv.org/abs/1711.07128 | - -## Accuracy -Dataset: Google Speech Commands Test Set - -| Metric | Value | -|--------|-------| -| Accuracy | 0.863 | - -## Performance -| Platform | Optimized | -|----------|:---------:| -| Cortex-A |:heavy_check_mark: | -| Cortex-M |:heavy_check_mark: | -| Mali GPU |:heavy_check_mark: | -| Ethos U |:heavy_check_mark: | - -### Key -* :heavy_check_mark: - Will run on this platform. -* :heavy_multiplication_x: - Will not run on this platform. - - - -## Optimizations -| Optimization | Value | -|-----------------|---------| -| Quantization | INT8 | - -## Network Inputs -| Input Node Name | Shape | Description | -|-----------------|---------|-------------| -| input | (1, 250) | The input is a processed MFCCs of shape (1, 250) | - -## Network Outputs -| Output Node Name | Shape | Description | -|------------------|---------|-------------| -| Identity | (1, 12) | The probability on 12 keywords. | diff --git a/models/keyword_spotting/dnn_large/tflite_int8/definition.yaml b/models/keyword_spotting/dnn_large/tflite_int8/definition.yaml deleted file mode 100644 index 68c8968..0000000 --- a/models/keyword_spotting/dnn_large/tflite_int8/definition.yaml +++ /dev/null @@ -1,41 +0,0 @@ -benchmark: - Google Speech Commands test set: - Accuracy: 86.26% -description: 'This is a fully quantized version (asymmetrical int8) of the DNN Large - model developed by Arm, with training checkpoints, from the Hello Edge paper. Code - to recreate this model can be found here: https://github.com/ARM-software/ML-examples/tree/master/tflu-kws-cortex-m' -license: -- Apache-2.0 -network: - file_size_bytes: 502272 - filename: dnn_l_quantized.tflite - framework: TensorFlow Lite - hash: - algorithm: sha1 - value: 2b1ee34e4c87ba6f24092c7457593227099efaf1 - provenance: https://github.com/ARM-software/ML-examples/tree/master/tflu-kws-cortex-m - quality_level: null -network_parameters: - input_nodes: - - description: The input is a processed MFCCs of shape (1, 250) - example_input: - path: models/keyword_spotting/dnn_large/tflite_int8/testing_input/input - name: input - shape: - - 1 - - 250 - output_nodes: - - description: The probability on 12 keywords. - name: Identity - shape: - - 1 - - 12 - test_output_path: models/keyword_spotting/dnn_large/tflite_int8/testing_output/Identity -operators: - TensorFlow Lite: - - DEQUANTIZE - - FULLY_CONNECTED - - QUANTIZE - - RELU - - SOFTMAX -paper: https://arxiv.org/abs/1711.07128 diff --git a/models/keyword_spotting/dnn_large/tflite_int8/get_class_labels.sh b/models/keyword_spotting/dnn_large/tflite_int8/get_class_labels.sh deleted file mode 100755 index e59caf5..0000000 --- a/models/keyword_spotting/dnn_large/tflite_int8/get_class_labels.sh +++ /dev/null @@ -1,20 +0,0 @@ -# Copyright (C) 2021 Arm Limited or its affiliates. All rights reserved. -# -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the License); you may -# not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an AS IS BASIS, WITHOUT -# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -#!/usr/bin/env bash - -wget https://raw.githubusercontent.com/ARM-software/ML-KWS-for-MCU/e9cf319e9aa2ff71d433e111477dd95329fb94cb/Pretrained_models/labels.txt -mv labels.txt labelmappings.txt \ No newline at end of file diff --git a/models/keyword_spotting/dnn_medium/model_package_tf/README.md b/models/keyword_spotting/dnn_medium/model_package_tf/README.md new file mode 100644 index 0000000..8005a3c --- /dev/null +++ b/models/keyword_spotting/dnn_medium/model_package_tf/README.md @@ -0,0 +1,115 @@ +# DNN Medium model package + +This folder contains code that will allow you to recreate the DNN Medium keyword spotting model from +the [Hello Edge paper](https://arxiv.org/pdf/1711.07128.pdf). + +## License +[Apache-2.0](https://spdx.org/licenses/Apache-2.0.html) + +## Model Package Overview +| Model | DNN_Medium | +|:---------------: |:------------------------------------------:| +| **Format**: | Keras, Saved Model, TensorFlow Lite int8, TensorFlow Lite fp32 | +| **Feature**: | Keyword spotting for Arm Cortex-M CPUs | +| **Architectural Delta w.r.t. Vanilla**: | None | +| **Domain**: | Keyword spotting | +| **Package Quality**: | Optimised | + +## Model Recreation + +In order to recreate the model you will first need to be using ```Python3.7``` and install the requirements in ```requirements.txt```. + +Once you have these requirements satisfied you can execute the recreation script contained within this folder, just run: + +```bash +bash ./recreate_model.sh +``` + +Running this script will use the pre-trained checkpoint files supplied in the ```./model_archive/model_source/weights``` folder +to generate the TFLite files and perform evaluation on the test sets. Both an fp32 version and a quantized version will be produced. +The quantized version will use post-training quantization to fully quantize it. + +If you want to run training from scratch you can do this by supplying ```--train``` when running the script. For example: + +```bash +bash ./recreate_model.sh --train +``` + +Training is then performed and should produce a model to the stated accuracy in this repository. +Note that exporting to TFLite will still happen with the pre-trained checkpoint files so you will need to re-run the script +and this time supply the path to the new checkpoint files you want to use, for example: + +```bash +bash ./recreate_model.sh --ckpt +``` + + +## Training + +To train a DNN with 3 fully-connected layers with 128 neurons in each layer, run: + +``` +python train.py --model_architecture dnn --model_size_info 128 128 128 +``` +The command line argument *--model_size_info* is used to pass the neural network layer +dimensions such as number of layers, convolution filter size/stride as a list to models.py, +which builds the TensorFlow graph based on the provided model architecture +and layer dimensions. For more info on *model_size_info* for each network architecture see +[models.py](models.py). + +The training commands with all the hyperparameters to reproduce the models shown in the +[paper](https://arxiv.org/pdf/1711.07128.pdf) are given [here](recreate_model.sh). + +## Testing +To run inference on the trained model from a checkpoint and get accuracy on validation and test sets, run: +``` +python evaluation.py --model_architecture dnn --model_size_info 128 128 128 --checkpoint +``` +The parameters used here should match those used in the Training step. + +## Optimization + +We introduce a new *optional* step to optimize the trained keyword spotting model for deployment. + +Here we use TensorFlow's [weight clustering API](https://www.tensorflow.org/model_optimization/guide/clustering) to reduce the compressed model size and optimize inference on supported hardware. 32 weight clusters and kmeans++ cluster intialization method are used as the clustering hyperparameters. + +To optimize your trained model (e.g. a DNN), a trained model checkpoint is needed to run clustering and fine-tuning on. +You can use the pre-trained checkpoints provided, or train your own model and use the resulting checkpoint. + +To apply the optimization and fine-tuning, run the following command: +``` +python optimisations.py --model_architecture dnn --model_size_info 128 128 128 --checkpoint +``` +The parameters used here should match those used in the Training step, except for the number of training steps. +The number of training steps is reduced since the optimization step only requires fine-tuning. + +This will generate a clustered model checkpoint that can be used in the quantization step to generate a quantized and clustered TFLite model. + +## Quantization and TFLite Conversion + +As part of the update we now use TensorFlow's +[post training quantization](https://www.tensorflow.org/lite/performance/post_training_quantization) to +make quantization of the trained models super simple. + +To quantize your trained model (e.g. a DNN) run: +``` +python convert_to_tflite.py --model_architecture dnn --model_size_info 128 128 128 --checkpoint [--inference_type int8|int16] +``` +The parameters used here should match those used in the Training step. + +The inference_type parameter is *optional* and to be used if a fully quantized model with inputs and outputs of type int8 or int16 is needed. It defaults to fp32. + +This step will produce a quantized TFLite file *dnn_quantized.tflite*. +You can test the accuracy of this quantized model on the test set by running: +``` +python evaluation.py --tflite_path dnn_quantized.tflite +``` +The parameters used here should match those used in the Training step. + +`convert_to_tflite.py` uses post-training quantization to generate a quantized model by default. If you wish to convert to a floating point TFLite model, use the command below: + +``` +python convert_to_tflite.py --model_architecture dnn --model_size_info 128 128 128 --checkpoint --no-quantize +``` + +This will produce a floating point TFLite file *dnn.tflite*. You can test the accuracy of this floating point model using `evaluation.py` as above. diff --git a/models/keyword_spotting/dnn_medium/model_package_tf/convert_to_tflite.py b/models/keyword_spotting/dnn_medium/model_package_tf/convert_to_tflite.py new file mode 100644 index 0000000..64ab8df --- /dev/null +++ b/models/keyword_spotting/dnn_medium/model_package_tf/convert_to_tflite.py @@ -0,0 +1,234 @@ +# Copyright © 2023 Arm Ltd. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Functions for converting and quantizing a trained keyword spotting + model and saving to TFLite.""" + +import argparse + +import tensorflow as tf + +from data_processing import data_preprocessing +from model_core_utils import models +from evaluation import tflite_test + +NUM_REP_DATA_SAMPLES = 100 # How many samples to use for post training quantization. + + +def convert(model_settings, audio_processor, checkpoint, quantize, inference_type, tflite_path): + """Load our trained floating point model and convert it. + + TFLite conversion or post training quantization is performed and the + resulting model is saved as a TFLite file. + We use samples from the validation set to do post training quantization. + + Args: + model_settings: Dictionary of common model settings. + audio_processor: Audio processor class object. + checkpoint: Path to training checkpoint to load. + quantize: Whether to quantize the model or convert to fp32 TFLite model. + inference_type: Input/output type of the quantized model. + tflite_path: Output TFLite file save path. + """ + model = models.create_model(model_settings, FLAGS.model_architecture, FLAGS.model_size_info, False) + model.load_weights(checkpoint).expect_partial() + + val_data = audio_processor.get_data(audio_processor.Modes.VALIDATION).batch(1) + + def _rep_dataset(): + """Generator function to produce representative dataset.""" + i = 0 + for mfcc, label in val_data: + if i > NUM_REP_DATA_SAMPLES: + break + i += 1 + yield [mfcc] + + if quantize: + # Quantize model and save to disk. + tflite_model = post_training_quantize(model, inference_type, _rep_dataset) + with open(tflite_path, 'wb') as f: + f.write(tflite_model) + print(f'Quantized model saved to {tflite_path}.') + else: + converter = tf.lite.TFLiteConverter.from_keras_model(model) + tflite_model = converter.convert() + with open(tflite_path, 'wb') as f: + f.write(tflite_model) + print(f'Converted model saved to {tflite_path}.') + + +def post_training_quantize(keras_model, inference_type, rep_dataset): + """Perform post training quantization and returns the TFLite model ready for saving. + + See https://www.tensorflow.org/lite/performance/post_training_quantization#full_integer_quantization for + more details. + + Args: + keras_model: The trained tf Keras model used for post training quantization. + inference_type: Input/output type of the quantized model. + rep_dataset: Function to use as a representative dataset, must be callable. + + Returns: + Quantized TFLite model ready for saving to disk. + """ + converter = tf.lite.TFLiteConverter.from_keras_model(keras_model) + converter.optimizations = [tf.lite.Optimize.DEFAULT] + + if inference_type == 'int8': + converter.inference_input_type = tf.int8 + converter.inference_output_type = tf.int8 + supported_ops = tf.lite.OpsSet.TFLITE_BUILTINS_INT8 + if inference_type == 'int16': + converter.inference_input_type = tf.int16 + converter.inference_output_type = tf.int16 + supported_ops = tf.lite.OpsSet.EXPERIMENTAL_TFLITE_BUILTINS_ACTIVATIONS_INT16_WEIGHTS_INT8 + + # Int8 post training quantization needs representative dataset. + converter.representative_dataset = rep_dataset + converter.target_spec.supported_ops = [supported_ops] + + tflite_model = converter.convert() + + return tflite_model + + +def main(): + model_settings = models.prepare_model_settings( + len(data_preprocessing.prepare_words_list(FLAGS.wanted_words.split(','))), + FLAGS.sample_rate, FLAGS.clip_duration_ms, FLAGS.window_size_ms, + FLAGS.window_stride_ms, FLAGS.dct_coefficient_count) + + audio_processor = data_preprocessing.AudioProcessor(data_url=FLAGS.data_url, + data_dir=FLAGS.data_dir, + silence_percentage=FLAGS.silence_percentage, + unknown_percentage=FLAGS.unknown_percentage, + wanted_words=FLAGS.wanted_words.split(','), + validation_percentage=FLAGS.validation_percentage, + testing_percentage=FLAGS.testing_percentage, + model_settings=model_settings) + + if FLAGS.quantize: + tflite_path = f'{FLAGS.model_architecture}_quantized.tflite' + else: + tflite_path = f'{FLAGS.model_architecture}.tflite' + + # Load floating point model from checkpoint and convert it. + convert(model_settings, audio_processor, FLAGS.checkpoint, + FLAGS.quantize, FLAGS.inference_type, tflite_path) + + # Test the newly converted model on the test set. + tflite_test(model_settings, audio_processor, tflite_path) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument( + '--data_url', + type=str, + default='http://download.tensorflow.org/data/speech_commands_v0.02.tar.gz', + help='Location of speech training data archive on the web.') + parser.add_argument( + '--data_dir', + type=str, + default='/tmp/speech_dataset/', + help="""\ + Where to download the speech training data to. + """) + parser.add_argument( + '--silence_percentage', + type=float, + default=10.0, + help="""\ + How much of the training data should be silence. + """) + parser.add_argument( + '--unknown_percentage', + type=float, + default=10.0, + help="""\ + How much of the training data should be unknown words. + """) + parser.add_argument( + '--testing_percentage', + type=int, + default=10, + help='What percentage of wavs to use as a test set.') + parser.add_argument( + '--validation_percentage', + type=int, + default=10, + help='What percentage of wavs to use as a validation set.') + parser.add_argument( + '--sample_rate', + type=int, + default=16000, + help='Expected sample rate of the wavs',) + parser.add_argument( + '--clip_duration_ms', + type=int, + default=1000, + help='Expected duration in milliseconds of the wavs',) + parser.add_argument( + '--window_size_ms', + type=float, + default=30.0, + help='How long each spectrogram timeslice is',) + parser.add_argument( + '--window_stride_ms', + type=float, + default=10.0, + help='How long each spectrogram timeslice is',) + parser.add_argument( + '--dct_coefficient_count', + type=int, + default=40, + help='How many bins to use for the MFCC fingerprint',) + parser.add_argument( + '--wanted_words', + type=str, + default='yes,no,up,down,left,right,on,off,stop,go', + help='Words to use (others will be added to an unknown label)',) + parser.add_argument( + '--model_architecture', + type=str, + default='dnn', + help='What model architecture to use') + parser.add_argument( + '--model_size_info', + type=int, + nargs="+", + default=[128, 128, 128], + help='Model dimensions - different for various models') + parser.add_argument( + '--checkpoint', + type=str, + help='Checkpoint to load the weights from.') + parser.add_argument( + '--quantize', + dest='quantize', + action="store_true", + default=True, + help='Whether to quantize the model or convert to fp32 TFLite model. Defaults to True.') + parser.add_argument( + '--no-quantize', + dest='quantize', + action="store_false", + help='Whether to quantize the model or convert to fp32 TFLite model. Defaults to True.') + parser.add_argument( + '--inference_type', + type=str, + default='fp32', + help='If quantize is true, whether the model input and output is float32, int8 or int16') + + FLAGS, _ = parser.parse_known_args() + main() diff --git a/models/keyword_spotting/dnn_medium/model_package_tf/data_processing/__init__.py b/models/keyword_spotting/dnn_medium/model_package_tf/data_processing/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/models/keyword_spotting/dnn_medium/model_package_tf/data_processing/data_preprocessing.py b/models/keyword_spotting/dnn_medium/model_package_tf/data_processing/data_preprocessing.py new file mode 100644 index 0000000..05cf5ba --- /dev/null +++ b/models/keyword_spotting/dnn_medium/model_package_tf/data_processing/data_preprocessing.py @@ -0,0 +1,462 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +# +# Modifications Copyright 2023 Arm Inc. All Rights Reserved. +# Modified to use TensorFlow 2.0 and data pipelines. +# +"""Functions for loading and preparing data for keyword spotting.""" + +import os +import re +import sys +import urllib +from pathlib import Path +import tarfile +import hashlib +import random +import math +from enum import Enum + +import numpy as np +import tensorflow as tf +from tensorflow.python.ops import gen_audio_ops as audio_ops + +MAX_NUM_WAVS_PER_CLASS = 2**27 - 1 # ~134M +RANDOM_SEED = 59185 +BACKGROUND_NOISE_DIR_NAME = '_background_noise_' +SILENCE_LABEL = '_silence_' +SILENCE_INDEX = 0 +UNKNOWN_WORD_INDEX = 1 +UNKNOWN_WORD_LABEL = '_unknown_' + + +def load_wav_file(wav_filename, desired_samples): + """Loads and then decodes a given 16bit PCM wav file. + + Decoded audio is scaled to the range [-1, 1] and padded or cropped to the desired number of samples. + + Args: + wav_filename: 16bit PCM wav file to load. + desired_samples: Number of samples wanted from the audio file. + + Returns: + Tuple consisting of the decoded audio and sample rate. + """ + wav_file = tf.io.read_file(wav_filename) + decoded_wav = audio_ops.decode_wav(wav_file, desired_channels=1, desired_samples=desired_samples) + + return decoded_wav.audio, decoded_wav.sample_rate + + +def calculate_mfcc(audio_signal, audio_sample_rate, window_size, window_stride, num_mfcc): + """Returns Mel Frequency Cepstral Coefficients (MFCC) for a given audio signal. + + Args: + audio_signal: Raw audio signal in range [-1, 1] + audio_sample_rate: Audio signal sample rate + window_size: Window size in samples for calculating spectrogram + window_stride: Window stride in samples for calculating spectrogram + num_mfcc: The number of MFCC features wanted. + + Returns: + Calculated mffc features. + """ + spectrogram = audio_ops.audio_spectrogram(input=audio_signal, window_size=window_size, stride=window_stride, + magnitude_squared=True) + + mfcc_features = audio_ops.mfcc(spectrogram, audio_sample_rate, dct_coefficient_count=num_mfcc) + + return mfcc_features + + +def which_set(filename, validation_percentage, testing_percentage): + """Determines which data partition the file should belong to. + + We want to keep files in the same training, validation, or testing sets even + if new ones are added over time. This makes it less likely that testing + samples will accidentally be reused in training when long runs are restarted + for example. To keep this stability, a hash of the filename is taken and used + to determine which set it should belong to. This determination only depends on + the name and the set proportions, so it won't change as other files are added. + It's also useful to associate particular files as related (for example words + spoken by the same person), so anything after '_nohash_' in a filename is + ignored for set determination. This ensures that 'bobby_nohash_0.wav' and + 'bobby_nohash_1.wav' are always in the same set, for example. + + Args: + filename: File path of the data sample. + validation_percentage: How much of the data set to use for validation. + testing_percentage: How much of the data set to use for testing. + + Returns: + String, one of 'training', 'validation', or 'testing'. + """ + base_name = os.path.basename(filename) + # We want to ignore anything after '_nohash_' in the file name when + # deciding which set to put a wav in, so the data set creator has a way of + # grouping wavs that are close variations of each other. + hash_name = re.sub(r'_nohash_.*$', '', base_name) + # This looks a bit magical, but we need to decide whether this file should + # go into the training, testing, or validation sets, and we want to keep + # existing files in the same set even if more files are subsequently + # added. + # To do that, we need a stable way of deciding based on just the file name + # itself, so we do a hash of that and then use that to generate a + # probability value that we use to assign it. + hash_name_hashed = hashlib.sha1(tf.compat.as_bytes(hash_name)).hexdigest() + percentage_hash = ((int(hash_name_hashed, 16) % + (MAX_NUM_WAVS_PER_CLASS + 1)) * + (100.0 / MAX_NUM_WAVS_PER_CLASS)) + if percentage_hash < validation_percentage: + result = 'validation' + elif percentage_hash < (testing_percentage + validation_percentage): + result = 'testing' + else: + result = 'training' + return result + + +def prepare_words_list(wanted_words): + """Prepends common tokens to the custom word list. + + Args: + wanted_words: List of strings containing custom words to spot. + + Returns: + List of words with silence and unknown tokens added. + """ + return [SILENCE_LABEL, UNKNOWN_WORD_LABEL] + wanted_words + + +class AudioProcessor: + """Handles loading, partitioning, and preparing audio training data.""" + + class Modes(Enum): + TRAINING = 1 + VALIDATION = 2 + TESTING = 3 + + def __init__(self, data_url, data_dir, silence_percentage, unknown_percentage, + wanted_words, validation_percentage, testing_percentage, model_settings): + self.data_dir = Path(data_dir) + self.model_settings = model_settings + self.words_list = prepare_words_list(wanted_words) + + self._tf_datasets = {} + self.background_data = None + self._set_size = {'training': 0, 'validation': 0, 'testing': 0} + + self._download_and_extract_data(data_url, data_dir) + self._prepare_datasets(silence_percentage, unknown_percentage, wanted_words, + validation_percentage, testing_percentage) + self._prepare_background_data() + + def get_data(self, mode, background_frequency=0, background_volume_range=0, time_shift=0): + """Returns the train, validation or test set for KWS as a TF Dataset. + + Args: + mode: The set to return, see AudioProcessor.Modes enumeration. + background_frequency: How many of the samples have background noise mixed in. + background_volume_range: How loud the background noise should be, between 0 and 1. + time_shift: Range to randomly shift the training audio by in time. + + Returns: + TF dataset that will generate tuples containing an mfcc and corresponding label. + + Raises: + ValueError: If mode is not recognised. + """ + if mode == AudioProcessor.Modes.TRAINING: + dataset = self._tf_datasets['training'] + elif mode == AudioProcessor.Modes.VALIDATION: + dataset = self._tf_datasets['validation'] + elif mode == AudioProcessor.Modes.TESTING: + dataset = self._tf_datasets['testing'] + else: + ValueError("Incorrect dataset type given") + + use_background = (self.background_data is not None) and (mode == AudioProcessor.Modes.TRAINING) + dataset = dataset.map(lambda path, label: self._process_path(path, label, self.model_settings, + background_frequency, background_volume_range, + time_shift, use_background, self.background_data), + num_parallel_calls=tf.data.experimental.AUTOTUNE) + + return dataset + + def set_size(self, mode): + """Get the number of samples in the requested dataset partition. + + Args: + mode: Which partition, see AudioProcessor.Modes enumeration. + + Returns: + Number of samples in the partition. + + Raises: + ValueError: If mode is not recognised. + """ + if mode == AudioProcessor.Modes.TRAINING: + return self._set_size['training'] + elif mode == AudioProcessor.Modes.VALIDATION: + return self._set_size['validation'] + elif mode == AudioProcessor.Modes.TESTING: + return self._set_size['testing'] + else: + ValueError('Incorrect dataset type given') + + @staticmethod + def _process_path(path, label, model_settings, background_frequency, background_volume_range, time_shift_samples, + use_background, background_data): + """Load wav files and calculate mfcc features. + + Random shifting of samples and adding in background noise is done within this function as well. + This function is meant to be mapped onto a TF Dataset by using a lambda function. + + Args: + path: Path to the wav file to load. + label: Integer label for classifying the audio clip. + model_settings: Dictionary of settings for model being trained. + background_frequency: How many clips will have background noise, 0.0 to 1.0. + background_volume_range: How loud the background noise will be. + time_shift_samples: How much to randomly shift the clips by. + use_background: Add in background noise to audio clips or not. + background_data: Ragged tensor of loaded background noise samples. + + Returns: + Tuple of calculated flattened mfcc and its class label. + """ + + desired_samples = model_settings['desired_samples'] + audio, sample_rate = load_wav_file(path, desired_samples=desired_samples) + + # Make our own silence audio data. + if label == SILENCE_INDEX: + audio = tf.multiply(audio, 0) + + # Shift samples start position and pad any gaps with zeros. + if time_shift_samples > 0: + time_shift_amount = tf.random.uniform(shape=(), minval=-time_shift_samples, maxval=time_shift_samples, + dtype=tf.int32) + else: + time_shift_amount = 0 + if time_shift_amount > 0: + time_shift_padding = [[time_shift_amount, 0], [0, 0]] + time_shift_offset = [0, 0] + else: + time_shift_padding = [[0, -time_shift_amount], [0, 0]] + time_shift_offset = [-time_shift_amount, 0] + + padded_foreground = tf.pad(audio, time_shift_padding, mode='CONSTANT') + sliced_foreground = tf.slice(padded_foreground, time_shift_offset, [desired_samples, -1]) + + # Get a random section of background noise. + if use_background: + background_index = tf.random.uniform(shape=(), maxval=background_data.shape[0], dtype=tf.int32) + background_sample = background_data[background_index] + background_offset = tf.random.uniform(shape=(), maxval=len(background_sample)-desired_samples, + dtype=tf.int32) + background_clipped = background_sample[background_offset:(background_offset + desired_samples)] + background_reshaped = tf.reshape(background_clipped, [desired_samples, 1]) + if tf.random.uniform(shape=(), maxval=1) < background_frequency: + background_volume = tf.random.uniform(shape=(), maxval=background_volume_range) + else: + background_volume = tf.constant(0, dtype='float32') + else: + background_reshaped = np.zeros([desired_samples, 1], dtype=np.float32) + background_volume = tf.constant(0, dtype='float32') + + # Mix in background noise. + background_mul = tf.multiply(background_reshaped, background_volume) + background_add = tf.add(background_mul, sliced_foreground) + background_clamp = tf.clip_by_value(background_add, -1.0, 1.0) + + mfcc = calculate_mfcc(background_clamp, sample_rate, model_settings['window_size_samples'], + model_settings['window_stride_samples'], + model_settings['dct_coefficient_count']) + mfcc = tf.reshape(mfcc, [-1]) + + return mfcc, label + + def _download_and_extract_data(self, data_url, target_directory): + """Downloads and extracts file to target directory. + + If the file does not already exist download it and then untar into the target directory. + + Args: + data_url: Web link to the tarred data to download. + target_directory: Directory to download and extract to. + """ + target_directory = Path(target_directory) + target_directory.mkdir(exist_ok=True) + + filename = data_url.split('/')[-1] + filepath = target_directory / filename + + if not filepath.exists(): + def _report_hook(block_num, block_size, total_size): + """Function to track download progress in urllib""" + read_so_far = block_num * block_size + percent = (read_so_far / total_size) * 100.0 + + s = f"\rDownloading {filename} {percent:.1f}%" + + sys.stdout.write(s) + sys.stdout.flush() + + filepath, _ = urllib.request.urlretrieve(data_url, filepath, _report_hook) + print() + + print(f'Untarring {filename}...') + tarfile.open(filepath, 'r:gz').extractall(target_directory) + + def _prepare_datasets(self, silence_percentage, unknown_percentage, wanted_words, + validation_percentage, testing_percentage): + """Split the data into train, validation and testing sets. + + Silence and unknown data is added, then sets are converted to TF Datasets. + + Args: + silence_percentage: Percent of words should be silence. + unknown_percentage: Percent of words that should be unknown. + wanted_words: List of words wanted to classify. + validation_percentage: Percent to split off for validation. + testing_percentage: Percent to split off for testing. + """ + # Make sure the shuffling and picking of unknowns is deterministic. + random.seed(RANDOM_SEED) + wanted_words_index = {} + + for index, wanted_word in enumerate(wanted_words): + wanted_words_index[wanted_word] = index + 2 + + # Find all wav files in subfolders. + search_path = self.data_dir / '*' / '*.wav' + data_index, unknown_index, all_words = self._find_and_sort_wavs(search_path, validation_percentage, + testing_percentage, wanted_words_index) + + for index, wanted_word in enumerate(wanted_words): + if wanted_word not in all_words: + raise Exception(f'Tried to find {wanted_word} in labels but only found: {", ".join(all_words.keys())}') + + word_to_index = {} + for word in all_words: + if word in wanted_words_index: + word_to_index[word] = wanted_words_index[word] + else: + word_to_index[word] = UNKNOWN_WORD_INDEX + word_to_index[SILENCE_LABEL] = SILENCE_INDEX + + # We need an arbitrary file to load as the input for the silence samples. + # It's multiplied by zero later, so the content doesn't matter. + silence_wav_path = data_index['training'][0]['file'] + for set_index in ['validation', 'testing', 'training']: + set_size = len(data_index[set_index]) # Size before adding silence and unknown samples. + silence_size = int(math.ceil(set_size * silence_percentage / 100)) + for _ in range(silence_size): + data_index[set_index].append({ + 'label': SILENCE_LABEL, + 'file': silence_wav_path + }) + # Pick some unknowns to add to each partition of the data set. + random.shuffle(unknown_index[set_index]) + unknown_size = int(math.ceil(set_size * unknown_percentage / 100)) + data_index[set_index].extend(unknown_index[set_index][:unknown_size]) + + self._set_size[set_index] = len(data_index[set_index]) # Size after adding silence and unknown samples. + + # Make sure the ordering is random. + random.shuffle(data_index[set_index]) + + # Transform into TF Datasets ready for easier processing later. + labels, paths = list(zip(*[d.values() for d in data_index[set_index]])) + labels = [word_to_index[label] for label in labels] + self._tf_datasets[set_index] = tf.data.Dataset.from_tensor_slices((list(paths), labels)) + + def _find_and_sort_wavs(self, search_pattern, validation_percentage, testing_percentage, wanted_words_index): + """Find and sort wav files into known and unknown word sets. + + Known words are files containing words in the list of wanted words. + Any other clip goes to the unknown label set. Labels come from the folder names. + All clips are also assigned to train, test and validation sets. + + Args: + search_pattern: Path pattern used by glob to find wav files. + validation_percentage: Percent to split off for validation. + testing_percentage: Percent to split off for testing. + wanted_words_index: Dict mapping wanted words to their label index. + + Returns: + 3-tuple of known words, unknown words and mapping of all word labels. + """ + data_index = {'validation': [], 'testing': [], 'training': []} + unknown_index = {'validation': [], 'testing': [], 'training': []} + all_words = {} + + for wav_path in sorted(tf.io.gfile.glob(str(search_pattern))): + word = Path(wav_path).parent.name.lower() + + # Treat the '_background_noise_' folder as a special case, since we expect + # it to contain long audio samples we mix in to improve training. + if word == BACKGROUND_NOISE_DIR_NAME: + continue + + all_words[word] = True + set_index = which_set(wav_path, validation_percentage, testing_percentage) + # If it's a known class, store its detail, otherwise add it to the list + # we'll use to train the unknown label. + if word in wanted_words_index: + data_index[set_index].append({'label': word, 'file': wav_path}) + else: + unknown_index[set_index].append({'label': word, 'file': wav_path}) + if not all_words: + raise Exception('No .wavs found at ' + str(search_pattern)) + + return data_index, unknown_index, all_words + + def _prepare_background_data(self): + """Searches a folder for background noise audio, and loads it into memory. + + It's expected that the background audio samples will be in a subdirectory + named '_background_noise_' inside the 'data_dir' folder, as .wavs that match + the sample rate of the training data, but can be much longer in duration. + + If the '_background_noise_' folder doesn't exist at all, this isn't an + error, it's just taken to mean that no background noise augmentation should + be used. If the folder does exist, but it's empty, that's treated as an + error. + + Returns: + Ragged tensor of raw PCM-encoded audio samples of background noise. + None if '_background_noise_' folder doesnt exist. + + Raises: + Exception: If files aren't found in the folder. + """ + background_data = [] + background_dir = Path(self.data_dir / BACKGROUND_NOISE_DIR_NAME) + if not background_dir.exists(): + self.background_data = None + return + + search_path = Path(background_dir / '*.wav') + for wav_path in tf.io.gfile.glob(str(search_path)): + wav_data, _ = load_wav_file(wav_path, desired_samples=-1) + background_data.append(tf.reshape(wav_data, [-1])) + + if not background_data: + raise Exception('No background wav files were found in ' + str(search_path)) + + # Ragged tensor as we cant use lists in tf dataset map functions. + self.background_data = tf.ragged.stack(background_data) diff --git a/models/keyword_spotting/dnn_medium/model_package_tf/dnn_m_inference_keras.py b/models/keyword_spotting/dnn_medium/model_package_tf/dnn_m_inference_keras.py new file mode 100644 index 0000000..db7694a --- /dev/null +++ b/models/keyword_spotting/dnn_medium/model_package_tf/dnn_m_inference_keras.py @@ -0,0 +1,76 @@ +# Copyright © 2023 Arm Ltd. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from data_processing.data_preprocessing import load_wav_file, calculate_mfcc + +import tensorflow as tf +import argparse + + +def load_labels(filename): + """Read in labels, one label per line.""" + f = open(filename, "r") + return f.read().splitlines() + + +def main(): + window_size_samples = int(FLAGS.sample_rate * FLAGS.window_size_ms / 1000) + window_stride_samples = int(FLAGS.sample_rate * FLAGS.window_stride_ms / 1000) + decoded, sample = load_wav_file(FLAGS.wav, FLAGS.sample_rate) + x = calculate_mfcc(decoded, sample, window_size_samples, window_stride_samples, FLAGS.dct_coefficient_count) + x = tf.reshape(x, [1, -1]) + + model = tf.keras.models.load_model(FLAGS.keras_file_path) + predictions = model.predict(x) + + # Sort to show labels in order of confidence + top_k = predictions[0].argsort()[-1:][::-1] + for node_id in top_k: + human_string = load_labels(FLAGS.labels)[int(node_id)] + score = predictions[0,node_id] + print(f'model predicted: {human_string} with score {score:.5f}') + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument( + '--wav', type=str, default='', help='Audio file to be identified.') + parser.add_argument( + '--labels', type=str, default='', help='Path to file containing labels.') + parser.add_argument( + '--sample_rate', + type=int, + default=16000, + help='Expected sample rate of the wavs', ) + parser.add_argument( + '--window_size_ms', + type=float, + default=30.0, + help='How long each spectrogram timeslice is', ) + parser.add_argument( + '--window_stride_ms', + type=float, + default=10.0, + help='How long each spectrogram timeslice is', ) + parser.add_argument( + '--dct_coefficient_count', + type=int, + default=40, + help='How many bins to use for the MFCC fingerprint', ) + parser.add_argument( + '--keras_file_path', + type=str, + default='', + help='Path to the .h5 Keras model file to use for testing.') + FLAGS, unparsed = parser.parse_known_args() + main() diff --git a/models/keyword_spotting/dnn_medium/model_package_tf/dnn_m_inference_tflite.py b/models/keyword_spotting/dnn_medium/model_package_tf/dnn_m_inference_tflite.py new file mode 100644 index 0000000..9f79d99 --- /dev/null +++ b/models/keyword_spotting/dnn_medium/model_package_tf/dnn_m_inference_tflite.py @@ -0,0 +1,120 @@ +# Copyright © 2023 Arm Ltd. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from data_processing.data_preprocessing import load_wav_file, calculate_mfcc + +import tensorflow as tf +import numpy as np +import argparse + + +def tflite_inference(input_data, tflite_path): + """Call forwards pass of TFLite file and returns the result. + + Args: + input_data: Input data to use on forward pass. + tflite_path: Path to TFLite file to run. + + Returns: + Output from inference. + """ + supported_quant_dtypes = (np.int8, np.int16) + interpreter = tf.lite.Interpreter(model_path=tflite_path) + interpreter.allocate_tensors() + + input_details = interpreter.get_input_details() + output_details = interpreter.get_output_details() + + input_dtype = input_details[0]["dtype"] + output_dtype = output_details[0]["dtype"] + + # Check if the input/output type is quantized, + # set scale and zero-point accordingly + if input_dtype in supported_quant_dtypes: + input_scale, input_zero_point = input_details[0]["quantization"] + else: + input_scale, input_zero_point = 1, 0 + + input_data = input_data / input_scale + input_zero_point + input_data = np.round(input_data) if input_dtype in supported_quant_dtypes else input_data + + if output_dtype in supported_quant_dtypes: + output_scale, output_zero_point = output_details[0]["quantization"] + else: + output_scale, output_zero_point = 1, 0 + + interpreter.set_tensor(input_details[0]['index'], tf.cast(input_data, input_dtype)) + interpreter.invoke() + + output_data = interpreter.get_tensor(output_details[0]['index']) + + output_data = output_scale * (output_data.astype(np.float32) - output_zero_point) + + return output_data + + +def load_labels(filename): + """Read in labels, one label per line.""" + f = open(filename, "r") + return f.read().splitlines() + + +def main(): + window_size_samples = int(FLAGS.sample_rate * FLAGS.window_size_ms / 1000) + window_stride_samples = int(FLAGS.sample_rate * FLAGS.window_stride_ms / 1000) + decoded, sample = load_wav_file(FLAGS.wav, FLAGS.sample_rate) + x = calculate_mfcc(decoded, sample, window_size_samples, window_stride_samples, FLAGS.dct_coefficient_count) + x = tf.reshape(x, [1, -1]) + predictions = tflite_inference(x, FLAGS.tflite_path) + + # Sort to show labels in order of confidence + top_k = predictions[0].argsort()[-1:][::-1] + for node_id in top_k: + human_string = load_labels(FLAGS.labels)[int(node_id)] + score = predictions[0,node_id] + print(f'model predicted: {human_string} with score {score:.5f}') + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument( + '--wav', type=str, default='', help='Audio file to be identified.') + parser.add_argument( + '--labels', type=str, default='', help='Path to file containing labels.') + parser.add_argument( + '--sample_rate', + type=int, + default=16000, + help='Expected sample rate of the wavs', ) + parser.add_argument( + '--window_size_ms', + type=float, + default=30.0, + help='How long each spectrogram timeslice is', ) + parser.add_argument( + '--window_stride_ms', + type=float, + default=10.0, + help='How long each spectrogram timeslice is', ) + parser.add_argument( + '--dct_coefficient_count', + type=int, + default=40, + help='How many bins to use for the MFCC fingerprint', ) + parser.add_argument( + '--tflite_path', + type=str, + default='', + help='Path to TFLite file to use for testing.') + FLAGS, unparsed = parser.parse_known_args() + main() diff --git a/models/keyword_spotting/dnn_medium/model_package_tf/evaluation.py b/models/keyword_spotting/dnn_medium/model_package_tf/evaluation.py new file mode 100644 index 0000000..4481dcd --- /dev/null +++ b/models/keyword_spotting/dnn_medium/model_package_tf/evaluation.py @@ -0,0 +1,250 @@ +# Copyright © 2023 Arm Ltd. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Functions for testing trained keyword spotting models from checkpoint files and TFLite files.""" + +import argparse + +import numpy as np +import tensorflow as tf + +from data_processing import data_preprocessing +from model_core_utils import models +from dnn_m_inference_tflite import tflite_inference + + +def tflite_test(model_settings, audio_processor, tflite_path): + """Calculate accuracy and confusion matrices on the validation and test sets. + + A TFLite model is used for doing testing. + + Args: + model_settings: Dictionary of common model settings. + audio_processor: Audio processor class object. + tflite_path: Path to TFLite file to use for inference. + """ + # Evaluate on validation set. + print("Running TFLite evaluation on validation set...") + val_data = audio_processor.get_data(audio_processor.Modes.VALIDATION).batch(1) + expected_indices = np.concatenate([y for x, y in val_data]) + predicted_indices = [] + + for mfcc, label in val_data: + prediction = tflite_inference(mfcc, tflite_path) + predicted_indices.append(np.squeeze(tf.argmax(prediction, axis=1))) + + val_accuracy = calculate_accuracy(predicted_indices, expected_indices) + confusion_matrix = tf.math.confusion_matrix(expected_indices, predicted_indices, + num_classes=model_settings['label_count']) + + print(confusion_matrix.numpy()) + print(f'Validation accuracy = {val_accuracy * 100:.2f}%' + f'(N={audio_processor.set_size(audio_processor.Modes.VALIDATION)})') + + # Evaluate on testing set. + print("Running TFLite evaluation on test set...") + test_data = audio_processor.get_data(audio_processor.Modes.TESTING).batch(1) + expected_indices = np.concatenate([y for x, y in test_data]) + predicted_indices = [] + + for mfcc, label in test_data: + prediction = tflite_inference(mfcc, tflite_path) + predicted_indices.append(np.squeeze(tf.argmax(prediction, axis=1))) + + test_accuracy = calculate_accuracy(predicted_indices, expected_indices) + confusion_matrix = tf.math.confusion_matrix(expected_indices, predicted_indices, + num_classes=model_settings['label_count']) + + print(confusion_matrix.numpy()) + print(f'Test accuracy = {test_accuracy * 100:.2f}%' + f'(N={audio_processor.set_size(audio_processor.Modes.TESTING)})') + + +def keras_test(model_settings, audio_processor, model): + """Calculate accuracy and confusion matrices on the validation and test sets. + + A loaded keras model is used for doing testing. + + Args: + model_settings: Dictionary of common model settings. + audio_processor: Audio processor class object. + model: Loaded keras model. + """ + # Evaluate on validation set. + print("Running TF evaluation on validation set...") + val_data = audio_processor.get_data(audio_processor.Modes.VALIDATION).batch(FLAGS.batch_size) + expected_indices = np.concatenate([y for x, y in val_data]) + + predictions = model.predict(val_data) + predicted_indices = tf.argmax(predictions, axis=1) + + val_accuracy = calculate_accuracy(predicted_indices, expected_indices) + confusion_matrix = tf.math.confusion_matrix(expected_indices, predicted_indices, + num_classes=model_settings['label_count']) + print(confusion_matrix.numpy()) + print(f'Validation accuracy = {val_accuracy * 100:.2f}%' + f'(N={audio_processor.set_size(audio_processor.Modes.VALIDATION)})') + + # Evaluate on testing set. + print("Running TF evaluation on test set...") + test_data = audio_processor.get_data(audio_processor.Modes.TESTING).batch(FLAGS.batch_size) + expected_indices = np.concatenate([y for x, y in test_data]) + + predictions = model.predict(test_data) + predicted_indices = tf.argmax(predictions, axis=1) + + test_accuracy = calculate_accuracy(predicted_indices, expected_indices) + confusion_matrix = tf.math.confusion_matrix(expected_indices, predicted_indices, + num_classes=model_settings['label_count']) + print(confusion_matrix.numpy()) + print(f'Test accuracy = {test_accuracy * 100:.2f}%' + f'(N={audio_processor.set_size(audio_processor.Modes.TESTING)})') + + +def calculate_accuracy(predicted_indices, expected_indices): + """Calculates and returns accuracy. + + Args: + predicted_indices: List of predicted integer indices. + expected_indices: List of expected integer indices. + + Returns: + Accuracy value between 0 and 1. + """ + correct_prediction = tf.equal(predicted_indices, expected_indices) + accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) + return accuracy + + +def evaluate(): + """Calculate accuracy and confusion matrices on validation and test sets. + + Model is created and weights loaded from supplied command line arguments. + """ + model_settings = models.prepare_model_settings( + len(data_preprocessing.prepare_words_list(FLAGS.wanted_words.split(','))), + FLAGS.sample_rate, FLAGS.clip_duration_ms, FLAGS.window_size_ms, + FLAGS.window_stride_ms, FLAGS.dct_coefficient_count) + + audio_processor = data_preprocessing.AudioProcessor(data_url=FLAGS.data_url, + data_dir=FLAGS.data_dir, + silence_percentage=FLAGS.silence_percentage, + unknown_percentage=FLAGS.unknown_percentage, + wanted_words=FLAGS.wanted_words.split(','), + validation_percentage=FLAGS.validation_percentage, + testing_percentage=FLAGS.testing_percentage, + model_settings=model_settings) + + if FLAGS.tflite_path: + tflite_test(model_settings, audio_processor, FLAGS.tflite_path) + + if FLAGS.checkpoint: + model = models.create_model(model_settings, FLAGS.model_architecture, FLAGS.model_size_info, False) + model.load_weights(FLAGS.checkpoint).expect_partial() + keras_test(model_settings, audio_processor, model) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument( + '--data_url', + type=str, + default='http://download.tensorflow.org/data/speech_commands_v0.02.tar.gz', + help='Location of speech training data archive on the web.') + parser.add_argument( + '--data_dir', + type=str, + default='/tmp/speech_dataset/', + help="""\ + Where to download the speech training data to. + """) + parser.add_argument( + '--silence_percentage', + type=float, + default=10.0, + help="""\ + How much of the training data should be silence. + """) + parser.add_argument( + '--unknown_percentage', + type=float, + default=10.0, + help="""\ + How much of the training data should be unknown words. + """) + parser.add_argument( + '--testing_percentage', + type=int, + default=10, + help='What percentage of wavs to use as a test set.') + parser.add_argument( + '--validation_percentage', + type=int, + default=10, + help='What percentage of wavs to use as a validation set.') + parser.add_argument( + '--sample_rate', + type=int, + default=16000, + help='Expected sample rate of the wavs',) + parser.add_argument( + '--clip_duration_ms', + type=int, + default=1000, + help='Expected duration in milliseconds of the wavs',) + parser.add_argument( + '--window_size_ms', + type=float, + default=30.0, + help='How long each spectrogram timeslice is',) + parser.add_argument( + '--window_stride_ms', + type=float, + default=10.0, + help='How long each spectrogram timeslice is',) + parser.add_argument( + '--dct_coefficient_count', + type=int, + default=40, + help='How many bins to use for the MFCC fingerprint',) + parser.add_argument( + '--batch_size', + type=int, + default=100, + help='How many items to train with at once',) + parser.add_argument( + '--wanted_words', + type=str, + default='yes,no,up,down,left,right,on,off,stop,go', + help='Words to use (others will be added to an unknown label)',) + parser.add_argument( + '--checkpoint', + type=str, + help='Checkpoint to load the weights from') + parser.add_argument( + '--tflite_path', + type=str, + help='Path to TFLite file to use for evaluation') + parser.add_argument( + '--model_architecture', + type=str, + default='dnn', + help='What model architecture to use') + parser.add_argument( + '--model_size_info', + type=int, + nargs="+", + default=[128, 128, 128], + help='Model dimensions - different for various models') + + FLAGS, _ = parser.parse_known_args() + evaluate() diff --git a/models/keyword_spotting/dnn_medium/model_package_tf/how_to_guidance.ipynb b/models/keyword_spotting/dnn_medium/model_package_tf/how_to_guidance.ipynb new file mode 100644 index 0000000..ac8b78c --- /dev/null +++ b/models/keyword_spotting/dnn_medium/model_package_tf/how_to_guidance.ipynb @@ -0,0 +1,428 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Copyright (C) 2023 Arm Limited or its affiliates. All rights reserved.\n", + "#\n", + "# SPDX-License-Identifier: Apache-2.0\n", + "#\n", + "# Licensed under the Apache License, Version 2.0 (the License); you may\n", + "# not use this file except in compliance with the License.\n", + "# You may obtain a copy of the License at\n", + "#\n", + "# www.apache.org/licenses/LICENSE-2.0\n", + "#\n", + "# Unless required by applicable law or agreed to in writing, software\n", + "# distributed under the License is distributed on an AS IS BASIS, WITHOUT\n", + "# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", + "# See the License for the specific language governing permissions and\n", + "# limitations under the License." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# DNN_Medium - Optimised\n", + "\n", + "Here we reproduce the models with our established codebase and ModelPackage approach for your convenience.\n", + "\n", + "## Model-Package Overview:\n", + "\n", + "| Model \t| DNN_Medium \t|\n", + "|:---------------:\t|:---------------------------------------------------------------:\t|\n", + "| **Format**: \t| Keras, Saved Model, TensorFlow Lite int8, TensorFlow Lite fp32 |\n", + "| **Feature**: \t| Keyword spotting for Arm Cortex-M CPUs |\n", + "| **Architectural Delta w.r.t. Vanilla**: | None |\n", + "| **Domain**: \t| Keyword spotting |\n", + "| **Package Quality**: \t| Optimised |" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Table of contents \n", + "\n", + "This how-to guidance presents the key steps to reproduce everything in this package. The contents are organised as below. We provided the internal navigation links for users to easy-jump among different sections. \n", + "\n", + " \n", + "* [1.0 Model recreation](#model_recreation)\n", + "\n", + "* [2.0 Training](#training)\n", + "\n", + "* [3.0 Testing](#testing)\n", + "\n", + "* [4.0 Optimization](#optimization)\n", + "\n", + "* [5.0 Quantization and TFLite conversion](#tflite_conversion)\n", + "\n", + "* [6.0 Inference the TFLite model files](#tflite_inference)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1.0 Model Recreation\n", + "\n", + "In order to recreate the model you will first need to be using ```Python3.7``` and install the requirements in ```requirements.txt```.\n", + "\n", + "Once you have these requirements satisfied you can execute the recreation script contained within this folder, just run:" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2023-01-31 13:21:58.189962: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0\n", + "Untarring speech_commands_v0.02.tar.gz...\n", + "2023-01-31 13:22:48.489206: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcuda.so.1\n", + "2023-01-31 13:22:48.528844: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: \n", + "pciBusID: 0000:03:00.0 name: NVIDIA TITAN Xp computeCapability: 6.1\n", + "coreClock: 1.582GHz coreCount: 30 deviceMemorySize: 11.91GiB deviceMemoryBandwidth: 510.07GiB/s\n", + "2023-01-31 13:22:48.528880: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0\n", + "2023-01-31 13:22:48.548795: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcublas.so.11\n", + "2023-01-31 13:22:48.548866: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcublasLt.so.11\n", + "2023-01-31 13:22:48.551645: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcufft.so.10\n", + "2023-01-31 13:22:48.551935: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcurand.so.10\n", + "2023-01-31 13:22:48.552501: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcusolver.so.11\n", + "2023-01-31 13:22:48.553238: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcusparse.so.11\n", + "2023-01-31 13:22:48.553392: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudnn.so.8\n", + "2023-01-31 13:22:48.553886: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0\n", + "2023-01-31 13:22:48.554176: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA\n", + "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n", + "2023-01-31 13:22:48.554998: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: \n", + "pciBusID: 0000:03:00.0 name: NVIDIA TITAN Xp computeCapability: 6.1\n", + "coreClock: 1.582GHz coreCount: 30 deviceMemorySize: 11.91GiB deviceMemoryBandwidth: 510.07GiB/s\n", + "2023-01-31 13:22:48.555410: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0\n", + "2023-01-31 13:22:48.555527: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0\n", + "2023-01-31 13:22:48.994481: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1258] Device interconnect StreamExecutor with strength 1 edge matrix:\n", + "2023-01-31 13:22:48.994520: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1264] 0 \n", + "2023-01-31 13:22:48.994528: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1277] 0: N \n", + "2023-01-31 13:22:48.995028: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1418] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 10939 MB memory) -> physical GPU (device: 0, name: NVIDIA TITAN Xp, pci bus id: 0000:03:00.0, compute capability: 6.1)\n", + "WARNING:tensorflow:Compiled the loaded model, but the compiled metrics have yet to be built. `model.compile_metrics` will be empty until you train or evaluate the model.\n", + "2023-01-31 13:22:50.146418: W tensorflow/python/util/util.cc:348] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.\n", + "2023-01-31 13:22:50.411740: I tensorflow/core/grappler/devices.cc:69] Number of eligible GPUs (core count >= 8, compute capability >= 0.0): 1\n", + "2023-01-31 13:22:50.411969: I tensorflow/core/grappler/clusters/single_machine.cc:357] Starting new session\n", + "2023-01-31 13:22:50.412348: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: \n", + "pciBusID: 0000:03:00.0 name: NVIDIA TITAN Xp computeCapability: 6.1\n", + "coreClock: 1.582GHz coreCount: 30 deviceMemorySize: 11.91GiB deviceMemoryBandwidth: 510.07GiB/s\n", + "2023-01-31 13:22:50.412596: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0\n", + "2023-01-31 13:22:50.412627: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1258] Device interconnect StreamExecutor with strength 1 edge matrix:\n", + "2023-01-31 13:22:50.412636: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1264] 0 \n", + "2023-01-31 13:22:50.412643: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1277] 0: N \n", + "2023-01-31 13:22:50.412919: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1418] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 10939 MB memory) -> physical GPU (device: 0, name: NVIDIA TITAN Xp, pci bus id: 0000:03:00.0, compute capability: 6.1)\n", + "2023-01-31 13:22:50.431567: I tensorflow/core/platform/profile_utils/cpu_utils.cc:114] CPU Frequency: 3492140000 Hz\n", + "2023-01-31 13:22:50.433318: I tensorflow/core/grappler/optimizers/meta_optimizer.cc:1144] Optimization results for grappler item: graph_to_optimize\n", + " function_optimizer: function_optimizer did nothing. time = 0.017ms.\n", + " function_optimizer: function_optimizer did nothing. time = 0.003ms.\n", + "\n", + "2023-01-31 13:22:50.470457: W tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc:345] Ignored output_format.\n", + "2023-01-31 13:22:50.470496: W tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc:348] Ignored drop_control_dependency.\n", + "2023-01-31 13:22:50.473049: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:210] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.\n", + "2023-01-31 13:22:50.475051: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: \n", + "pciBusID: 0000:03:00.0 name: NVIDIA TITAN Xp computeCapability: 6.1\n", + "coreClock: 1.582GHz coreCount: 30 deviceMemorySize: 11.91GiB deviceMemoryBandwidth: 510.07GiB/s\n", + "2023-01-31 13:22:50.475342: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0\n", + "2023-01-31 13:22:50.475376: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1258] Device interconnect StreamExecutor with strength 1 edge matrix:\n", + "2023-01-31 13:22:50.475387: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1264] 0 \n", + "2023-01-31 13:22:50.475395: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1277] 0: N \n", + "2023-01-31 13:22:50.475693: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1418] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 10939 MB memory) -> physical GPU (device: 0, name: NVIDIA TITAN Xp, pci bus id: 0000:03:00.0, compute capability: 6.1)\n", + "Converted model saved to dnn.tflite.\n", + "Running TFLite evaluation on validation set...\n", + "2023-01-31 13:22:50.520336: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:176] None of the MLIR Optimization Passes are enabled (registered 2)\n", + "[[371 0 0 0 0 0 0 0 0 0 0 0]\n", + " [ 0 265 9 7 5 18 11 12 17 5 8 14]\n", + " [ 0 6 346 9 0 2 22 5 1 0 1 5]\n", + " [ 0 9 8 323 8 14 3 5 0 1 2 33]\n", + " [ 0 4 0 2 304 1 3 3 4 17 9 3]\n", + " [ 0 8 1 19 1 326 2 1 7 0 0 12]\n", + " [ 0 2 24 2 3 1 304 13 0 0 0 3]\n", + " [ 0 10 1 1 4 1 4 336 1 2 0 3]\n", + " [ 1 10 1 1 7 2 0 2 326 9 1 3]\n", + " [ 1 2 0 1 27 0 1 1 11 321 4 4]\n", + " [ 2 5 0 0 16 2 2 1 1 2 318 1]\n", + " [ 0 13 0 43 6 13 1 2 3 3 1 287]]\n", + "Validation accuracy = 86.10%(N=4445)\n", + "Running TFLite evaluation on test set...\n", + "[[408 0 0 0 0 0 0 0 0 0 0 0]\n", + " [ 0 295 7 11 6 6 13 12 24 8 5 21]\n", + " [ 0 12 380 3 0 4 15 1 0 0 0 4]\n", + " [ 1 11 2 332 0 22 1 0 0 0 0 36]\n", + " [ 0 14 1 2 357 2 2 5 12 11 11 8]\n", + " [ 0 18 5 18 6 329 5 1 4 0 2 18]\n", + " [ 0 10 25 3 4 1 347 15 1 0 2 4]\n", + " [ 0 20 1 0 5 1 14 349 1 5 0 0]\n", + " [ 0 12 0 1 5 9 0 0 347 16 2 4]\n", + " [ 0 15 0 1 15 1 5 2 12 339 3 9]\n", + " [ 0 5 0 3 21 2 4 1 2 1 368 4]\n", + " [ 0 10 1 62 8 13 3 1 0 0 1 303]]\n", + "Test accuracy = 84.95%(N=4890)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2023-01-31 13:23:02.712653: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0\n", + "Untarring speech_commands_v0.02.tar.gz...\n", + "2023-01-31 13:23:53.488800: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcuda.so.1\n", + "2023-01-31 13:23:53.524175: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: \n", + "pciBusID: 0000:03:00.0 name: NVIDIA TITAN Xp computeCapability: 6.1\n", + "coreClock: 1.582GHz coreCount: 30 deviceMemorySize: 11.91GiB deviceMemoryBandwidth: 510.07GiB/s\n", + "2023-01-31 13:23:53.524209: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0\n", + "2023-01-31 13:23:53.544183: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcublas.so.11\n", + "2023-01-31 13:23:53.544253: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcublasLt.so.11\n", + "2023-01-31 13:23:53.546889: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcufft.so.10\n", + "2023-01-31 13:23:53.547146: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcurand.so.10\n", + "2023-01-31 13:23:53.547744: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcusolver.so.11\n", + "2023-01-31 13:23:53.548454: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcusparse.so.11\n", + "2023-01-31 13:23:53.548596: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudnn.so.8\n", + "2023-01-31 13:23:53.548947: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0\n", + "2023-01-31 13:23:53.549238: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA\n", + "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n", + "2023-01-31 13:23:53.549958: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: \n", + "pciBusID: 0000:03:00.0 name: NVIDIA TITAN Xp computeCapability: 6.1\n", + "coreClock: 1.582GHz coreCount: 30 deviceMemorySize: 11.91GiB deviceMemoryBandwidth: 510.07GiB/s\n", + "2023-01-31 13:23:53.550439: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0\n", + "2023-01-31 13:23:53.550510: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0\n", + "2023-01-31 13:23:53.960933: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1258] Device interconnect StreamExecutor with strength 1 edge matrix:\n", + "2023-01-31 13:23:53.960972: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1264] 0 \n", + "2023-01-31 13:23:53.960979: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1277] 0: N \n", + "2023-01-31 13:23:53.961483: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1418] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 10940 MB memory) -> physical GPU (device: 0, name: NVIDIA TITAN Xp, pci bus id: 0000:03:00.0, compute capability: 6.1)\n", + "WARNING:tensorflow:Compiled the loaded model, but the compiled metrics have yet to be built. `model.compile_metrics` will be empty until you train or evaluate the model.\n", + "2023-01-31 13:23:55.053376: W tensorflow/python/util/util.cc:348] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.\n", + "2023-01-31 13:23:55.321894: I tensorflow/core/grappler/devices.cc:69] Number of eligible GPUs (core count >= 8, compute capability >= 0.0): 1\n", + "2023-01-31 13:23:55.322084: I tensorflow/core/grappler/clusters/single_machine.cc:357] Starting new session\n", + "2023-01-31 13:23:55.322539: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: \n", + "pciBusID: 0000:03:00.0 name: NVIDIA TITAN Xp computeCapability: 6.1\n", + "coreClock: 1.582GHz coreCount: 30 deviceMemorySize: 11.91GiB deviceMemoryBandwidth: 510.07GiB/s\n", + "2023-01-31 13:23:55.322808: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0\n", + "2023-01-31 13:23:55.322839: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1258] Device interconnect StreamExecutor with strength 1 edge matrix:\n", + "2023-01-31 13:23:55.322850: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1264] 0 \n", + "2023-01-31 13:23:55.322858: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1277] 0: N \n", + "2023-01-31 13:23:55.323143: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1418] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 10940 MB memory) -> physical GPU (device: 0, name: NVIDIA TITAN Xp, pci bus id: 0000:03:00.0, compute capability: 6.1)\n", + "2023-01-31 13:23:55.347442: I tensorflow/core/platform/profile_utils/cpu_utils.cc:114] CPU Frequency: 3492140000 Hz\n", + "2023-01-31 13:23:55.348486: I tensorflow/core/grappler/optimizers/meta_optimizer.cc:1144] Optimization results for grappler item: graph_to_optimize\n", + " function_optimizer: function_optimizer did nothing. time = 0.011ms.\n", + " function_optimizer: function_optimizer did nothing. time = 0.002ms.\n", + "\n", + "2023-01-31 13:23:55.387556: W tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc:345] Ignored output_format.\n", + "2023-01-31 13:23:55.387602: W tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc:348] Ignored drop_control_dependency.\n", + "2023-01-31 13:23:55.390277: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:210] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.\n", + "2023-01-31 13:23:55.392318: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: \n", + "pciBusID: 0000:03:00.0 name: NVIDIA TITAN Xp computeCapability: 6.1\n", + "coreClock: 1.582GHz coreCount: 30 deviceMemorySize: 11.91GiB deviceMemoryBandwidth: 510.07GiB/s\n", + "2023-01-31 13:23:55.392627: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0\n", + "2023-01-31 13:23:55.392665: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1258] Device interconnect StreamExecutor with strength 1 edge matrix:\n", + "2023-01-31 13:23:55.392681: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1264] 0 \n", + "2023-01-31 13:23:55.392693: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1277] 0: N \n", + "2023-01-31 13:23:55.393015: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1418] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 10940 MB memory) -> physical GPU (device: 0, name: NVIDIA TITAN Xp, pci bus id: 0000:03:00.0, compute capability: 6.1)\n", + "2023-01-31 13:23:55.414179: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:176] None of the MLIR Optimization Passes are enabled (registered 2)\n", + "fully_quantize: 0, inference_type: 6, input_inference_type: 9, output_inference_type: 9\n", + "Quantized model saved to dnn_quantized.tflite.\n", + "Running TFLite evaluation on validation set...\n", + "[[371 0 0 0 0 0 0 0 0 0 0 0]\n", + " [ 0 272 6 8 8 19 9 12 17 6 4 10]\n", + " [ 0 11 341 9 5 2 20 6 0 0 0 3]\n", + " [ 0 15 9 319 13 13 2 4 1 1 3 26]\n", + " [ 0 6 0 3 307 1 1 2 3 16 9 2]\n", + " [ 0 11 1 20 12 312 3 0 6 0 1 11]\n", + " [ 0 7 26 3 5 1 294 11 1 1 1 2]\n", + " [ 0 13 1 1 9 2 5 326 1 1 2 2]\n", + " [ 2 13 0 0 7 4 1 2 318 10 4 2]\n", + " [ 1 4 0 2 37 0 1 2 12 308 3 3]\n", + " [ 2 5 0 0 21 2 2 1 1 3 312 1]\n", + " [ 0 16 1 43 9 15 1 3 1 3 1 279]]\n", + "Validation accuracy = 84.57%(N=4445)\n", + "Running TFLite evaluation on test set...\n", + "[[408 0 0 0 0 0 0 0 0 0 0 0]\n", + " [ 0 303 7 13 6 4 12 9 22 8 6 18]\n", + " [ 0 13 370 5 4 3 15 1 1 0 2 5]\n", + " [ 0 12 6 335 4 19 1 1 1 0 0 26]\n", + " [ 0 14 1 4 354 1 0 3 15 14 11 8]\n", + " [ 0 26 5 26 10 316 5 2 3 0 1 12]\n", + " [ 0 15 25 2 9 1 334 17 1 0 2 6]\n", + " [ 0 19 1 0 10 1 14 338 4 4 4 1]\n", + " [ 0 16 1 2 8 8 1 0 339 11 6 4]\n", + " [ 0 15 0 1 27 0 6 2 12 329 3 7]\n", + " [ 0 9 0 3 22 2 4 1 2 2 360 6]\n", + " [ 0 20 0 63 16 12 1 3 1 1 6 279]]\n", + "Test accuracy = 83.13%(N=4890)\n" + ] + } + ], + "source": [ + "!bash ./recreate_model.sh" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Running this script will use the pre-trained checkpoint files supplied in the ```./model_archive/model_source/weights``` folder to generate the TFLite files and perform evaluation on the test set. Both an fp32 version and a quantized version will be produced. The quantized version will use post-training quantization to fully quantize it.\n", + "\n", + "If you want to run training from scratch you can do this by supplying ```--train``` when running the script. For example:\n", + "\n", + "```bash\n", + "bash ./recreate_model.sh --train\n", + "```\n", + "\n", + "Training is then performed and should produce a model to the stated accuracy in this repository. Note that exporting to TFLite will still happen with the baseline pre-trained checkpoint files, so you will need to re-run the script and this time supply the path to the new checkpoint files you want to use, for example:\n", + "\n", + "```bash\n", + "bash ./recreate_model.sh --ckpt \n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2.0 Training\n", + "\n", + "The training scripts can be used to recreate any of the models from the [Hello Edge paper](https://arxiv.org/pdf/1711.07128.pdf) provided the right hyperparameters are used. The training commands with all the hyperparameters to reproduce the model in this repository are given [here](recreate_model.sh). The model in this part of the repository represents just one variation of the models from the paper, other varieties are covered in other parts of the repository.\n", + "\n", + "\n", + "As a general example of how to train a DNN with 3 fully-connected layers with 128 neurons in each layer, run:\n", + "```\n", + "python train.py --model_architecture dnn --model_size_info 128 128 128\n", + "```\n", + "\n", + "The command line argument *--model_size_info* is used to pass the neural network layer\n", + "dimensions such as number of layers, convolution filter size/stride as a list to models.py,\n", + "which builds the TensorFlow graph based on the provided model architecture\n", + "and layer dimensions. For more info on *model_size_info* for each network architecture see\n", + "[models.py](model_core_utils/models.py).\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 3.0 Testing\n", + "To run inference on the trained model from a checkpoint and get accuracy on validation and test sets, run:\n", + "```\n", + "python evaluation.py --model_architecture dnn --model_size_info 128 128 128 --checkpoint \n", + "```\n", + "**The model and feature extraction parameters passed to this script should match those used in the Training step.**" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 4.0 Optimization\n", + "\n", + "We introduce an *optional* step to optimize the trained keyword spotting model for deployment.\n", + "\n", + "Here we use TensorFlow's [weight clustering API](https://www.tensorflow.org/model_optimization/guide/clustering) to reduce the compressed model size and optimize inference on supported hardware. 32 weight clusters and kmeans++ cluster intialization method are used as the clustering hyperparameters.\n", + "\n", + "To optimize your trained model (e.g. a DNN), a trained model checkpoint is needed to run clustering and fine-tuning on.\n", + "You can use the pre-trained checkpoints provided, or train your own model and use the resulting checkpoint.\n", + "\n", + "To apply the optimization and fine-tuning, run the following command:\n", + "```\n", + "python optimisations.py --model_architecture dnn --model_size_info 128 128 128 --checkpoint \n", + "```\n", + "**The model and feature extraction parameters used here should match those used in the Training step, except for the number of training steps.\n", + "The number of training steps is reduced since the optimization step only requires fine-tuning.**\n", + "\n", + "This will generate a clustered model checkpoint that can be used in the quantization step to generate a quantized and clustered TFLite model." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 5.0 Quantization and TFLite Conversion\n", + "\n", + "You can now use TensorFlow's\n", + "[post training quantization](https://www.tensorflow.org/lite/performance/post_training_quantization) to\n", + "make quantization of the trained models super simple.\n", + "\n", + "To quantize your trained model (e.g. a DNN) run:\n", + "```\n", + "python convert_to_tflite.py --model_architecture dnn --model_size_info 128 128 128 --checkpoint [--inference_type int8|int16]\n", + "```\n", + "**The model and feature extraction parameters used here should match those used in the Training step.**\n", + "\n", + "The ```inference_type``` parameter is *optional* and to be used if a fully quantized model with inputs and outputs of type int8 or int16 is needed. It defaults to fp32.\n", + "\n", + "In this example, this step will produce a quantized TFLite file *dnn_quantized.tflite*." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can test the accuracy of this quantized model on the test set by running:\n", + "```\n", + "python evaluation.py --tflite_path dnn_quantized.tflite\n", + "```\n", + "**The model and feature extraction parameters used here should match those used in the Training step.**\n", + "\n", + "`convert_to_tflite.py` uses post-training quantization to generate a quantized model by default. If you wish to convert to a floating point TFLite model, use the command below:\n", + "\n", + "```\n", + "python convert_to_tflite.py --model_architecture dnn --model_size_info 128 128 128 --checkpoint --no-quantize\n", + "```\n", + "\n", + "This will produce a floating point TFLite file *dnn.tflite*. You can test the accuracy of this floating point model using `evaluation.py` as above.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 6.0 Single inference of the TFLite model files \n", + "\n", + "You can conduct TFLite inference for .fp32 and .int8 model files by using the following command: \n", + "\n", + "```python dnn_m_inference_tflite.py --labels validation_utils/labels.txt --wav --tflite_path ```\n", + "\n", + "**The feature extraction parameters used here should match those used in the Training step.**\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.4" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/models/keyword_spotting/dnn_medium/model_package_tf/model_archive/TFLite/tflite_fp32/README.md b/models/keyword_spotting/dnn_medium/model_package_tf/model_archive/TFLite/tflite_fp32/README.md new file mode 100644 index 0000000..54631cd --- /dev/null +++ b/models/keyword_spotting/dnn_medium/model_package_tf/model_archive/TFLite/tflite_fp32/README.md @@ -0,0 +1,62 @@ +# keyword_spotting/dnn_medium/model_package_tf/model_archive/TFLite/tflite_fp32 + +## Description +This is a floating point fp32 version of the DNN Medium model developed by Arm, from the Hello Edge paper. + +## License +[Apache-2.0](https://spdx.org/licenses/Apache-2.0.html) + +## Network Information +| Network Information | Value | +|---------------------|-------| +| Framework | TensorFlow Lite | +| Datatype | fp32 | +| SHA-1 Hash | 3c20c6ee24ee41ed6db968ff58d69f5823c94036 | +| Size (Bytes) | 797768 | +| Provenance | https://arxiv.org/abs/1711.07128 | +| Training | Trained by Arm | +| Paper | https://arxiv.org/abs/1711.07128 | + +## DataSet +| Dataset Information | Value | +|--------|-------| +| Name | Google Speech Commands test set | + +## Accuracy + +| Metric | Value | +|--------|-------| +| accuracy | 84.95% | + +## HW Support +| HW Support | Value | +|--------------|-------| +| Cortex-A |:heavy_check_mark: | +| Cortex-M |:heavy_check_mark: | +| Mali GPU |:heavy_check_mark: | +| Ethos U |:heavy_multiplication_x: | + +### Key +* :heavy_check_mark: - Will run on this platform. +* :heavy_multiplication_x: - Will not run on this platform. + +## Network Quality +| Network Quality | Value | +|-------------------------|-------| +| Recreate | :heavy_check_mark: | +| Quality level | Deployable | +| Vanilla | :heavy_check_mark: | +| Clustered | :heavy_multiplication_x: | +| Pruned | :heavy_multiplication_x: | +| Quantization - default | :heavy_multiplication_x: | +| Quantization - full | :heavy_multiplication_x: | + +## Network Inputs +| Input Node Name | Shape | Type | Example Path | Example Type | Example Shape | Example Use Case | +|-----------------|-------|-------|--------------|-------|-------|-----------------| +| input | (1, 250) | fp32 | models/keyword_spotting/dnn_medium/model_package_tf/model_archive/TFLite/tflite_fp32/testing_input/input | fp32 | [1, 250] | The input is a processed MFCCs | + +## Network Outputs +| Output Node Name | Shape | Type | Example Path | Example Type | Example Shape | Example Use Case | +|-----------------|-------|-------|--------------|-------|-------|-----------------| +| Identity | (1, 12) | fp32 | models/keyword_spotting/dnn_medium/model_package_tf/model_archive/TFLite/tflite_fp32/testing_output/Identity | fp32 | [1, 12] | The probability on 12 keywords | \ No newline at end of file diff --git a/models/keyword_spotting/dnn_medium/model_package_tf/model_archive/TFLite/tflite_fp32/definition.yaml b/models/keyword_spotting/dnn_medium/model_package_tf/model_archive/TFLite/tflite_fp32/definition.yaml new file mode 100644 index 0000000..a650fd3 --- /dev/null +++ b/models/keyword_spotting/dnn_medium/model_package_tf/model_archive/TFLite/tflite_fp32/definition.yaml @@ -0,0 +1,62 @@ +benchmark: + benchmark_metrics: + accuracy: 84.95% + benchmark_name: Google Speech Commands test set +description: This is a floating point fp32 version of the DNN Medium model developed + by Arm, from the Hello Edge paper. +license: +- Apache-2.0 +network: + datatype: fp32 + file_size_bytes: 797768 + filename: dnn_m.tflite + framework: TensorFlow Lite + hash: + algorithm: sha1 + value: 3c20c6ee24ee41ed6db968ff58d69f5823c94036 + provenance: https://arxiv.org/abs/1711.07128 + training: Trained by Arm +network_parameters: + input_nodes: + - description: The input is a processed MFCCs of shape (1, 490) + example_input: + path: models/keyword_spotting/dnn_medium/model_package_tf/model_archive/TFLite/tflite_fp32/testing_input/input + shape: + - 1 + - 250 + type: fp32 + use_case: Random input for model regression. + input_datatype: fp32 + name: input + shape: + - 1 + - 250 + output_nodes: + - description: The probability on 12 keywords. + example_output: + path: models/keyword_spotting/dnn_medium/model_package_tf/model_archive/TFLite/tflite_fp32/testing_output/Identity + shape: + - 1 + - 12 + type: fp32 + use_case: output for model regression. + name: Identity + output_datatype: fp32 + shape: + - 1 + - 12 +network_quality: + clustered: false + is_vanilla: true + pruned: false + quality_level: Deployable + quality_level_hero_hw: cortex_m + quantization_default: false + quantization_full: false + recreate: true +operators: + TensorFlow Lite: + - FULLY_CONNECTED + - RELU + - SOFTMAX +paper: https://arxiv.org/abs/1711.07128 \ No newline at end of file diff --git a/models/keyword_spotting/dnn_medium/model_package_tf/model_archive/TFLite/tflite_fp32/dnn_m.tflite b/models/keyword_spotting/dnn_medium/model_package_tf/model_archive/TFLite/tflite_fp32/dnn_m.tflite new file mode 100644 index 0000000..e4e30d7 --- /dev/null +++ b/models/keyword_spotting/dnn_medium/model_package_tf/model_archive/TFLite/tflite_fp32/dnn_m.tflite @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8380c3ff3a3152c5ab5cc2a226c73707924d906e468f708513ffa84d6e9a1d96 +size 797768 diff --git a/models/keyword_spotting/dnn_medium/model_package_tf/model_archive/TFLite/tflite_fp32/testing_input/input/0.npy b/models/keyword_spotting/dnn_medium/model_package_tf/model_archive/TFLite/tflite_fp32/testing_input/input/0.npy new file mode 100644 index 0000000..85f3e34 --- /dev/null +++ b/models/keyword_spotting/dnn_medium/model_package_tf/model_archive/TFLite/tflite_fp32/testing_input/input/0.npy @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b1f811913684442a9517879b173e29799094e4261cbef84c0a84536564179349 +size 1128 diff --git a/models/keyword_spotting/dnn_medium/model_package_tf/model_archive/TFLite/tflite_fp32/testing_output/Identity/0.npy b/models/keyword_spotting/dnn_medium/model_package_tf/model_archive/TFLite/tflite_fp32/testing_output/Identity/0.npy new file mode 100644 index 0000000..6af5cd7 --- /dev/null +++ b/models/keyword_spotting/dnn_medium/model_package_tf/model_archive/TFLite/tflite_fp32/testing_output/Identity/0.npy @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fed63e3ed1b354a3927bf735223654a482c6745299f5e2a57ed3974dfef295f1 +size 176 diff --git a/models/keyword_spotting/dnn_medium/model_package_tf/model_archive/TFLite/tflite_int8/README.md b/models/keyword_spotting/dnn_medium/model_package_tf/model_archive/TFLite/tflite_int8/README.md new file mode 100644 index 0000000..1e65aad --- /dev/null +++ b/models/keyword_spotting/dnn_medium/model_package_tf/model_archive/TFLite/tflite_int8/README.md @@ -0,0 +1,62 @@ +# keyword_spotting/dnn_medium/model_package_tf/model_archive/TFLite/tflite_int8 + +## Description +This is a fully quantized int8 version of the DNN Medium model developed by Arm, from the Hello Edge paper. + +## License +[Apache-2.0](https://spdx.org/licenses/Apache-2.0.html) + +## Network Information +| Network Information | Value | +|---------------------|-------| +| Framework | TensorFlow Lite | +| Datatype | int8 | +| SHA-1 Hash | 7e138f99cfc6a603a1fc735a2d9c3e28a41a6a43 | +| Size (Bytes) | 203832 | +| Provenance | https://arxiv.org/abs/1711.07128 | +| Training | Trained by Arm | +| Paper | https://arxiv.org/abs/1711.07128 | + +## DataSet +| Dataset Information | Value | +|--------|-------| +| Name | Google Speech Commands test set | + +## Accuracy + +| Metric | Value | +|--------|-------| +| Accuracy | 83.93% | + +## HW Support +| HW Support | Value | +|--------------|-------| +| Cortex-A |:heavy_check_mark: | +| Cortex-M |:heavy_check_mark: | +| Mali GPU |:heavy_check_mark: | +| Ethos U |:heavy_check_mark: | + +### Key +* :heavy_check_mark: - Will run on this platform. +* :heavy_multiplication_x: - Will not run on this platform. + +## Network Quality +| Network Quality | Value | +|-------------------------|-------| +| Recreate | :heavy_check_mark: | +| Quality level | Deployable | +| Vanilla | :heavy_check_mark: | +| Clustered | :heavy_multiplication_x: | +| Pruned | :heavy_multiplication_x: | +| Quantization - default | :heavy_multiplication_x: | +| Quantization - full | :heavy_check_mark: | + +## Network Inputs +| Input Node Name | Shape | Type | Example Path | Example Type | Example Shape | Example Use Case | +|-----------------|-------|-------|--------------|-------|-------|-----------------| +| input | (1, 250) | int8 | models/keyword_spotting/dnn_medium/model_package_tf/model_archive/TFLite/tflite_int8/testing_input/input | int8 | [1, 250] | The input is a processed MFCCs | + +## Network Outputs +| Output Node Name | Shape | Type | Example Path | Example Type | Example Shape | Example Use Case | +|-----------------|-------|-------|--------------|-------|-------|-----------------| +| Identity | (1, 12) | int8 | models/keyword_spotting/dnn_medium/model_package_tf/model_archive/TFLite/tflite_int8/testing_output/Identity | int8 | [1, 12] | The probability on 12 keywords | \ No newline at end of file diff --git a/models/keyword_spotting/dnn_medium/model_package_tf/model_archive/TFLite/tflite_int8/definition.yaml b/models/keyword_spotting/dnn_medium/model_package_tf/model_archive/TFLite/tflite_int8/definition.yaml new file mode 100644 index 0000000..c519ab1 --- /dev/null +++ b/models/keyword_spotting/dnn_medium/model_package_tf/model_archive/TFLite/tflite_int8/definition.yaml @@ -0,0 +1,62 @@ +benchmark: + benchmark_metrics: + Accuracy: 83.93% + benchmark_name: Google Speech Commands test set +description: This is a fully quantized int8 version of the DNN Medium model developed + by Arm, from the Hello Edge paper. +license: +- Apache-2.0 +network: + datatype: int8 + file_size_bytes: 203832 + filename: dnn_m_quantized.tflite + framework: TensorFlow Lite + hash: + algorithm: sha1 + value: 7e138f99cfc6a603a1fc735a2d9c3e28a41a6a43 + provenance: https://arxiv.org/abs/1711.07128 + training: Trained by Arm +network_parameters: + input_nodes: + - description: The input is a processed MFCCs of shape (1, 250) + example_input: + path: models/keyword_spotting/dnn_medium/model_package_tf/model_archive/TFLite/tflite_int8/testing_input/input + shape: + - 1 + - 250 + type: int8 + use_case: Random input for model regression. + input_datatype: int8 + name: input + shape: + - 1 + - 250 + output_nodes: + - description: The probability on 12 keywords. + example_output: + path: models/keyword_spotting/dnn_medium/model_package_tf/model_archive/TFLite/tflite_int8/testing_output/Identity + shape: + - 1 + - 12 + type: int8 + use_case: output for model regression. + name: Identity + output_datatype: int8 + shape: + - 1 + - 12 +network_quality: + clustered: false + is_vanilla: true + pruned: false + quality_level: Deployable + quality_level_hero_hw: cortex_m + quantization_default: false + quantization_full: true + recreate: true +operators: + TensorFlow Lite: + - FULLY_CONNECTED + - RELU + - SOFTMAX +paper: https://arxiv.org/abs/1711.07128 \ No newline at end of file diff --git a/models/keyword_spotting/dnn_medium/tflite_int8/dnn_m_quantized.tflite b/models/keyword_spotting/dnn_medium/model_package_tf/model_archive/TFLite/tflite_int8/dnn_m_quantized.tflite similarity index 100% rename from models/keyword_spotting/dnn_medium/tflite_int8/dnn_m_quantized.tflite rename to models/keyword_spotting/dnn_medium/model_package_tf/model_archive/TFLite/tflite_int8/dnn_m_quantized.tflite diff --git a/models/keyword_spotting/dnn_medium/tflite_int8/testing_input/input/0.npy b/models/keyword_spotting/dnn_medium/model_package_tf/model_archive/TFLite/tflite_int8/testing_input/input/0.npy similarity index 100% rename from models/keyword_spotting/dnn_medium/tflite_int8/testing_input/input/0.npy rename to models/keyword_spotting/dnn_medium/model_package_tf/model_archive/TFLite/tflite_int8/testing_input/input/0.npy diff --git a/models/keyword_spotting/dnn_medium/tflite_int8/testing_output/Identity/0.npy b/models/keyword_spotting/dnn_medium/model_package_tf/model_archive/TFLite/tflite_int8/testing_output/Identity/0.npy similarity index 100% rename from models/keyword_spotting/dnn_medium/tflite_int8/testing_output/Identity/0.npy rename to models/keyword_spotting/dnn_medium/model_package_tf/model_archive/TFLite/tflite_int8/testing_output/Identity/0.npy diff --git a/models/keyword_spotting/dnn_medium/model_package_tf/model_archive/model_source/saved_model/dnn_medium/keras_metadata.pb b/models/keyword_spotting/dnn_medium/model_package_tf/model_archive/model_source/saved_model/dnn_medium/keras_metadata.pb new file mode 100644 index 0000000..08ef7e5 --- /dev/null +++ b/models/keyword_spotting/dnn_medium/model_package_tf/model_archive/model_source/saved_model/dnn_medium/keras_metadata.pb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d4dfba08e6695d3429dc605cf00dd1e6950f646faf61fc9876de9471f66ee419 +size 10087 diff --git a/models/keyword_spotting/dnn_medium/model_package_tf/model_archive/model_source/saved_model/dnn_medium/saved_model.pb b/models/keyword_spotting/dnn_medium/model_package_tf/model_archive/model_source/saved_model/dnn_medium/saved_model.pb new file mode 100644 index 0000000..770dcc1 --- /dev/null +++ b/models/keyword_spotting/dnn_medium/model_package_tf/model_archive/model_source/saved_model/dnn_medium/saved_model.pb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ef3a9281ac9bc4de4ce805938bfdb673c0c06627ce977e11521c0782c1999256 +size 85126 diff --git a/models/keyword_spotting/dnn_medium/model_package_tf/model_archive/model_source/saved_model/dnn_medium/variables/variables.data-00000-of-00001 b/models/keyword_spotting/dnn_medium/model_package_tf/model_archive/model_source/saved_model/dnn_medium/variables/variables.data-00000-of-00001 new file mode 100644 index 0000000..afb21fe --- /dev/null +++ b/models/keyword_spotting/dnn_medium/model_package_tf/model_archive/model_source/saved_model/dnn_medium/variables/variables.data-00000-of-00001 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:69f2943b2684b7c153e67808422daa3f61b229dd3a6092b5ae5af95d1eaf3ff6 +size 798335 diff --git a/models/keyword_spotting/dnn_medium/model_package_tf/model_archive/model_source/saved_model/dnn_medium/variables/variables.index b/models/keyword_spotting/dnn_medium/model_package_tf/model_archive/model_source/saved_model/dnn_medium/variables/variables.index new file mode 100644 index 0000000..7a51ce6 --- /dev/null +++ b/models/keyword_spotting/dnn_medium/model_package_tf/model_archive/model_source/saved_model/dnn_medium/variables/variables.index @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4d85b2373052882c55abdeb07a4c061ad4aa23c0c36a72db08dc17a515d30363 +size 641 diff --git a/models/keyword_spotting/dnn_medium/tflite_int8/ckpt/checkpoint b/models/keyword_spotting/dnn_medium/model_package_tf/model_archive/model_source/weights/checkpoint similarity index 100% rename from models/keyword_spotting/dnn_medium/tflite_int8/ckpt/checkpoint rename to models/keyword_spotting/dnn_medium/model_package_tf/model_archive/model_source/weights/checkpoint diff --git a/models/keyword_spotting/dnn_medium/tflite_int8/ckpt/dnn_0.86_ckpt.data-00000-of-00001 b/models/keyword_spotting/dnn_medium/model_package_tf/model_archive/model_source/weights/dnn_0.86_ckpt.data-00000-of-00001 similarity index 100% rename from models/keyword_spotting/dnn_medium/tflite_int8/ckpt/dnn_0.86_ckpt.data-00000-of-00001 rename to models/keyword_spotting/dnn_medium/model_package_tf/model_archive/model_source/weights/dnn_0.86_ckpt.data-00000-of-00001 diff --git a/models/keyword_spotting/dnn_medium/tflite_int8/ckpt/dnn_0.86_ckpt.index b/models/keyword_spotting/dnn_medium/model_package_tf/model_archive/model_source/weights/dnn_0.86_ckpt.index similarity index 100% rename from models/keyword_spotting/dnn_medium/tflite_int8/ckpt/dnn_0.86_ckpt.index rename to models/keyword_spotting/dnn_medium/model_package_tf/model_archive/model_source/weights/dnn_0.86_ckpt.index diff --git a/models/keyword_spotting/dnn_medium/model_package_tf/model_core_utils/__init__.py b/models/keyword_spotting/dnn_medium/model_package_tf/model_core_utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/models/keyword_spotting/dnn_medium/model_package_tf/model_core_utils/models.py b/models/keyword_spotting/dnn_medium/model_package_tf/model_core_utils/models.py new file mode 100644 index 0000000..1978136 --- /dev/null +++ b/models/keyword_spotting/dnn_medium/model_package_tf/model_core_utils/models.py @@ -0,0 +1,327 @@ +# Copyright © 2023 Arm Ltd. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Model definitions for simple keyword spotting.""" + +import math + +import tensorflow as tf + + +def prepare_model_settings(label_count, sample_rate, clip_duration_ms, + window_size_ms, window_stride_ms, + dct_coefficient_count): + """Calculates common settings needed for all models. + + Args: + label_count: How many classes are to be recognized. + sample_rate: Number of audio samples per second. + clip_duration_ms: Length of each audio clip to be analyzed. + window_size_ms: Duration of frequency analysis window. + window_stride_ms: How far to move in time between frequency windows. + dct_coefficient_count: Number of frequency bins to use for analysis. + + Returns: + Dictionary containing common settings. + """ + desired_samples = int(sample_rate * clip_duration_ms / 1000) + window_size_samples = int(sample_rate * window_size_ms / 1000) + window_stride_samples = int(sample_rate * window_stride_ms / 1000) + length_minus_window = (desired_samples - window_size_samples) + if length_minus_window < 0: + spectrogram_length = 0 + else: + spectrogram_length = 1 + int(length_minus_window / window_stride_samples) + fingerprint_size = dct_coefficient_count * spectrogram_length + + return { + 'desired_samples': desired_samples, + 'window_size_samples': window_size_samples, + 'window_stride_samples': window_stride_samples, + 'spectrogram_length': spectrogram_length, + 'dct_coefficient_count': dct_coefficient_count, + 'fingerprint_size': fingerprint_size, + 'label_count': label_count, + 'sample_rate': sample_rate, + } + + +def create_model(model_settings, model_architecture, model_size_info, is_training): + """Builds a tf.keras model of the requested architecture compatible with the settings. + + Args: + model_settings: Dictionary of information about the model. + model_architecture: String specifying which kind of model to create. + model_size_info: Array with specific information for the chosen architecture + (e.g convolutional parameters, number of layers). + + Returns: + A tf.keras Model with the requested architecture. + + Raises: + Exception: If the architecture type isn't recognized. + """ + + if model_architecture == 'dnn': + return create_dnn_model(model_settings, model_size_info) + + elif model_architecture == 'cnn': + return create_cnn_model(model_settings, model_size_info) + + elif model_architecture == 'ds_cnn': + return create_ds_cnn_model(model_settings, model_size_info) + elif model_architecture == 'single_fc': + return create_single_fc_model(model_settings) + elif model_architecture == 'basic_lstm': + return create_basic_lstm_model(model_settings, model_size_info, is_training) + else: + raise Exception(f'model_architecture argument {model_architecture} not recognized' + f', should be one of, "dnn", "cnn", "ds_cnn" ') + + +def create_single_fc_model(model_settings): + """Builds a model with a single fully-connected layer. + + For details see https://arxiv.org/abs/1711.07128. + + Args: + model_settings: Dict of different settings for model training. + + Returns: + tf.keras Model of the 'SINGLE_FC' architecture. + """ + inputs = tf.keras.Input(shape=(model_settings['fingerprint_size'],), name='input') + # Fully connected layer + output = tf.keras.layers.Dense(units=model_settings['label_count'], activation='softmax')(inputs) + + return tf.keras.Model(inputs, output) + + +def create_basic_lstm_model(model_settings, model_size_info, is_training): + """Builds a model with a basic lstm layer. + + For details see https://arxiv.org/abs/1711.07128. + + Args: + model_settings: Dict of different settings for model training. + model_size_info: Length of the array defines the number of hidden-layers and + each element in the array represent the number of neurons in that layer. + is_training: Determining whether the use of the model is for training or for something else. + + Returns: + tf.keras Model of the 'Basic_LSTM' architecture. + """ + inputs = tf.keras.Input(shape=(model_settings['fingerprint_size'], ), name='input') + + input_frequency_size = model_settings['dct_coefficient_count'] + input_time_size = model_settings['spectrogram_length'] + x = tf.reshape(inputs, shape=(-1, input_time_size, input_frequency_size)) + + # LSTM layer, and unrolling depending on whether you are training or not + if is_training: + x = tf.keras.layers.LSTM(units=model_size_info[0], time_major=False, unroll=False)(x) + else: + x = tf.keras.layers.LSTM(units=model_size_info[0], time_major=False, unroll=True)(x) + + # Outputs a fully connected layer + output = tf.keras.layers.Dense(units=model_settings['label_count'], activation='softmax')(x) + + return tf.keras.Model(inputs, output) + + +def create_dnn_model(model_settings, model_size_info): + """Builds a model with multiple hidden fully-connected layers. + + For details see https://arxiv.org/abs/1711.07128. + + Args: + model_settings: Dict of different settings for model training. + model_size_info: Length of the array defines the number of hidden-layers and + each element in the array represent the number of neurons in that layer. + + Returns: + tf.keras Model of the 'DNN' architecture. + """ + + inputs = tf.keras.Input(shape=(model_settings['fingerprint_size'], ), name='input') + + # First fully connected layer. + x = tf.keras.layers.Dense(units=model_size_info[0], activation='relu')(inputs) + + # Hidden layers with ReLU activations. + for i in range(1, len(model_size_info)): + x = tf.keras.layers.Dense(units=model_size_info[i], activation='relu')(x) + + # Output fully connected layer. + output = tf.keras.layers.Dense(units=model_settings['label_count'], activation='softmax')(x) + + return tf.keras.Model(inputs, output) + + +def create_cnn_model(model_settings, model_size_info): + """Builds a model with 2 convolution layers followed by a linear layer and a hidden fully-connected layer. + + For details see https://arxiv.org/abs/1711.07128. + + Args: + model_settings: Dict of different settings for model training. + model_size_info: Defines the first and second convolution parameters in + {number of conv features, conv filter height, width, stride in y,x dir.}, + followed by linear layer size and fully-connected layer size. + + Returns: + tf.keras Model of the 'CNN' architecture. + """ + + input_frequency_size = model_settings['dct_coefficient_count'] + input_time_size = model_settings['spectrogram_length'] + + first_filter_count = model_size_info[0] + first_filter_height = model_size_info[1] # Time axis. + first_filter_width = model_size_info[2] # Frequency axis. + first_filter_stride_y = model_size_info[3] # Time axis. + first_filter_stride_x = model_size_info[4] # Frequency_axis. + + second_filter_count = model_size_info[5] + second_filter_height = model_size_info[6] # Time axis. + second_filter_width = model_size_info[7] # Frequency axis. + second_filter_stride_y = model_size_info[8] # Time axis. + second_filter_stride_x = model_size_info[9] # Frequency axis. + + linear_layer_size = model_size_info[10] + fc_size = model_size_info[11] + + inputs = tf.keras.Input(shape=(model_settings['fingerprint_size']), name='input') + + # Reshape the flattened input. + x = tf.reshape(inputs, shape=(-1, input_time_size, input_frequency_size, 1)) + + # First convolution. + x = tf.keras.layers.Conv2D(filters=first_filter_count, + kernel_size=(first_filter_height, first_filter_width), + strides=(first_filter_stride_y, first_filter_stride_x), + padding='VALID')(x) + x = tf.keras.layers.BatchNormalization()(x) + x = tf.keras.layers.ReLU()(x) + x = tf.keras.layers.Dropout(rate=0)(x) + + # Second convolution. + x = tf.keras.layers.Conv2D(filters=second_filter_count, + kernel_size=(second_filter_height, second_filter_width), + strides=(second_filter_stride_y, second_filter_stride_x), + padding='VALID')(x) + x = tf.keras.layers.BatchNormalization()(x) + x = tf.keras.layers.ReLU()(x) + x = tf.keras.layers.Dropout(rate=0)(x) + + # Flatten for fully connected layers. + x = tf.keras.layers.Flatten()(x) + + # Fully connected layer with no activation. + x = tf.keras.layers.Dense(units=linear_layer_size)(x) + + # Fully connected layer with ReLU activation. + x = tf.keras.layers.Dense(units=fc_size)(x) + x = tf.keras.layers.BatchNormalization()(x) + x = tf.keras.layers.ReLU()(x) + x = tf.keras.layers.Dropout(rate=0)(x) + + # Output fully connected. + output = tf.keras.layers.Dense(units=model_settings['label_count'], activation='softmax')(x) + + return tf.keras.Model(inputs, output) + + +def create_ds_cnn_model(model_settings, model_size_info): + """Builds a model with convolutional & depthwise separable convolutional layers. + + For more details see https://arxiv.org/abs/1711.07128. + + Args: + model_settings: Dict of different settings for model training. + model_size_info: Defines number of layers, followed by the DS-Conv layer + parameters in the order {number of conv features, conv filter height, + width and stride in y,x dir.} for each of the layers. + + Returns: + tf.keras Model of the 'DS-CNN' architecture. + """ + + label_count = model_settings['label_count'] + input_frequency_size = model_settings['dct_coefficient_count'] + input_time_size = model_settings['spectrogram_length'] + + t_dim = input_time_size + f_dim = input_frequency_size + + # Extract model dimensions from model_size_info. + num_layers = model_size_info[0] + conv_feat = [None]*num_layers + conv_kt = [None]*num_layers + conv_kf = [None]*num_layers + conv_st = [None]*num_layers + conv_sf = [None]*num_layers + + i = 1 + for layer_no in range(0, num_layers): + conv_feat[layer_no] = model_size_info[i] + i += 1 + conv_kt[layer_no] = model_size_info[i] + i += 1 + conv_kf[layer_no] = model_size_info[i] + i += 1 + conv_st[layer_no] = model_size_info[i] + i += 1 + conv_sf[layer_no] = model_size_info[i] + i += 1 + + inputs = tf.keras.Input(shape=(model_settings['fingerprint_size']), name='input') + + # Reshape the flattened input. + x = tf.reshape(inputs, shape=(-1, input_time_size, input_frequency_size, 1)) + + # Depthwise separable convolutions. + for layer_no in range(0, num_layers): + if layer_no == 0: + # First convolution. + x = tf.keras.layers.Conv2D(filters=conv_feat[0], + kernel_size=(conv_kt[0], conv_kf[0]), + strides=(conv_st[0], conv_sf[0]), + padding='SAME')(x) + x = tf.keras.layers.BatchNormalization()(x) + x = tf.keras.layers.ReLU()(x) + else: + # Depthwise convolution. + x = tf.keras.layers.DepthwiseConv2D(kernel_size=(conv_kt[layer_no], conv_kf[layer_no]), + strides=(conv_sf[layer_no], conv_st[layer_no]), + padding='SAME')(x) + x = tf.keras.layers.BatchNormalization()(x) + x = tf.keras.layers.ReLU()(x) + + # Pointwise convolution. + x = tf.keras.layers.Conv2D(filters=conv_feat[layer_no], kernel_size=(1, 1))(x) + x = tf.keras.layers.BatchNormalization()(x) + x = tf.keras.layers.ReLU()(x) + + t_dim = math.ceil(t_dim/float(conv_st[layer_no])) + f_dim = math.ceil(f_dim/float(conv_sf[layer_no])) + + # Global average pool. + x = tf.keras.layers.AveragePooling2D(pool_size=(t_dim, f_dim), strides=1)(x) + + # Squeeze before passing to output fully connected layer. + x = tf.reshape(x, shape=(-1, conv_feat[layer_no])) + + # Output connected layer. + output = tf.keras.layers.Dense(units=label_count, activation='softmax')(x) + + return tf.keras.Model(inputs, output) diff --git a/models/keyword_spotting/dnn_medium/model_package_tf/optimisations.py b/models/keyword_spotting/dnn_medium/model_package_tf/optimisations.py new file mode 100644 index 0000000..16b6f4c --- /dev/null +++ b/models/keyword_spotting/dnn_medium/model_package_tf/optimisations.py @@ -0,0 +1,259 @@ +# Copyright © 2023 Arm Ltd. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Functions for optimizing simple keyword spotting models using clustering API.""" + +import argparse +from pathlib import Path + +import tensorflow as tf +import numpy as np +import tensorflow_model_optimization as tfmot + +from data_processing import data_preprocessing +from model_core_utils import models + + +def print_model_weight_clusters(model): + + for layer in model.layers: + if isinstance(layer, tf.keras.layers.Wrapper): + weights = layer.trainable_weights + else: + weights = layer.weights + for weight in weights: + if "kernel" in weight.name: + unique_count = len(np.unique(weight)) + print( + f"{layer.name}/{weight.name}: {unique_count} clusters " + ) + + +def optimize(): + model_settings = models.prepare_model_settings( + len(data_preprocessing.prepare_words_list(FLAGS.wanted_words.split(','))), + FLAGS.sample_rate, FLAGS.clip_duration_ms, FLAGS.window_size_ms, + FLAGS.window_stride_ms, FLAGS.dct_coefficient_count) + + # Create the model to optimize from checkpoint. + model = models.create_model(model_settings, FLAGS.model_architecture, FLAGS.model_size_info) + model.load_weights(FLAGS.checkpoint).expect_partial() + + audio_processor = data_preprocessing.AudioProcessor(data_url=FLAGS.data_url, + data_dir=FLAGS.data_dir, + silence_percentage=FLAGS.silence_percentage, + unknown_percentage=FLAGS.unknown_percentage, + wanted_words=FLAGS.wanted_words.split(','), + validation_percentage=FLAGS.validation_percentage, + testing_percentage=FLAGS.testing_percentage, + model_settings=model_settings) + + # We decay learning rate in a constant piecewise way to help learning. + training_steps_list = list(map(int, FLAGS.how_many_training_steps.split(','))) + learning_rates_list = list(map(float, FLAGS.learning_rate.split(','))) + lr_boundary_list = training_steps_list[:-1] # Only need the values at which to change lr. + lr_schedule = tf.keras.optimizers.schedules.PiecewiseConstantDecay(boundaries=lr_boundary_list, + values=learning_rates_list) + + cluster_weights = tfmot.clustering.keras.cluster_weights + CentroidInitialization = tfmot.clustering.keras.CentroidInitialization + + clustering_params = { + 'number_of_clusters': 32, + 'cluster_centroids_init': CentroidInitialization.KMEANS_PLUS_PLUS} + + clustered_model = cluster_weights(model, **clustering_params) + + # Specify the optimizer configurations. + optimizer = tf.keras.optimizers.Adam(learning_rate=lr_schedule) + clustered_model.compile(optimizer=optimizer, + loss=tf.keras.losses.SparseCategoricalCrossentropy(), + metrics=['accuracy']) + + train_data = audio_processor.get_data(audio_processor.Modes.TRAINING, + FLAGS.background_frequency, FLAGS.background_volume, + int((FLAGS.time_shift_ms * FLAGS.sample_rate) / 1000)) + train_data = train_data.repeat().batch(FLAGS.batch_size).prefetch(tf.data.AUTOTUNE) + val_data = audio_processor.get_data(audio_processor.Modes.VALIDATION) + val_data = val_data.batch(FLAGS.batch_size).prefetch(tf.data.AUTOTUNE) + + # We train for a max number of iterations so need to calculate how many 'epochs' this will be. + training_steps_max = np.sum(training_steps_list) + training_epoch_max = int(np.ceil(training_steps_max / FLAGS.eval_step_interval)) + + # Train the model with clustering applied. + clustered_model.fit(x=train_data, + steps_per_epoch=FLAGS.eval_step_interval, + epochs=training_epoch_max, + validation_data=val_data) + + stripped_clustered_model = tfmot.clustering.keras.strip_clustering(clustered_model) + + print_model_weight_clusters(stripped_clustered_model) + + # Save the clustered model weights + train_dir = Path(FLAGS.train_dir) / "optimized" + train_dir.mkdir(parents=True, exist_ok=True) + + stripped_clustered_model.save_weights((train_dir / + (FLAGS.model_architecture + + "_clustered_ckpt"))) + + # Test the model. + test_data = audio_processor.get_data(audio_processor.Modes.TESTING) + test_data = test_data.batch(FLAGS.batch_size) + + stripped_clustered_model.compile(optimizer=optimizer, + loss=tf.keras.losses.SparseCategoricalCrossentropy(), + metrics=['accuracy']) + + test_loss, test_acc = stripped_clustered_model.evaluate(x=test_data) + print(f'Final test accuracy: {test_acc*100:.2f}%') + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + '--data_url', + type=str, + default='http://download.tensorflow.org/data/speech_commands_v0.02.tar.gz', + help='Location of speech training data archive on the web.') + parser.add_argument( + '--data_dir', + type=str, + default='/tmp/speech_dataset/', + help="""\ + Where to download the speech training data to. + """) + parser.add_argument( + '--background_volume', + type=float, + default=0.1, + help="""\ + How loud the background noise should be, between 0 and 1. + """) + parser.add_argument( + '--background_frequency', + type=float, + default=0.8, + help="""\ + How many of the training samples have background noise mixed in. + """) + parser.add_argument( + '--silence_percentage', + type=float, + default=10.0, + help="""\ + How much of the training data should be silence. + """) + parser.add_argument( + '--unknown_percentage', + type=float, + default=10.0, + help="""\ + How much of the training data should be unknown words. + """) + parser.add_argument( + '--time_shift_ms', + type=float, + default=100.0, + help="""\ + Range to randomly shift the training audio by in time. + """) + parser.add_argument( + '--testing_percentage', + type=int, + default=10, + help='What percentage of wavs to use as a test set.') + parser.add_argument( + '--validation_percentage', + type=int, + default=10, + help='What percentage of wavs to use as a validation set.') + parser.add_argument( + '--sample_rate', + type=int, + default=16000, + help='Expected sample rate of the wavs',) + parser.add_argument( + '--clip_duration_ms', + type=int, + default=1000, + help='Expected duration in milliseconds of the wavs',) + parser.add_argument( + '--window_size_ms', + type=float, + default=30.0, + help='How long each spectrogram timeslice is',) + parser.add_argument( + '--window_stride_ms', + type=float, + default=10.0, + help='How long each spectrogram timeslice is',) + parser.add_argument( + '--dct_coefficient_count', + type=int, + default=40, + help='How many bins to use for the MFCC fingerprint',) + parser.add_argument( + '--how_many_training_steps', + type=str, + default='3750,750', + help='How many training loops to run',) + parser.add_argument( + '--eval_step_interval', + type=int, + default=400, + help='How often to evaluate the training results.') + parser.add_argument( + '--learning_rate', + type=str, + default='0.001,0.0001', + help='How large a learning rate to use when training.') + parser.add_argument( + '--batch_size', + type=int, + default=100, + help='How many items to train with at once',) + parser.add_argument( + '--wanted_words', + type=str, + default='yes,no,up,down,left,right,on,off,stop,go', + help='Words to use (others will be added to an unknown label)',) + parser.add_argument( + '--train_dir', + type=str, + default='/tmp/speech_commands_train', + help='Directory to write event logs and checkpoint.') + parser.add_argument( + '--save_step_interval', + type=int, + default=100, + help='Save model checkpoint every save_steps.') + parser.add_argument( + '--checkpoint', + type=str, + help='Checkpoint to load the weights from before fine-tuning.') + parser.add_argument( + '--model_architecture', + type=str, + default='dnn', + help='What model architecture to use') + parser.add_argument( + '--model_size_info', + type=int, + nargs="+", + default=[128, 128, 128], + help='Model dimensions - different for various models') + + FLAGS, _ = parser.parse_known_args() + optimize() diff --git a/models/keyword_spotting/dnn_medium/model_package_tf/recreate_model.sh b/models/keyword_spotting/dnn_medium/model_package_tf/recreate_model.sh new file mode 100644 index 0000000..2a465cf --- /dev/null +++ b/models/keyword_spotting/dnn_medium/model_package_tf/recreate_model.sh @@ -0,0 +1,52 @@ +#!/usr/bin/env bash +# Copyright (C) 2023 Arm Limited or its affiliates. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the License); you may +# not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an AS IS BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +ckpt_path=model_archive/model_source/weights/dnn_0.86_ckpt +train=false + +# Parse command line args +while (( $# >= 1 )); do + case $1 in + --ckpt) + if [ "$2" ]; then + ckpt_path=$2 + shift + else + printf 'ERROR: "--ckpt" requires a path to be supplied.\n' + exit 1 + fi + ;; + --train) + train=true + break;; + *) shift; + esac; +done + + +# DNN Medium training +if [ "$train" = true ] +then +python train.py --model_architecture dnn --model_size_info 256 256 256 --dct_coefficient_count 10 --window_size_ms 40 --window_stride_ms 40 --learning_rate 0.0005,0.0001,0.00002 --how_many_training_steps 10000,10000,10000 --summaries_dir work/DNN/DNN_M/retrain_logs --train_dir work/DNN/DNN_M/training +fi + +# Conversion to TFLite fp32 +python convert_to_tflite.py --model_architecture dnn --model_size_info 256 256 256 --dct_coefficient_count 10 --window_size_ms 40 --window_stride_ms 40 --checkpoint $ckpt_path --no-quantize + +# Conversion to TFLite int8 +python convert_to_tflite.py --model_architecture dnn --model_size_info 256 256 256 --dct_coefficient_count 10 --window_size_ms 40 --window_stride_ms 40 --checkpoint $ckpt_path --inference_type int8 + diff --git a/models/keyword_spotting/dnn_medium/model_package_tf/requirements.txt b/models/keyword_spotting/dnn_medium/model_package_tf/requirements.txt new file mode 100644 index 0000000..3448cff --- /dev/null +++ b/models/keyword_spotting/dnn_medium/model_package_tf/requirements.txt @@ -0,0 +1,3 @@ +numpy == 1.19.5 +tensorflow == 2.5.0 +tensorflow-model-optimization == 0.6.0 \ No newline at end of file diff --git a/models/keyword_spotting/dnn_medium/model_package_tf/train.py b/models/keyword_spotting/dnn_medium/model_package_tf/train.py new file mode 100644 index 0000000..8c488b3 --- /dev/null +++ b/models/keyword_spotting/dnn_medium/model_package_tf/train.py @@ -0,0 +1,227 @@ +# Copyright © 2023 Arm Ltd. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Functions for training simple keyword spotting models.""" + +import argparse +from pathlib import Path + +import tensorflow as tf +import numpy as np + +from data_processing import data_preprocessing +from model_core_utils import models + + +def train(): + model_settings = models.prepare_model_settings( + len(data_preprocessing.prepare_words_list(FLAGS.wanted_words.split(','))), + FLAGS.sample_rate, FLAGS.clip_duration_ms, FLAGS.window_size_ms, + FLAGS.window_stride_ms, FLAGS.dct_coefficient_count) + + # Create the model. + model = models.create_model(model_settings, FLAGS.model_architecture, FLAGS.model_size_info, True) + + audio_processor = data_preprocessing.AudioProcessor(data_url=FLAGS.data_url, + data_dir=FLAGS.data_dir, + silence_percentage=FLAGS.silence_percentage, + unknown_percentage=FLAGS.unknown_percentage, + wanted_words=FLAGS.wanted_words.split(','), + validation_percentage=FLAGS.validation_percentage, + testing_percentage=FLAGS.testing_percentage, + model_settings=model_settings) + + # We decay learning rate in a constant piecewise way to help learning. + training_steps_list = list(map(int, FLAGS.how_many_training_steps.split(','))) + learning_rates_list = list(map(float, FLAGS.learning_rate.split(','))) + lr_boundary_list = training_steps_list[:-1] # Only need the values at which to change lr. + lr_schedule = tf.keras.optimizers.schedules.PiecewiseConstantDecay(boundaries=lr_boundary_list, + values=learning_rates_list) + + # Specify the optimizer configurations. + optimizer = tf.keras.optimizers.Adam(learning_rate=lr_schedule) + model.compile(optimizer=optimizer, + loss=tf.keras.losses.SparseCategoricalCrossentropy(), + metrics=['accuracy']) + + train_data = audio_processor.get_data(audio_processor.Modes.TRAINING, + FLAGS.background_frequency, FLAGS.background_volume, + int((FLAGS.time_shift_ms * FLAGS.sample_rate) / 1000)) + train_data = train_data.repeat().batch(FLAGS.batch_size).prefetch(tf.data.AUTOTUNE) + val_data = audio_processor.get_data(audio_processor.Modes.VALIDATION) + val_data = val_data.batch(FLAGS.batch_size).prefetch(tf.data.AUTOTUNE) + + # We train for a max number of iterations so need to calculate how many 'epochs' this will be. + training_steps_max = np.sum(training_steps_list) + training_epoch_max = int(np.ceil(training_steps_max / FLAGS.eval_step_interval)) + + # Callbacks. + train_dir = Path(FLAGS.train_dir) / "best" + train_dir.mkdir(parents=True, exist_ok=True) + model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint( + filepath=(train_dir / (FLAGS.model_architecture + "_{val_accuracy:.3f}_ckpt")), + save_weights_only=True, + monitor='val_accuracy', + mode='max', + save_best_only=True) + tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=FLAGS.summaries_dir) + + # Train the model. + model.fit(x=train_data, + steps_per_epoch=FLAGS.eval_step_interval, + epochs=training_epoch_max, + validation_data=val_data, + callbacks=[model_checkpoint_callback, tensorboard_callback]) + + # Test and save the model. + test_data = audio_processor.get_data(audio_processor.Modes.TESTING) + test_data = test_data.batch(FLAGS.batch_size) + + test_loss, test_acc = model.evaluate(x=test_data) + print(f'Final test accuracy: {test_acc*100:.2f}%') + model.save(f'saved_model/{FLAGS.model_architecture}') + model.save(f'keras/{FLAGS.model_architecture}.h5') + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + '--data_url', + type=str, + default='http://download.tensorflow.org/data/speech_commands_v0.02.tar.gz', + help='Location of speech training data archive on the web.') + parser.add_argument( + '--data_dir', + type=str, + default='/tmp/speech_dataset/', + help="""\ + Where to download the speech training data to. + """) + parser.add_argument( + '--background_volume', + type=float, + default=0.1, + help="""\ + How loud the background noise should be, between 0 and 1. + """) + parser.add_argument( + '--background_frequency', + type=float, + default=0.8, + help="""\ + How many of the training samples have background noise mixed in. + """) + parser.add_argument( + '--silence_percentage', + type=float, + default=10.0, + help="""\ + How much of the training data should be silence. + """) + parser.add_argument( + '--unknown_percentage', + type=float, + default=10.0, + help="""\ + How much of the training data should be unknown words. + """) + parser.add_argument( + '--time_shift_ms', + type=float, + default=100.0, + help="""\ + Range to randomly shift the training audio by in time. + """) + parser.add_argument( + '--testing_percentage', + type=int, + default=10, + help='What percentage of wavs to use as a test set.') + parser.add_argument( + '--validation_percentage', + type=int, + default=10, + help='What percentage of wavs to use as a validation set.') + parser.add_argument( + '--sample_rate', + type=int, + default=16000, + help='Expected sample rate of the wavs',) + parser.add_argument( + '--clip_duration_ms', + type=int, + default=1000, + help='Expected duration in milliseconds of the wavs',) + parser.add_argument( + '--window_size_ms', + type=float, + default=30.0, + help='How long each spectrogram timeslice is',) + parser.add_argument( + '--window_stride_ms', + type=float, + default=10.0, + help='How long each spectrogram timeslice is',) + parser.add_argument( + '--dct_coefficient_count', + type=int, + default=40, + help='How many bins to use for the MFCC fingerprint',) + parser.add_argument( + '--how_many_training_steps', + type=str, + default='15000,3000', + help='How many training loops to run',) + parser.add_argument( + '--eval_step_interval', + type=int, + default=400, + help='How often to evaluate the training results.') + parser.add_argument( + '--learning_rate', + type=str, + default='0.001,0.0001', + help='How large a learning rate to use when training.') + parser.add_argument( + '--batch_size', + type=int, + default=100, + help='How many items to train with at once',) + parser.add_argument( + '--summaries_dir', + type=str, + default='/tmp/retrain_logs', + help='Where to save summary logs for TensorBoard.') + parser.add_argument( + '--wanted_words', + type=str, + default='yes,no,up,down,left,right,on,off,stop,go', + help='Words to use (others will be added to an unknown label)',) + parser.add_argument( + '--train_dir', + type=str, + default='/tmp/speech_commands_train', + help='Directory to write event logs and checkpoint.') + parser.add_argument( + '--model_architecture', + type=str, + default='dnn', + help='What model architecture to use') + parser.add_argument( + '--model_size_info', + type=int, + nargs="+", + default=[128, 128, 128], + help='Model dimensions - different for various models') + + FLAGS, _ = parser.parse_known_args() + train() diff --git a/models/keyword_spotting/dnn_medium/model_package_tf/validation_utils/labels.txt b/models/keyword_spotting/dnn_medium/model_package_tf/validation_utils/labels.txt new file mode 100644 index 0000000..ba41645 --- /dev/null +++ b/models/keyword_spotting/dnn_medium/model_package_tf/validation_utils/labels.txt @@ -0,0 +1,12 @@ +_silence_ +_unknown_ +yes +no +up +down +left +right +on +off +stop +go \ No newline at end of file diff --git a/models/keyword_spotting/dnn_medium/tflite_int8/README.md b/models/keyword_spotting/dnn_medium/tflite_int8/README.md deleted file mode 100644 index cfc52ce..0000000 --- a/models/keyword_spotting/dnn_medium/tflite_int8/README.md +++ /dev/null @@ -1,59 +0,0 @@ -# DNN Medium INT8 - -## Description -This is a fully quantized version (asymmetrical int8) of the DNN Medium model developed by Arm, with training checkpoints, from the Hello Edge paper. Code to recreate this model can be found here: https://github.com/ARM-software/ML-examples/tree/master/tflu-kws-cortex-m - -## License -[Apache-2.0](https://spdx.org/licenses/Apache-2.0.html) - -## Related Materials -### Class Labels -The class labels associated with this model can be downloaded by running the script `get_class_labels.sh`. - -### Model Recreation Code -Code to recreate this model can be found here: https://github.com/ARM-software/ML-examples/tree/master/tflu-kws-cortex-m. - -## Network Information -| Network Information | Value | -|---------------------|------------------| -| Framework | TensorFlow Lite | -| SHA-1 Hash | 7e138f99cfc6a603a1fc735a2d9c3e28a41a6a43 | -| Size (Bytes) | 203832 | -| Provenance | https://github.com/ARM-software/ML-examples/tree/master/tflu-kws-cortex-m | -| Paper | https://arxiv.org/abs/1711.07128 | - -## Accuracy -Dataset: Google Speech Commands Test Set - -| Metric | Value | -|--------|-------| -| Accuracy | 0.844 | - -## Performance -| Platform | Optimized | -|----------|:---------:| -| Cortex-A |:heavy_check_mark: | -| Cortex-M |:heavy_check_mark: | -| Mali GPU |:heavy_check_mark: | -| Ethos U |:heavy_check_mark: | - -### Key -* :heavy_check_mark: - Will run on this platform. -* :heavy_multiplication_x: - Will not run on this platform. - - - -## Optimizations -| Optimization | Value | -|-----------------|---------| -| Quantization | INT8 | - -## Network Inputs -| Input Node Name | Shape | Description | -|-----------------|---------|-------------| -| input | (1, 250) | The input is a processed MFCCs of shape (1, 250) | - -## Network Outputs -| Output Node Name | Shape | Description | -|------------------|---------|-------------| -| Identity | (1, 12) | The probability on 12 keywords. | diff --git a/models/keyword_spotting/dnn_medium/tflite_int8/definition.yaml b/models/keyword_spotting/dnn_medium/tflite_int8/definition.yaml deleted file mode 100644 index abcfbd8..0000000 --- a/models/keyword_spotting/dnn_medium/tflite_int8/definition.yaml +++ /dev/null @@ -1,41 +0,0 @@ -benchmark: - Google Speech Commands test set: - Accuracy: 84.44% -description: 'This is a fully quantized version (asymmetrical int8) of the DNN Medium - model developed by Arm, with training checkpoints, from the Hello Edge paper. Code - to recreate this model can be found here: https://github.com/ARM-software/ML-examples/tree/master/tflu-kws-cortex-m' -license: -- Apache-2.0 -network: - file_size_bytes: 203832 - filename: dnn_m_quantized.tflite - framework: TensorFlow Lite - hash: - algorithm: sha1 - value: 7e138f99cfc6a603a1fc735a2d9c3e28a41a6a43 - provenance: https://github.com/ARM-software/ML-examples/tree/master/tflu-kws-cortex-m - quality_level: null -network_parameters: - input_nodes: - - description: The input is a processed MFCCs of shape (1, 250) - example_input: - path: models/keyword_spotting/dnn_medium/tflite_int8/testing_input/input - name: input - shape: - - 1 - - 250 - output_nodes: - - description: The probability on 12 keywords. - name: Identity - shape: - - 1 - - 12 - test_output_path: models/keyword_spotting/dnn_medium/tflite_int8/testing_output/Identity -operators: - TensorFlow Lite: - - DEQUANTIZE - - FULLY_CONNECTED - - QUANTIZE - - RELU - - SOFTMAX -paper: https://arxiv.org/abs/1711.07128 diff --git a/models/keyword_spotting/dnn_medium/tflite_int8/get_class_labels.sh b/models/keyword_spotting/dnn_medium/tflite_int8/get_class_labels.sh deleted file mode 100755 index e59caf5..0000000 --- a/models/keyword_spotting/dnn_medium/tflite_int8/get_class_labels.sh +++ /dev/null @@ -1,20 +0,0 @@ -# Copyright (C) 2021 Arm Limited or its affiliates. All rights reserved. -# -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the License); you may -# not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an AS IS BASIS, WITHOUT -# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -#!/usr/bin/env bash - -wget https://raw.githubusercontent.com/ARM-software/ML-KWS-for-MCU/e9cf319e9aa2ff71d433e111477dd95329fb94cb/Pretrained_models/labels.txt -mv labels.txt labelmappings.txt \ No newline at end of file diff --git a/models/keyword_spotting/dnn_small/model_package_tf/README.md b/models/keyword_spotting/dnn_small/model_package_tf/README.md new file mode 100644 index 0000000..7d73dab --- /dev/null +++ b/models/keyword_spotting/dnn_small/model_package_tf/README.md @@ -0,0 +1,115 @@ +# DNN Small model package + +This folder contains code that will allow you to recreate the DNN Small keyword spotting model from +the [Hello Edge paper](https://arxiv.org/pdf/1711.07128.pdf). + +## License +[Apache-2.0](https://spdx.org/licenses/Apache-2.0.html) + +## Model Package Overview +| Model | DNN_Small | +|:---------------: |:------------------------------------------:| +| **Format**: | Keras, Saved Model, TensorFlow Lite int8, TensorFlow Lite fp32 | +| **Feature**: | Keyword spotting for Arm Cortex-M CPUs | +| **Architectural Delta w.r.t. Vanilla**: | None | +| **Domain**: | Keyword spotting | +| **Package Quality**: | Optimised | + +## Model Recreation + +In order to recreate the model you will first need to be using ```Python3.7``` and install the requirements in ```requirements.txt```. + +Once you have these requirements satisfied you can execute the recreation script contained within this folder, just run: + +```bash +bash ./recreate_model.sh +``` + +Running this script will use the pre-trained checkpoint files supplied in the ```./model_archive/model_source/weights``` folder +to generate the TFLite files and perform evaluation on the test sets. Both an fp32 version and a quantized version will be produced. +The quantized version will use post-training quantization to fully quantize it. + +If you want to run training from scratch you can do this by supplying ```--train``` when running the script. For example: + +```bash +bash ./recreate_model.sh --train +``` + +Training is then performed and should produce a model to the stated accuracy in this repository. +Note that exporting to TFLite will still happen with the pre-trained checkpoint files so you will need to re-run the script +and this time supply the path to the new checkpoint files you want to use, for example: + +```bash +bash ./recreate_model.sh --ckpt +``` + + +## Training + +To train a DNN with 3 fully-connected layers with 128 neurons in each layer, run: + +``` +python train.py --model_architecture dnn --model_size_info 128 128 128 +``` +The command line argument *--model_size_info* is used to pass the neural network layer +dimensions such as number of layers, convolution filter size/stride as a list to models.py, +which builds the TensorFlow graph based on the provided model architecture +and layer dimensions. For more info on *model_size_info* for each network architecture see +[models.py](models.py). + +The training commands with all the hyperparameters to reproduce the models shown in the +[paper](https://arxiv.org/pdf/1711.07128.pdf) are given [here](recreate_model.sh). + +## Testing +To run inference on the trained model from a checkpoint and get accuracy on validation and test sets, run: +``` +python evaluation.py --model_architecture dnn --model_size_info 128 128 128 --checkpoint +``` +The parameters used here should match those used in the Training step. + +## Optimization + +We introduce a new *optional* step to optimize the trained keyword spotting model for deployment. + +Here we use TensorFlow's [weight clustering API](https://www.tensorflow.org/model_optimization/guide/clustering) to reduce the compressed model size and optimize inference on supported hardware. 32 weight clusters and kmeans++ cluster intialization method are used as the clustering hyperparameters. + +To optimize your trained model (e.g. a DNN), a trained model checkpoint is needed to run clustering and fine-tuning on. +You can use the pre-trained checkpoints provided, or train your own model and use the resulting checkpoint. + +To apply the optimization and fine-tuning, run the following command: +``` +python optimisations.py --model_architecture dnn --model_size_info 128 128 128 --checkpoint +``` +The parameters used here should match those used in the Training step, except for the number of training steps. +The number of training steps is reduced since the optimization step only requires fine-tuning. + +This will generate a clustered model checkpoint that can be used in the quantization step to generate a quantized and clustered TFLite model. + +## Quantization and TFLite Conversion + +As part of the update we now use TensorFlow's +[post training quantization](https://www.tensorflow.org/lite/performance/post_training_quantization) to +make quantization of the trained models super simple. + +To quantize your trained model (e.g. a DNN) run: +``` +python convert_to_tflite.py --model_architecture dnn --model_size_info 128 128 128 --checkpoint [--inference_type int8|int16] +``` +The parameters used here should match those used in the Training step. + +The inference_type parameter is *optional* and to be used if a fully quantized model with inputs and outputs of type int8 or int16 is needed. It defaults to fp32. + +This step will produce a quantized TFLite file *dnn_quantized.tflite*. +You can test the accuracy of this quantized model on the test set by running: +``` +python evaluation.py --tflite_path dnn_quantized.tflite +``` +The parameters used here should match those used in the Training step. + +`convert_to_tflite.py` uses post-training quantization to generate a quantized model by default. If you wish to convert to a floating point TFLite model, use the command below: + +``` +python convert_to_tflite.py --model_architecture dnn --model_size_info 128 128 128 --checkpoint --no-quantize +``` + +This will produce a floating point TFLite file *dnn.tflite*. You can test the accuracy of this floating point model using `evaluation.py` as above. diff --git a/models/keyword_spotting/dnn_small/model_package_tf/convert_to_tflite.py b/models/keyword_spotting/dnn_small/model_package_tf/convert_to_tflite.py new file mode 100644 index 0000000..64ab8df --- /dev/null +++ b/models/keyword_spotting/dnn_small/model_package_tf/convert_to_tflite.py @@ -0,0 +1,234 @@ +# Copyright © 2023 Arm Ltd. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Functions for converting and quantizing a trained keyword spotting + model and saving to TFLite.""" + +import argparse + +import tensorflow as tf + +from data_processing import data_preprocessing +from model_core_utils import models +from evaluation import tflite_test + +NUM_REP_DATA_SAMPLES = 100 # How many samples to use for post training quantization. + + +def convert(model_settings, audio_processor, checkpoint, quantize, inference_type, tflite_path): + """Load our trained floating point model and convert it. + + TFLite conversion or post training quantization is performed and the + resulting model is saved as a TFLite file. + We use samples from the validation set to do post training quantization. + + Args: + model_settings: Dictionary of common model settings. + audio_processor: Audio processor class object. + checkpoint: Path to training checkpoint to load. + quantize: Whether to quantize the model or convert to fp32 TFLite model. + inference_type: Input/output type of the quantized model. + tflite_path: Output TFLite file save path. + """ + model = models.create_model(model_settings, FLAGS.model_architecture, FLAGS.model_size_info, False) + model.load_weights(checkpoint).expect_partial() + + val_data = audio_processor.get_data(audio_processor.Modes.VALIDATION).batch(1) + + def _rep_dataset(): + """Generator function to produce representative dataset.""" + i = 0 + for mfcc, label in val_data: + if i > NUM_REP_DATA_SAMPLES: + break + i += 1 + yield [mfcc] + + if quantize: + # Quantize model and save to disk. + tflite_model = post_training_quantize(model, inference_type, _rep_dataset) + with open(tflite_path, 'wb') as f: + f.write(tflite_model) + print(f'Quantized model saved to {tflite_path}.') + else: + converter = tf.lite.TFLiteConverter.from_keras_model(model) + tflite_model = converter.convert() + with open(tflite_path, 'wb') as f: + f.write(tflite_model) + print(f'Converted model saved to {tflite_path}.') + + +def post_training_quantize(keras_model, inference_type, rep_dataset): + """Perform post training quantization and returns the TFLite model ready for saving. + + See https://www.tensorflow.org/lite/performance/post_training_quantization#full_integer_quantization for + more details. + + Args: + keras_model: The trained tf Keras model used for post training quantization. + inference_type: Input/output type of the quantized model. + rep_dataset: Function to use as a representative dataset, must be callable. + + Returns: + Quantized TFLite model ready for saving to disk. + """ + converter = tf.lite.TFLiteConverter.from_keras_model(keras_model) + converter.optimizations = [tf.lite.Optimize.DEFAULT] + + if inference_type == 'int8': + converter.inference_input_type = tf.int8 + converter.inference_output_type = tf.int8 + supported_ops = tf.lite.OpsSet.TFLITE_BUILTINS_INT8 + if inference_type == 'int16': + converter.inference_input_type = tf.int16 + converter.inference_output_type = tf.int16 + supported_ops = tf.lite.OpsSet.EXPERIMENTAL_TFLITE_BUILTINS_ACTIVATIONS_INT16_WEIGHTS_INT8 + + # Int8 post training quantization needs representative dataset. + converter.representative_dataset = rep_dataset + converter.target_spec.supported_ops = [supported_ops] + + tflite_model = converter.convert() + + return tflite_model + + +def main(): + model_settings = models.prepare_model_settings( + len(data_preprocessing.prepare_words_list(FLAGS.wanted_words.split(','))), + FLAGS.sample_rate, FLAGS.clip_duration_ms, FLAGS.window_size_ms, + FLAGS.window_stride_ms, FLAGS.dct_coefficient_count) + + audio_processor = data_preprocessing.AudioProcessor(data_url=FLAGS.data_url, + data_dir=FLAGS.data_dir, + silence_percentage=FLAGS.silence_percentage, + unknown_percentage=FLAGS.unknown_percentage, + wanted_words=FLAGS.wanted_words.split(','), + validation_percentage=FLAGS.validation_percentage, + testing_percentage=FLAGS.testing_percentage, + model_settings=model_settings) + + if FLAGS.quantize: + tflite_path = f'{FLAGS.model_architecture}_quantized.tflite' + else: + tflite_path = f'{FLAGS.model_architecture}.tflite' + + # Load floating point model from checkpoint and convert it. + convert(model_settings, audio_processor, FLAGS.checkpoint, + FLAGS.quantize, FLAGS.inference_type, tflite_path) + + # Test the newly converted model on the test set. + tflite_test(model_settings, audio_processor, tflite_path) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument( + '--data_url', + type=str, + default='http://download.tensorflow.org/data/speech_commands_v0.02.tar.gz', + help='Location of speech training data archive on the web.') + parser.add_argument( + '--data_dir', + type=str, + default='/tmp/speech_dataset/', + help="""\ + Where to download the speech training data to. + """) + parser.add_argument( + '--silence_percentage', + type=float, + default=10.0, + help="""\ + How much of the training data should be silence. + """) + parser.add_argument( + '--unknown_percentage', + type=float, + default=10.0, + help="""\ + How much of the training data should be unknown words. + """) + parser.add_argument( + '--testing_percentage', + type=int, + default=10, + help='What percentage of wavs to use as a test set.') + parser.add_argument( + '--validation_percentage', + type=int, + default=10, + help='What percentage of wavs to use as a validation set.') + parser.add_argument( + '--sample_rate', + type=int, + default=16000, + help='Expected sample rate of the wavs',) + parser.add_argument( + '--clip_duration_ms', + type=int, + default=1000, + help='Expected duration in milliseconds of the wavs',) + parser.add_argument( + '--window_size_ms', + type=float, + default=30.0, + help='How long each spectrogram timeslice is',) + parser.add_argument( + '--window_stride_ms', + type=float, + default=10.0, + help='How long each spectrogram timeslice is',) + parser.add_argument( + '--dct_coefficient_count', + type=int, + default=40, + help='How many bins to use for the MFCC fingerprint',) + parser.add_argument( + '--wanted_words', + type=str, + default='yes,no,up,down,left,right,on,off,stop,go', + help='Words to use (others will be added to an unknown label)',) + parser.add_argument( + '--model_architecture', + type=str, + default='dnn', + help='What model architecture to use') + parser.add_argument( + '--model_size_info', + type=int, + nargs="+", + default=[128, 128, 128], + help='Model dimensions - different for various models') + parser.add_argument( + '--checkpoint', + type=str, + help='Checkpoint to load the weights from.') + parser.add_argument( + '--quantize', + dest='quantize', + action="store_true", + default=True, + help='Whether to quantize the model or convert to fp32 TFLite model. Defaults to True.') + parser.add_argument( + '--no-quantize', + dest='quantize', + action="store_false", + help='Whether to quantize the model or convert to fp32 TFLite model. Defaults to True.') + parser.add_argument( + '--inference_type', + type=str, + default='fp32', + help='If quantize is true, whether the model input and output is float32, int8 or int16') + + FLAGS, _ = parser.parse_known_args() + main() diff --git a/models/keyword_spotting/dnn_small/model_package_tf/data_processing/__init__.py b/models/keyword_spotting/dnn_small/model_package_tf/data_processing/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/models/keyword_spotting/dnn_small/model_package_tf/data_processing/data_preprocessing.py b/models/keyword_spotting/dnn_small/model_package_tf/data_processing/data_preprocessing.py new file mode 100644 index 0000000..05cf5ba --- /dev/null +++ b/models/keyword_spotting/dnn_small/model_package_tf/data_processing/data_preprocessing.py @@ -0,0 +1,462 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +# +# Modifications Copyright 2023 Arm Inc. All Rights Reserved. +# Modified to use TensorFlow 2.0 and data pipelines. +# +"""Functions for loading and preparing data for keyword spotting.""" + +import os +import re +import sys +import urllib +from pathlib import Path +import tarfile +import hashlib +import random +import math +from enum import Enum + +import numpy as np +import tensorflow as tf +from tensorflow.python.ops import gen_audio_ops as audio_ops + +MAX_NUM_WAVS_PER_CLASS = 2**27 - 1 # ~134M +RANDOM_SEED = 59185 +BACKGROUND_NOISE_DIR_NAME = '_background_noise_' +SILENCE_LABEL = '_silence_' +SILENCE_INDEX = 0 +UNKNOWN_WORD_INDEX = 1 +UNKNOWN_WORD_LABEL = '_unknown_' + + +def load_wav_file(wav_filename, desired_samples): + """Loads and then decodes a given 16bit PCM wav file. + + Decoded audio is scaled to the range [-1, 1] and padded or cropped to the desired number of samples. + + Args: + wav_filename: 16bit PCM wav file to load. + desired_samples: Number of samples wanted from the audio file. + + Returns: + Tuple consisting of the decoded audio and sample rate. + """ + wav_file = tf.io.read_file(wav_filename) + decoded_wav = audio_ops.decode_wav(wav_file, desired_channels=1, desired_samples=desired_samples) + + return decoded_wav.audio, decoded_wav.sample_rate + + +def calculate_mfcc(audio_signal, audio_sample_rate, window_size, window_stride, num_mfcc): + """Returns Mel Frequency Cepstral Coefficients (MFCC) for a given audio signal. + + Args: + audio_signal: Raw audio signal in range [-1, 1] + audio_sample_rate: Audio signal sample rate + window_size: Window size in samples for calculating spectrogram + window_stride: Window stride in samples for calculating spectrogram + num_mfcc: The number of MFCC features wanted. + + Returns: + Calculated mffc features. + """ + spectrogram = audio_ops.audio_spectrogram(input=audio_signal, window_size=window_size, stride=window_stride, + magnitude_squared=True) + + mfcc_features = audio_ops.mfcc(spectrogram, audio_sample_rate, dct_coefficient_count=num_mfcc) + + return mfcc_features + + +def which_set(filename, validation_percentage, testing_percentage): + """Determines which data partition the file should belong to. + + We want to keep files in the same training, validation, or testing sets even + if new ones are added over time. This makes it less likely that testing + samples will accidentally be reused in training when long runs are restarted + for example. To keep this stability, a hash of the filename is taken and used + to determine which set it should belong to. This determination only depends on + the name and the set proportions, so it won't change as other files are added. + It's also useful to associate particular files as related (for example words + spoken by the same person), so anything after '_nohash_' in a filename is + ignored for set determination. This ensures that 'bobby_nohash_0.wav' and + 'bobby_nohash_1.wav' are always in the same set, for example. + + Args: + filename: File path of the data sample. + validation_percentage: How much of the data set to use for validation. + testing_percentage: How much of the data set to use for testing. + + Returns: + String, one of 'training', 'validation', or 'testing'. + """ + base_name = os.path.basename(filename) + # We want to ignore anything after '_nohash_' in the file name when + # deciding which set to put a wav in, so the data set creator has a way of + # grouping wavs that are close variations of each other. + hash_name = re.sub(r'_nohash_.*$', '', base_name) + # This looks a bit magical, but we need to decide whether this file should + # go into the training, testing, or validation sets, and we want to keep + # existing files in the same set even if more files are subsequently + # added. + # To do that, we need a stable way of deciding based on just the file name + # itself, so we do a hash of that and then use that to generate a + # probability value that we use to assign it. + hash_name_hashed = hashlib.sha1(tf.compat.as_bytes(hash_name)).hexdigest() + percentage_hash = ((int(hash_name_hashed, 16) % + (MAX_NUM_WAVS_PER_CLASS + 1)) * + (100.0 / MAX_NUM_WAVS_PER_CLASS)) + if percentage_hash < validation_percentage: + result = 'validation' + elif percentage_hash < (testing_percentage + validation_percentage): + result = 'testing' + else: + result = 'training' + return result + + +def prepare_words_list(wanted_words): + """Prepends common tokens to the custom word list. + + Args: + wanted_words: List of strings containing custom words to spot. + + Returns: + List of words with silence and unknown tokens added. + """ + return [SILENCE_LABEL, UNKNOWN_WORD_LABEL] + wanted_words + + +class AudioProcessor: + """Handles loading, partitioning, and preparing audio training data.""" + + class Modes(Enum): + TRAINING = 1 + VALIDATION = 2 + TESTING = 3 + + def __init__(self, data_url, data_dir, silence_percentage, unknown_percentage, + wanted_words, validation_percentage, testing_percentage, model_settings): + self.data_dir = Path(data_dir) + self.model_settings = model_settings + self.words_list = prepare_words_list(wanted_words) + + self._tf_datasets = {} + self.background_data = None + self._set_size = {'training': 0, 'validation': 0, 'testing': 0} + + self._download_and_extract_data(data_url, data_dir) + self._prepare_datasets(silence_percentage, unknown_percentage, wanted_words, + validation_percentage, testing_percentage) + self._prepare_background_data() + + def get_data(self, mode, background_frequency=0, background_volume_range=0, time_shift=0): + """Returns the train, validation or test set for KWS as a TF Dataset. + + Args: + mode: The set to return, see AudioProcessor.Modes enumeration. + background_frequency: How many of the samples have background noise mixed in. + background_volume_range: How loud the background noise should be, between 0 and 1. + time_shift: Range to randomly shift the training audio by in time. + + Returns: + TF dataset that will generate tuples containing an mfcc and corresponding label. + + Raises: + ValueError: If mode is not recognised. + """ + if mode == AudioProcessor.Modes.TRAINING: + dataset = self._tf_datasets['training'] + elif mode == AudioProcessor.Modes.VALIDATION: + dataset = self._tf_datasets['validation'] + elif mode == AudioProcessor.Modes.TESTING: + dataset = self._tf_datasets['testing'] + else: + ValueError("Incorrect dataset type given") + + use_background = (self.background_data is not None) and (mode == AudioProcessor.Modes.TRAINING) + dataset = dataset.map(lambda path, label: self._process_path(path, label, self.model_settings, + background_frequency, background_volume_range, + time_shift, use_background, self.background_data), + num_parallel_calls=tf.data.experimental.AUTOTUNE) + + return dataset + + def set_size(self, mode): + """Get the number of samples in the requested dataset partition. + + Args: + mode: Which partition, see AudioProcessor.Modes enumeration. + + Returns: + Number of samples in the partition. + + Raises: + ValueError: If mode is not recognised. + """ + if mode == AudioProcessor.Modes.TRAINING: + return self._set_size['training'] + elif mode == AudioProcessor.Modes.VALIDATION: + return self._set_size['validation'] + elif mode == AudioProcessor.Modes.TESTING: + return self._set_size['testing'] + else: + ValueError('Incorrect dataset type given') + + @staticmethod + def _process_path(path, label, model_settings, background_frequency, background_volume_range, time_shift_samples, + use_background, background_data): + """Load wav files and calculate mfcc features. + + Random shifting of samples and adding in background noise is done within this function as well. + This function is meant to be mapped onto a TF Dataset by using a lambda function. + + Args: + path: Path to the wav file to load. + label: Integer label for classifying the audio clip. + model_settings: Dictionary of settings for model being trained. + background_frequency: How many clips will have background noise, 0.0 to 1.0. + background_volume_range: How loud the background noise will be. + time_shift_samples: How much to randomly shift the clips by. + use_background: Add in background noise to audio clips or not. + background_data: Ragged tensor of loaded background noise samples. + + Returns: + Tuple of calculated flattened mfcc and its class label. + """ + + desired_samples = model_settings['desired_samples'] + audio, sample_rate = load_wav_file(path, desired_samples=desired_samples) + + # Make our own silence audio data. + if label == SILENCE_INDEX: + audio = tf.multiply(audio, 0) + + # Shift samples start position and pad any gaps with zeros. + if time_shift_samples > 0: + time_shift_amount = tf.random.uniform(shape=(), minval=-time_shift_samples, maxval=time_shift_samples, + dtype=tf.int32) + else: + time_shift_amount = 0 + if time_shift_amount > 0: + time_shift_padding = [[time_shift_amount, 0], [0, 0]] + time_shift_offset = [0, 0] + else: + time_shift_padding = [[0, -time_shift_amount], [0, 0]] + time_shift_offset = [-time_shift_amount, 0] + + padded_foreground = tf.pad(audio, time_shift_padding, mode='CONSTANT') + sliced_foreground = tf.slice(padded_foreground, time_shift_offset, [desired_samples, -1]) + + # Get a random section of background noise. + if use_background: + background_index = tf.random.uniform(shape=(), maxval=background_data.shape[0], dtype=tf.int32) + background_sample = background_data[background_index] + background_offset = tf.random.uniform(shape=(), maxval=len(background_sample)-desired_samples, + dtype=tf.int32) + background_clipped = background_sample[background_offset:(background_offset + desired_samples)] + background_reshaped = tf.reshape(background_clipped, [desired_samples, 1]) + if tf.random.uniform(shape=(), maxval=1) < background_frequency: + background_volume = tf.random.uniform(shape=(), maxval=background_volume_range) + else: + background_volume = tf.constant(0, dtype='float32') + else: + background_reshaped = np.zeros([desired_samples, 1], dtype=np.float32) + background_volume = tf.constant(0, dtype='float32') + + # Mix in background noise. + background_mul = tf.multiply(background_reshaped, background_volume) + background_add = tf.add(background_mul, sliced_foreground) + background_clamp = tf.clip_by_value(background_add, -1.0, 1.0) + + mfcc = calculate_mfcc(background_clamp, sample_rate, model_settings['window_size_samples'], + model_settings['window_stride_samples'], + model_settings['dct_coefficient_count']) + mfcc = tf.reshape(mfcc, [-1]) + + return mfcc, label + + def _download_and_extract_data(self, data_url, target_directory): + """Downloads and extracts file to target directory. + + If the file does not already exist download it and then untar into the target directory. + + Args: + data_url: Web link to the tarred data to download. + target_directory: Directory to download and extract to. + """ + target_directory = Path(target_directory) + target_directory.mkdir(exist_ok=True) + + filename = data_url.split('/')[-1] + filepath = target_directory / filename + + if not filepath.exists(): + def _report_hook(block_num, block_size, total_size): + """Function to track download progress in urllib""" + read_so_far = block_num * block_size + percent = (read_so_far / total_size) * 100.0 + + s = f"\rDownloading {filename} {percent:.1f}%" + + sys.stdout.write(s) + sys.stdout.flush() + + filepath, _ = urllib.request.urlretrieve(data_url, filepath, _report_hook) + print() + + print(f'Untarring {filename}...') + tarfile.open(filepath, 'r:gz').extractall(target_directory) + + def _prepare_datasets(self, silence_percentage, unknown_percentage, wanted_words, + validation_percentage, testing_percentage): + """Split the data into train, validation and testing sets. + + Silence and unknown data is added, then sets are converted to TF Datasets. + + Args: + silence_percentage: Percent of words should be silence. + unknown_percentage: Percent of words that should be unknown. + wanted_words: List of words wanted to classify. + validation_percentage: Percent to split off for validation. + testing_percentage: Percent to split off for testing. + """ + # Make sure the shuffling and picking of unknowns is deterministic. + random.seed(RANDOM_SEED) + wanted_words_index = {} + + for index, wanted_word in enumerate(wanted_words): + wanted_words_index[wanted_word] = index + 2 + + # Find all wav files in subfolders. + search_path = self.data_dir / '*' / '*.wav' + data_index, unknown_index, all_words = self._find_and_sort_wavs(search_path, validation_percentage, + testing_percentage, wanted_words_index) + + for index, wanted_word in enumerate(wanted_words): + if wanted_word not in all_words: + raise Exception(f'Tried to find {wanted_word} in labels but only found: {", ".join(all_words.keys())}') + + word_to_index = {} + for word in all_words: + if word in wanted_words_index: + word_to_index[word] = wanted_words_index[word] + else: + word_to_index[word] = UNKNOWN_WORD_INDEX + word_to_index[SILENCE_LABEL] = SILENCE_INDEX + + # We need an arbitrary file to load as the input for the silence samples. + # It's multiplied by zero later, so the content doesn't matter. + silence_wav_path = data_index['training'][0]['file'] + for set_index in ['validation', 'testing', 'training']: + set_size = len(data_index[set_index]) # Size before adding silence and unknown samples. + silence_size = int(math.ceil(set_size * silence_percentage / 100)) + for _ in range(silence_size): + data_index[set_index].append({ + 'label': SILENCE_LABEL, + 'file': silence_wav_path + }) + # Pick some unknowns to add to each partition of the data set. + random.shuffle(unknown_index[set_index]) + unknown_size = int(math.ceil(set_size * unknown_percentage / 100)) + data_index[set_index].extend(unknown_index[set_index][:unknown_size]) + + self._set_size[set_index] = len(data_index[set_index]) # Size after adding silence and unknown samples. + + # Make sure the ordering is random. + random.shuffle(data_index[set_index]) + + # Transform into TF Datasets ready for easier processing later. + labels, paths = list(zip(*[d.values() for d in data_index[set_index]])) + labels = [word_to_index[label] for label in labels] + self._tf_datasets[set_index] = tf.data.Dataset.from_tensor_slices((list(paths), labels)) + + def _find_and_sort_wavs(self, search_pattern, validation_percentage, testing_percentage, wanted_words_index): + """Find and sort wav files into known and unknown word sets. + + Known words are files containing words in the list of wanted words. + Any other clip goes to the unknown label set. Labels come from the folder names. + All clips are also assigned to train, test and validation sets. + + Args: + search_pattern: Path pattern used by glob to find wav files. + validation_percentage: Percent to split off for validation. + testing_percentage: Percent to split off for testing. + wanted_words_index: Dict mapping wanted words to their label index. + + Returns: + 3-tuple of known words, unknown words and mapping of all word labels. + """ + data_index = {'validation': [], 'testing': [], 'training': []} + unknown_index = {'validation': [], 'testing': [], 'training': []} + all_words = {} + + for wav_path in sorted(tf.io.gfile.glob(str(search_pattern))): + word = Path(wav_path).parent.name.lower() + + # Treat the '_background_noise_' folder as a special case, since we expect + # it to contain long audio samples we mix in to improve training. + if word == BACKGROUND_NOISE_DIR_NAME: + continue + + all_words[word] = True + set_index = which_set(wav_path, validation_percentage, testing_percentage) + # If it's a known class, store its detail, otherwise add it to the list + # we'll use to train the unknown label. + if word in wanted_words_index: + data_index[set_index].append({'label': word, 'file': wav_path}) + else: + unknown_index[set_index].append({'label': word, 'file': wav_path}) + if not all_words: + raise Exception('No .wavs found at ' + str(search_pattern)) + + return data_index, unknown_index, all_words + + def _prepare_background_data(self): + """Searches a folder for background noise audio, and loads it into memory. + + It's expected that the background audio samples will be in a subdirectory + named '_background_noise_' inside the 'data_dir' folder, as .wavs that match + the sample rate of the training data, but can be much longer in duration. + + If the '_background_noise_' folder doesn't exist at all, this isn't an + error, it's just taken to mean that no background noise augmentation should + be used. If the folder does exist, but it's empty, that's treated as an + error. + + Returns: + Ragged tensor of raw PCM-encoded audio samples of background noise. + None if '_background_noise_' folder doesnt exist. + + Raises: + Exception: If files aren't found in the folder. + """ + background_data = [] + background_dir = Path(self.data_dir / BACKGROUND_NOISE_DIR_NAME) + if not background_dir.exists(): + self.background_data = None + return + + search_path = Path(background_dir / '*.wav') + for wav_path in tf.io.gfile.glob(str(search_path)): + wav_data, _ = load_wav_file(wav_path, desired_samples=-1) + background_data.append(tf.reshape(wav_data, [-1])) + + if not background_data: + raise Exception('No background wav files were found in ' + str(search_path)) + + # Ragged tensor as we cant use lists in tf dataset map functions. + self.background_data = tf.ragged.stack(background_data) diff --git a/models/keyword_spotting/dnn_small/model_package_tf/dnn_s_inference_keras.py b/models/keyword_spotting/dnn_small/model_package_tf/dnn_s_inference_keras.py new file mode 100644 index 0000000..db7694a --- /dev/null +++ b/models/keyword_spotting/dnn_small/model_package_tf/dnn_s_inference_keras.py @@ -0,0 +1,76 @@ +# Copyright © 2023 Arm Ltd. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from data_processing.data_preprocessing import load_wav_file, calculate_mfcc + +import tensorflow as tf +import argparse + + +def load_labels(filename): + """Read in labels, one label per line.""" + f = open(filename, "r") + return f.read().splitlines() + + +def main(): + window_size_samples = int(FLAGS.sample_rate * FLAGS.window_size_ms / 1000) + window_stride_samples = int(FLAGS.sample_rate * FLAGS.window_stride_ms / 1000) + decoded, sample = load_wav_file(FLAGS.wav, FLAGS.sample_rate) + x = calculate_mfcc(decoded, sample, window_size_samples, window_stride_samples, FLAGS.dct_coefficient_count) + x = tf.reshape(x, [1, -1]) + + model = tf.keras.models.load_model(FLAGS.keras_file_path) + predictions = model.predict(x) + + # Sort to show labels in order of confidence + top_k = predictions[0].argsort()[-1:][::-1] + for node_id in top_k: + human_string = load_labels(FLAGS.labels)[int(node_id)] + score = predictions[0,node_id] + print(f'model predicted: {human_string} with score {score:.5f}') + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument( + '--wav', type=str, default='', help='Audio file to be identified.') + parser.add_argument( + '--labels', type=str, default='', help='Path to file containing labels.') + parser.add_argument( + '--sample_rate', + type=int, + default=16000, + help='Expected sample rate of the wavs', ) + parser.add_argument( + '--window_size_ms', + type=float, + default=30.0, + help='How long each spectrogram timeslice is', ) + parser.add_argument( + '--window_stride_ms', + type=float, + default=10.0, + help='How long each spectrogram timeslice is', ) + parser.add_argument( + '--dct_coefficient_count', + type=int, + default=40, + help='How many bins to use for the MFCC fingerprint', ) + parser.add_argument( + '--keras_file_path', + type=str, + default='', + help='Path to the .h5 Keras model file to use for testing.') + FLAGS, unparsed = parser.parse_known_args() + main() diff --git a/models/keyword_spotting/dnn_small/model_package_tf/dnn_s_inference_tflite.py b/models/keyword_spotting/dnn_small/model_package_tf/dnn_s_inference_tflite.py new file mode 100644 index 0000000..9f79d99 --- /dev/null +++ b/models/keyword_spotting/dnn_small/model_package_tf/dnn_s_inference_tflite.py @@ -0,0 +1,120 @@ +# Copyright © 2023 Arm Ltd. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from data_processing.data_preprocessing import load_wav_file, calculate_mfcc + +import tensorflow as tf +import numpy as np +import argparse + + +def tflite_inference(input_data, tflite_path): + """Call forwards pass of TFLite file and returns the result. + + Args: + input_data: Input data to use on forward pass. + tflite_path: Path to TFLite file to run. + + Returns: + Output from inference. + """ + supported_quant_dtypes = (np.int8, np.int16) + interpreter = tf.lite.Interpreter(model_path=tflite_path) + interpreter.allocate_tensors() + + input_details = interpreter.get_input_details() + output_details = interpreter.get_output_details() + + input_dtype = input_details[0]["dtype"] + output_dtype = output_details[0]["dtype"] + + # Check if the input/output type is quantized, + # set scale and zero-point accordingly + if input_dtype in supported_quant_dtypes: + input_scale, input_zero_point = input_details[0]["quantization"] + else: + input_scale, input_zero_point = 1, 0 + + input_data = input_data / input_scale + input_zero_point + input_data = np.round(input_data) if input_dtype in supported_quant_dtypes else input_data + + if output_dtype in supported_quant_dtypes: + output_scale, output_zero_point = output_details[0]["quantization"] + else: + output_scale, output_zero_point = 1, 0 + + interpreter.set_tensor(input_details[0]['index'], tf.cast(input_data, input_dtype)) + interpreter.invoke() + + output_data = interpreter.get_tensor(output_details[0]['index']) + + output_data = output_scale * (output_data.astype(np.float32) - output_zero_point) + + return output_data + + +def load_labels(filename): + """Read in labels, one label per line.""" + f = open(filename, "r") + return f.read().splitlines() + + +def main(): + window_size_samples = int(FLAGS.sample_rate * FLAGS.window_size_ms / 1000) + window_stride_samples = int(FLAGS.sample_rate * FLAGS.window_stride_ms / 1000) + decoded, sample = load_wav_file(FLAGS.wav, FLAGS.sample_rate) + x = calculate_mfcc(decoded, sample, window_size_samples, window_stride_samples, FLAGS.dct_coefficient_count) + x = tf.reshape(x, [1, -1]) + predictions = tflite_inference(x, FLAGS.tflite_path) + + # Sort to show labels in order of confidence + top_k = predictions[0].argsort()[-1:][::-1] + for node_id in top_k: + human_string = load_labels(FLAGS.labels)[int(node_id)] + score = predictions[0,node_id] + print(f'model predicted: {human_string} with score {score:.5f}') + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument( + '--wav', type=str, default='', help='Audio file to be identified.') + parser.add_argument( + '--labels', type=str, default='', help='Path to file containing labels.') + parser.add_argument( + '--sample_rate', + type=int, + default=16000, + help='Expected sample rate of the wavs', ) + parser.add_argument( + '--window_size_ms', + type=float, + default=30.0, + help='How long each spectrogram timeslice is', ) + parser.add_argument( + '--window_stride_ms', + type=float, + default=10.0, + help='How long each spectrogram timeslice is', ) + parser.add_argument( + '--dct_coefficient_count', + type=int, + default=40, + help='How many bins to use for the MFCC fingerprint', ) + parser.add_argument( + '--tflite_path', + type=str, + default='', + help='Path to TFLite file to use for testing.') + FLAGS, unparsed = parser.parse_known_args() + main() diff --git a/models/keyword_spotting/dnn_small/model_package_tf/evaluation.py b/models/keyword_spotting/dnn_small/model_package_tf/evaluation.py new file mode 100644 index 0000000..9cf3d0c --- /dev/null +++ b/models/keyword_spotting/dnn_small/model_package_tf/evaluation.py @@ -0,0 +1,250 @@ +# Copyright © 2023 Arm Ltd. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Functions for testing trained keyword spotting models from checkpoint files and TFLite files.""" + +import argparse + +import numpy as np +import tensorflow as tf + +from data_processing import data_preprocessing +from model_core_utils import models +from dnn_s_inference_tflite import tflite_inference + + +def tflite_test(model_settings, audio_processor, tflite_path): + """Calculate accuracy and confusion matrices on the validation and test sets. + + A TFLite model is used for doing testing. + + Args: + model_settings: Dictionary of common model settings. + audio_processor: Audio processor class object. + tflite_path: Path to TFLite file to use for inference. + """ + # Evaluate on validation set. + print("Running TFLite evaluation on validation set...") + val_data = audio_processor.get_data(audio_processor.Modes.VALIDATION).batch(1) + expected_indices = np.concatenate([y for x, y in val_data]) + predicted_indices = [] + + for mfcc, label in val_data: + prediction = tflite_inference(mfcc, tflite_path) + predicted_indices.append(np.squeeze(tf.argmax(prediction, axis=1))) + + val_accuracy = calculate_accuracy(predicted_indices, expected_indices) + confusion_matrix = tf.math.confusion_matrix(expected_indices, predicted_indices, + num_classes=model_settings['label_count']) + + print(confusion_matrix.numpy()) + print(f'Validation accuracy = {val_accuracy * 100:.2f}%' + f'(N={audio_processor.set_size(audio_processor.Modes.VALIDATION)})') + + # Evaluate on testing set. + print("Running TFLite evaluation on test set...") + test_data = audio_processor.get_data(audio_processor.Modes.TESTING).batch(1) + expected_indices = np.concatenate([y for x, y in test_data]) + predicted_indices = [] + + for mfcc, label in test_data: + prediction = tflite_inference(mfcc, tflite_path) + predicted_indices.append(np.squeeze(tf.argmax(prediction, axis=1))) + + test_accuracy = calculate_accuracy(predicted_indices, expected_indices) + confusion_matrix = tf.math.confusion_matrix(expected_indices, predicted_indices, + num_classes=model_settings['label_count']) + + print(confusion_matrix.numpy()) + print(f'Test accuracy = {test_accuracy * 100:.2f}%' + f'(N={audio_processor.set_size(audio_processor.Modes.TESTING)})') + + +def keras_test(model_settings, audio_processor, model): + """Calculate accuracy and confusion matrices on the validation and test sets. + + A loaded keras model is used for doing testing. + + Args: + model_settings: Dictionary of common model settings. + audio_processor: Audio processor class object. + model: Loaded keras model. + """ + # Evaluate on validation set. + print("Running TF evaluation on validation set...") + val_data = audio_processor.get_data(audio_processor.Modes.VALIDATION).batch(FLAGS.batch_size) + expected_indices = np.concatenate([y for x, y in val_data]) + + predictions = model.predict(val_data) + predicted_indices = tf.argmax(predictions, axis=1) + + val_accuracy = calculate_accuracy(predicted_indices, expected_indices) + confusion_matrix = tf.math.confusion_matrix(expected_indices, predicted_indices, + num_classes=model_settings['label_count']) + print(confusion_matrix.numpy()) + print(f'Validation accuracy = {val_accuracy * 100:.2f}%' + f'(N={audio_processor.set_size(audio_processor.Modes.VALIDATION)})') + + # Evaluate on testing set. + print("Running TF evaluation on test set...") + test_data = audio_processor.get_data(audio_processor.Modes.TESTING).batch(FLAGS.batch_size) + expected_indices = np.concatenate([y for x, y in test_data]) + + predictions = model.predict(test_data) + predicted_indices = tf.argmax(predictions, axis=1) + + test_accuracy = calculate_accuracy(predicted_indices, expected_indices) + confusion_matrix = tf.math.confusion_matrix(expected_indices, predicted_indices, + num_classes=model_settings['label_count']) + print(confusion_matrix.numpy()) + print(f'Test accuracy = {test_accuracy * 100:.2f}%' + f'(N={audio_processor.set_size(audio_processor.Modes.TESTING)})') + + +def calculate_accuracy(predicted_indices, expected_indices): + """Calculates and returns accuracy. + + Args: + predicted_indices: List of predicted integer indices. + expected_indices: List of expected integer indices. + + Returns: + Accuracy value between 0 and 1. + """ + correct_prediction = tf.equal(predicted_indices, expected_indices) + accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) + return accuracy + + +def evaluate(): + """Calculate accuracy and confusion matrices on validation and test sets. + + Model is created and weights loaded from supplied command line arguments. + """ + model_settings = models.prepare_model_settings( + len(data_preprocessing.prepare_words_list(FLAGS.wanted_words.split(','))), + FLAGS.sample_rate, FLAGS.clip_duration_ms, FLAGS.window_size_ms, + FLAGS.window_stride_ms, FLAGS.dct_coefficient_count) + + audio_processor = data_preprocessing.AudioProcessor(data_url=FLAGS.data_url, + data_dir=FLAGS.data_dir, + silence_percentage=FLAGS.silence_percentage, + unknown_percentage=FLAGS.unknown_percentage, + wanted_words=FLAGS.wanted_words.split(','), + validation_percentage=FLAGS.validation_percentage, + testing_percentage=FLAGS.testing_percentage, + model_settings=model_settings) + + if FLAGS.tflite_path: + tflite_test(model_settings, audio_processor, FLAGS.tflite_path) + + if FLAGS.checkpoint: + model = models.create_model(model_settings, FLAGS.model_architecture, FLAGS.model_size_info, False) + model.load_weights(FLAGS.checkpoint).expect_partial() + keras_test(model_settings, audio_processor, model) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument( + '--data_url', + type=str, + default='http://download.tensorflow.org/data/speech_commands_v0.02.tar.gz', + help='Location of speech training data archive on the web.') + parser.add_argument( + '--data_dir', + type=str, + default='/tmp/speech_dataset/', + help="""\ + Where to download the speech training data to. + """) + parser.add_argument( + '--silence_percentage', + type=float, + default=10.0, + help="""\ + How much of the training data should be silence. + """) + parser.add_argument( + '--unknown_percentage', + type=float, + default=10.0, + help="""\ + How much of the training data should be unknown words. + """) + parser.add_argument( + '--testing_percentage', + type=int, + default=10, + help='What percentage of wavs to use as a test set.') + parser.add_argument( + '--validation_percentage', + type=int, + default=10, + help='What percentage of wavs to use as a validation set.') + parser.add_argument( + '--sample_rate', + type=int, + default=16000, + help='Expected sample rate of the wavs',) + parser.add_argument( + '--clip_duration_ms', + type=int, + default=1000, + help='Expected duration in milliseconds of the wavs',) + parser.add_argument( + '--window_size_ms', + type=float, + default=30.0, + help='How long each spectrogram timeslice is',) + parser.add_argument( + '--window_stride_ms', + type=float, + default=10.0, + help='How long each spectrogram timeslice is',) + parser.add_argument( + '--dct_coefficient_count', + type=int, + default=40, + help='How many bins to use for the MFCC fingerprint',) + parser.add_argument( + '--batch_size', + type=int, + default=100, + help='How many items to train with at once',) + parser.add_argument( + '--wanted_words', + type=str, + default='yes,no,up,down,left,right,on,off,stop,go', + help='Words to use (others will be added to an unknown label)',) + parser.add_argument( + '--checkpoint', + type=str, + help='Checkpoint to load the weights from') + parser.add_argument( + '--tflite_path', + type=str, + help='Path to TFLite file to use for evaluation') + parser.add_argument( + '--model_architecture', + type=str, + default='dnn', + help='What model architecture to use') + parser.add_argument( + '--model_size_info', + type=int, + nargs="+", + default=[128, 128, 128], + help='Model dimensions - different for various models') + + FLAGS, _ = parser.parse_known_args() + evaluate() diff --git a/models/keyword_spotting/dnn_small/model_package_tf/how_to_guidance.ipynb b/models/keyword_spotting/dnn_small/model_package_tf/how_to_guidance.ipynb new file mode 100644 index 0000000..1332d4e --- /dev/null +++ b/models/keyword_spotting/dnn_small/model_package_tf/how_to_guidance.ipynb @@ -0,0 +1,428 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Copyright (C) 2023 Arm Limited or its affiliates. All rights reserved.\n", + "#\n", + "# SPDX-License-Identifier: Apache-2.0\n", + "#\n", + "# Licensed under the Apache License, Version 2.0 (the License); you may\n", + "# not use this file except in compliance with the License.\n", + "# You may obtain a copy of the License at\n", + "#\n", + "# www.apache.org/licenses/LICENSE-2.0\n", + "#\n", + "# Unless required by applicable law or agreed to in writing, software\n", + "# distributed under the License is distributed on an AS IS BASIS, WITHOUT\n", + "# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", + "# See the License for the specific language governing permissions and\n", + "# limitations under the License." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# DNN_Small - Optimised\n", + "\n", + "Here we reproduce the models with our established codebase and ModelPackage approach for your convenience.\n", + "\n", + "## Model-Package Overview:\n", + "\n", + "| Model \t| DNN_Small \t|\n", + "|:---------------:\t|:---------------------------------------------------------------:\t|\n", + "| **Format**: \t| Keras, Saved Model, TensorFlow Lite int8, TensorFlow Lite fp32 |\n", + "| **Feature**: \t| Keyword spotting for Arm Cortex-M CPUs |\n", + "| **Architectural Delta w.r.t. Vanilla**: | None |\n", + "| **Domain**: \t| Keyword spotting |\n", + "| **Package Quality**: \t| Optimised |" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Table of contents \n", + "\n", + "This how-to guidance presents the key steps to reproduce everything in this package. The contents are organised as below. We provided the internal navigation links for users to easy-jump among different sections. \n", + "\n", + " \n", + "* [1.0 Model recreation](#model_recreation)\n", + "\n", + "* [2.0 Training](#training)\n", + "\n", + "* [3.0 Testing](#testing)\n", + "\n", + "* [4.0 Optimization](#optimization)\n", + "\n", + "* [5.0 Quantization and TFLite conversion](#tflite_conversion)\n", + "\n", + "* [6.0 Inference the TFLite model files](#tflite_inference)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1.0 Model Recreation\n", + "\n", + "In order to recreate the model you will first need to be using ```Python3.7``` and install the requirements in ```requirements.txt```.\n", + "\n", + "Once you have these requirements satisfied you can execute the recreation script contained within this folder, just run:" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2023-01-31 13:25:23.242199: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0\n", + "Untarring speech_commands_v0.02.tar.gz...\n", + "2023-01-31 13:26:16.311986: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcuda.so.1\n", + "2023-01-31 13:26:16.348776: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: \n", + "pciBusID: 0000:03:00.0 name: NVIDIA TITAN Xp computeCapability: 6.1\n", + "coreClock: 1.582GHz coreCount: 30 deviceMemorySize: 11.91GiB deviceMemoryBandwidth: 510.07GiB/s\n", + "2023-01-31 13:26:16.348818: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0\n", + "2023-01-31 13:26:16.369436: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcublas.so.11\n", + "2023-01-31 13:26:16.369509: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcublasLt.so.11\n", + "2023-01-31 13:26:16.372294: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcufft.so.10\n", + "2023-01-31 13:26:16.372684: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcurand.so.10\n", + "2023-01-31 13:26:16.373267: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcusolver.so.11\n", + "2023-01-31 13:26:16.374012: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcusparse.so.11\n", + "2023-01-31 13:26:16.374168: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudnn.so.8\n", + "2023-01-31 13:26:16.374680: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0\n", + "2023-01-31 13:26:16.374967: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA\n", + "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n", + "2023-01-31 13:26:16.375884: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: \n", + "pciBusID: 0000:03:00.0 name: NVIDIA TITAN Xp computeCapability: 6.1\n", + "coreClock: 1.582GHz coreCount: 30 deviceMemorySize: 11.91GiB deviceMemoryBandwidth: 510.07GiB/s\n", + "2023-01-31 13:26:16.376614: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0\n", + "2023-01-31 13:26:16.376682: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0\n", + "2023-01-31 13:26:16.822126: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1258] Device interconnect StreamExecutor with strength 1 edge matrix:\n", + "2023-01-31 13:26:16.822161: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1264] 0 \n", + "2023-01-31 13:26:16.822173: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1277] 0: N \n", + "2023-01-31 13:26:16.822780: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1418] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 10939 MB memory) -> physical GPU (device: 0, name: NVIDIA TITAN Xp, pci bus id: 0000:03:00.0, compute capability: 6.1)\n", + "WARNING:tensorflow:Compiled the loaded model, but the compiled metrics have yet to be built. `model.compile_metrics` will be empty until you train or evaluate the model.\n", + "2023-01-31 13:26:17.956358: W tensorflow/python/util/util.cc:348] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.\n", + "2023-01-31 13:26:18.216079: I tensorflow/core/grappler/devices.cc:69] Number of eligible GPUs (core count >= 8, compute capability >= 0.0): 1\n", + "2023-01-31 13:26:18.216285: I tensorflow/core/grappler/clusters/single_machine.cc:357] Starting new session\n", + "2023-01-31 13:26:18.216661: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: \n", + "pciBusID: 0000:03:00.0 name: NVIDIA TITAN Xp computeCapability: 6.1\n", + "coreClock: 1.582GHz coreCount: 30 deviceMemorySize: 11.91GiB deviceMemoryBandwidth: 510.07GiB/s\n", + "2023-01-31 13:26:18.216906: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0\n", + "2023-01-31 13:26:18.216936: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1258] Device interconnect StreamExecutor with strength 1 edge matrix:\n", + "2023-01-31 13:26:18.216946: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1264] 0 \n", + "2023-01-31 13:26:18.216953: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1277] 0: N \n", + "2023-01-31 13:26:18.217236: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1418] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 10939 MB memory) -> physical GPU (device: 0, name: NVIDIA TITAN Xp, pci bus id: 0000:03:00.0, compute capability: 6.1)\n", + "2023-01-31 13:26:18.235442: I tensorflow/core/platform/profile_utils/cpu_utils.cc:114] CPU Frequency: 3492140000 Hz\n", + "2023-01-31 13:26:18.236450: I tensorflow/core/grappler/optimizers/meta_optimizer.cc:1144] Optimization results for grappler item: graph_to_optimize\n", + " function_optimizer: function_optimizer did nothing. time = 0.011ms.\n", + " function_optimizer: function_optimizer did nothing. time = 0.001ms.\n", + "\n", + "2023-01-31 13:26:18.268723: W tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc:345] Ignored output_format.\n", + "2023-01-31 13:26:18.268758: W tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc:348] Ignored drop_control_dependency.\n", + "2023-01-31 13:26:18.271003: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:210] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.\n", + "2023-01-31 13:26:18.272912: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: \n", + "pciBusID: 0000:03:00.0 name: NVIDIA TITAN Xp computeCapability: 6.1\n", + "coreClock: 1.582GHz coreCount: 30 deviceMemorySize: 11.91GiB deviceMemoryBandwidth: 510.07GiB/s\n", + "2023-01-31 13:26:18.273329: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0\n", + "2023-01-31 13:26:18.273362: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1258] Device interconnect StreamExecutor with strength 1 edge matrix:\n", + "2023-01-31 13:26:18.273373: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1264] 0 \n", + "2023-01-31 13:26:18.273385: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1277] 0: N \n", + "2023-01-31 13:26:18.273700: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1418] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 10939 MB memory) -> physical GPU (device: 0, name: NVIDIA TITAN Xp, pci bus id: 0000:03:00.0, compute capability: 6.1)\n", + "Converted model saved to dnn.tflite.\n", + "Running TFLite evaluation on validation set...\n", + "2023-01-31 13:26:18.314546: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:176] None of the MLIR Optimization Passes are enabled (registered 2)\n", + "[[371 0 0 0 0 0 0 0 0 0 0 0]\n", + " [ 0 238 16 9 5 30 11 10 15 7 9 21]\n", + " [ 0 7 341 8 0 6 26 5 1 0 0 3]\n", + " [ 0 8 7 316 5 17 0 5 2 1 4 41]\n", + " [ 0 8 1 2 287 3 5 4 6 19 7 8]\n", + " [ 0 10 1 22 2 317 2 0 5 2 1 15]\n", + " [ 0 5 27 2 1 2 299 9 0 3 0 4]\n", + " [ 1 13 2 2 0 2 3 334 2 2 0 2]\n", + " [ 2 9 1 1 6 6 2 0 318 13 1 4]\n", + " [ 1 4 1 0 29 0 1 1 17 311 4 4]\n", + " [ 2 2 0 1 15 5 0 1 4 5 310 5]\n", + " [ 0 10 1 38 8 26 2 1 3 1 1 281]]\n", + "Validation accuracy = 83.76%(N=4445)\n", + "Running TFLite evaluation on test set...\n", + "[[408 0 0 0 0 0 0 0 0 0 0 0]\n", + " [ 0 274 12 20 8 11 16 13 20 7 8 19]\n", + " [ 1 9 375 3 0 6 20 0 0 0 1 4]\n", + " [ 0 14 5 312 1 33 6 0 0 0 3 31]\n", + " [ 0 12 0 3 362 5 3 5 8 11 13 3]\n", + " [ 0 10 2 34 2 332 5 0 5 0 3 13]\n", + " [ 0 12 27 5 4 1 339 17 1 2 2 2]\n", + " [ 0 12 0 2 4 1 9 362 1 3 0 2]\n", + " [ 1 12 0 3 3 14 1 1 336 20 1 4]\n", + " [ 1 6 3 2 16 0 3 1 19 338 2 11]\n", + " [ 0 5 1 2 22 4 3 0 0 2 367 5]\n", + " [ 0 17 0 65 6 17 3 2 2 5 2 283]]\n", + "Test accuracy = 83.60%(N=4890)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2023-01-31 13:26:30.279559: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0\n", + "Untarring speech_commands_v0.02.tar.gz...\n", + "2023-01-31 13:27:20.964068: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcuda.so.1\n", + "2023-01-31 13:27:21.007726: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: \n", + "pciBusID: 0000:03:00.0 name: NVIDIA TITAN Xp computeCapability: 6.1\n", + "coreClock: 1.582GHz coreCount: 30 deviceMemorySize: 11.91GiB deviceMemoryBandwidth: 510.07GiB/s\n", + "2023-01-31 13:27:21.007765: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0\n", + "2023-01-31 13:27:21.028042: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcublas.so.11\n", + "2023-01-31 13:27:21.028131: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcublasLt.so.11\n", + "2023-01-31 13:27:21.030956: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcufft.so.10\n", + "2023-01-31 13:27:21.031218: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcurand.so.10\n", + "2023-01-31 13:27:21.031788: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcusolver.so.11\n", + "2023-01-31 13:27:21.032512: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcusparse.so.11\n", + "2023-01-31 13:27:21.032668: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudnn.so.8\n", + "2023-01-31 13:27:21.033033: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0\n", + "2023-01-31 13:27:21.033325: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA\n", + "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n", + "2023-01-31 13:27:21.034039: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: \n", + "pciBusID: 0000:03:00.0 name: NVIDIA TITAN Xp computeCapability: 6.1\n", + "coreClock: 1.582GHz coreCount: 30 deviceMemorySize: 11.91GiB deviceMemoryBandwidth: 510.07GiB/s\n", + "2023-01-31 13:27:21.034415: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0\n", + "2023-01-31 13:27:21.034486: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0\n", + "2023-01-31 13:27:21.478837: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1258] Device interconnect StreamExecutor with strength 1 edge matrix:\n", + "2023-01-31 13:27:21.478873: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1264] 0 \n", + "2023-01-31 13:27:21.478882: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1277] 0: N \n", + "2023-01-31 13:27:21.479411: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1418] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 10939 MB memory) -> physical GPU (device: 0, name: NVIDIA TITAN Xp, pci bus id: 0000:03:00.0, compute capability: 6.1)\n", + "WARNING:tensorflow:Compiled the loaded model, but the compiled metrics have yet to be built. `model.compile_metrics` will be empty until you train or evaluate the model.\n", + "2023-01-31 13:27:22.568489: W tensorflow/python/util/util.cc:348] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.\n", + "2023-01-31 13:27:22.830822: I tensorflow/core/grappler/devices.cc:69] Number of eligible GPUs (core count >= 8, compute capability >= 0.0): 1\n", + "2023-01-31 13:27:22.831041: I tensorflow/core/grappler/clusters/single_machine.cc:357] Starting new session\n", + "2023-01-31 13:27:22.831444: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: \n", + "pciBusID: 0000:03:00.0 name: NVIDIA TITAN Xp computeCapability: 6.1\n", + "coreClock: 1.582GHz coreCount: 30 deviceMemorySize: 11.91GiB deviceMemoryBandwidth: 510.07GiB/s\n", + "2023-01-31 13:27:22.831775: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0\n", + "2023-01-31 13:27:22.831807: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1258] Device interconnect StreamExecutor with strength 1 edge matrix:\n", + "2023-01-31 13:27:22.831816: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1264] 0 \n", + "2023-01-31 13:27:22.831823: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1277] 0: N \n", + "2023-01-31 13:27:22.832109: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1418] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 10939 MB memory) -> physical GPU (device: 0, name: NVIDIA TITAN Xp, pci bus id: 0000:03:00.0, compute capability: 6.1)\n", + "2023-01-31 13:27:22.851539: I tensorflow/core/platform/profile_utils/cpu_utils.cc:114] CPU Frequency: 3492140000 Hz\n", + "2023-01-31 13:27:22.852738: I tensorflow/core/grappler/optimizers/meta_optimizer.cc:1144] Optimization results for grappler item: graph_to_optimize\n", + " function_optimizer: function_optimizer did nothing. time = 0.013ms.\n", + " function_optimizer: function_optimizer did nothing. time = 0.001ms.\n", + "\n", + "2023-01-31 13:27:22.888443: W tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc:345] Ignored output_format.\n", + "2023-01-31 13:27:22.888491: W tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc:348] Ignored drop_control_dependency.\n", + "2023-01-31 13:27:22.891172: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:210] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.\n", + "2023-01-31 13:27:22.893139: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: \n", + "pciBusID: 0000:03:00.0 name: NVIDIA TITAN Xp computeCapability: 6.1\n", + "coreClock: 1.582GHz coreCount: 30 deviceMemorySize: 11.91GiB deviceMemoryBandwidth: 510.07GiB/s\n", + "2023-01-31 13:27:22.893390: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0\n", + "2023-01-31 13:27:22.893420: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1258] Device interconnect StreamExecutor with strength 1 edge matrix:\n", + "2023-01-31 13:27:22.893430: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1264] 0 \n", + "2023-01-31 13:27:22.893437: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1277] 0: N \n", + "2023-01-31 13:27:22.893709: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1418] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 10939 MB memory) -> physical GPU (device: 0, name: NVIDIA TITAN Xp, pci bus id: 0000:03:00.0, compute capability: 6.1)\n", + "2023-01-31 13:27:22.923079: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:176] None of the MLIR Optimization Passes are enabled (registered 2)\n", + "fully_quantize: 0, inference_type: 6, input_inference_type: 9, output_inference_type: 9\n", + "Quantized model saved to dnn_quantized.tflite.\n", + "Running TFLite evaluation on validation set...\n", + "[[371 0 0 0 0 0 0 0 0 0 0 0]\n", + " [ 0 241 17 14 5 27 12 9 17 6 7 16]\n", + " [ 0 11 340 11 4 5 21 4 1 0 0 0]\n", + " [ 0 15 7 315 10 14 1 2 2 1 6 33]\n", + " [ 0 10 1 6 282 4 6 3 5 19 10 4]\n", + " [ 0 17 2 26 8 300 1 0 6 0 4 13]\n", + " [ 0 8 30 3 6 1 293 7 0 1 2 1]\n", + " [ 0 17 2 4 6 1 9 316 1 2 4 1]\n", + " [ 2 9 1 1 10 4 2 2 317 11 0 4]\n", + " [ 1 8 1 2 33 0 0 2 15 303 6 2]\n", + " [ 2 6 0 2 25 5 0 0 2 1 304 3]\n", + " [ 0 16 1 47 15 27 2 1 3 1 4 255]]\n", + "Validation accuracy = 81.82%(N=4445)\n", + "Running TFLite evaluation on test set...\n", + "[[408 0 0 0 0 0 0 0 0 0 0 0]\n", + " [ 0 281 13 23 11 10 14 14 21 6 6 9]\n", + " [ 0 12 372 6 6 6 13 0 0 0 1 3]\n", + " [ 0 19 8 311 6 28 4 0 0 0 4 25]\n", + " [ 0 20 2 6 359 6 1 2 6 7 14 2]\n", + " [ 0 15 5 36 10 318 3 0 4 2 2 11]\n", + " [ 0 12 33 6 13 3 320 19 0 2 2 2]\n", + " [ 1 17 0 4 5 1 11 347 1 1 7 1]\n", + " [ 0 16 0 6 8 16 1 1 326 18 3 1]\n", + " [ 1 6 3 4 37 1 3 2 19 314 3 9]\n", + " [ 0 10 0 6 28 3 4 0 0 1 354 5]\n", + " [ 0 19 0 73 18 19 3 2 3 4 2 259]]\n", + "Test accuracy = 81.17%(N=4890)\n" + ] + } + ], + "source": [ + "!bash ./recreate_model.sh" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Running this script will use the pre-trained checkpoint files supplied in the ```./model_archive/model_source/weights``` folder to generate the TFLite files and perform evaluation on the test set. Both an fp32 version and a quantized version will be produced. The quantized version will use post-training quantization to fully quantize it.\n", + "\n", + "If you want to run training from scratch you can do this by supplying ```--train``` when running the script. For example:\n", + "\n", + "```bash\n", + "bash ./recreate_model.sh --train\n", + "```\n", + "\n", + "Training is then performed and should produce a model to the stated accuracy in this repository. Note that exporting to TFLite will still happen with the baseline pre-trained checkpoint files, so you will need to re-run the script and this time supply the path to the new checkpoint files you want to use, for example:\n", + "\n", + "```bash\n", + "bash ./recreate_model.sh --ckpt \n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2.0 Training\n", + "\n", + "The training scripts can be used to recreate any of the models from the [Hello Edge paper](https://arxiv.org/pdf/1711.07128.pdf) provided the right hyperparameters are used. The training commands with all the hyperparameters to reproduce the model in this repository are given [here](recreate_model.sh). The model in this part of the repository represents just one variation of the models from the paper, other varieties are covered in other parts of the repository.\n", + "\n", + "\n", + "As a general example of how to train a DNN with 3 fully-connected layers with 128 neurons in each layer, run:\n", + "```\n", + "python train.py --model_architecture dnn --model_size_info 128 128 128\n", + "```\n", + "\n", + "The command line argument *--model_size_info* is used to pass the neural network layer\n", + "dimensions such as number of layers, convolution filter size/stride as a list to models.py,\n", + "which builds the TensorFlow graph based on the provided model architecture\n", + "and layer dimensions. For more info on *model_size_info* for each network architecture see\n", + "[models.py](model_core_utils/models.py).\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 3.0 Testing\n", + "To run inference on the trained model from a checkpoint and get accuracy on validation and test sets, run:\n", + "```\n", + "python evaluation.py --model_architecture dnn --model_size_info 128 128 128 --checkpoint \n", + "```\n", + "**The model and feature extraction parameters passed to this script should match those used in the Training step.**" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 4.0 Optimization\n", + "\n", + "We introduce an *optional* step to optimize the trained keyword spotting model for deployment.\n", + "\n", + "Here we use TensorFlow's [weight clustering API](https://www.tensorflow.org/model_optimization/guide/clustering) to reduce the compressed model size and optimize inference on supported hardware. 32 weight clusters and kmeans++ cluster intialization method are used as the clustering hyperparameters.\n", + "\n", + "To optimize your trained model (e.g. a DNN), a trained model checkpoint is needed to run clustering and fine-tuning on.\n", + "You can use the pre-trained checkpoints provided, or train your own model and use the resulting checkpoint.\n", + "\n", + "To apply the optimization and fine-tuning, run the following command:\n", + "```\n", + "python optimisations.py --model_architecture dnn --model_size_info 128 128 128 --checkpoint \n", + "```\n", + "**The model and feature extraction parameters used here should match those used in the Training step, except for the number of training steps.\n", + "The number of training steps is reduced since the optimization step only requires fine-tuning.**\n", + "\n", + "This will generate a clustered model checkpoint that can be used in the quantization step to generate a quantized and clustered TFLite model." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 5.0 Quantization and TFLite Conversion\n", + "\n", + "You can now use TensorFlow's\n", + "[post training quantization](https://www.tensorflow.org/lite/performance/post_training_quantization) to\n", + "make quantization of the trained models super simple.\n", + "\n", + "To quantize your trained model (e.g. a DNN) run:\n", + "```\n", + "python convert_to_tflite.py --model_architecture dnn --model_size_info 128 128 128 --checkpoint [--inference_type int8|int16]\n", + "```\n", + "**The model and feature extraction parameters used here should match those used in the Training step.**\n", + "\n", + "The ```inference_type``` parameter is *optional* and to be used if a fully quantized model with inputs and outputs of type int8 or int16 is needed. It defaults to fp32.\n", + "\n", + "In this example, this step will produce a quantized TFLite file *dnn_quantized.tflite*." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can test the accuracy of this quantized model on the test set by running:\n", + "```\n", + "python evaluation.py --tflite_path dnn_quantized.tflite\n", + "```\n", + "**The model and feature extraction parameters used here should match those used in the Training step.**\n", + "\n", + "`convert_to_tflite.py` uses post-training quantization to generate a quantized model by default. If you wish to convert to a floating point TFLite model, use the command below:\n", + "\n", + "```\n", + "python convert_to_tflite.py --model_architecture dnn --model_size_info 128 128 128 --checkpoint --no-quantize\n", + "```\n", + "\n", + "This will produce a floating point TFLite file *dnn.tflite*. You can test the accuracy of this floating point model using `evaluation.py` as above.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 6.0 Single inference of the TFLite model files \n", + "\n", + "You can conduct TFLite inference for .fp32 and .int8 model files by using the following command: \n", + "\n", + "```python dnn_s_inference_tflite.py --labels validation_utils/labels.txt --wav --tflite_path ```\n", + "\n", + "**The feature extraction parameters used here should match those used in the Training step.**\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.4" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/models/keyword_spotting/dnn_small/model_package_tf/model_archive/TFLite/tflite_fp32/README.md b/models/keyword_spotting/dnn_small/model_package_tf/model_archive/TFLite/tflite_fp32/README.md new file mode 100644 index 0000000..78f4f45 --- /dev/null +++ b/models/keyword_spotting/dnn_small/model_package_tf/model_archive/TFLite/tflite_fp32/README.md @@ -0,0 +1,62 @@ +# keyword_spotting/dnn_small/model_package_tf/model_archive/TFLite/tflite_fp32 + +## Description +This is a floating point fp32 version of the DNN Small model developed by Arm, from the Hello Edge paper. + +## License +[Apache-2.0](https://spdx.org/licenses/Apache-2.0.html) + +## Network Information +| Network Information | Value | +|---------------------|-------| +| Framework | TensorFlow Lite | +| Datatype | fp32 | +| SHA-1 Hash | 7491539a547ee30b87c266e6bbb4455e0c8f556d | +| Size (Bytes) | 320648 | +| Provenance | https://arxiv.org/abs/1711.07128 | +| Training | Trained by Arm | +| Paper | https://arxiv.org/abs/1711.07128 | + +## DataSet +| Dataset Information | Value | +|--------|-------| +| Name | Google Speech Commands test set | + +## Accuracy + +| Metric | Value | +|--------|-------| +| accuracy | 83.60% | + +## HW Support +| HW Support | Value | +|--------------|-------| +| Cortex-A |:heavy_check_mark: | +| Cortex-M |:heavy_check_mark: | +| Mali GPU |:heavy_check_mark: | +| Ethos U |:heavy_multiplication_x: | + +### Key +* :heavy_check_mark: - Will run on this platform. +* :heavy_multiplication_x: - Will not run on this platform. + +## Network Quality +| Network Quality | Value | +|-------------------------|-------| +| Recreate | :heavy_check_mark: | +| Quality level | Deployable | +| Vanilla | :heavy_check_mark: | +| Clustered | :heavy_multiplication_x: | +| Pruned | :heavy_multiplication_x: | +| Quantization - default | :heavy_multiplication_x: | +| Quantization - full | :heavy_multiplication_x: | + +## Network Inputs +| Input Node Name | Shape | Type | Example Path | Example Type | Example Shape | Example Use Case | +|-----------------|-------|-------|--------------|-------|-------|-----------------| +| input | (1, 250) | fp32 | models/keyword_spotting/dnn_small/model_package_tf/model_archive/TFLite/tflite_fp32/testing_input/input | fp32 | [1, 250] | The input is a processed MFCCs | + +## Network Outputs +| Output Node Name | Shape | Type | Example Path | Example Type | Example Shape | Example Use Case | +|-----------------|-------|-------|--------------|-------|-------|-----------------| +| Identity | (1, 12) | fp32 | models/keyword_spotting/dnn_small/model_package_tf/model_archive/TFLite/tflite_fp32/testing_output/Identity | fp32 | [1, 12] | The probability on 12 keywords | \ No newline at end of file diff --git a/models/keyword_spotting/dnn_small/model_package_tf/model_archive/TFLite/tflite_fp32/definition.yaml b/models/keyword_spotting/dnn_small/model_package_tf/model_archive/TFLite/tflite_fp32/definition.yaml new file mode 100644 index 0000000..0458507 --- /dev/null +++ b/models/keyword_spotting/dnn_small/model_package_tf/model_archive/TFLite/tflite_fp32/definition.yaml @@ -0,0 +1,62 @@ +benchmark: + benchmark_metrics: + accuracy: 83.60% + benchmark_name: Google Speech Commands test set +description: This is a floating point fp32 version of the DNN Small model developed + by Arm, from the Hello Edge paper. +license: +- Apache-2.0 +network: + datatype: fp32 + file_size_bytes: 320648 + filename: dnn_s.tflite + framework: TensorFlow Lite + hash: + algorithm: sha1 + value: 7491539a547ee30b87c266e6bbb4455e0c8f556d + provenance: https://arxiv.org/abs/1711.07128 + training: Trained by Arm +network_parameters: + input_nodes: + - description: The input is a processed MFCCs of shape (1, 490) + example_input: + path: models/keyword_spotting/dnn_small/model_package_tf/model_archive/TFLite/tflite_fp32/testing_input/input + shape: + - 1 + - 250 + type: fp32 + use_case: Random input for model regression. + input_datatype: fp32 + name: input + shape: + - 1 + - 250 + output_nodes: + - description: The probability on 12 keywords. + example_output: + path: models/keyword_spotting/dnn_small/model_package_tf/model_archive/TFLite/tflite_fp32/testing_output/Identity + shape: + - 1 + - 12 + type: fp32 + use_case: output for model regression. + name: Identity + output_datatype: fp32 + shape: + - 1 + - 12 +network_quality: + clustered: false + is_vanilla: true + pruned: false + quality_level: Deployable + quality_level_hero_hw: cortex_m + quantization_default: false + quantization_full: false + recreate: true +operators: + TensorFlow Lite: + - FULLY_CONNECTED + - RELU + - SOFTMAX +paper: https://arxiv.org/abs/1711.07128 \ No newline at end of file diff --git a/models/keyword_spotting/dnn_small/model_package_tf/model_archive/TFLite/tflite_fp32/dnn_s.tflite b/models/keyword_spotting/dnn_small/model_package_tf/model_archive/TFLite/tflite_fp32/dnn_s.tflite new file mode 100644 index 0000000..84cf83d --- /dev/null +++ b/models/keyword_spotting/dnn_small/model_package_tf/model_archive/TFLite/tflite_fp32/dnn_s.tflite @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7beaf5a4b740228324fc48db72eb2dab16854278676cb3f67268fee5910ab5f8 +size 320648 diff --git a/models/keyword_spotting/dnn_small/model_package_tf/model_archive/TFLite/tflite_fp32/testing_input/input/0.npy b/models/keyword_spotting/dnn_small/model_package_tf/model_archive/TFLite/tflite_fp32/testing_input/input/0.npy new file mode 100644 index 0000000..fd525dc --- /dev/null +++ b/models/keyword_spotting/dnn_small/model_package_tf/model_archive/TFLite/tflite_fp32/testing_input/input/0.npy @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1f9883bea3889da8d87477965f034c7f8a453636a4ed5897c34c0798a41924f8 +size 1128 diff --git a/models/keyword_spotting/dnn_small/model_package_tf/model_archive/TFLite/tflite_fp32/testing_output/Identity/0.npy b/models/keyword_spotting/dnn_small/model_package_tf/model_archive/TFLite/tflite_fp32/testing_output/Identity/0.npy new file mode 100644 index 0000000..3d71018 --- /dev/null +++ b/models/keyword_spotting/dnn_small/model_package_tf/model_archive/TFLite/tflite_fp32/testing_output/Identity/0.npy @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b74580f29a9cea2e7f1f179e930c05d4d2ac884c70b535d7c5f988bc38c47258 +size 176 diff --git a/models/keyword_spotting/dnn_small/model_package_tf/model_archive/TFLite/tflite_int8/README.md b/models/keyword_spotting/dnn_small/model_package_tf/model_archive/TFLite/tflite_int8/README.md new file mode 100644 index 0000000..91932d2 --- /dev/null +++ b/models/keyword_spotting/dnn_small/model_package_tf/model_archive/TFLite/tflite_int8/README.md @@ -0,0 +1,62 @@ +# keyword_spotting/dnn_small/model_package_tf/model_archive/TFLite/tflite_int8 + +## Description +This is a fully quantized int8 version of the DNN Small model developed by Arm, from the Hello Edge paper. + +## License +[Apache-2.0](https://spdx.org/licenses/Apache-2.0.html) + +## Network Information +| Network Information | Value | +|---------------------|-------| +| Framework | TensorFlow Lite | +| Datatype | int8 | +| SHA-1 Hash | 4b92e09fb43b2f042ce2811b91c7c67bf7186b6b | +| Size (Bytes) | 83544 | +| Provenance | https://arxiv.org/abs/1711.07128 | +| Training | Trained by Arm | +| Paper | https://arxiv.org/abs/1711.07128 | + +## DataSet +| Dataset Information | Value | +|--------|-------| +| Name | Google Speech Commands test set | + +## Accuracy + +| Metric | Value | +|--------|-------| +| Accuracy | 82.11% | + +## HW Support +| HW Support | Value | +|--------------|-------| +| Cortex-A |:heavy_check_mark: | +| Cortex-M |:heavy_check_mark: | +| Mali GPU |:heavy_check_mark: | +| Ethos U |:heavy_check_mark: | + +### Key +* :heavy_check_mark: - Will run on this platform. +* :heavy_multiplication_x: - Will not run on this platform. + +## Network Quality +| Network Quality | Value | +|-------------------------|-------| +| Recreate | :heavy_check_mark: | +| Quality level | Deployable | +| Vanilla | :heavy_check_mark: | +| Clustered | :heavy_multiplication_x: | +| Pruned | :heavy_multiplication_x: | +| Quantization - default | :heavy_multiplication_x: | +| Quantization - full | :heavy_check_mark: | + +## Network Inputs +| Input Node Name | Shape | Type | Example Path | Example Type | Example Shape | Example Use Case | +|-----------------|-------|-------|--------------|-------|-------|-----------------| +| input | (1, 250) | int8 | models/keyword_spotting/dnn_small/model_package_tf/model_archive/TFLite/tflite_int8/testing_input/input | int8 | [1, 250] | The input is a processed MFCCs | + +## Network Outputs +| Output Node Name | Shape | Type | Example Path | Example Type | Example Shape | Example Use Case | +|-----------------|-------|-------|--------------|-------|-------|-----------------| +| Identity | (1, 12) | int8 | models/keyword_spotting/dnn_small/model_package_tf/model_archive/TFLite/tflite_int8/testing_output/Identity | int8 | [1, 12] | The probability on 12 keywords | \ No newline at end of file diff --git a/models/keyword_spotting/dnn_small/model_package_tf/model_archive/TFLite/tflite_int8/definition.yaml b/models/keyword_spotting/dnn_small/model_package_tf/model_archive/TFLite/tflite_int8/definition.yaml new file mode 100644 index 0000000..d653ebc --- /dev/null +++ b/models/keyword_spotting/dnn_small/model_package_tf/model_archive/TFLite/tflite_int8/definition.yaml @@ -0,0 +1,62 @@ +benchmark: + benchmark_metrics: + Accuracy: 82.11% + benchmark_name: Google Speech Commands test set +description: This is a fully quantized int8 version of the DNN Small model developed + by Arm, from the Hello Edge paper. +license: +- Apache-2.0 +network: + datatype: int8 + file_size_bytes: 83544 + filename: dnn_s_quantized.tflite + framework: TensorFlow Lite + hash: + algorithm: sha1 + value: 4b92e09fb43b2f042ce2811b91c7c67bf7186b6b + provenance: https://arxiv.org/abs/1711.07128 + training: Trained by Arm +network_parameters: + input_nodes: + - description: The input is a processed MFCCs of shape (1, 250) + example_input: + path: models/keyword_spotting/dnn_small/model_package_tf/model_archive/TFLite/tflite_int8/testing_input/input + shape: + - 1 + - 250 + type: int8 + use_case: Random input for model regression. + input_datatype: int8 + name: input + shape: + - 1 + - 250 + output_nodes: + - description: The probability on 12 keywords. + example_output: + path: models/keyword_spotting/dnn_small/model_package_tf/model_archive/TFLite/tflite_int8/testing_output/Identity + shape: + - 1 + - 12 + type: int8 + use_case: output for model regression. + name: Identity + output_datatype: int8 + shape: + - 1 + - 12 +network_quality: + clustered: false + is_vanilla: true + pruned: false + quality_level: Deployable + quality_level_hero_hw: cortex_m + quantization_default: false + quantization_full: true + recreate: true +operators: + TensorFlow Lite: + - FULLY_CONNECTED + - RELU + - SOFTMAX +paper: https://arxiv.org/abs/1711.07128 \ No newline at end of file diff --git a/models/keyword_spotting/dnn_small/tflite_int8/dnn_s_quantized.tflite b/models/keyword_spotting/dnn_small/model_package_tf/model_archive/TFLite/tflite_int8/dnn_s_quantized.tflite similarity index 100% rename from models/keyword_spotting/dnn_small/tflite_int8/dnn_s_quantized.tflite rename to models/keyword_spotting/dnn_small/model_package_tf/model_archive/TFLite/tflite_int8/dnn_s_quantized.tflite diff --git a/models/keyword_spotting/dnn_small/tflite_int8/testing_input/input/0.npy b/models/keyword_spotting/dnn_small/model_package_tf/model_archive/TFLite/tflite_int8/testing_input/input/0.npy similarity index 100% rename from models/keyword_spotting/dnn_small/tflite_int8/testing_input/input/0.npy rename to models/keyword_spotting/dnn_small/model_package_tf/model_archive/TFLite/tflite_int8/testing_input/input/0.npy diff --git a/models/keyword_spotting/dnn_small/tflite_int8/testing_output/Identity/0.npy b/models/keyword_spotting/dnn_small/model_package_tf/model_archive/TFLite/tflite_int8/testing_output/Identity/0.npy similarity index 100% rename from models/keyword_spotting/dnn_small/tflite_int8/testing_output/Identity/0.npy rename to models/keyword_spotting/dnn_small/model_package_tf/model_archive/TFLite/tflite_int8/testing_output/Identity/0.npy diff --git a/models/keyword_spotting/dnn_small/model_package_tf/model_archive/model_source/saved_model/dnn_small/keras_metadata.pb b/models/keyword_spotting/dnn_small/model_package_tf/model_archive/model_source/saved_model/dnn_small/keras_metadata.pb new file mode 100644 index 0000000..4f01a9c --- /dev/null +++ b/models/keyword_spotting/dnn_small/model_package_tf/model_archive/model_source/saved_model/dnn_small/keras_metadata.pb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7035d087e4fea7940fc83080a1b64f4d8cdec6d8344aadb5876ff41994807bbf +size 10087 diff --git a/models/keyword_spotting/dnn_small/model_package_tf/model_archive/model_source/saved_model/dnn_small/saved_model.pb b/models/keyword_spotting/dnn_small/model_package_tf/model_archive/model_source/saved_model/dnn_small/saved_model.pb new file mode 100644 index 0000000..152a69e --- /dev/null +++ b/models/keyword_spotting/dnn_small/model_package_tf/model_archive/model_source/saved_model/dnn_small/saved_model.pb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4c53338f2dc3fb47b591a96d93710047cc31fe9aa697bbf51283ce3b7d3557fe +size 84664 diff --git a/models/keyword_spotting/dnn_small/model_package_tf/model_archive/model_source/saved_model/dnn_small/variables/variables.data-00000-of-00001 b/models/keyword_spotting/dnn_small/model_package_tf/model_archive/model_source/saved_model/dnn_small/variables/variables.data-00000-of-00001 new file mode 100644 index 0000000..d945297 --- /dev/null +++ b/models/keyword_spotting/dnn_small/model_package_tf/model_archive/model_source/saved_model/dnn_small/variables/variables.data-00000-of-00001 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fd85a15e363ad2aeb3bf02308e5f89137221c1c6c658e71ccba21aefbba99d63 +size 321215 diff --git a/models/keyword_spotting/dnn_small/model_package_tf/model_archive/model_source/saved_model/dnn_small/variables/variables.index b/models/keyword_spotting/dnn_small/model_package_tf/model_archive/model_source/saved_model/dnn_small/variables/variables.index new file mode 100644 index 0000000..35dd996 --- /dev/null +++ b/models/keyword_spotting/dnn_small/model_package_tf/model_archive/model_source/saved_model/dnn_small/variables/variables.index @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cc2c60477817e8647d6ebbe1409b40435de6bcaef280b0a41cf5713d3ec95393 +size 641 diff --git a/models/keyword_spotting/dnn_small/tflite_int8/ckpt/checkpoint b/models/keyword_spotting/dnn_small/model_package_tf/model_archive/model_source/weights/checkpoint similarity index 100% rename from models/keyword_spotting/dnn_small/tflite_int8/ckpt/checkpoint rename to models/keyword_spotting/dnn_small/model_package_tf/model_archive/model_source/weights/checkpoint diff --git a/models/keyword_spotting/dnn_small/tflite_int8/ckpt/dnn_0.84_ckpt.data-00000-of-00001 b/models/keyword_spotting/dnn_small/model_package_tf/model_archive/model_source/weights/dnn_0.84_ckpt.data-00000-of-00001 similarity index 100% rename from models/keyword_spotting/dnn_small/tflite_int8/ckpt/dnn_0.84_ckpt.data-00000-of-00001 rename to models/keyword_spotting/dnn_small/model_package_tf/model_archive/model_source/weights/dnn_0.84_ckpt.data-00000-of-00001 diff --git a/models/keyword_spotting/dnn_small/tflite_int8/ckpt/dnn_0.84_ckpt.index b/models/keyword_spotting/dnn_small/model_package_tf/model_archive/model_source/weights/dnn_0.84_ckpt.index similarity index 100% rename from models/keyword_spotting/dnn_small/tflite_int8/ckpt/dnn_0.84_ckpt.index rename to models/keyword_spotting/dnn_small/model_package_tf/model_archive/model_source/weights/dnn_0.84_ckpt.index diff --git a/models/keyword_spotting/dnn_small/model_package_tf/model_core_utils/__init__.py b/models/keyword_spotting/dnn_small/model_package_tf/model_core_utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/models/keyword_spotting/dnn_small/model_package_tf/model_core_utils/models.py b/models/keyword_spotting/dnn_small/model_package_tf/model_core_utils/models.py new file mode 100644 index 0000000..1978136 --- /dev/null +++ b/models/keyword_spotting/dnn_small/model_package_tf/model_core_utils/models.py @@ -0,0 +1,327 @@ +# Copyright © 2023 Arm Ltd. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Model definitions for simple keyword spotting.""" + +import math + +import tensorflow as tf + + +def prepare_model_settings(label_count, sample_rate, clip_duration_ms, + window_size_ms, window_stride_ms, + dct_coefficient_count): + """Calculates common settings needed for all models. + + Args: + label_count: How many classes are to be recognized. + sample_rate: Number of audio samples per second. + clip_duration_ms: Length of each audio clip to be analyzed. + window_size_ms: Duration of frequency analysis window. + window_stride_ms: How far to move in time between frequency windows. + dct_coefficient_count: Number of frequency bins to use for analysis. + + Returns: + Dictionary containing common settings. + """ + desired_samples = int(sample_rate * clip_duration_ms / 1000) + window_size_samples = int(sample_rate * window_size_ms / 1000) + window_stride_samples = int(sample_rate * window_stride_ms / 1000) + length_minus_window = (desired_samples - window_size_samples) + if length_minus_window < 0: + spectrogram_length = 0 + else: + spectrogram_length = 1 + int(length_minus_window / window_stride_samples) + fingerprint_size = dct_coefficient_count * spectrogram_length + + return { + 'desired_samples': desired_samples, + 'window_size_samples': window_size_samples, + 'window_stride_samples': window_stride_samples, + 'spectrogram_length': spectrogram_length, + 'dct_coefficient_count': dct_coefficient_count, + 'fingerprint_size': fingerprint_size, + 'label_count': label_count, + 'sample_rate': sample_rate, + } + + +def create_model(model_settings, model_architecture, model_size_info, is_training): + """Builds a tf.keras model of the requested architecture compatible with the settings. + + Args: + model_settings: Dictionary of information about the model. + model_architecture: String specifying which kind of model to create. + model_size_info: Array with specific information for the chosen architecture + (e.g convolutional parameters, number of layers). + + Returns: + A tf.keras Model with the requested architecture. + + Raises: + Exception: If the architecture type isn't recognized. + """ + + if model_architecture == 'dnn': + return create_dnn_model(model_settings, model_size_info) + + elif model_architecture == 'cnn': + return create_cnn_model(model_settings, model_size_info) + + elif model_architecture == 'ds_cnn': + return create_ds_cnn_model(model_settings, model_size_info) + elif model_architecture == 'single_fc': + return create_single_fc_model(model_settings) + elif model_architecture == 'basic_lstm': + return create_basic_lstm_model(model_settings, model_size_info, is_training) + else: + raise Exception(f'model_architecture argument {model_architecture} not recognized' + f', should be one of, "dnn", "cnn", "ds_cnn" ') + + +def create_single_fc_model(model_settings): + """Builds a model with a single fully-connected layer. + + For details see https://arxiv.org/abs/1711.07128. + + Args: + model_settings: Dict of different settings for model training. + + Returns: + tf.keras Model of the 'SINGLE_FC' architecture. + """ + inputs = tf.keras.Input(shape=(model_settings['fingerprint_size'],), name='input') + # Fully connected layer + output = tf.keras.layers.Dense(units=model_settings['label_count'], activation='softmax')(inputs) + + return tf.keras.Model(inputs, output) + + +def create_basic_lstm_model(model_settings, model_size_info, is_training): + """Builds a model with a basic lstm layer. + + For details see https://arxiv.org/abs/1711.07128. + + Args: + model_settings: Dict of different settings for model training. + model_size_info: Length of the array defines the number of hidden-layers and + each element in the array represent the number of neurons in that layer. + is_training: Determining whether the use of the model is for training or for something else. + + Returns: + tf.keras Model of the 'Basic_LSTM' architecture. + """ + inputs = tf.keras.Input(shape=(model_settings['fingerprint_size'], ), name='input') + + input_frequency_size = model_settings['dct_coefficient_count'] + input_time_size = model_settings['spectrogram_length'] + x = tf.reshape(inputs, shape=(-1, input_time_size, input_frequency_size)) + + # LSTM layer, and unrolling depending on whether you are training or not + if is_training: + x = tf.keras.layers.LSTM(units=model_size_info[0], time_major=False, unroll=False)(x) + else: + x = tf.keras.layers.LSTM(units=model_size_info[0], time_major=False, unroll=True)(x) + + # Outputs a fully connected layer + output = tf.keras.layers.Dense(units=model_settings['label_count'], activation='softmax')(x) + + return tf.keras.Model(inputs, output) + + +def create_dnn_model(model_settings, model_size_info): + """Builds a model with multiple hidden fully-connected layers. + + For details see https://arxiv.org/abs/1711.07128. + + Args: + model_settings: Dict of different settings for model training. + model_size_info: Length of the array defines the number of hidden-layers and + each element in the array represent the number of neurons in that layer. + + Returns: + tf.keras Model of the 'DNN' architecture. + """ + + inputs = tf.keras.Input(shape=(model_settings['fingerprint_size'], ), name='input') + + # First fully connected layer. + x = tf.keras.layers.Dense(units=model_size_info[0], activation='relu')(inputs) + + # Hidden layers with ReLU activations. + for i in range(1, len(model_size_info)): + x = tf.keras.layers.Dense(units=model_size_info[i], activation='relu')(x) + + # Output fully connected layer. + output = tf.keras.layers.Dense(units=model_settings['label_count'], activation='softmax')(x) + + return tf.keras.Model(inputs, output) + + +def create_cnn_model(model_settings, model_size_info): + """Builds a model with 2 convolution layers followed by a linear layer and a hidden fully-connected layer. + + For details see https://arxiv.org/abs/1711.07128. + + Args: + model_settings: Dict of different settings for model training. + model_size_info: Defines the first and second convolution parameters in + {number of conv features, conv filter height, width, stride in y,x dir.}, + followed by linear layer size and fully-connected layer size. + + Returns: + tf.keras Model of the 'CNN' architecture. + """ + + input_frequency_size = model_settings['dct_coefficient_count'] + input_time_size = model_settings['spectrogram_length'] + + first_filter_count = model_size_info[0] + first_filter_height = model_size_info[1] # Time axis. + first_filter_width = model_size_info[2] # Frequency axis. + first_filter_stride_y = model_size_info[3] # Time axis. + first_filter_stride_x = model_size_info[4] # Frequency_axis. + + second_filter_count = model_size_info[5] + second_filter_height = model_size_info[6] # Time axis. + second_filter_width = model_size_info[7] # Frequency axis. + second_filter_stride_y = model_size_info[8] # Time axis. + second_filter_stride_x = model_size_info[9] # Frequency axis. + + linear_layer_size = model_size_info[10] + fc_size = model_size_info[11] + + inputs = tf.keras.Input(shape=(model_settings['fingerprint_size']), name='input') + + # Reshape the flattened input. + x = tf.reshape(inputs, shape=(-1, input_time_size, input_frequency_size, 1)) + + # First convolution. + x = tf.keras.layers.Conv2D(filters=first_filter_count, + kernel_size=(first_filter_height, first_filter_width), + strides=(first_filter_stride_y, first_filter_stride_x), + padding='VALID')(x) + x = tf.keras.layers.BatchNormalization()(x) + x = tf.keras.layers.ReLU()(x) + x = tf.keras.layers.Dropout(rate=0)(x) + + # Second convolution. + x = tf.keras.layers.Conv2D(filters=second_filter_count, + kernel_size=(second_filter_height, second_filter_width), + strides=(second_filter_stride_y, second_filter_stride_x), + padding='VALID')(x) + x = tf.keras.layers.BatchNormalization()(x) + x = tf.keras.layers.ReLU()(x) + x = tf.keras.layers.Dropout(rate=0)(x) + + # Flatten for fully connected layers. + x = tf.keras.layers.Flatten()(x) + + # Fully connected layer with no activation. + x = tf.keras.layers.Dense(units=linear_layer_size)(x) + + # Fully connected layer with ReLU activation. + x = tf.keras.layers.Dense(units=fc_size)(x) + x = tf.keras.layers.BatchNormalization()(x) + x = tf.keras.layers.ReLU()(x) + x = tf.keras.layers.Dropout(rate=0)(x) + + # Output fully connected. + output = tf.keras.layers.Dense(units=model_settings['label_count'], activation='softmax')(x) + + return tf.keras.Model(inputs, output) + + +def create_ds_cnn_model(model_settings, model_size_info): + """Builds a model with convolutional & depthwise separable convolutional layers. + + For more details see https://arxiv.org/abs/1711.07128. + + Args: + model_settings: Dict of different settings for model training. + model_size_info: Defines number of layers, followed by the DS-Conv layer + parameters in the order {number of conv features, conv filter height, + width and stride in y,x dir.} for each of the layers. + + Returns: + tf.keras Model of the 'DS-CNN' architecture. + """ + + label_count = model_settings['label_count'] + input_frequency_size = model_settings['dct_coefficient_count'] + input_time_size = model_settings['spectrogram_length'] + + t_dim = input_time_size + f_dim = input_frequency_size + + # Extract model dimensions from model_size_info. + num_layers = model_size_info[0] + conv_feat = [None]*num_layers + conv_kt = [None]*num_layers + conv_kf = [None]*num_layers + conv_st = [None]*num_layers + conv_sf = [None]*num_layers + + i = 1 + for layer_no in range(0, num_layers): + conv_feat[layer_no] = model_size_info[i] + i += 1 + conv_kt[layer_no] = model_size_info[i] + i += 1 + conv_kf[layer_no] = model_size_info[i] + i += 1 + conv_st[layer_no] = model_size_info[i] + i += 1 + conv_sf[layer_no] = model_size_info[i] + i += 1 + + inputs = tf.keras.Input(shape=(model_settings['fingerprint_size']), name='input') + + # Reshape the flattened input. + x = tf.reshape(inputs, shape=(-1, input_time_size, input_frequency_size, 1)) + + # Depthwise separable convolutions. + for layer_no in range(0, num_layers): + if layer_no == 0: + # First convolution. + x = tf.keras.layers.Conv2D(filters=conv_feat[0], + kernel_size=(conv_kt[0], conv_kf[0]), + strides=(conv_st[0], conv_sf[0]), + padding='SAME')(x) + x = tf.keras.layers.BatchNormalization()(x) + x = tf.keras.layers.ReLU()(x) + else: + # Depthwise convolution. + x = tf.keras.layers.DepthwiseConv2D(kernel_size=(conv_kt[layer_no], conv_kf[layer_no]), + strides=(conv_sf[layer_no], conv_st[layer_no]), + padding='SAME')(x) + x = tf.keras.layers.BatchNormalization()(x) + x = tf.keras.layers.ReLU()(x) + + # Pointwise convolution. + x = tf.keras.layers.Conv2D(filters=conv_feat[layer_no], kernel_size=(1, 1))(x) + x = tf.keras.layers.BatchNormalization()(x) + x = tf.keras.layers.ReLU()(x) + + t_dim = math.ceil(t_dim/float(conv_st[layer_no])) + f_dim = math.ceil(f_dim/float(conv_sf[layer_no])) + + # Global average pool. + x = tf.keras.layers.AveragePooling2D(pool_size=(t_dim, f_dim), strides=1)(x) + + # Squeeze before passing to output fully connected layer. + x = tf.reshape(x, shape=(-1, conv_feat[layer_no])) + + # Output connected layer. + output = tf.keras.layers.Dense(units=label_count, activation='softmax')(x) + + return tf.keras.Model(inputs, output) diff --git a/models/keyword_spotting/dnn_small/model_package_tf/optimisations.py b/models/keyword_spotting/dnn_small/model_package_tf/optimisations.py new file mode 100644 index 0000000..16b6f4c --- /dev/null +++ b/models/keyword_spotting/dnn_small/model_package_tf/optimisations.py @@ -0,0 +1,259 @@ +# Copyright © 2023 Arm Ltd. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Functions for optimizing simple keyword spotting models using clustering API.""" + +import argparse +from pathlib import Path + +import tensorflow as tf +import numpy as np +import tensorflow_model_optimization as tfmot + +from data_processing import data_preprocessing +from model_core_utils import models + + +def print_model_weight_clusters(model): + + for layer in model.layers: + if isinstance(layer, tf.keras.layers.Wrapper): + weights = layer.trainable_weights + else: + weights = layer.weights + for weight in weights: + if "kernel" in weight.name: + unique_count = len(np.unique(weight)) + print( + f"{layer.name}/{weight.name}: {unique_count} clusters " + ) + + +def optimize(): + model_settings = models.prepare_model_settings( + len(data_preprocessing.prepare_words_list(FLAGS.wanted_words.split(','))), + FLAGS.sample_rate, FLAGS.clip_duration_ms, FLAGS.window_size_ms, + FLAGS.window_stride_ms, FLAGS.dct_coefficient_count) + + # Create the model to optimize from checkpoint. + model = models.create_model(model_settings, FLAGS.model_architecture, FLAGS.model_size_info) + model.load_weights(FLAGS.checkpoint).expect_partial() + + audio_processor = data_preprocessing.AudioProcessor(data_url=FLAGS.data_url, + data_dir=FLAGS.data_dir, + silence_percentage=FLAGS.silence_percentage, + unknown_percentage=FLAGS.unknown_percentage, + wanted_words=FLAGS.wanted_words.split(','), + validation_percentage=FLAGS.validation_percentage, + testing_percentage=FLAGS.testing_percentage, + model_settings=model_settings) + + # We decay learning rate in a constant piecewise way to help learning. + training_steps_list = list(map(int, FLAGS.how_many_training_steps.split(','))) + learning_rates_list = list(map(float, FLAGS.learning_rate.split(','))) + lr_boundary_list = training_steps_list[:-1] # Only need the values at which to change lr. + lr_schedule = tf.keras.optimizers.schedules.PiecewiseConstantDecay(boundaries=lr_boundary_list, + values=learning_rates_list) + + cluster_weights = tfmot.clustering.keras.cluster_weights + CentroidInitialization = tfmot.clustering.keras.CentroidInitialization + + clustering_params = { + 'number_of_clusters': 32, + 'cluster_centroids_init': CentroidInitialization.KMEANS_PLUS_PLUS} + + clustered_model = cluster_weights(model, **clustering_params) + + # Specify the optimizer configurations. + optimizer = tf.keras.optimizers.Adam(learning_rate=lr_schedule) + clustered_model.compile(optimizer=optimizer, + loss=tf.keras.losses.SparseCategoricalCrossentropy(), + metrics=['accuracy']) + + train_data = audio_processor.get_data(audio_processor.Modes.TRAINING, + FLAGS.background_frequency, FLAGS.background_volume, + int((FLAGS.time_shift_ms * FLAGS.sample_rate) / 1000)) + train_data = train_data.repeat().batch(FLAGS.batch_size).prefetch(tf.data.AUTOTUNE) + val_data = audio_processor.get_data(audio_processor.Modes.VALIDATION) + val_data = val_data.batch(FLAGS.batch_size).prefetch(tf.data.AUTOTUNE) + + # We train for a max number of iterations so need to calculate how many 'epochs' this will be. + training_steps_max = np.sum(training_steps_list) + training_epoch_max = int(np.ceil(training_steps_max / FLAGS.eval_step_interval)) + + # Train the model with clustering applied. + clustered_model.fit(x=train_data, + steps_per_epoch=FLAGS.eval_step_interval, + epochs=training_epoch_max, + validation_data=val_data) + + stripped_clustered_model = tfmot.clustering.keras.strip_clustering(clustered_model) + + print_model_weight_clusters(stripped_clustered_model) + + # Save the clustered model weights + train_dir = Path(FLAGS.train_dir) / "optimized" + train_dir.mkdir(parents=True, exist_ok=True) + + stripped_clustered_model.save_weights((train_dir / + (FLAGS.model_architecture + + "_clustered_ckpt"))) + + # Test the model. + test_data = audio_processor.get_data(audio_processor.Modes.TESTING) + test_data = test_data.batch(FLAGS.batch_size) + + stripped_clustered_model.compile(optimizer=optimizer, + loss=tf.keras.losses.SparseCategoricalCrossentropy(), + metrics=['accuracy']) + + test_loss, test_acc = stripped_clustered_model.evaluate(x=test_data) + print(f'Final test accuracy: {test_acc*100:.2f}%') + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + '--data_url', + type=str, + default='http://download.tensorflow.org/data/speech_commands_v0.02.tar.gz', + help='Location of speech training data archive on the web.') + parser.add_argument( + '--data_dir', + type=str, + default='/tmp/speech_dataset/', + help="""\ + Where to download the speech training data to. + """) + parser.add_argument( + '--background_volume', + type=float, + default=0.1, + help="""\ + How loud the background noise should be, between 0 and 1. + """) + parser.add_argument( + '--background_frequency', + type=float, + default=0.8, + help="""\ + How many of the training samples have background noise mixed in. + """) + parser.add_argument( + '--silence_percentage', + type=float, + default=10.0, + help="""\ + How much of the training data should be silence. + """) + parser.add_argument( + '--unknown_percentage', + type=float, + default=10.0, + help="""\ + How much of the training data should be unknown words. + """) + parser.add_argument( + '--time_shift_ms', + type=float, + default=100.0, + help="""\ + Range to randomly shift the training audio by in time. + """) + parser.add_argument( + '--testing_percentage', + type=int, + default=10, + help='What percentage of wavs to use as a test set.') + parser.add_argument( + '--validation_percentage', + type=int, + default=10, + help='What percentage of wavs to use as a validation set.') + parser.add_argument( + '--sample_rate', + type=int, + default=16000, + help='Expected sample rate of the wavs',) + parser.add_argument( + '--clip_duration_ms', + type=int, + default=1000, + help='Expected duration in milliseconds of the wavs',) + parser.add_argument( + '--window_size_ms', + type=float, + default=30.0, + help='How long each spectrogram timeslice is',) + parser.add_argument( + '--window_stride_ms', + type=float, + default=10.0, + help='How long each spectrogram timeslice is',) + parser.add_argument( + '--dct_coefficient_count', + type=int, + default=40, + help='How many bins to use for the MFCC fingerprint',) + parser.add_argument( + '--how_many_training_steps', + type=str, + default='3750,750', + help='How many training loops to run',) + parser.add_argument( + '--eval_step_interval', + type=int, + default=400, + help='How often to evaluate the training results.') + parser.add_argument( + '--learning_rate', + type=str, + default='0.001,0.0001', + help='How large a learning rate to use when training.') + parser.add_argument( + '--batch_size', + type=int, + default=100, + help='How many items to train with at once',) + parser.add_argument( + '--wanted_words', + type=str, + default='yes,no,up,down,left,right,on,off,stop,go', + help='Words to use (others will be added to an unknown label)',) + parser.add_argument( + '--train_dir', + type=str, + default='/tmp/speech_commands_train', + help='Directory to write event logs and checkpoint.') + parser.add_argument( + '--save_step_interval', + type=int, + default=100, + help='Save model checkpoint every save_steps.') + parser.add_argument( + '--checkpoint', + type=str, + help='Checkpoint to load the weights from before fine-tuning.') + parser.add_argument( + '--model_architecture', + type=str, + default='dnn', + help='What model architecture to use') + parser.add_argument( + '--model_size_info', + type=int, + nargs="+", + default=[128, 128, 128], + help='Model dimensions - different for various models') + + FLAGS, _ = parser.parse_known_args() + optimize() diff --git a/models/keyword_spotting/dnn_small/model_package_tf/recreate_model.sh b/models/keyword_spotting/dnn_small/model_package_tf/recreate_model.sh new file mode 100644 index 0000000..d00f43f --- /dev/null +++ b/models/keyword_spotting/dnn_small/model_package_tf/recreate_model.sh @@ -0,0 +1,52 @@ +#!/usr/bin/env bash +# Copyright (C) 2023 Arm Limited or its affiliates. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the License); you may +# not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an AS IS BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +ckpt_path=model_archive/model_source/weights/dnn_0.84_ckpt +train=false + +# Parse command line args +while (( $# >= 1 )); do + case $1 in + --ckpt) + if [ "$2" ]; then + ckpt_path=$2 + shift + else + printf 'ERROR: "--ckpt" requires a path to be supplied.\n' + exit 1 + fi + ;; + --train) + train=true + break;; + *) shift; + esac; +done + + +# DNN Small training +if [ "$train" = true ] +then +python train.py --model_architecture dnn --model_size_info 144 144 144 --dct_coefficient_count 10 --window_size_ms 40 --window_stride_ms 40 --learning_rate 0.0005,0.0001,0.00002 --how_many_training_steps 10000,10000,10000 --summaries_dir work/DNN/DNN_S/retrain_logs --train_dir work/DNN/DNN_S/training +fi + +# Conversion to TFLite fp32 +python convert_to_tflite.py --model_architecture dnn --model_size_info 144 144 144 --dct_coefficient_count 10 --window_size_ms 40 --window_stride_ms 40 --checkpoint $ckpt_path --no-quantize + +# Conversion to TFLite int8 +python convert_to_tflite.py --model_architecture dnn --model_size_info 144 144 144 --dct_coefficient_count 10 --window_size_ms 40 --window_stride_ms 40 --checkpoint $ckpt_path --inference_type int8 + diff --git a/models/keyword_spotting/dnn_small/model_package_tf/requirements.txt b/models/keyword_spotting/dnn_small/model_package_tf/requirements.txt new file mode 100644 index 0000000..3448cff --- /dev/null +++ b/models/keyword_spotting/dnn_small/model_package_tf/requirements.txt @@ -0,0 +1,3 @@ +numpy == 1.19.5 +tensorflow == 2.5.0 +tensorflow-model-optimization == 0.6.0 \ No newline at end of file diff --git a/models/keyword_spotting/dnn_small/model_package_tf/train.py b/models/keyword_spotting/dnn_small/model_package_tf/train.py new file mode 100644 index 0000000..8c488b3 --- /dev/null +++ b/models/keyword_spotting/dnn_small/model_package_tf/train.py @@ -0,0 +1,227 @@ +# Copyright © 2023 Arm Ltd. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Functions for training simple keyword spotting models.""" + +import argparse +from pathlib import Path + +import tensorflow as tf +import numpy as np + +from data_processing import data_preprocessing +from model_core_utils import models + + +def train(): + model_settings = models.prepare_model_settings( + len(data_preprocessing.prepare_words_list(FLAGS.wanted_words.split(','))), + FLAGS.sample_rate, FLAGS.clip_duration_ms, FLAGS.window_size_ms, + FLAGS.window_stride_ms, FLAGS.dct_coefficient_count) + + # Create the model. + model = models.create_model(model_settings, FLAGS.model_architecture, FLAGS.model_size_info, True) + + audio_processor = data_preprocessing.AudioProcessor(data_url=FLAGS.data_url, + data_dir=FLAGS.data_dir, + silence_percentage=FLAGS.silence_percentage, + unknown_percentage=FLAGS.unknown_percentage, + wanted_words=FLAGS.wanted_words.split(','), + validation_percentage=FLAGS.validation_percentage, + testing_percentage=FLAGS.testing_percentage, + model_settings=model_settings) + + # We decay learning rate in a constant piecewise way to help learning. + training_steps_list = list(map(int, FLAGS.how_many_training_steps.split(','))) + learning_rates_list = list(map(float, FLAGS.learning_rate.split(','))) + lr_boundary_list = training_steps_list[:-1] # Only need the values at which to change lr. + lr_schedule = tf.keras.optimizers.schedules.PiecewiseConstantDecay(boundaries=lr_boundary_list, + values=learning_rates_list) + + # Specify the optimizer configurations. + optimizer = tf.keras.optimizers.Adam(learning_rate=lr_schedule) + model.compile(optimizer=optimizer, + loss=tf.keras.losses.SparseCategoricalCrossentropy(), + metrics=['accuracy']) + + train_data = audio_processor.get_data(audio_processor.Modes.TRAINING, + FLAGS.background_frequency, FLAGS.background_volume, + int((FLAGS.time_shift_ms * FLAGS.sample_rate) / 1000)) + train_data = train_data.repeat().batch(FLAGS.batch_size).prefetch(tf.data.AUTOTUNE) + val_data = audio_processor.get_data(audio_processor.Modes.VALIDATION) + val_data = val_data.batch(FLAGS.batch_size).prefetch(tf.data.AUTOTUNE) + + # We train for a max number of iterations so need to calculate how many 'epochs' this will be. + training_steps_max = np.sum(training_steps_list) + training_epoch_max = int(np.ceil(training_steps_max / FLAGS.eval_step_interval)) + + # Callbacks. + train_dir = Path(FLAGS.train_dir) / "best" + train_dir.mkdir(parents=True, exist_ok=True) + model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint( + filepath=(train_dir / (FLAGS.model_architecture + "_{val_accuracy:.3f}_ckpt")), + save_weights_only=True, + monitor='val_accuracy', + mode='max', + save_best_only=True) + tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=FLAGS.summaries_dir) + + # Train the model. + model.fit(x=train_data, + steps_per_epoch=FLAGS.eval_step_interval, + epochs=training_epoch_max, + validation_data=val_data, + callbacks=[model_checkpoint_callback, tensorboard_callback]) + + # Test and save the model. + test_data = audio_processor.get_data(audio_processor.Modes.TESTING) + test_data = test_data.batch(FLAGS.batch_size) + + test_loss, test_acc = model.evaluate(x=test_data) + print(f'Final test accuracy: {test_acc*100:.2f}%') + model.save(f'saved_model/{FLAGS.model_architecture}') + model.save(f'keras/{FLAGS.model_architecture}.h5') + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + '--data_url', + type=str, + default='http://download.tensorflow.org/data/speech_commands_v0.02.tar.gz', + help='Location of speech training data archive on the web.') + parser.add_argument( + '--data_dir', + type=str, + default='/tmp/speech_dataset/', + help="""\ + Where to download the speech training data to. + """) + parser.add_argument( + '--background_volume', + type=float, + default=0.1, + help="""\ + How loud the background noise should be, between 0 and 1. + """) + parser.add_argument( + '--background_frequency', + type=float, + default=0.8, + help="""\ + How many of the training samples have background noise mixed in. + """) + parser.add_argument( + '--silence_percentage', + type=float, + default=10.0, + help="""\ + How much of the training data should be silence. + """) + parser.add_argument( + '--unknown_percentage', + type=float, + default=10.0, + help="""\ + How much of the training data should be unknown words. + """) + parser.add_argument( + '--time_shift_ms', + type=float, + default=100.0, + help="""\ + Range to randomly shift the training audio by in time. + """) + parser.add_argument( + '--testing_percentage', + type=int, + default=10, + help='What percentage of wavs to use as a test set.') + parser.add_argument( + '--validation_percentage', + type=int, + default=10, + help='What percentage of wavs to use as a validation set.') + parser.add_argument( + '--sample_rate', + type=int, + default=16000, + help='Expected sample rate of the wavs',) + parser.add_argument( + '--clip_duration_ms', + type=int, + default=1000, + help='Expected duration in milliseconds of the wavs',) + parser.add_argument( + '--window_size_ms', + type=float, + default=30.0, + help='How long each spectrogram timeslice is',) + parser.add_argument( + '--window_stride_ms', + type=float, + default=10.0, + help='How long each spectrogram timeslice is',) + parser.add_argument( + '--dct_coefficient_count', + type=int, + default=40, + help='How many bins to use for the MFCC fingerprint',) + parser.add_argument( + '--how_many_training_steps', + type=str, + default='15000,3000', + help='How many training loops to run',) + parser.add_argument( + '--eval_step_interval', + type=int, + default=400, + help='How often to evaluate the training results.') + parser.add_argument( + '--learning_rate', + type=str, + default='0.001,0.0001', + help='How large a learning rate to use when training.') + parser.add_argument( + '--batch_size', + type=int, + default=100, + help='How many items to train with at once',) + parser.add_argument( + '--summaries_dir', + type=str, + default='/tmp/retrain_logs', + help='Where to save summary logs for TensorBoard.') + parser.add_argument( + '--wanted_words', + type=str, + default='yes,no,up,down,left,right,on,off,stop,go', + help='Words to use (others will be added to an unknown label)',) + parser.add_argument( + '--train_dir', + type=str, + default='/tmp/speech_commands_train', + help='Directory to write event logs and checkpoint.') + parser.add_argument( + '--model_architecture', + type=str, + default='dnn', + help='What model architecture to use') + parser.add_argument( + '--model_size_info', + type=int, + nargs="+", + default=[128, 128, 128], + help='Model dimensions - different for various models') + + FLAGS, _ = parser.parse_known_args() + train() diff --git a/models/keyword_spotting/dnn_small/model_package_tf/validation_utils/labels.txt b/models/keyword_spotting/dnn_small/model_package_tf/validation_utils/labels.txt new file mode 100644 index 0000000..ba41645 --- /dev/null +++ b/models/keyword_spotting/dnn_small/model_package_tf/validation_utils/labels.txt @@ -0,0 +1,12 @@ +_silence_ +_unknown_ +yes +no +up +down +left +right +on +off +stop +go \ No newline at end of file diff --git a/models/keyword_spotting/dnn_small/tflite_int8/README.md b/models/keyword_spotting/dnn_small/tflite_int8/README.md deleted file mode 100644 index 1f5d3f8..0000000 --- a/models/keyword_spotting/dnn_small/tflite_int8/README.md +++ /dev/null @@ -1,59 +0,0 @@ -# DNN Small INT8 - -## Description -This is a fully quantized version (asymmetrical int8) of the DNN Small model developed by Arm, with training checkpoints, from the Hello Edge paper. Code to recreate this model can be found here: https://github.com/ARM-software/ML-examples/tree/master/tflu-kws-cortex-m - -## License -[Apache-2.0](https://spdx.org/licenses/Apache-2.0.html) - -## Related Materials -### Class Labels -The class labels associated with this model can be downloaded by running the script `get_class_labels.sh`. - -### Model Recreation Code -Code to recreate this model can be found here: https://github.com/ARM-software/ML-examples/tree/master/tflu-kws-cortex-m. - -## Network Information -| Network Information | Value | -|---------------------|------------------| -| Framework | TensorFlow Lite | -| SHA-1 Hash | 4b92e09fb43b2f042ce2811b91c7c67bf7186b6b | -| Size (Bytes) | 83544 | -| Provenance | https://github.com/ARM-software/ML-examples/tree/master/tflu-kws-cortex-m | -| Paper | https://arxiv.org/abs/1711.07128 | - -## Accuracy -Dataset: Google Speech Commands Test Set - -| Metric | Value | -|--------|-------| -| Accuracy | 0.825 | - -## Performance -| Platform | Optimized | -|----------|:---------:| -| Cortex-A |:heavy_check_mark: | -| Cortex-M |:heavy_check_mark: | -| Mali GPU |:heavy_check_mark: | -| Ethos U |:heavy_check_mark: | - -### Key -* :heavy_check_mark: - Will run on this platform. -* :heavy_multiplication_x: - Will not run on this platform. - - - -## Optimizations -| Optimization | Value | -|-----------------|---------| -| Quantization | INT8 | - -## Network Inputs -| Input Node Name | Shape | Description | -|-----------------|---------|-------------| -| input | (1, 250) | The input is a processed MFCCs of shape (1, 250) | - -## Network Outputs -| Output Node Name | Shape | Description | -|------------------|---------|-------------| -| Identity | (1, 12) | The probability on 12 keywords. | diff --git a/models/keyword_spotting/dnn_small/tflite_int8/definition.yaml b/models/keyword_spotting/dnn_small/tflite_int8/definition.yaml deleted file mode 100644 index 7f66d4d..0000000 --- a/models/keyword_spotting/dnn_small/tflite_int8/definition.yaml +++ /dev/null @@ -1,41 +0,0 @@ -benchmark: - Google Speech Commands test set: - Accuracy: 82.45% -description: 'This is a fully quantized version (asymmetrical int8) of the DNN Small - model developed by Arm, with training checkpoints, from the Hello Edge paper. Code - to recreate this model can be found here: https://github.com/ARM-software/ML-examples/tree/master/tflu-kws-cortex-m' -license: -- Apache-2.0 -network: - file_size_bytes: 83544 - filename: dnn_s_quantized.tflite - framework: TensorFlow Lite - hash: - algorithm: sha1 - value: 4b92e09fb43b2f042ce2811b91c7c67bf7186b6b - provenance: https://github.com/ARM-software/ML-examples/tree/master/tflu-kws-cortex-m - quality_level: null -network_parameters: - input_nodes: - - description: The input is a processed MFCCs of shape (1, 250) - example_input: - path: models/keyword_spotting/dnn_small/tflite_int8/testing_input/input - name: input - shape: - - 1 - - 250 - output_nodes: - - description: The probability on 12 keywords. - name: Identity - shape: - - 1 - - 12 - test_output_path: models/keyword_spotting/dnn_small/tflite_int8/testing_output/Identity -operators: - TensorFlow Lite: - - DEQUANTIZE - - FULLY_CONNECTED - - QUANTIZE - - RELU - - SOFTMAX -paper: https://arxiv.org/abs/1711.07128 diff --git a/models/keyword_spotting/dnn_small/tflite_int8/get_class_labels.sh b/models/keyword_spotting/dnn_small/tflite_int8/get_class_labels.sh deleted file mode 100755 index e59caf5..0000000 --- a/models/keyword_spotting/dnn_small/tflite_int8/get_class_labels.sh +++ /dev/null @@ -1,20 +0,0 @@ -# Copyright (C) 2021 Arm Limited or its affiliates. All rights reserved. -# -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the License); you may -# not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an AS IS BASIS, WITHOUT -# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -#!/usr/bin/env bash - -wget https://raw.githubusercontent.com/ARM-software/ML-KWS-for-MCU/e9cf319e9aa2ff71d433e111477dd95329fb94cb/Pretrained_models/labels.txt -mv labels.txt labelmappings.txt \ No newline at end of file diff --git a/models/keyword_spotting/ds_cnn_large/model_package_tf/README.md b/models/keyword_spotting/ds_cnn_large/model_package_tf/README.md new file mode 100644 index 0000000..c4e4d69 --- /dev/null +++ b/models/keyword_spotting/ds_cnn_large/model_package_tf/README.md @@ -0,0 +1,115 @@ +# DS-CNN Large model package + +This folder contains code that will allow you to recreate the DS-CNN Large keyword spotting model from +the [Hello Edge paper](https://arxiv.org/pdf/1711.07128.pdf). + +## License +[Apache-2.0](https://spdx.org/licenses/Apache-2.0.html) + +## Model Package Overview +| Model | DS_CNN_Large | +|:---------------: |:------------------------------------------:| +| **Format**: | Keras, Saved Model, TensorFlow Lite int8, TensorFlow Lite fp32 | +| **Feature**: | Keyword spotting for Arm Cortex-M CPUs | +| **Architectural Delta w.r.t. Vanilla**: | None | +| **Domain**: | Keyword spotting | +| **Package Quality**: | Hero | + +## Model Recreation + +In order to recreate the model you will first need to be using ```Python3.7``` and install the requirements in ```requirements.txt```. + +Once you have these requirements satisfied you can execute the recreation script contained within this folder, just run: + +```bash +bash ./recreate_model.sh +``` + +Running this script will use the pre-trained checkpoint files supplied in the ```./model_archive/model_source/weights``` folder +to generate the TFLite files and perform evaluation on the test sets. Both an fp32 version and a quantized version will be produced. +The quantized version will use post-training quantization to fully quantize it. + +If you want to run training from scratch you can do this by supplying ```--train``` when running the script. For example: + +```bash +bash ./recreate_model.sh --train +``` + +Training is then performed and should produce a model to the stated accuracy in this repository. +Note that exporting to TFLite will still happen with the pre-trained checkpoint files so you will need to re-run the script +and this time supply the path to the new checkpoint files you want to use, for example: + +```bash +bash ./recreate_model.sh --ckpt +``` + + +## Training + +To train a DNN with 3 fully-connected layers with 128 neurons in each layer, run: + +``` +python train.py --model_architecture dnn --model_size_info 128 128 128 +``` +The command line argument *--model_size_info* is used to pass the neural network layer +dimensions such as number of layers, convolution filter size/stride as a list to models.py, +which builds the TensorFlow graph based on the provided model architecture +and layer dimensions. For more info on *model_size_info* for each network architecture see +[models.py](models.py). + +The training commands with all the hyperparameters to reproduce the models shown in the +[paper](https://arxiv.org/pdf/1711.07128.pdf) are given [here](recreate_model.sh). + +## Testing +To run inference on the trained model from a checkpoint and get accuracy on validation and test sets, run: +``` +python evaluation.py --model_architecture dnn --model_size_info 128 128 128 --checkpoint +``` +The parameters used here should match those used in the Training step. + +## Optimization + +We introduce a new *optional* step to optimize the trained keyword spotting model for deployment. + +Here we use TensorFlow's [weight clustering API](https://www.tensorflow.org/model_optimization/guide/clustering) to reduce the compressed model size and optimize inference on supported hardware. 32 weight clusters and kmeans++ cluster intialization method are used as the clustering hyperparameters. + +To optimize your trained model (e.g. a DNN), a trained model checkpoint is needed to run clustering and fine-tuning on. +You can use the pre-trained checkpoints provided, or train your own model and use the resulting checkpoint. + +To apply the optimization and fine-tuning, run the following command: +``` +python optimisations.py --model_architecture dnn --model_size_info 128 128 128 --checkpoint +``` +The parameters used here should match those used in the Training step, except for the number of training steps. +The number of training steps is reduced since the optimization step only requires fine-tuning. + +This will generate a clustered model checkpoint that can be used in the quantization step to generate a quantized and clustered TFLite model. + +## Quantization and TFLite Conversion + +As part of the update we now use TensorFlow's +[post training quantization](https://www.tensorflow.org/lite/performance/post_training_quantization) to +make quantization of the trained models super simple. + +To quantize your trained model (e.g. a DNN) run: +``` +python convert_to_tflite.py --model_architecture dnn --model_size_info 128 128 128 --checkpoint [--inference_type int8|int16] +``` +The parameters used here should match those used in the Training step. + +The inference_type parameter is *optional* and to be used if a fully quantized model with inputs and outputs of type int8 or int16 is needed. It defaults to fp32. + +This step will produce a quantized TFLite file *dnn_quantized.tflite*. +You can test the accuracy of this quantized model on the test set by running: +``` +python evaluation.py --tflite_path dnn_quantized.tflite +``` +The parameters used here should match those used in the Training step. + +`convert_to_tflite.py` uses post-training quantization to generate a quantized model by default. If you wish to convert to a floating point TFLite model, use the command below: + +``` +python convert_to_tflite.py --model_architecture dnn --model_size_info 128 128 128 --checkpoint --no-quantize +``` + +This will produce a floating point TFLite file *dnn.tflite*. You can test the accuracy of this floating point model using `evaluation.py` as above. diff --git a/models/keyword_spotting/ds_cnn_large/model_package_tf/convert_to_tflite.py b/models/keyword_spotting/ds_cnn_large/model_package_tf/convert_to_tflite.py new file mode 100644 index 0000000..64ab8df --- /dev/null +++ b/models/keyword_spotting/ds_cnn_large/model_package_tf/convert_to_tflite.py @@ -0,0 +1,234 @@ +# Copyright © 2023 Arm Ltd. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Functions for converting and quantizing a trained keyword spotting + model and saving to TFLite.""" + +import argparse + +import tensorflow as tf + +from data_processing import data_preprocessing +from model_core_utils import models +from evaluation import tflite_test + +NUM_REP_DATA_SAMPLES = 100 # How many samples to use for post training quantization. + + +def convert(model_settings, audio_processor, checkpoint, quantize, inference_type, tflite_path): + """Load our trained floating point model and convert it. + + TFLite conversion or post training quantization is performed and the + resulting model is saved as a TFLite file. + We use samples from the validation set to do post training quantization. + + Args: + model_settings: Dictionary of common model settings. + audio_processor: Audio processor class object. + checkpoint: Path to training checkpoint to load. + quantize: Whether to quantize the model or convert to fp32 TFLite model. + inference_type: Input/output type of the quantized model. + tflite_path: Output TFLite file save path. + """ + model = models.create_model(model_settings, FLAGS.model_architecture, FLAGS.model_size_info, False) + model.load_weights(checkpoint).expect_partial() + + val_data = audio_processor.get_data(audio_processor.Modes.VALIDATION).batch(1) + + def _rep_dataset(): + """Generator function to produce representative dataset.""" + i = 0 + for mfcc, label in val_data: + if i > NUM_REP_DATA_SAMPLES: + break + i += 1 + yield [mfcc] + + if quantize: + # Quantize model and save to disk. + tflite_model = post_training_quantize(model, inference_type, _rep_dataset) + with open(tflite_path, 'wb') as f: + f.write(tflite_model) + print(f'Quantized model saved to {tflite_path}.') + else: + converter = tf.lite.TFLiteConverter.from_keras_model(model) + tflite_model = converter.convert() + with open(tflite_path, 'wb') as f: + f.write(tflite_model) + print(f'Converted model saved to {tflite_path}.') + + +def post_training_quantize(keras_model, inference_type, rep_dataset): + """Perform post training quantization and returns the TFLite model ready for saving. + + See https://www.tensorflow.org/lite/performance/post_training_quantization#full_integer_quantization for + more details. + + Args: + keras_model: The trained tf Keras model used for post training quantization. + inference_type: Input/output type of the quantized model. + rep_dataset: Function to use as a representative dataset, must be callable. + + Returns: + Quantized TFLite model ready for saving to disk. + """ + converter = tf.lite.TFLiteConverter.from_keras_model(keras_model) + converter.optimizations = [tf.lite.Optimize.DEFAULT] + + if inference_type == 'int8': + converter.inference_input_type = tf.int8 + converter.inference_output_type = tf.int8 + supported_ops = tf.lite.OpsSet.TFLITE_BUILTINS_INT8 + if inference_type == 'int16': + converter.inference_input_type = tf.int16 + converter.inference_output_type = tf.int16 + supported_ops = tf.lite.OpsSet.EXPERIMENTAL_TFLITE_BUILTINS_ACTIVATIONS_INT16_WEIGHTS_INT8 + + # Int8 post training quantization needs representative dataset. + converter.representative_dataset = rep_dataset + converter.target_spec.supported_ops = [supported_ops] + + tflite_model = converter.convert() + + return tflite_model + + +def main(): + model_settings = models.prepare_model_settings( + len(data_preprocessing.prepare_words_list(FLAGS.wanted_words.split(','))), + FLAGS.sample_rate, FLAGS.clip_duration_ms, FLAGS.window_size_ms, + FLAGS.window_stride_ms, FLAGS.dct_coefficient_count) + + audio_processor = data_preprocessing.AudioProcessor(data_url=FLAGS.data_url, + data_dir=FLAGS.data_dir, + silence_percentage=FLAGS.silence_percentage, + unknown_percentage=FLAGS.unknown_percentage, + wanted_words=FLAGS.wanted_words.split(','), + validation_percentage=FLAGS.validation_percentage, + testing_percentage=FLAGS.testing_percentage, + model_settings=model_settings) + + if FLAGS.quantize: + tflite_path = f'{FLAGS.model_architecture}_quantized.tflite' + else: + tflite_path = f'{FLAGS.model_architecture}.tflite' + + # Load floating point model from checkpoint and convert it. + convert(model_settings, audio_processor, FLAGS.checkpoint, + FLAGS.quantize, FLAGS.inference_type, tflite_path) + + # Test the newly converted model on the test set. + tflite_test(model_settings, audio_processor, tflite_path) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument( + '--data_url', + type=str, + default='http://download.tensorflow.org/data/speech_commands_v0.02.tar.gz', + help='Location of speech training data archive on the web.') + parser.add_argument( + '--data_dir', + type=str, + default='/tmp/speech_dataset/', + help="""\ + Where to download the speech training data to. + """) + parser.add_argument( + '--silence_percentage', + type=float, + default=10.0, + help="""\ + How much of the training data should be silence. + """) + parser.add_argument( + '--unknown_percentage', + type=float, + default=10.0, + help="""\ + How much of the training data should be unknown words. + """) + parser.add_argument( + '--testing_percentage', + type=int, + default=10, + help='What percentage of wavs to use as a test set.') + parser.add_argument( + '--validation_percentage', + type=int, + default=10, + help='What percentage of wavs to use as a validation set.') + parser.add_argument( + '--sample_rate', + type=int, + default=16000, + help='Expected sample rate of the wavs',) + parser.add_argument( + '--clip_duration_ms', + type=int, + default=1000, + help='Expected duration in milliseconds of the wavs',) + parser.add_argument( + '--window_size_ms', + type=float, + default=30.0, + help='How long each spectrogram timeslice is',) + parser.add_argument( + '--window_stride_ms', + type=float, + default=10.0, + help='How long each spectrogram timeslice is',) + parser.add_argument( + '--dct_coefficient_count', + type=int, + default=40, + help='How many bins to use for the MFCC fingerprint',) + parser.add_argument( + '--wanted_words', + type=str, + default='yes,no,up,down,left,right,on,off,stop,go', + help='Words to use (others will be added to an unknown label)',) + parser.add_argument( + '--model_architecture', + type=str, + default='dnn', + help='What model architecture to use') + parser.add_argument( + '--model_size_info', + type=int, + nargs="+", + default=[128, 128, 128], + help='Model dimensions - different for various models') + parser.add_argument( + '--checkpoint', + type=str, + help='Checkpoint to load the weights from.') + parser.add_argument( + '--quantize', + dest='quantize', + action="store_true", + default=True, + help='Whether to quantize the model or convert to fp32 TFLite model. Defaults to True.') + parser.add_argument( + '--no-quantize', + dest='quantize', + action="store_false", + help='Whether to quantize the model or convert to fp32 TFLite model. Defaults to True.') + parser.add_argument( + '--inference_type', + type=str, + default='fp32', + help='If quantize is true, whether the model input and output is float32, int8 or int16') + + FLAGS, _ = parser.parse_known_args() + main() diff --git a/models/keyword_spotting/ds_cnn_large/model_package_tf/data_processing/__init__.py b/models/keyword_spotting/ds_cnn_large/model_package_tf/data_processing/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/models/keyword_spotting/ds_cnn_large/model_package_tf/data_processing/data_preprocessing.py b/models/keyword_spotting/ds_cnn_large/model_package_tf/data_processing/data_preprocessing.py new file mode 100644 index 0000000..05cf5ba --- /dev/null +++ b/models/keyword_spotting/ds_cnn_large/model_package_tf/data_processing/data_preprocessing.py @@ -0,0 +1,462 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +# +# Modifications Copyright 2023 Arm Inc. All Rights Reserved. +# Modified to use TensorFlow 2.0 and data pipelines. +# +"""Functions for loading and preparing data for keyword spotting.""" + +import os +import re +import sys +import urllib +from pathlib import Path +import tarfile +import hashlib +import random +import math +from enum import Enum + +import numpy as np +import tensorflow as tf +from tensorflow.python.ops import gen_audio_ops as audio_ops + +MAX_NUM_WAVS_PER_CLASS = 2**27 - 1 # ~134M +RANDOM_SEED = 59185 +BACKGROUND_NOISE_DIR_NAME = '_background_noise_' +SILENCE_LABEL = '_silence_' +SILENCE_INDEX = 0 +UNKNOWN_WORD_INDEX = 1 +UNKNOWN_WORD_LABEL = '_unknown_' + + +def load_wav_file(wav_filename, desired_samples): + """Loads and then decodes a given 16bit PCM wav file. + + Decoded audio is scaled to the range [-1, 1] and padded or cropped to the desired number of samples. + + Args: + wav_filename: 16bit PCM wav file to load. + desired_samples: Number of samples wanted from the audio file. + + Returns: + Tuple consisting of the decoded audio and sample rate. + """ + wav_file = tf.io.read_file(wav_filename) + decoded_wav = audio_ops.decode_wav(wav_file, desired_channels=1, desired_samples=desired_samples) + + return decoded_wav.audio, decoded_wav.sample_rate + + +def calculate_mfcc(audio_signal, audio_sample_rate, window_size, window_stride, num_mfcc): + """Returns Mel Frequency Cepstral Coefficients (MFCC) for a given audio signal. + + Args: + audio_signal: Raw audio signal in range [-1, 1] + audio_sample_rate: Audio signal sample rate + window_size: Window size in samples for calculating spectrogram + window_stride: Window stride in samples for calculating spectrogram + num_mfcc: The number of MFCC features wanted. + + Returns: + Calculated mffc features. + """ + spectrogram = audio_ops.audio_spectrogram(input=audio_signal, window_size=window_size, stride=window_stride, + magnitude_squared=True) + + mfcc_features = audio_ops.mfcc(spectrogram, audio_sample_rate, dct_coefficient_count=num_mfcc) + + return mfcc_features + + +def which_set(filename, validation_percentage, testing_percentage): + """Determines which data partition the file should belong to. + + We want to keep files in the same training, validation, or testing sets even + if new ones are added over time. This makes it less likely that testing + samples will accidentally be reused in training when long runs are restarted + for example. To keep this stability, a hash of the filename is taken and used + to determine which set it should belong to. This determination only depends on + the name and the set proportions, so it won't change as other files are added. + It's also useful to associate particular files as related (for example words + spoken by the same person), so anything after '_nohash_' in a filename is + ignored for set determination. This ensures that 'bobby_nohash_0.wav' and + 'bobby_nohash_1.wav' are always in the same set, for example. + + Args: + filename: File path of the data sample. + validation_percentage: How much of the data set to use for validation. + testing_percentage: How much of the data set to use for testing. + + Returns: + String, one of 'training', 'validation', or 'testing'. + """ + base_name = os.path.basename(filename) + # We want to ignore anything after '_nohash_' in the file name when + # deciding which set to put a wav in, so the data set creator has a way of + # grouping wavs that are close variations of each other. + hash_name = re.sub(r'_nohash_.*$', '', base_name) + # This looks a bit magical, but we need to decide whether this file should + # go into the training, testing, or validation sets, and we want to keep + # existing files in the same set even if more files are subsequently + # added. + # To do that, we need a stable way of deciding based on just the file name + # itself, so we do a hash of that and then use that to generate a + # probability value that we use to assign it. + hash_name_hashed = hashlib.sha1(tf.compat.as_bytes(hash_name)).hexdigest() + percentage_hash = ((int(hash_name_hashed, 16) % + (MAX_NUM_WAVS_PER_CLASS + 1)) * + (100.0 / MAX_NUM_WAVS_PER_CLASS)) + if percentage_hash < validation_percentage: + result = 'validation' + elif percentage_hash < (testing_percentage + validation_percentage): + result = 'testing' + else: + result = 'training' + return result + + +def prepare_words_list(wanted_words): + """Prepends common tokens to the custom word list. + + Args: + wanted_words: List of strings containing custom words to spot. + + Returns: + List of words with silence and unknown tokens added. + """ + return [SILENCE_LABEL, UNKNOWN_WORD_LABEL] + wanted_words + + +class AudioProcessor: + """Handles loading, partitioning, and preparing audio training data.""" + + class Modes(Enum): + TRAINING = 1 + VALIDATION = 2 + TESTING = 3 + + def __init__(self, data_url, data_dir, silence_percentage, unknown_percentage, + wanted_words, validation_percentage, testing_percentage, model_settings): + self.data_dir = Path(data_dir) + self.model_settings = model_settings + self.words_list = prepare_words_list(wanted_words) + + self._tf_datasets = {} + self.background_data = None + self._set_size = {'training': 0, 'validation': 0, 'testing': 0} + + self._download_and_extract_data(data_url, data_dir) + self._prepare_datasets(silence_percentage, unknown_percentage, wanted_words, + validation_percentage, testing_percentage) + self._prepare_background_data() + + def get_data(self, mode, background_frequency=0, background_volume_range=0, time_shift=0): + """Returns the train, validation or test set for KWS as a TF Dataset. + + Args: + mode: The set to return, see AudioProcessor.Modes enumeration. + background_frequency: How many of the samples have background noise mixed in. + background_volume_range: How loud the background noise should be, between 0 and 1. + time_shift: Range to randomly shift the training audio by in time. + + Returns: + TF dataset that will generate tuples containing an mfcc and corresponding label. + + Raises: + ValueError: If mode is not recognised. + """ + if mode == AudioProcessor.Modes.TRAINING: + dataset = self._tf_datasets['training'] + elif mode == AudioProcessor.Modes.VALIDATION: + dataset = self._tf_datasets['validation'] + elif mode == AudioProcessor.Modes.TESTING: + dataset = self._tf_datasets['testing'] + else: + ValueError("Incorrect dataset type given") + + use_background = (self.background_data is not None) and (mode == AudioProcessor.Modes.TRAINING) + dataset = dataset.map(lambda path, label: self._process_path(path, label, self.model_settings, + background_frequency, background_volume_range, + time_shift, use_background, self.background_data), + num_parallel_calls=tf.data.experimental.AUTOTUNE) + + return dataset + + def set_size(self, mode): + """Get the number of samples in the requested dataset partition. + + Args: + mode: Which partition, see AudioProcessor.Modes enumeration. + + Returns: + Number of samples in the partition. + + Raises: + ValueError: If mode is not recognised. + """ + if mode == AudioProcessor.Modes.TRAINING: + return self._set_size['training'] + elif mode == AudioProcessor.Modes.VALIDATION: + return self._set_size['validation'] + elif mode == AudioProcessor.Modes.TESTING: + return self._set_size['testing'] + else: + ValueError('Incorrect dataset type given') + + @staticmethod + def _process_path(path, label, model_settings, background_frequency, background_volume_range, time_shift_samples, + use_background, background_data): + """Load wav files and calculate mfcc features. + + Random shifting of samples and adding in background noise is done within this function as well. + This function is meant to be mapped onto a TF Dataset by using a lambda function. + + Args: + path: Path to the wav file to load. + label: Integer label for classifying the audio clip. + model_settings: Dictionary of settings for model being trained. + background_frequency: How many clips will have background noise, 0.0 to 1.0. + background_volume_range: How loud the background noise will be. + time_shift_samples: How much to randomly shift the clips by. + use_background: Add in background noise to audio clips or not. + background_data: Ragged tensor of loaded background noise samples. + + Returns: + Tuple of calculated flattened mfcc and its class label. + """ + + desired_samples = model_settings['desired_samples'] + audio, sample_rate = load_wav_file(path, desired_samples=desired_samples) + + # Make our own silence audio data. + if label == SILENCE_INDEX: + audio = tf.multiply(audio, 0) + + # Shift samples start position and pad any gaps with zeros. + if time_shift_samples > 0: + time_shift_amount = tf.random.uniform(shape=(), minval=-time_shift_samples, maxval=time_shift_samples, + dtype=tf.int32) + else: + time_shift_amount = 0 + if time_shift_amount > 0: + time_shift_padding = [[time_shift_amount, 0], [0, 0]] + time_shift_offset = [0, 0] + else: + time_shift_padding = [[0, -time_shift_amount], [0, 0]] + time_shift_offset = [-time_shift_amount, 0] + + padded_foreground = tf.pad(audio, time_shift_padding, mode='CONSTANT') + sliced_foreground = tf.slice(padded_foreground, time_shift_offset, [desired_samples, -1]) + + # Get a random section of background noise. + if use_background: + background_index = tf.random.uniform(shape=(), maxval=background_data.shape[0], dtype=tf.int32) + background_sample = background_data[background_index] + background_offset = tf.random.uniform(shape=(), maxval=len(background_sample)-desired_samples, + dtype=tf.int32) + background_clipped = background_sample[background_offset:(background_offset + desired_samples)] + background_reshaped = tf.reshape(background_clipped, [desired_samples, 1]) + if tf.random.uniform(shape=(), maxval=1) < background_frequency: + background_volume = tf.random.uniform(shape=(), maxval=background_volume_range) + else: + background_volume = tf.constant(0, dtype='float32') + else: + background_reshaped = np.zeros([desired_samples, 1], dtype=np.float32) + background_volume = tf.constant(0, dtype='float32') + + # Mix in background noise. + background_mul = tf.multiply(background_reshaped, background_volume) + background_add = tf.add(background_mul, sliced_foreground) + background_clamp = tf.clip_by_value(background_add, -1.0, 1.0) + + mfcc = calculate_mfcc(background_clamp, sample_rate, model_settings['window_size_samples'], + model_settings['window_stride_samples'], + model_settings['dct_coefficient_count']) + mfcc = tf.reshape(mfcc, [-1]) + + return mfcc, label + + def _download_and_extract_data(self, data_url, target_directory): + """Downloads and extracts file to target directory. + + If the file does not already exist download it and then untar into the target directory. + + Args: + data_url: Web link to the tarred data to download. + target_directory: Directory to download and extract to. + """ + target_directory = Path(target_directory) + target_directory.mkdir(exist_ok=True) + + filename = data_url.split('/')[-1] + filepath = target_directory / filename + + if not filepath.exists(): + def _report_hook(block_num, block_size, total_size): + """Function to track download progress in urllib""" + read_so_far = block_num * block_size + percent = (read_so_far / total_size) * 100.0 + + s = f"\rDownloading {filename} {percent:.1f}%" + + sys.stdout.write(s) + sys.stdout.flush() + + filepath, _ = urllib.request.urlretrieve(data_url, filepath, _report_hook) + print() + + print(f'Untarring {filename}...') + tarfile.open(filepath, 'r:gz').extractall(target_directory) + + def _prepare_datasets(self, silence_percentage, unknown_percentage, wanted_words, + validation_percentage, testing_percentage): + """Split the data into train, validation and testing sets. + + Silence and unknown data is added, then sets are converted to TF Datasets. + + Args: + silence_percentage: Percent of words should be silence. + unknown_percentage: Percent of words that should be unknown. + wanted_words: List of words wanted to classify. + validation_percentage: Percent to split off for validation. + testing_percentage: Percent to split off for testing. + """ + # Make sure the shuffling and picking of unknowns is deterministic. + random.seed(RANDOM_SEED) + wanted_words_index = {} + + for index, wanted_word in enumerate(wanted_words): + wanted_words_index[wanted_word] = index + 2 + + # Find all wav files in subfolders. + search_path = self.data_dir / '*' / '*.wav' + data_index, unknown_index, all_words = self._find_and_sort_wavs(search_path, validation_percentage, + testing_percentage, wanted_words_index) + + for index, wanted_word in enumerate(wanted_words): + if wanted_word not in all_words: + raise Exception(f'Tried to find {wanted_word} in labels but only found: {", ".join(all_words.keys())}') + + word_to_index = {} + for word in all_words: + if word in wanted_words_index: + word_to_index[word] = wanted_words_index[word] + else: + word_to_index[word] = UNKNOWN_WORD_INDEX + word_to_index[SILENCE_LABEL] = SILENCE_INDEX + + # We need an arbitrary file to load as the input for the silence samples. + # It's multiplied by zero later, so the content doesn't matter. + silence_wav_path = data_index['training'][0]['file'] + for set_index in ['validation', 'testing', 'training']: + set_size = len(data_index[set_index]) # Size before adding silence and unknown samples. + silence_size = int(math.ceil(set_size * silence_percentage / 100)) + for _ in range(silence_size): + data_index[set_index].append({ + 'label': SILENCE_LABEL, + 'file': silence_wav_path + }) + # Pick some unknowns to add to each partition of the data set. + random.shuffle(unknown_index[set_index]) + unknown_size = int(math.ceil(set_size * unknown_percentage / 100)) + data_index[set_index].extend(unknown_index[set_index][:unknown_size]) + + self._set_size[set_index] = len(data_index[set_index]) # Size after adding silence and unknown samples. + + # Make sure the ordering is random. + random.shuffle(data_index[set_index]) + + # Transform into TF Datasets ready for easier processing later. + labels, paths = list(zip(*[d.values() for d in data_index[set_index]])) + labels = [word_to_index[label] for label in labels] + self._tf_datasets[set_index] = tf.data.Dataset.from_tensor_slices((list(paths), labels)) + + def _find_and_sort_wavs(self, search_pattern, validation_percentage, testing_percentage, wanted_words_index): + """Find and sort wav files into known and unknown word sets. + + Known words are files containing words in the list of wanted words. + Any other clip goes to the unknown label set. Labels come from the folder names. + All clips are also assigned to train, test and validation sets. + + Args: + search_pattern: Path pattern used by glob to find wav files. + validation_percentage: Percent to split off for validation. + testing_percentage: Percent to split off for testing. + wanted_words_index: Dict mapping wanted words to their label index. + + Returns: + 3-tuple of known words, unknown words and mapping of all word labels. + """ + data_index = {'validation': [], 'testing': [], 'training': []} + unknown_index = {'validation': [], 'testing': [], 'training': []} + all_words = {} + + for wav_path in sorted(tf.io.gfile.glob(str(search_pattern))): + word = Path(wav_path).parent.name.lower() + + # Treat the '_background_noise_' folder as a special case, since we expect + # it to contain long audio samples we mix in to improve training. + if word == BACKGROUND_NOISE_DIR_NAME: + continue + + all_words[word] = True + set_index = which_set(wav_path, validation_percentage, testing_percentage) + # If it's a known class, store its detail, otherwise add it to the list + # we'll use to train the unknown label. + if word in wanted_words_index: + data_index[set_index].append({'label': word, 'file': wav_path}) + else: + unknown_index[set_index].append({'label': word, 'file': wav_path}) + if not all_words: + raise Exception('No .wavs found at ' + str(search_pattern)) + + return data_index, unknown_index, all_words + + def _prepare_background_data(self): + """Searches a folder for background noise audio, and loads it into memory. + + It's expected that the background audio samples will be in a subdirectory + named '_background_noise_' inside the 'data_dir' folder, as .wavs that match + the sample rate of the training data, but can be much longer in duration. + + If the '_background_noise_' folder doesn't exist at all, this isn't an + error, it's just taken to mean that no background noise augmentation should + be used. If the folder does exist, but it's empty, that's treated as an + error. + + Returns: + Ragged tensor of raw PCM-encoded audio samples of background noise. + None if '_background_noise_' folder doesnt exist. + + Raises: + Exception: If files aren't found in the folder. + """ + background_data = [] + background_dir = Path(self.data_dir / BACKGROUND_NOISE_DIR_NAME) + if not background_dir.exists(): + self.background_data = None + return + + search_path = Path(background_dir / '*.wav') + for wav_path in tf.io.gfile.glob(str(search_path)): + wav_data, _ = load_wav_file(wav_path, desired_samples=-1) + background_data.append(tf.reshape(wav_data, [-1])) + + if not background_data: + raise Exception('No background wav files were found in ' + str(search_path)) + + # Ragged tensor as we cant use lists in tf dataset map functions. + self.background_data = tf.ragged.stack(background_data) diff --git a/models/keyword_spotting/ds_cnn_large/model_package_tf/ds_cnn_l_inference_keras.py b/models/keyword_spotting/ds_cnn_large/model_package_tf/ds_cnn_l_inference_keras.py new file mode 100644 index 0000000..db7694a --- /dev/null +++ b/models/keyword_spotting/ds_cnn_large/model_package_tf/ds_cnn_l_inference_keras.py @@ -0,0 +1,76 @@ +# Copyright © 2023 Arm Ltd. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from data_processing.data_preprocessing import load_wav_file, calculate_mfcc + +import tensorflow as tf +import argparse + + +def load_labels(filename): + """Read in labels, one label per line.""" + f = open(filename, "r") + return f.read().splitlines() + + +def main(): + window_size_samples = int(FLAGS.sample_rate * FLAGS.window_size_ms / 1000) + window_stride_samples = int(FLAGS.sample_rate * FLAGS.window_stride_ms / 1000) + decoded, sample = load_wav_file(FLAGS.wav, FLAGS.sample_rate) + x = calculate_mfcc(decoded, sample, window_size_samples, window_stride_samples, FLAGS.dct_coefficient_count) + x = tf.reshape(x, [1, -1]) + + model = tf.keras.models.load_model(FLAGS.keras_file_path) + predictions = model.predict(x) + + # Sort to show labels in order of confidence + top_k = predictions[0].argsort()[-1:][::-1] + for node_id in top_k: + human_string = load_labels(FLAGS.labels)[int(node_id)] + score = predictions[0,node_id] + print(f'model predicted: {human_string} with score {score:.5f}') + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument( + '--wav', type=str, default='', help='Audio file to be identified.') + parser.add_argument( + '--labels', type=str, default='', help='Path to file containing labels.') + parser.add_argument( + '--sample_rate', + type=int, + default=16000, + help='Expected sample rate of the wavs', ) + parser.add_argument( + '--window_size_ms', + type=float, + default=30.0, + help='How long each spectrogram timeslice is', ) + parser.add_argument( + '--window_stride_ms', + type=float, + default=10.0, + help='How long each spectrogram timeslice is', ) + parser.add_argument( + '--dct_coefficient_count', + type=int, + default=40, + help='How many bins to use for the MFCC fingerprint', ) + parser.add_argument( + '--keras_file_path', + type=str, + default='', + help='Path to the .h5 Keras model file to use for testing.') + FLAGS, unparsed = parser.parse_known_args() + main() diff --git a/models/keyword_spotting/ds_cnn_large/model_package_tf/ds_cnn_l_inference_tflite.py b/models/keyword_spotting/ds_cnn_large/model_package_tf/ds_cnn_l_inference_tflite.py new file mode 100644 index 0000000..9f79d99 --- /dev/null +++ b/models/keyword_spotting/ds_cnn_large/model_package_tf/ds_cnn_l_inference_tflite.py @@ -0,0 +1,120 @@ +# Copyright © 2023 Arm Ltd. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from data_processing.data_preprocessing import load_wav_file, calculate_mfcc + +import tensorflow as tf +import numpy as np +import argparse + + +def tflite_inference(input_data, tflite_path): + """Call forwards pass of TFLite file and returns the result. + + Args: + input_data: Input data to use on forward pass. + tflite_path: Path to TFLite file to run. + + Returns: + Output from inference. + """ + supported_quant_dtypes = (np.int8, np.int16) + interpreter = tf.lite.Interpreter(model_path=tflite_path) + interpreter.allocate_tensors() + + input_details = interpreter.get_input_details() + output_details = interpreter.get_output_details() + + input_dtype = input_details[0]["dtype"] + output_dtype = output_details[0]["dtype"] + + # Check if the input/output type is quantized, + # set scale and zero-point accordingly + if input_dtype in supported_quant_dtypes: + input_scale, input_zero_point = input_details[0]["quantization"] + else: + input_scale, input_zero_point = 1, 0 + + input_data = input_data / input_scale + input_zero_point + input_data = np.round(input_data) if input_dtype in supported_quant_dtypes else input_data + + if output_dtype in supported_quant_dtypes: + output_scale, output_zero_point = output_details[0]["quantization"] + else: + output_scale, output_zero_point = 1, 0 + + interpreter.set_tensor(input_details[0]['index'], tf.cast(input_data, input_dtype)) + interpreter.invoke() + + output_data = interpreter.get_tensor(output_details[0]['index']) + + output_data = output_scale * (output_data.astype(np.float32) - output_zero_point) + + return output_data + + +def load_labels(filename): + """Read in labels, one label per line.""" + f = open(filename, "r") + return f.read().splitlines() + + +def main(): + window_size_samples = int(FLAGS.sample_rate * FLAGS.window_size_ms / 1000) + window_stride_samples = int(FLAGS.sample_rate * FLAGS.window_stride_ms / 1000) + decoded, sample = load_wav_file(FLAGS.wav, FLAGS.sample_rate) + x = calculate_mfcc(decoded, sample, window_size_samples, window_stride_samples, FLAGS.dct_coefficient_count) + x = tf.reshape(x, [1, -1]) + predictions = tflite_inference(x, FLAGS.tflite_path) + + # Sort to show labels in order of confidence + top_k = predictions[0].argsort()[-1:][::-1] + for node_id in top_k: + human_string = load_labels(FLAGS.labels)[int(node_id)] + score = predictions[0,node_id] + print(f'model predicted: {human_string} with score {score:.5f}') + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument( + '--wav', type=str, default='', help='Audio file to be identified.') + parser.add_argument( + '--labels', type=str, default='', help='Path to file containing labels.') + parser.add_argument( + '--sample_rate', + type=int, + default=16000, + help='Expected sample rate of the wavs', ) + parser.add_argument( + '--window_size_ms', + type=float, + default=30.0, + help='How long each spectrogram timeslice is', ) + parser.add_argument( + '--window_stride_ms', + type=float, + default=10.0, + help='How long each spectrogram timeslice is', ) + parser.add_argument( + '--dct_coefficient_count', + type=int, + default=40, + help='How many bins to use for the MFCC fingerprint', ) + parser.add_argument( + '--tflite_path', + type=str, + default='', + help='Path to TFLite file to use for testing.') + FLAGS, unparsed = parser.parse_known_args() + main() diff --git a/models/keyword_spotting/ds_cnn_large/model_package_tf/evaluation.py b/models/keyword_spotting/ds_cnn_large/model_package_tf/evaluation.py new file mode 100644 index 0000000..da2c57c --- /dev/null +++ b/models/keyword_spotting/ds_cnn_large/model_package_tf/evaluation.py @@ -0,0 +1,250 @@ +# Copyright © 2023 Arm Ltd. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Functions for testing trained keyword spotting models from checkpoint files and TFLite files.""" + +import argparse + +import numpy as np +import tensorflow as tf + +from data_processing import data_preprocessing +from model_core_utils import models +from ds_cnn_l_inference_tflite import tflite_inference + + +def tflite_test(model_settings, audio_processor, tflite_path): + """Calculate accuracy and confusion matrices on the validation and test sets. + + A TFLite model is used for doing testing. + + Args: + model_settings: Dictionary of common model settings. + audio_processor: Audio processor class object. + tflite_path: Path to TFLite file to use for inference. + """ + # Evaluate on validation set. + print("Running TFLite evaluation on validation set...") + val_data = audio_processor.get_data(audio_processor.Modes.VALIDATION).batch(1) + expected_indices = np.concatenate([y for x, y in val_data]) + predicted_indices = [] + + for mfcc, label in val_data: + prediction = tflite_inference(mfcc, tflite_path) + predicted_indices.append(np.squeeze(tf.argmax(prediction, axis=1))) + + val_accuracy = calculate_accuracy(predicted_indices, expected_indices) + confusion_matrix = tf.math.confusion_matrix(expected_indices, predicted_indices, + num_classes=model_settings['label_count']) + + print(confusion_matrix.numpy()) + print(f'Validation accuracy = {val_accuracy * 100:.2f}%' + f'(N={audio_processor.set_size(audio_processor.Modes.VALIDATION)})') + + # Evaluate on testing set. + print("Running TFLite evaluation on test set...") + test_data = audio_processor.get_data(audio_processor.Modes.TESTING).batch(1) + expected_indices = np.concatenate([y for x, y in test_data]) + predicted_indices = [] + + for mfcc, label in test_data: + prediction = tflite_inference(mfcc, tflite_path) + predicted_indices.append(np.squeeze(tf.argmax(prediction, axis=1))) + + test_accuracy = calculate_accuracy(predicted_indices, expected_indices) + confusion_matrix = tf.math.confusion_matrix(expected_indices, predicted_indices, + num_classes=model_settings['label_count']) + + print(confusion_matrix.numpy()) + print(f'Test accuracy = {test_accuracy * 100:.2f}%' + f'(N={audio_processor.set_size(audio_processor.Modes.TESTING)})') + + +def keras_test(model_settings, audio_processor, model): + """Calculate accuracy and confusion matrices on the validation and test sets. + + A loaded keras model is used for doing testing. + + Args: + model_settings: Dictionary of common model settings. + audio_processor: Audio processor class object. + model: Loaded keras model. + """ + # Evaluate on validation set. + print("Running TF evaluation on validation set...") + val_data = audio_processor.get_data(audio_processor.Modes.VALIDATION).batch(FLAGS.batch_size) + expected_indices = np.concatenate([y for x, y in val_data]) + + predictions = model.predict(val_data) + predicted_indices = tf.argmax(predictions, axis=1) + + val_accuracy = calculate_accuracy(predicted_indices, expected_indices) + confusion_matrix = tf.math.confusion_matrix(expected_indices, predicted_indices, + num_classes=model_settings['label_count']) + print(confusion_matrix.numpy()) + print(f'Validation accuracy = {val_accuracy * 100:.2f}%' + f'(N={audio_processor.set_size(audio_processor.Modes.VALIDATION)})') + + # Evaluate on testing set. + print("Running TF evaluation on test set...") + test_data = audio_processor.get_data(audio_processor.Modes.TESTING).batch(FLAGS.batch_size) + expected_indices = np.concatenate([y for x, y in test_data]) + + predictions = model.predict(test_data) + predicted_indices = tf.argmax(predictions, axis=1) + + test_accuracy = calculate_accuracy(predicted_indices, expected_indices) + confusion_matrix = tf.math.confusion_matrix(expected_indices, predicted_indices, + num_classes=model_settings['label_count']) + print(confusion_matrix.numpy()) + print(f'Test accuracy = {test_accuracy * 100:.2f}%' + f'(N={audio_processor.set_size(audio_processor.Modes.TESTING)})') + + +def calculate_accuracy(predicted_indices, expected_indices): + """Calculates and returns accuracy. + + Args: + predicted_indices: List of predicted integer indices. + expected_indices: List of expected integer indices. + + Returns: + Accuracy value between 0 and 1. + """ + correct_prediction = tf.equal(predicted_indices, expected_indices) + accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) + return accuracy + + +def evaluate(): + """Calculate accuracy and confusion matrices on validation and test sets. + + Model is created and weights loaded from supplied command line arguments. + """ + model_settings = models.prepare_model_settings( + len(data_preprocessing.prepare_words_list(FLAGS.wanted_words.split(','))), + FLAGS.sample_rate, FLAGS.clip_duration_ms, FLAGS.window_size_ms, + FLAGS.window_stride_ms, FLAGS.dct_coefficient_count) + + audio_processor = data_preprocessing.AudioProcessor(data_url=FLAGS.data_url, + data_dir=FLAGS.data_dir, + silence_percentage=FLAGS.silence_percentage, + unknown_percentage=FLAGS.unknown_percentage, + wanted_words=FLAGS.wanted_words.split(','), + validation_percentage=FLAGS.validation_percentage, + testing_percentage=FLAGS.testing_percentage, + model_settings=model_settings) + + if FLAGS.tflite_path: + tflite_test(model_settings, audio_processor, FLAGS.tflite_path) + + if FLAGS.checkpoint: + model = models.create_model(model_settings, FLAGS.model_architecture, FLAGS.model_size_info, False) + model.load_weights(FLAGS.checkpoint).expect_partial() + keras_test(model_settings, audio_processor, model) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument( + '--data_url', + type=str, + default='http://download.tensorflow.org/data/speech_commands_v0.02.tar.gz', + help='Location of speech training data archive on the web.') + parser.add_argument( + '--data_dir', + type=str, + default='/tmp/speech_dataset/', + help="""\ + Where to download the speech training data to. + """) + parser.add_argument( + '--silence_percentage', + type=float, + default=10.0, + help="""\ + How much of the training data should be silence. + """) + parser.add_argument( + '--unknown_percentage', + type=float, + default=10.0, + help="""\ + How much of the training data should be unknown words. + """) + parser.add_argument( + '--testing_percentage', + type=int, + default=10, + help='What percentage of wavs to use as a test set.') + parser.add_argument( + '--validation_percentage', + type=int, + default=10, + help='What percentage of wavs to use as a validation set.') + parser.add_argument( + '--sample_rate', + type=int, + default=16000, + help='Expected sample rate of the wavs',) + parser.add_argument( + '--clip_duration_ms', + type=int, + default=1000, + help='Expected duration in milliseconds of the wavs',) + parser.add_argument( + '--window_size_ms', + type=float, + default=30.0, + help='How long each spectrogram timeslice is',) + parser.add_argument( + '--window_stride_ms', + type=float, + default=10.0, + help='How long each spectrogram timeslice is',) + parser.add_argument( + '--dct_coefficient_count', + type=int, + default=40, + help='How many bins to use for the MFCC fingerprint',) + parser.add_argument( + '--batch_size', + type=int, + default=100, + help='How many items to train with at once',) + parser.add_argument( + '--wanted_words', + type=str, + default='yes,no,up,down,left,right,on,off,stop,go', + help='Words to use (others will be added to an unknown label)',) + parser.add_argument( + '--checkpoint', + type=str, + help='Checkpoint to load the weights from') + parser.add_argument( + '--tflite_path', + type=str, + help='Path to TFLite file to use for evaluation') + parser.add_argument( + '--model_architecture', + type=str, + default='dnn', + help='What model architecture to use') + parser.add_argument( + '--model_size_info', + type=int, + nargs="+", + default=[128, 128, 128], + help='Model dimensions - different for various models') + + FLAGS, _ = parser.parse_known_args() + evaluate() diff --git a/models/keyword_spotting/ds_cnn_large/model_package_tf/how_to_guidance.ipynb b/models/keyword_spotting/ds_cnn_large/model_package_tf/how_to_guidance.ipynb new file mode 100644 index 0000000..73d594b --- /dev/null +++ b/models/keyword_spotting/ds_cnn_large/model_package_tf/how_to_guidance.ipynb @@ -0,0 +1,428 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Copyright (C) 2023 Arm Limited or its affiliates. All rights reserved.\n", + "#\n", + "# SPDX-License-Identifier: Apache-2.0\n", + "#\n", + "# Licensed under the Apache License, Version 2.0 (the License); you may\n", + "# not use this file except in compliance with the License.\n", + "# You may obtain a copy of the License at\n", + "#\n", + "# www.apache.org/licenses/LICENSE-2.0\n", + "#\n", + "# Unless required by applicable law or agreed to in writing, software\n", + "# distributed under the License is distributed on an AS IS BASIS, WITHOUT\n", + "# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", + "# See the License for the specific language governing permissions and\n", + "# limitations under the License." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# DS_CNN_Large - Hero\n", + "\n", + "Here we reproduce the models with our established codebase and ModelPackage approach for your convenience.\n", + "\n", + "## Model-Package Overview:\n", + "\n", + "| Model \t| DS_CNN_Large \t|\n", + "|:---------------:\t|:---------------------------------------------------------------:\t|\n", + "| **Format**: \t| Keras, Saved Model, TensorFlow Lite int8, TensorFlow Lite fp32 |\n", + "| **Feature**: \t| Keyword spotting for Arm Cortex-M CPUs |\n", + "| **Architectural Delta w.r.t. Vanilla**: | None |\n", + "| **Domain**: \t| Keyword spotting |\n", + "| **Package Quality**: \t| Hero |" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Table of contents \n", + "\n", + "This how-to guidance presents the key steps to reproduce everything in this package. The contents are organised as below. We provided the internal navigation links for users to easy-jump among different sections. \n", + "\n", + " \n", + "* [1.0 Model recreation](#model_recreation)\n", + "\n", + "* [2.0 Training](#training)\n", + "\n", + "* [3.0 Testing](#testing)\n", + "\n", + "* [4.0 Optimization](#optimization)\n", + "\n", + "* [5.0 Quantization and TFLite conversion](#tflite_conversion)\n", + "\n", + "* [6.0 Inference the TFLite model files](#tflite_inference)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1.0 Model Recreation\n", + "\n", + "In order to recreate the model you will first need to be using ```Python3.7``` and install the requirements in ```requirements.txt```.\n", + "\n", + "Once you have these requirements satisfied you can execute the recreation script contained within this folder, just run:" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2023-01-31 11:38:02.599656: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0\n", + "Untarring speech_commands_v0.02.tar.gz...\n", + "2023-01-31 11:38:53.030038: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcuda.so.1\n", + "2023-01-31 11:38:53.069964: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: \n", + "pciBusID: 0000:03:00.0 name: NVIDIA TITAN Xp computeCapability: 6.1\n", + "coreClock: 1.582GHz coreCount: 30 deviceMemorySize: 11.91GiB deviceMemoryBandwidth: 510.07GiB/s\n", + "2023-01-31 11:38:53.070029: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0\n", + "2023-01-31 11:38:53.094139: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcublas.so.11\n", + "2023-01-31 11:38:53.094219: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcublasLt.so.11\n", + "2023-01-31 11:38:53.096985: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcufft.so.10\n", + "2023-01-31 11:38:53.097285: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcurand.so.10\n", + "2023-01-31 11:38:53.097852: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcusolver.so.11\n", + "2023-01-31 11:38:53.098590: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcusparse.so.11\n", + "2023-01-31 11:38:53.098752: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudnn.so.8\n", + "2023-01-31 11:38:53.099168: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0\n", + "2023-01-31 11:38:53.099481: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA\n", + "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n", + "2023-01-31 11:38:53.100222: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: \n", + "pciBusID: 0000:03:00.0 name: NVIDIA TITAN Xp computeCapability: 6.1\n", + "coreClock: 1.582GHz coreCount: 30 deviceMemorySize: 11.91GiB deviceMemoryBandwidth: 510.07GiB/s\n", + "2023-01-31 11:38:53.100624: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0\n", + "2023-01-31 11:38:53.100693: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0\n", + "2023-01-31 11:38:53.524442: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1258] Device interconnect StreamExecutor with strength 1 edge matrix:\n", + "2023-01-31 11:38:53.524481: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1264] 0 \n", + "2023-01-31 11:38:53.524492: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1277] 0: N \n", + "2023-01-31 11:38:53.524999: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1418] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 10974 MB memory) -> physical GPU (device: 0, name: NVIDIA TITAN Xp, pci bus id: 0000:03:00.0, compute capability: 6.1)\n", + "WARNING:tensorflow:Compiled the loaded model, but the compiled metrics have yet to be built. `model.compile_metrics` will be empty until you train or evaluate the model.\n", + "2023-01-31 11:38:56.213089: W tensorflow/python/util/util.cc:348] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.\n", + "2023-01-31 11:38:58.326629: I tensorflow/core/grappler/devices.cc:69] Number of eligible GPUs (core count >= 8, compute capability >= 0.0): 1\n", + "2023-01-31 11:38:58.326721: I tensorflow/core/grappler/clusters/single_machine.cc:357] Starting new session\n", + "2023-01-31 11:38:58.327408: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: \n", + "pciBusID: 0000:03:00.0 name: NVIDIA TITAN Xp computeCapability: 6.1\n", + "coreClock: 1.582GHz coreCount: 30 deviceMemorySize: 11.91GiB deviceMemoryBandwidth: 510.07GiB/s\n", + "2023-01-31 11:38:58.327678: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0\n", + "2023-01-31 11:38:58.327711: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1258] Device interconnect StreamExecutor with strength 1 edge matrix:\n", + "2023-01-31 11:38:58.327721: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1264] 0 \n", + "2023-01-31 11:38:58.327731: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1277] 0: N \n", + "2023-01-31 11:38:58.328025: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1418] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 10974 MB memory) -> physical GPU (device: 0, name: NVIDIA TITAN Xp, pci bus id: 0000:03:00.0, compute capability: 6.1)\n", + "2023-01-31 11:38:58.347388: I tensorflow/core/platform/profile_utils/cpu_utils.cc:114] CPU Frequency: 3492140000 Hz\n", + "2023-01-31 11:38:58.352977: I tensorflow/core/grappler/optimizers/meta_optimizer.cc:1144] Optimization results for grappler item: graph_to_optimize\n", + " function_optimizer: function_optimizer did nothing. time = 0.012ms.\n", + " function_optimizer: function_optimizer did nothing. time = 0.002ms.\n", + "\n", + "2023-01-31 11:38:58.537693: W tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc:345] Ignored output_format.\n", + "2023-01-31 11:38:58.537738: W tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc:348] Ignored drop_control_dependency.\n", + "2023-01-31 11:38:58.545075: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:210] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.\n", + "2023-01-31 11:38:58.548334: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: \n", + "pciBusID: 0000:03:00.0 name: NVIDIA TITAN Xp computeCapability: 6.1\n", + "coreClock: 1.582GHz coreCount: 30 deviceMemorySize: 11.91GiB deviceMemoryBandwidth: 510.07GiB/s\n", + "2023-01-31 11:38:58.548626: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0\n", + "2023-01-31 11:38:58.548661: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1258] Device interconnect StreamExecutor with strength 1 edge matrix:\n", + "2023-01-31 11:38:58.548672: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1264] 0 \n", + "2023-01-31 11:38:58.548679: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1277] 0: N \n", + "2023-01-31 11:38:58.548981: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1418] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 10974 MB memory) -> physical GPU (device: 0, name: NVIDIA TITAN Xp, pci bus id: 0000:03:00.0, compute capability: 6.1)\n", + "Converted model saved to ds_cnn.tflite.\n", + "Running TFLite evaluation on validation set...\n", + "2023-01-31 11:38:58.616947: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:176] None of the MLIR Optimization Passes are enabled (registered 2)\n", + "[[371 0 0 0 0 0 0 0 0 0 0 0]\n", + " [ 0 339 1 5 2 3 6 5 3 2 1 4]\n", + " [ 0 2 382 0 0 2 8 0 0 0 0 3]\n", + " [ 0 5 2 382 1 7 1 0 0 0 0 8]\n", + " [ 0 2 0 0 323 1 3 0 1 14 5 1]\n", + " [ 0 2 0 6 1 361 1 1 0 0 3 2]\n", + " [ 0 0 5 1 0 0 344 2 0 0 0 0]\n", + " [ 0 3 0 1 0 0 0 358 0 0 0 1]\n", + " [ 1 3 0 2 4 1 0 0 344 7 0 1]\n", + " [ 0 2 1 0 18 0 1 0 4 342 3 2]\n", + " [ 0 1 0 0 8 0 0 1 0 4 335 1]\n", + " [ 0 4 0 9 1 5 0 0 1 2 2 348]]\n", + "Validation accuracy = 95.14%(N=4445)\n", + "Running TFLite evaluation on test set...\n", + "[[408 0 0 0 0 0 0 0 0 0 0 0]\n", + " [ 0 368 2 3 1 4 6 4 6 3 1 10]\n", + " [ 0 1 405 2 0 0 10 0 0 1 0 0]\n", + " [ 0 0 1 389 0 7 2 0 0 1 0 5]\n", + " [ 0 6 0 0 399 2 0 0 2 10 5 1]\n", + " [ 0 5 2 8 0 380 3 0 0 0 1 7]\n", + " [ 0 3 5 2 0 1 400 1 0 0 0 0]\n", + " [ 0 6 1 1 0 0 4 383 0 0 1 0]\n", + " [ 0 7 0 0 3 8 0 0 369 8 0 1]\n", + " [ 0 2 0 2 13 0 0 0 5 374 0 6]\n", + " [ 0 0 0 1 7 3 0 0 1 0 398 1]\n", + " [ 0 3 1 18 3 2 0 0 0 1 0 374]]\n", + "Test accuracy = 95.03%(N=4890)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2023-01-31 11:39:46.821173: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0\n", + "Untarring speech_commands_v0.02.tar.gz...\n", + "2023-01-31 11:40:36.690810: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcuda.so.1\n", + "2023-01-31 11:40:36.728954: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: \n", + "pciBusID: 0000:03:00.0 name: NVIDIA TITAN Xp computeCapability: 6.1\n", + "coreClock: 1.582GHz coreCount: 30 deviceMemorySize: 11.91GiB deviceMemoryBandwidth: 510.07GiB/s\n", + "2023-01-31 11:40:36.728995: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0\n", + "2023-01-31 11:40:36.749408: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcublas.so.11\n", + "2023-01-31 11:40:36.749475: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcublasLt.so.11\n", + "2023-01-31 11:40:36.752323: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcufft.so.10\n", + "2023-01-31 11:40:36.752624: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcurand.so.10\n", + "2023-01-31 11:40:36.753198: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcusolver.so.11\n", + "2023-01-31 11:40:36.753937: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcusparse.so.11\n", + "2023-01-31 11:40:36.754090: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudnn.so.8\n", + "2023-01-31 11:40:36.754586: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0\n", + "2023-01-31 11:40:36.754864: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA\n", + "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n", + "2023-01-31 11:40:36.755740: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: \n", + "pciBusID: 0000:03:00.0 name: NVIDIA TITAN Xp computeCapability: 6.1\n", + "coreClock: 1.582GHz coreCount: 30 deviceMemorySize: 11.91GiB deviceMemoryBandwidth: 510.07GiB/s\n", + "2023-01-31 11:40:36.756134: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0\n", + "2023-01-31 11:40:36.756197: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0\n", + "2023-01-31 11:40:37.210806: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1258] Device interconnect StreamExecutor with strength 1 edge matrix:\n", + "2023-01-31 11:40:37.210845: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1264] 0 \n", + "2023-01-31 11:40:37.210854: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1277] 0: N \n", + "2023-01-31 11:40:37.211393: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1418] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 10994 MB memory) -> physical GPU (device: 0, name: NVIDIA TITAN Xp, pci bus id: 0000:03:00.0, compute capability: 6.1)\n", + "WARNING:tensorflow:Compiled the loaded model, but the compiled metrics have yet to be built. `model.compile_metrics` will be empty until you train or evaluate the model.\n", + "2023-01-31 11:40:39.812506: W tensorflow/python/util/util.cc:348] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.\n", + "2023-01-31 11:40:42.235293: I tensorflow/core/grappler/devices.cc:69] Number of eligible GPUs (core count >= 8, compute capability >= 0.0): 1\n", + "2023-01-31 11:40:42.235385: I tensorflow/core/grappler/clusters/single_machine.cc:357] Starting new session\n", + "2023-01-31 11:40:42.236028: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: \n", + "pciBusID: 0000:03:00.0 name: NVIDIA TITAN Xp computeCapability: 6.1\n", + "coreClock: 1.582GHz coreCount: 30 deviceMemorySize: 11.91GiB deviceMemoryBandwidth: 510.07GiB/s\n", + "2023-01-31 11:40:42.236295: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0\n", + "2023-01-31 11:40:42.236328: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1258] Device interconnect StreamExecutor with strength 1 edge matrix:\n", + "2023-01-31 11:40:42.236339: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1264] 0 \n", + "2023-01-31 11:40:42.236348: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1277] 0: N \n", + "2023-01-31 11:40:42.236662: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1418] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 10994 MB memory) -> physical GPU (device: 0, name: NVIDIA TITAN Xp, pci bus id: 0000:03:00.0, compute capability: 6.1)\n", + "2023-01-31 11:40:42.255416: I tensorflow/core/platform/profile_utils/cpu_utils.cc:114] CPU Frequency: 3492140000 Hz\n", + "2023-01-31 11:40:42.259691: I tensorflow/core/grappler/optimizers/meta_optimizer.cc:1144] Optimization results for grappler item: graph_to_optimize\n", + " function_optimizer: function_optimizer did nothing. time = 0.012ms.\n", + " function_optimizer: function_optimizer did nothing. time = 0.002ms.\n", + "\n", + "2023-01-31 11:40:42.434390: W tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc:345] Ignored output_format.\n", + "2023-01-31 11:40:42.434429: W tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc:348] Ignored drop_control_dependency.\n", + "2023-01-31 11:40:42.441258: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:210] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.\n", + "2023-01-31 11:40:42.444349: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: \n", + "pciBusID: 0000:03:00.0 name: NVIDIA TITAN Xp computeCapability: 6.1\n", + "coreClock: 1.582GHz coreCount: 30 deviceMemorySize: 11.91GiB deviceMemoryBandwidth: 510.07GiB/s\n", + "2023-01-31 11:40:42.444613: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0\n", + "2023-01-31 11:40:42.444644: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1258] Device interconnect StreamExecutor with strength 1 edge matrix:\n", + "2023-01-31 11:40:42.444655: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1264] 0 \n", + "2023-01-31 11:40:42.444662: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1277] 0: N \n", + "2023-01-31 11:40:42.444950: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1418] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 10994 MB memory) -> physical GPU (device: 0, name: NVIDIA TITAN Xp, pci bus id: 0000:03:00.0, compute capability: 6.1)\n", + "2023-01-31 11:40:42.484939: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:176] None of the MLIR Optimization Passes are enabled (registered 2)\n", + "fully_quantize: 0, inference_type: 6, input_inference_type: 9, output_inference_type: 9\n", + "Quantized model saved to ds_cnn_quantized.tflite.\n", + "Running TFLite evaluation on validation set...\n", + "[[371 0 0 0 0 0 0 0 0 0 0 0]\n", + " [ 0 333 2 4 2 3 6 5 3 1 3 9]\n", + " [ 0 2 378 1 0 2 8 1 0 0 2 3]\n", + " [ 0 5 1 375 2 5 2 0 2 0 1 13]\n", + " [ 0 5 0 0 321 2 3 0 1 11 6 1]\n", + " [ 0 2 0 7 1 354 1 1 1 2 6 2]\n", + " [ 0 1 8 1 2 0 338 2 0 0 0 0]\n", + " [ 0 2 0 1 1 0 0 355 0 1 1 2]\n", + " [ 1 4 0 1 3 1 1 0 345 6 1 0]\n", + " [ 0 1 0 1 27 0 2 1 5 330 4 2]\n", + " [ 0 2 1 0 9 0 0 1 0 3 333 1]\n", + " [ 0 4 0 12 3 5 1 0 1 0 6 340]]\n", + "Validation accuracy = 93.88%(N=4445)\n", + "Running TFLite evaluation on test set...\n", + "[[408 0 0 0 0 0 0 0 0 0 0 0]\n", + " [ 0 363 2 5 3 5 7 3 8 0 3 9]\n", + " [ 0 2 399 3 0 0 13 0 0 1 1 0]\n", + " [ 0 1 1 384 0 7 4 0 0 0 2 6]\n", + " [ 0 9 0 0 398 2 1 0 1 7 6 1]\n", + " [ 0 5 3 12 1 372 5 0 1 0 1 6]\n", + " [ 0 4 5 2 1 0 395 1 0 0 4 0]\n", + " [ 0 8 0 4 3 2 7 370 0 0 2 0]\n", + " [ 0 9 0 1 6 8 0 2 361 7 2 0]\n", + " [ 0 2 0 2 16 0 1 0 5 367 2 7]\n", + " [ 0 0 0 0 11 3 0 3 1 2 389 2]\n", + " [ 0 6 1 19 4 5 3 0 0 1 2 361]]\n", + "Test accuracy = 93.39%(N=4890)\n" + ] + } + ], + "source": [ + "!bash ./recreate_model.sh" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Running this script will use the pre-trained checkpoint files supplied in the ```./model_archive/model_source/weights``` folder to generate the TFLite files and perform evaluation on the test set. Both an fp32 version and a quantized version will be produced. The quantized version will use post-training quantization to fully quantize it.\n", + "\n", + "If you want to run training from scratch you can do this by supplying ```--train``` when running the script. For example:\n", + "\n", + "```bash\n", + "bash ./recreate_model.sh --train\n", + "```\n", + "\n", + "Training is then performed and should produce a model to the stated accuracy in this repository. Note that exporting to TFLite will still happen with the baseline pre-trained checkpoint files, so you will need to re-run the script and this time supply the path to the new checkpoint files you want to use, for example:\n", + "\n", + "```bash\n", + "bash ./recreate_model.sh --ckpt \n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2.0 Training\n", + "\n", + "The training scripts can be used to recreate any of the models from the [Hello Edge paper](https://arxiv.org/pdf/1711.07128.pdf) provided the right hyperparameters are used. The training commands with all the hyperparameters to reproduce the model in this repository are given [here](recreate_model.sh). The model in this part of the repository represents just one variation of the models from the paper, other varieties are covered in other parts of the repository.\n", + "\n", + "\n", + "As a general example of how to train a DNN with 3 fully-connected layers with 128 neurons in each layer, run:\n", + "```\n", + "python train.py --model_architecture dnn --model_size_info 128 128 128\n", + "```\n", + "\n", + "The command line argument *--model_size_info* is used to pass the neural network layer\n", + "dimensions such as number of layers, convolution filter size/stride as a list to models.py,\n", + "which builds the TensorFlow graph based on the provided model architecture\n", + "and layer dimensions. For more info on *model_size_info* for each network architecture see\n", + "[models.py](model_core_utils/models.py).\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 3.0 Testing\n", + "To run inference on the trained model from a checkpoint and get accuracy on validation and test sets, run:\n", + "```\n", + "python evaluation.py --model_architecture dnn --model_size_info 128 128 128 --checkpoint \n", + "```\n", + "**The model and feature extraction parameters passed to this script should match those used in the Training step.**" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 4.0 Optimization\n", + "\n", + "We introduce an *optional* step to optimize the trained keyword spotting model for deployment.\n", + "\n", + "Here we use TensorFlow's [weight clustering API](https://www.tensorflow.org/model_optimization/guide/clustering) to reduce the compressed model size and optimize inference on supported hardware. 32 weight clusters and kmeans++ cluster intialization method are used as the clustering hyperparameters.\n", + "\n", + "To optimize your trained model (e.g. a DNN), a trained model checkpoint is needed to run clustering and fine-tuning on.\n", + "You can use the pre-trained checkpoints provided, or train your own model and use the resulting checkpoint.\n", + "\n", + "To apply the optimization and fine-tuning, run the following command:\n", + "```\n", + "python optimisations.py --model_architecture dnn --model_size_info 128 128 128 --checkpoint \n", + "```\n", + "**The model and feature extraction parameters used here should match those used in the Training step, except for the number of training steps.\n", + "The number of training steps is reduced since the optimization step only requires fine-tuning.**\n", + "\n", + "This will generate a clustered model checkpoint that can be used in the quantization step to generate a quantized and clustered TFLite model." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 5.0 Quantization and TFLite Conversion\n", + "\n", + "You can now use TensorFlow's\n", + "[post training quantization](https://www.tensorflow.org/lite/performance/post_training_quantization) to\n", + "make quantization of the trained models super simple.\n", + "\n", + "To quantize your trained model (e.g. a DNN) run:\n", + "```\n", + "python convert_to_tflite.py --model_architecture dnn --model_size_info 128 128 128 --checkpoint [--inference_type int8|int16]\n", + "```\n", + "**The model and feature extraction parameters used here should match those used in the Training step.**\n", + "\n", + "The ```inference_type``` parameter is *optional* and to be used if a fully quantized model with inputs and outputs of type int8 or int16 is needed. It defaults to fp32.\n", + "\n", + "In this example, this step will produce a quantized TFLite file *dnn_quantized.tflite*." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can test the accuracy of this quantized model on the test set by running:\n", + "```\n", + "python evaluation.py --tflite_path dnn_quantized.tflite\n", + "```\n", + "**The model and feature extraction parameters used here should match those used in the Training step.**\n", + "\n", + "`convert_to_tflite.py` uses post-training quantization to generate a quantized model by default. If you wish to convert to a floating point TFLite model, use the command below:\n", + "\n", + "```\n", + "python convert_to_tflite.py --model_architecture dnn --model_size_info 128 128 128 --checkpoint --no-quantize\n", + "```\n", + "\n", + "This will produce a floating point TFLite file *dnn.tflite*. You can test the accuracy of this floating point model using `evaluation.py` as above.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 6.0 Single inference of the TFLite model files \n", + "\n", + "You can conduct TFLite inference for .fp32 and .int8 model files by using the following command: \n", + "\n", + "```python ds_cnn_l_inference_tflite.py --labels validation_utils/labels.txt --wav --tflite_path ```\n", + "\n", + "**The feature extraction parameters used here should match those used in the Training step.**\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.4" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/TFLite/tflite_clustered_fp32/README.md b/models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/TFLite/tflite_clustered_fp32/README.md new file mode 100644 index 0000000..be17ae3 --- /dev/null +++ b/models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/TFLite/tflite_clustered_fp32/README.md @@ -0,0 +1,62 @@ +# keyword_spotting/ds_cnn_large/model_package_tf/model_archive/TFLite/tflite_clustered_fp32 + +## Description +This is a clustered (32 clusters, kmeans++ centroid initialization) and retrained (fine-tuned) floating point fp32 version of the DS-CNN Large model developed by Arm, from the Hello Edge paper. + +## License +[Apache-2.0](https://spdx.org/licenses/Apache-2.0.html) + +## Network Information +| Network Information | Value | +|---------------------|-------| +| Framework | TensorFlow Lite | +| Datatype | fp32 | +| SHA-1 Hash | d9af9829a2363c21fd6158c7bc425d0b635eb55c | +| Size (Bytes) | 1652648 | +| Provenance | https://arxiv.org/abs/1711.07128 | +| Training | Trained by Arm | +| Paper | https://arxiv.org/abs/1711.07128 | + +## DataSet +| Dataset Information | Value | +|--------|-------| +| Name | Google Speech Commands test set | + +## Accuracy + +| Metric | Value | +|--------|-------| +| accuracy | 94.76% | + +## HW Support +| HW Support | Value | +|--------------|-------| +| Cortex-A |:heavy_check_mark: | +| Cortex-M |:heavy_check_mark: | +| Mali GPU |:heavy_check_mark: | +| Ethos U |:heavy_multiplication_x: | + +### Key +* :heavy_check_mark: - Will run on this platform. +* :heavy_multiplication_x: - Will not run on this platform. + +## Network Quality +| Network Quality | Value | +|-------------------------|-------| +| Recreate | :heavy_check_mark: | +| Quality level | Deployable | +| Vanilla | :heavy_check_mark: | +| Clustered | :heavy_check_mark: | +| Pruned | :heavy_multiplication_x: | +| Quantization - default | :heavy_multiplication_x: | +| Quantization - full | :heavy_multiplication_x: | + +## Network Inputs +| Input Node Name | Shape | Type | Example Path | Example Type | Example Shape | Example Use Case | +|-----------------|-------|-------|--------------|-------|-------|-----------------| +| input | (1, 490) | fp32 | models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/TFLite/tflite_clustered_fp32/testing_input/input | fp32 | [1, 490] | The input is a processed MFCCs | + +## Network Outputs +| Output Node Name | Shape | Type | Example Path | Example Type | Example Shape | Example Use Case | +|-----------------|-------|-------|--------------|-------|-------|-----------------| +| Identity | (1, 12) | fp32 | models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/TFLite/tflite_clustered_fp32/testing_output/Identity | fp32 | [1, 12] | The probability on 12 keywords | \ No newline at end of file diff --git a/models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/TFLite/tflite_clustered_fp32/definition.yaml b/models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/TFLite/tflite_clustered_fp32/definition.yaml new file mode 100644 index 0000000..77d4f8c --- /dev/null +++ b/models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/TFLite/tflite_clustered_fp32/definition.yaml @@ -0,0 +1,67 @@ +benchmark: + benchmark_metrics: + accuracy: 94.76% + benchmark_name: Google Speech Commands test set +description: This is a clustered (32 clusters, kmeans++ centroid initialization) + and retrained (fine-tuned) fp32 version of the DS-CNN Large model developed + by Arm, from the Hello Edge paper. +license: +- Apache-2.0 +network: + datatype: fp32 + file_size_bytes: 1652648 + filename: ds_cnn_l_clustered_fp32.tflite + framework: TensorFlow Lite + hash: + algorithm: sha1 + value: d9af9829a2363c21fd6158c7bc425d0b635eb55c + provenance: https://arxiv.org/abs/1711.07128 + training: Trained by Arm +network_parameters: + input_nodes: + - description: The input is a processed MFCCs of shape (1, 490) + example_input: + path: models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/TFLite/tflite_clustered_fp32/testing_input/input + shape: + - 1 + - 490 + type: fp32 + use_case: Random input for model regression. + input_datatype: fp32 + name: input + shape: + - 1 + - 490 + output_nodes: + - description: The probability on 12 keywords. + example_output: + path: models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/TFLite/tflite_clustered_fp32/testing_output/Identity + shape: + - 1 + - 12 + type: fp32 + use_case: output for model regression. + name: Identity + output_datatype: fp32 + shape: + - 1 + - 12 +network_quality: + clustered: true + is_vanilla: true + pruned: false + quality_level: Deployable + quality_level_hero_hw: cortex_m + quantization_default: false + quantization_full: false + recreate: true +operators: + TensorFlow Lite: + - AVERAGE_POOL_2D + - CONV_2D + - DEPTHWISE_CONV_2D + - FULLY_CONNECTED + - RELU + - RESHAPE + - SOFTMAX +paper: https://arxiv.org/abs/1711.07128 \ No newline at end of file diff --git a/models/keyword_spotting/ds_cnn_large/tflite_clustered_fp32/ds_cnn_clustered_fp32.tflite b/models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/TFLite/tflite_clustered_fp32/ds_cnn_l_clustered_fp32.tflite similarity index 100% rename from models/keyword_spotting/ds_cnn_large/tflite_clustered_fp32/ds_cnn_clustered_fp32.tflite rename to models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/TFLite/tflite_clustered_fp32/ds_cnn_l_clustered_fp32.tflite diff --git a/models/keyword_spotting/ds_cnn_large/tflite_clustered_fp32/testing_input/input/0.npy b/models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/TFLite/tflite_clustered_fp32/testing_input/input/0.npy similarity index 100% rename from models/keyword_spotting/ds_cnn_large/tflite_clustered_fp32/testing_input/input/0.npy rename to models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/TFLite/tflite_clustered_fp32/testing_input/input/0.npy diff --git a/models/keyword_spotting/ds_cnn_large/tflite_clustered_fp32/testing_output/Identity/0.npy b/models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/TFLite/tflite_clustered_fp32/testing_output/Identity/0.npy similarity index 100% rename from models/keyword_spotting/ds_cnn_large/tflite_clustered_fp32/testing_output/Identity/0.npy rename to models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/TFLite/tflite_clustered_fp32/testing_output/Identity/0.npy diff --git a/models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/TFLite/tflite_clustered_int8/README.md b/models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/TFLite/tflite_clustered_int8/README.md new file mode 100644 index 0000000..976c8c6 --- /dev/null +++ b/models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/TFLite/tflite_clustered_int8/README.md @@ -0,0 +1,62 @@ +# keyword_spotting/ds_cnn_large/model_package_tf/model_archive/TFLite/tflite_clustered_int8 + +## Description +This is a clustered (32 clusters, kmeans++ centroid initialization) and retrained (fine-tuned) fully quantized int8 version of the DS-CNN Large model developed by Arm, from the Hello Edge paper. + +## License +[Apache-2.0](https://spdx.org/licenses/Apache-2.0.html) + +## Network Information +| Network Information | Value | +|---------------------|------------------------------------------| +| Framework | TensorFlow Lite | +| Datatype | int8 | +| SHA-1 Hash | 2ee38794ed171c75d3313460a1633c5d6a79f530 | +| Size (Bytes) | 503816 | +| Provenance | https://arxiv.org/abs/1711.07128 | +| Training | Trained by Arm | +| Paper | https://arxiv.org/abs/1711.07128 | + +## DataSet +| Dataset Information | Value | +|--------|-------| +| Name | Google Speech Commands test set | + +## Accuracy + +| Metric | Value | +|--------|-------| +| accuracy | 93.87% | + +## HW Support +| HW Support | Value | +|--------------|-------| +| Cortex-A |:heavy_check_mark: | +| Cortex-M |:heavy_check_mark: | +| Mali GPU |:heavy_check_mark: | +| Ethos U |:heavy_check_mark: | + +### Key +* :heavy_check_mark: - Will run on this platform. +* :heavy_multiplication_x: - Will not run on this platform. + +## Network Quality +| Network Quality | Value | +|-------------------------|-------| +| Recreate | :heavy_check_mark: | +| Quality level | Deployable | +| Vanilla | :heavy_check_mark: | +| Clustered | :heavy_check_mark: | +| Pruned | :heavy_multiplication_x: | +| Quantization - default | :heavy_multiplication_x: | +| Quantization - full | :heavy_check_mark: | + +## Network Inputs +| Input Node Name | Shape | Type | Example Path | Example Type | Example Shape | Example Use Case | +|-----------------|-------|-------|--------------|-------|-------|-----------------| +| input | (1, 490) | int8 | models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/TFLite/tflite_clustered_int8/testing_input/input | int8 | [1, 490] | The input is a processed MFCCs | + +## Network Outputs +| Output Node Name | Shape | Type | Example Path | Example Type | Example Shape | Example Use Case | +|-----------------|-------|-------|--------------|-------|-------|-----------------| +| Identity | (1, 12) | int8 | models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/TFLite/tflite_clustered_int8/testing_output/Identity | int8 | [1, 12] | The probability on 12 keywords | \ No newline at end of file diff --git a/models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/TFLite/tflite_clustered_int8/definition.yaml b/models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/TFLite/tflite_clustered_int8/definition.yaml new file mode 100644 index 0000000..a3adef5 --- /dev/null +++ b/models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/TFLite/tflite_clustered_int8/definition.yaml @@ -0,0 +1,67 @@ +benchmark: + benchmark_metrics: + accuracy: 93.87% + benchmark_name: Google Speech Commands test set +description: This is a clustered (32 clusters, kmeans++ centroid initialization) + and retrained (fine-tuned) fully quantized int8 version of the DS-CNN Large model developed + by Arm, from the Hello Edge paper. +license: +- Apache-2.0 +network: + datatype: int8 + file_size_bytes: 503816 + filename: ds_cnn_l_clustered_int8.tflite + framework: TensorFlow Lite + hash: + algorithm: sha1 + value: 2ee38794ed171c75d3313460a1633c5d6a79f530 + provenance: https://arxiv.org/abs/1711.07128 + training: Trained by Arm +network_parameters: + input_nodes: + - description: The input is a processed MFCCs of shape (1, 490) + example_input: + path: models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/TFLite/tflite_clustered_int8/testing_input/input + shape: + - 1 + - 490 + type: int8 + use_case: Random input for model regression. + input_datatype: int8 + name: input + shape: + - 1 + - 490 + output_nodes: + - description: The probability on 12 keywords. + example_output: + path: models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/TFLite/tflite_clustered_int8/testing_output/Identity + shape: + - 1 + - 12 + type: int8 + use_case: output for model regression. + name: Identity + output_datatype: int8 + shape: + - 1 + - 12 +network_quality: + clustered: true + is_vanilla: true + pruned: false + quality_level: Deployable + quality_level_hero_hw: cortex_m + quantization_default: false + quantization_full: true + recreate: true +operators: + TensorFlow Lite: + - AVERAGE_POOL_2D + - CONV_2D + - DEPTHWISE_CONV_2D + - FULLY_CONNECTED + - RELU + - RESHAPE + - SOFTMAX +paper: https://arxiv.org/abs/1711.07128 \ No newline at end of file diff --git a/models/keyword_spotting/ds_cnn_large/tflite_clustered_int8/ds_cnn_clustered_int8.tflite b/models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/TFLite/tflite_clustered_int8/ds_cnn_l_clustered_int8.tflite similarity index 100% rename from models/keyword_spotting/ds_cnn_large/tflite_clustered_int8/ds_cnn_clustered_int8.tflite rename to models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/TFLite/tflite_clustered_int8/ds_cnn_l_clustered_int8.tflite diff --git a/models/keyword_spotting/ds_cnn_large/tflite_clustered_int8/testing_input/input/0.npy b/models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/TFLite/tflite_clustered_int8/testing_input/input/0.npy similarity index 100% rename from models/keyword_spotting/ds_cnn_large/tflite_clustered_int8/testing_input/input/0.npy rename to models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/TFLite/tflite_clustered_int8/testing_input/input/0.npy diff --git a/models/keyword_spotting/ds_cnn_large/tflite_clustered_int8/testing_output/Identity/0.npy b/models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/TFLite/tflite_clustered_int8/testing_output/Identity/0.npy similarity index 100% rename from models/keyword_spotting/ds_cnn_large/tflite_clustered_int8/testing_output/Identity/0.npy rename to models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/TFLite/tflite_clustered_int8/testing_output/Identity/0.npy diff --git a/models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/TFLite/tflite_fp32/README.md b/models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/TFLite/tflite_fp32/README.md new file mode 100644 index 0000000..7647971 --- /dev/null +++ b/models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/TFLite/tflite_fp32/README.md @@ -0,0 +1,62 @@ +# keyword_spotting/ds_cnn_large/model_package_tf/model_archive/TFLite/tflite_fp32 + +## Description +This is a floating point fp32 version of the DS-CNN Large model developed by Arm, from the Hello Edge paper. + +## License +[Apache-2.0](https://spdx.org/licenses/Apache-2.0.html) + +## Network Information +| Network Information | Value | +|---------------------|-------| +| Framework | TensorFlow Lite | +| Datatype | fp32 | +| SHA-1 Hash | fea0e0dc13fc4207dd44904fe701f34254dd4767 | +| Size (Bytes) | 1652648 | +| Provenance | https://arxiv.org/abs/1711.07128 | +| Training | Trained by Arm | +| Paper | https://arxiv.org/abs/1711.07128 | + +## DataSet +| Dataset Information | Value | +|--------|-------| +| Name | Google Speech Commands test set | + +## Accuracy + +| Metric | Value | +|--------|-------| +| accuracy | 95.03% | + +## HW Support +| HW Support | Value | +|--------------|-------| +| Cortex-A |:heavy_check_mark: | +| Cortex-M |:heavy_check_mark: HERO | +| Mali GPU |:heavy_check_mark: | +| Ethos U |:heavy_multiplication_x: | + +### Key +* :heavy_check_mark: - Will run on this platform. +* :heavy_multiplication_x: - Will not run on this platform. + +## Network Quality +| Network Quality | Value | +|-------------------------|-------| +| Recreate | :heavy_check_mark: | +| Quality level | Hero | +| Vanilla | :heavy_check_mark: | +| Clustered | :heavy_multiplication_x: | +| Pruned | :heavy_multiplication_x: | +| Quantization - default | :heavy_multiplication_x: | +| Quantization - full | :heavy_multiplication_x: | + +## Network Inputs +| Input Node Name | Shape | Type | Example Path | Example Type | Example Shape | Example Use Case | +|-----------------|-------|-------|--------------|-------|-------|-----------------| +| input | (1, 490) | int8 | models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/TFLite/tflite_fp32/testing_input/input | fp32 | [1, 490] | The input is a processed MFCCs | + +## Network Outputs +| Output Node Name | Shape | Type | Example Path | Example Type | Example Shape | Example Use Case | +|-----------------|-------|-------|--------------|-------|-------|-----------------| +| Identity | (1, 12) | int8 | models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/TFLite/tflite_fp32/testing_output/Identity | fp32 | [1, 12] | The probability on 12 keywords | \ No newline at end of file diff --git a/models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/TFLite/tflite_fp32/definition.yaml b/models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/TFLite/tflite_fp32/definition.yaml new file mode 100644 index 0000000..288d185 --- /dev/null +++ b/models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/TFLite/tflite_fp32/definition.yaml @@ -0,0 +1,66 @@ +benchmark: + benchmark_metrics: + accuracy: 95.03% + benchmark_name: Google Speech Commands test set +description: This is a floating point fp32 version of the DS-CNN Large model developed + by Arm, from the Hello Edge paper. +license: +- Apache-2.0 +network: + datatype: fp32 + file_size_bytes: 1652648 + filename: ds_cnn_l.tflite + framework: TensorFlow Lite + hash: + algorithm: sha1 + value: fea0e0dc13fc4207dd44904fe701f34254dd4767 + provenance: https://arxiv.org/abs/1711.07128 + training: Trained by Arm +network_parameters: + input_nodes: + - description: The input is a processed MFCCs of shape (1, 490) + example_input: + path: models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/TFLite/tflite_fp32/testing_input/input + shape: + - 1 + - 490 + type: fp32 + use_case: Random input for model regression. + input_datatype: fp32 + name: input + shape: + - 1 + - 490 + output_nodes: + - description: The probability on 12 keywords. + example_output: + path: models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/TFLite/tflite_fp32/testing_output/Identity + shape: + - 1 + - 12 + type: fp32 + use_case: output for model regression. + name: Identity + output_datatype: fp32 + shape: + - 1 + - 12 +network_quality: + clustered: false + is_vanilla: true + pruned: false + quality_level: Deployable + quality_level_hero_hw: cortex_m + quantization_default: false + quantization_full: false + recreate: true +operators: + TensorFlow Lite: + - AVERAGE_POOL_2D + - CONV_2D + - DEPTHWISE_CONV_2D + - FULLY_CONNECTED + - RELU + - RESHAPE + - SOFTMAX +paper: https://arxiv.org/abs/1711.07128 \ No newline at end of file diff --git a/models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/TFLite/tflite_fp32/ds_cnn_l.tflite b/models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/TFLite/tflite_fp32/ds_cnn_l.tflite new file mode 100644 index 0000000..6619422 --- /dev/null +++ b/models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/TFLite/tflite_fp32/ds_cnn_l.tflite @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:703bedd0f14360a47ac870a51b13dfde965e4be4d901ee8c6b87bd2f3360671b +size 1652648 diff --git a/models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/TFLite/tflite_fp32/testing_input/input/0.npy b/models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/TFLite/tflite_fp32/testing_input/input/0.npy new file mode 100644 index 0000000..8886270 --- /dev/null +++ b/models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/TFLite/tflite_fp32/testing_input/input/0.npy @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:add2d479791b5e4aa5e4bfd8f16cf47f965783aff20845a8283fa7e571cabd50 +size 2088 diff --git a/models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/TFLite/tflite_fp32/testing_output/Identity/0.npy b/models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/TFLite/tflite_fp32/testing_output/Identity/0.npy new file mode 100644 index 0000000..5b8a6d6 --- /dev/null +++ b/models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/TFLite/tflite_fp32/testing_output/Identity/0.npy @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7ffd3d5e8b2601d820fd4b4c786d5f475075848f6f9636a5d62a7c38f30d2cc0 +size 176 diff --git a/models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/TFLite/tflite_int8/README.md b/models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/TFLite/tflite_int8/README.md new file mode 100644 index 0000000..7f813ed --- /dev/null +++ b/models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/TFLite/tflite_int8/README.md @@ -0,0 +1,62 @@ +# keyword_spotting/ds_cnn_large/model_package_tf/model_archive/TFLite/tflite_int8 + +## Description +This is a fully quantized int8 version of the DS-CNN Large model developed by Arm, from the Hello Edge paper. + +## License +[Apache-2.0](https://spdx.org/licenses/Apache-2.0.html) + +## Network Information +| Network Information | Value | +|---------------------|-------| +| Framework | TensorFlow Lite | +| Datatype | int8 | +| SHA-1 Hash | 504f8e7bfa5c0f15c5475e5d08637b3b8aad0972 | +| Size (Bytes) | 503816 | +| Provenance | https://arxiv.org/abs/1711.07128 | +| Training | Trained by Arm | +| Paper | https://arxiv.org/abs/1711.07128 | + +## DataSet +| Dataset Information | Value | +|--------|-------| +| Name | Google Speech Commands test set | + +## Accuracy + +| Metric | Value | +|--------|-------| +| Accuracy | 94.52% | + +## HW Support +| HW Support | Value | +|--------------|-------| +| Cortex-A |:heavy_check_mark: | +| Cortex-M |:heavy_check_mark: HERO | +| Mali GPU |:heavy_check_mark: | +| Ethos U |:heavy_check_mark: | + +### Key +* :heavy_check_mark: - Will run on this platform. +* :heavy_multiplication_x: - Will not run on this platform. + +## Network Quality +| Network Quality | Value | +|-------------------------|-------| +| Recreate | :heavy_check_mark: | +| Quality level | Hero | +| Vanilla | :heavy_check_mark: | +| Clustered | :heavy_multiplication_x: | +| Pruned | :heavy_multiplication_x: | +| Quantization - default | :heavy_multiplication_x: | +| Quantization - full | :heavy_check_mark: | + +## Network Inputs +| Input Node Name | Shape | Type | Example Path | Example Type | Example Shape | Example Use Case | +|-----------------|-------|-------|--------------|-------|-------|-----------------| +| input | (1, 490) | int8 | models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/TFLite/tflite_int8/testing_input/input | int8 | [1, 490] | The input is a processed MFCCs | + +## Network Outputs +| Output Node Name | Shape | Type | Example Path | Example Type | Example Shape | Example Use Case | +|-----------------|-------|-------|--------------|-------|-------|-----------------| +| Identity | (1, 12) | int8 | models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/TFLite/tflite_int8/testing_output/Identity | int8 | [1, 12] | The probability on 12 keywords | \ No newline at end of file diff --git a/models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/TFLite/tflite_int8/definition.yaml b/models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/TFLite/tflite_int8/definition.yaml new file mode 100644 index 0000000..6a2b864 --- /dev/null +++ b/models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/TFLite/tflite_int8/definition.yaml @@ -0,0 +1,66 @@ +benchmark: + benchmark_metrics: + Accuracy: 94.52% + benchmark_name: Google Speech Commands test set +description: This is a fully quantized int8 version of the DS-CNN Large model developed + by Arm, from the Hello Edge paper. +license: +- Apache-2.0 +network: + datatype: int8 + file_size_bytes: 503816 + filename: ds_cnn_l_quantized.tflite + framework: TensorFlow Lite + hash: + algorithm: sha1 + value: 504f8e7bfa5c0f15c5475e5d08637b3b8aad0972 + provenance: https://arxiv.org/abs/1711.07128 + training: Trained by Arm +network_parameters: + input_nodes: + - description: The input is a processed MFCCs of shape (1, 490) + example_input: + path: models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/TFLite/tflite_int8/testing_input/input + shape: + - 1 + - 490 + type: int8 + use_case: Random input for model regression. + input_datatype: int8 + name: input + shape: + - 1 + - 490 + output_nodes: + - description: The probability on 12 keywords. + example_output: + path: models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/TFLite/tflite_int8/testing_output/Identity + shape: + - 1 + - 12 + type: int8 + use_case: output for model regression. + name: Identity + output_datatype: int8 + shape: + - 1 + - 12 +network_quality: + clustered: false + is_vanilla: true + pruned: false + quality_level: Deployable + quality_level_hero_hw: cortex_m + quantization_default: false + quantization_full: true + recreate: true +operators: + TensorFlow Lite: + - AVERAGE_POOL_2D + - CONV_2D + - DEPTHWISE_CONV_2D + - FULLY_CONNECTED + - RELU + - RESHAPE + - SOFTMAX +paper: https://arxiv.org/abs/1711.07128 \ No newline at end of file diff --git a/models/keyword_spotting/ds_cnn_large/tflite_int8/ds_cnn_l_quantized.tflite b/models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/TFLite/tflite_int8/ds_cnn_l_quantized.tflite similarity index 100% rename from models/keyword_spotting/ds_cnn_large/tflite_int8/ds_cnn_l_quantized.tflite rename to models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/TFLite/tflite_int8/ds_cnn_l_quantized.tflite diff --git a/models/keyword_spotting/ds_cnn_large/tflite_int8/testing_input/input/0.npy b/models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/TFLite/tflite_int8/testing_input/input/0.npy similarity index 100% rename from models/keyword_spotting/ds_cnn_large/tflite_int8/testing_input/input/0.npy rename to models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/TFLite/tflite_int8/testing_input/input/0.npy diff --git a/models/keyword_spotting/ds_cnn_large/tflite_int8/testing_output/Identity/0.npy b/models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/TFLite/tflite_int8/testing_output/Identity/0.npy similarity index 100% rename from models/keyword_spotting/ds_cnn_large/tflite_int8/testing_output/Identity/0.npy rename to models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/TFLite/tflite_int8/testing_output/Identity/0.npy diff --git a/models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/model_source/saved_model/ds_cnn_large/keras_metadata.pb b/models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/model_source/saved_model/ds_cnn_large/keras_metadata.pb new file mode 100644 index 0000000..454265f --- /dev/null +++ b/models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/model_source/saved_model/ds_cnn_large/keras_metadata.pb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bb5e42915f74efe437002d09ef323928da8efdc68b403118711d05871534690e +size 78436 diff --git a/models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/model_source/saved_model/ds_cnn_large/saved_model.pb b/models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/model_source/saved_model/ds_cnn_large/saved_model.pb new file mode 100644 index 0000000..95b9f8f --- /dev/null +++ b/models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/model_source/saved_model/ds_cnn_large/saved_model.pb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ac426a86f4d862a0055c945b92ecb0e8f3de3ea90542b2731764b67c2e9ae3f3 +size 859950 diff --git a/models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/model_source/saved_model/ds_cnn_large/variables/variables.data-00000-of-00001 b/models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/model_source/saved_model/ds_cnn_large/variables/variables.data-00000-of-00001 new file mode 100644 index 0000000..77a395d --- /dev/null +++ b/models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/model_source/saved_model/ds_cnn_large/variables/variables.data-00000-of-00001 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:efd31c705f2114c88f89660862742beb82a0bea80efd245969076e5339bccdf4 +size 1713786 diff --git a/models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/model_source/saved_model/ds_cnn_large/variables/variables.index b/models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/model_source/saved_model/ds_cnn_large/variables/variables.index new file mode 100644 index 0000000..7493cc8 --- /dev/null +++ b/models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/model_source/saved_model/ds_cnn_large/variables/variables.index @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f30be633de16e745ef0a11a3842ad8dbc70d8ead948acf049a613aff0c64cd3d +size 4397 diff --git a/models/keyword_spotting/ds_cnn_large/tflite_int8/ckpt/checkpoint b/models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/model_source/weights/checkpoint similarity index 100% rename from models/keyword_spotting/ds_cnn_large/tflite_int8/ckpt/checkpoint rename to models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/model_source/weights/checkpoint diff --git a/models/keyword_spotting/ds_cnn_large/tflite_int8/ckpt/ds_cnn_0.95_ckpt.data-00000-of-00001 b/models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/model_source/weights/ds_cnn_0.95_ckpt.data-00000-of-00001 similarity index 100% rename from models/keyword_spotting/ds_cnn_large/tflite_int8/ckpt/ds_cnn_0.95_ckpt.data-00000-of-00001 rename to models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/model_source/weights/ds_cnn_0.95_ckpt.data-00000-of-00001 diff --git a/models/keyword_spotting/ds_cnn_large/tflite_int8/ckpt/ds_cnn_0.95_ckpt.index b/models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/model_source/weights/ds_cnn_0.95_ckpt.index similarity index 100% rename from models/keyword_spotting/ds_cnn_large/tflite_int8/ckpt/ds_cnn_0.95_ckpt.index rename to models/keyword_spotting/ds_cnn_large/model_package_tf/model_archive/model_source/weights/ds_cnn_0.95_ckpt.index diff --git a/models/keyword_spotting/ds_cnn_large/model_package_tf/model_core_utils/__init__.py b/models/keyword_spotting/ds_cnn_large/model_package_tf/model_core_utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/models/keyword_spotting/ds_cnn_large/model_package_tf/model_core_utils/models.py b/models/keyword_spotting/ds_cnn_large/model_package_tf/model_core_utils/models.py new file mode 100644 index 0000000..1978136 --- /dev/null +++ b/models/keyword_spotting/ds_cnn_large/model_package_tf/model_core_utils/models.py @@ -0,0 +1,327 @@ +# Copyright © 2023 Arm Ltd. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Model definitions for simple keyword spotting.""" + +import math + +import tensorflow as tf + + +def prepare_model_settings(label_count, sample_rate, clip_duration_ms, + window_size_ms, window_stride_ms, + dct_coefficient_count): + """Calculates common settings needed for all models. + + Args: + label_count: How many classes are to be recognized. + sample_rate: Number of audio samples per second. + clip_duration_ms: Length of each audio clip to be analyzed. + window_size_ms: Duration of frequency analysis window. + window_stride_ms: How far to move in time between frequency windows. + dct_coefficient_count: Number of frequency bins to use for analysis. + + Returns: + Dictionary containing common settings. + """ + desired_samples = int(sample_rate * clip_duration_ms / 1000) + window_size_samples = int(sample_rate * window_size_ms / 1000) + window_stride_samples = int(sample_rate * window_stride_ms / 1000) + length_minus_window = (desired_samples - window_size_samples) + if length_minus_window < 0: + spectrogram_length = 0 + else: + spectrogram_length = 1 + int(length_minus_window / window_stride_samples) + fingerprint_size = dct_coefficient_count * spectrogram_length + + return { + 'desired_samples': desired_samples, + 'window_size_samples': window_size_samples, + 'window_stride_samples': window_stride_samples, + 'spectrogram_length': spectrogram_length, + 'dct_coefficient_count': dct_coefficient_count, + 'fingerprint_size': fingerprint_size, + 'label_count': label_count, + 'sample_rate': sample_rate, + } + + +def create_model(model_settings, model_architecture, model_size_info, is_training): + """Builds a tf.keras model of the requested architecture compatible with the settings. + + Args: + model_settings: Dictionary of information about the model. + model_architecture: String specifying which kind of model to create. + model_size_info: Array with specific information for the chosen architecture + (e.g convolutional parameters, number of layers). + + Returns: + A tf.keras Model with the requested architecture. + + Raises: + Exception: If the architecture type isn't recognized. + """ + + if model_architecture == 'dnn': + return create_dnn_model(model_settings, model_size_info) + + elif model_architecture == 'cnn': + return create_cnn_model(model_settings, model_size_info) + + elif model_architecture == 'ds_cnn': + return create_ds_cnn_model(model_settings, model_size_info) + elif model_architecture == 'single_fc': + return create_single_fc_model(model_settings) + elif model_architecture == 'basic_lstm': + return create_basic_lstm_model(model_settings, model_size_info, is_training) + else: + raise Exception(f'model_architecture argument {model_architecture} not recognized' + f', should be one of, "dnn", "cnn", "ds_cnn" ') + + +def create_single_fc_model(model_settings): + """Builds a model with a single fully-connected layer. + + For details see https://arxiv.org/abs/1711.07128. + + Args: + model_settings: Dict of different settings for model training. + + Returns: + tf.keras Model of the 'SINGLE_FC' architecture. + """ + inputs = tf.keras.Input(shape=(model_settings['fingerprint_size'],), name='input') + # Fully connected layer + output = tf.keras.layers.Dense(units=model_settings['label_count'], activation='softmax')(inputs) + + return tf.keras.Model(inputs, output) + + +def create_basic_lstm_model(model_settings, model_size_info, is_training): + """Builds a model with a basic lstm layer. + + For details see https://arxiv.org/abs/1711.07128. + + Args: + model_settings: Dict of different settings for model training. + model_size_info: Length of the array defines the number of hidden-layers and + each element in the array represent the number of neurons in that layer. + is_training: Determining whether the use of the model is for training or for something else. + + Returns: + tf.keras Model of the 'Basic_LSTM' architecture. + """ + inputs = tf.keras.Input(shape=(model_settings['fingerprint_size'], ), name='input') + + input_frequency_size = model_settings['dct_coefficient_count'] + input_time_size = model_settings['spectrogram_length'] + x = tf.reshape(inputs, shape=(-1, input_time_size, input_frequency_size)) + + # LSTM layer, and unrolling depending on whether you are training or not + if is_training: + x = tf.keras.layers.LSTM(units=model_size_info[0], time_major=False, unroll=False)(x) + else: + x = tf.keras.layers.LSTM(units=model_size_info[0], time_major=False, unroll=True)(x) + + # Outputs a fully connected layer + output = tf.keras.layers.Dense(units=model_settings['label_count'], activation='softmax')(x) + + return tf.keras.Model(inputs, output) + + +def create_dnn_model(model_settings, model_size_info): + """Builds a model with multiple hidden fully-connected layers. + + For details see https://arxiv.org/abs/1711.07128. + + Args: + model_settings: Dict of different settings for model training. + model_size_info: Length of the array defines the number of hidden-layers and + each element in the array represent the number of neurons in that layer. + + Returns: + tf.keras Model of the 'DNN' architecture. + """ + + inputs = tf.keras.Input(shape=(model_settings['fingerprint_size'], ), name='input') + + # First fully connected layer. + x = tf.keras.layers.Dense(units=model_size_info[0], activation='relu')(inputs) + + # Hidden layers with ReLU activations. + for i in range(1, len(model_size_info)): + x = tf.keras.layers.Dense(units=model_size_info[i], activation='relu')(x) + + # Output fully connected layer. + output = tf.keras.layers.Dense(units=model_settings['label_count'], activation='softmax')(x) + + return tf.keras.Model(inputs, output) + + +def create_cnn_model(model_settings, model_size_info): + """Builds a model with 2 convolution layers followed by a linear layer and a hidden fully-connected layer. + + For details see https://arxiv.org/abs/1711.07128. + + Args: + model_settings: Dict of different settings for model training. + model_size_info: Defines the first and second convolution parameters in + {number of conv features, conv filter height, width, stride in y,x dir.}, + followed by linear layer size and fully-connected layer size. + + Returns: + tf.keras Model of the 'CNN' architecture. + """ + + input_frequency_size = model_settings['dct_coefficient_count'] + input_time_size = model_settings['spectrogram_length'] + + first_filter_count = model_size_info[0] + first_filter_height = model_size_info[1] # Time axis. + first_filter_width = model_size_info[2] # Frequency axis. + first_filter_stride_y = model_size_info[3] # Time axis. + first_filter_stride_x = model_size_info[4] # Frequency_axis. + + second_filter_count = model_size_info[5] + second_filter_height = model_size_info[6] # Time axis. + second_filter_width = model_size_info[7] # Frequency axis. + second_filter_stride_y = model_size_info[8] # Time axis. + second_filter_stride_x = model_size_info[9] # Frequency axis. + + linear_layer_size = model_size_info[10] + fc_size = model_size_info[11] + + inputs = tf.keras.Input(shape=(model_settings['fingerprint_size']), name='input') + + # Reshape the flattened input. + x = tf.reshape(inputs, shape=(-1, input_time_size, input_frequency_size, 1)) + + # First convolution. + x = tf.keras.layers.Conv2D(filters=first_filter_count, + kernel_size=(first_filter_height, first_filter_width), + strides=(first_filter_stride_y, first_filter_stride_x), + padding='VALID')(x) + x = tf.keras.layers.BatchNormalization()(x) + x = tf.keras.layers.ReLU()(x) + x = tf.keras.layers.Dropout(rate=0)(x) + + # Second convolution. + x = tf.keras.layers.Conv2D(filters=second_filter_count, + kernel_size=(second_filter_height, second_filter_width), + strides=(second_filter_stride_y, second_filter_stride_x), + padding='VALID')(x) + x = tf.keras.layers.BatchNormalization()(x) + x = tf.keras.layers.ReLU()(x) + x = tf.keras.layers.Dropout(rate=0)(x) + + # Flatten for fully connected layers. + x = tf.keras.layers.Flatten()(x) + + # Fully connected layer with no activation. + x = tf.keras.layers.Dense(units=linear_layer_size)(x) + + # Fully connected layer with ReLU activation. + x = tf.keras.layers.Dense(units=fc_size)(x) + x = tf.keras.layers.BatchNormalization()(x) + x = tf.keras.layers.ReLU()(x) + x = tf.keras.layers.Dropout(rate=0)(x) + + # Output fully connected. + output = tf.keras.layers.Dense(units=model_settings['label_count'], activation='softmax')(x) + + return tf.keras.Model(inputs, output) + + +def create_ds_cnn_model(model_settings, model_size_info): + """Builds a model with convolutional & depthwise separable convolutional layers. + + For more details see https://arxiv.org/abs/1711.07128. + + Args: + model_settings: Dict of different settings for model training. + model_size_info: Defines number of layers, followed by the DS-Conv layer + parameters in the order {number of conv features, conv filter height, + width and stride in y,x dir.} for each of the layers. + + Returns: + tf.keras Model of the 'DS-CNN' architecture. + """ + + label_count = model_settings['label_count'] + input_frequency_size = model_settings['dct_coefficient_count'] + input_time_size = model_settings['spectrogram_length'] + + t_dim = input_time_size + f_dim = input_frequency_size + + # Extract model dimensions from model_size_info. + num_layers = model_size_info[0] + conv_feat = [None]*num_layers + conv_kt = [None]*num_layers + conv_kf = [None]*num_layers + conv_st = [None]*num_layers + conv_sf = [None]*num_layers + + i = 1 + for layer_no in range(0, num_layers): + conv_feat[layer_no] = model_size_info[i] + i += 1 + conv_kt[layer_no] = model_size_info[i] + i += 1 + conv_kf[layer_no] = model_size_info[i] + i += 1 + conv_st[layer_no] = model_size_info[i] + i += 1 + conv_sf[layer_no] = model_size_info[i] + i += 1 + + inputs = tf.keras.Input(shape=(model_settings['fingerprint_size']), name='input') + + # Reshape the flattened input. + x = tf.reshape(inputs, shape=(-1, input_time_size, input_frequency_size, 1)) + + # Depthwise separable convolutions. + for layer_no in range(0, num_layers): + if layer_no == 0: + # First convolution. + x = tf.keras.layers.Conv2D(filters=conv_feat[0], + kernel_size=(conv_kt[0], conv_kf[0]), + strides=(conv_st[0], conv_sf[0]), + padding='SAME')(x) + x = tf.keras.layers.BatchNormalization()(x) + x = tf.keras.layers.ReLU()(x) + else: + # Depthwise convolution. + x = tf.keras.layers.DepthwiseConv2D(kernel_size=(conv_kt[layer_no], conv_kf[layer_no]), + strides=(conv_sf[layer_no], conv_st[layer_no]), + padding='SAME')(x) + x = tf.keras.layers.BatchNormalization()(x) + x = tf.keras.layers.ReLU()(x) + + # Pointwise convolution. + x = tf.keras.layers.Conv2D(filters=conv_feat[layer_no], kernel_size=(1, 1))(x) + x = tf.keras.layers.BatchNormalization()(x) + x = tf.keras.layers.ReLU()(x) + + t_dim = math.ceil(t_dim/float(conv_st[layer_no])) + f_dim = math.ceil(f_dim/float(conv_sf[layer_no])) + + # Global average pool. + x = tf.keras.layers.AveragePooling2D(pool_size=(t_dim, f_dim), strides=1)(x) + + # Squeeze before passing to output fully connected layer. + x = tf.reshape(x, shape=(-1, conv_feat[layer_no])) + + # Output connected layer. + output = tf.keras.layers.Dense(units=label_count, activation='softmax')(x) + + return tf.keras.Model(inputs, output) diff --git a/models/keyword_spotting/ds_cnn_large/model_package_tf/optimisations.py b/models/keyword_spotting/ds_cnn_large/model_package_tf/optimisations.py new file mode 100644 index 0000000..16b6f4c --- /dev/null +++ b/models/keyword_spotting/ds_cnn_large/model_package_tf/optimisations.py @@ -0,0 +1,259 @@ +# Copyright © 2023 Arm Ltd. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Functions for optimizing simple keyword spotting models using clustering API.""" + +import argparse +from pathlib import Path + +import tensorflow as tf +import numpy as np +import tensorflow_model_optimization as tfmot + +from data_processing import data_preprocessing +from model_core_utils import models + + +def print_model_weight_clusters(model): + + for layer in model.layers: + if isinstance(layer, tf.keras.layers.Wrapper): + weights = layer.trainable_weights + else: + weights = layer.weights + for weight in weights: + if "kernel" in weight.name: + unique_count = len(np.unique(weight)) + print( + f"{layer.name}/{weight.name}: {unique_count} clusters " + ) + + +def optimize(): + model_settings = models.prepare_model_settings( + len(data_preprocessing.prepare_words_list(FLAGS.wanted_words.split(','))), + FLAGS.sample_rate, FLAGS.clip_duration_ms, FLAGS.window_size_ms, + FLAGS.window_stride_ms, FLAGS.dct_coefficient_count) + + # Create the model to optimize from checkpoint. + model = models.create_model(model_settings, FLAGS.model_architecture, FLAGS.model_size_info) + model.load_weights(FLAGS.checkpoint).expect_partial() + + audio_processor = data_preprocessing.AudioProcessor(data_url=FLAGS.data_url, + data_dir=FLAGS.data_dir, + silence_percentage=FLAGS.silence_percentage, + unknown_percentage=FLAGS.unknown_percentage, + wanted_words=FLAGS.wanted_words.split(','), + validation_percentage=FLAGS.validation_percentage, + testing_percentage=FLAGS.testing_percentage, + model_settings=model_settings) + + # We decay learning rate in a constant piecewise way to help learning. + training_steps_list = list(map(int, FLAGS.how_many_training_steps.split(','))) + learning_rates_list = list(map(float, FLAGS.learning_rate.split(','))) + lr_boundary_list = training_steps_list[:-1] # Only need the values at which to change lr. + lr_schedule = tf.keras.optimizers.schedules.PiecewiseConstantDecay(boundaries=lr_boundary_list, + values=learning_rates_list) + + cluster_weights = tfmot.clustering.keras.cluster_weights + CentroidInitialization = tfmot.clustering.keras.CentroidInitialization + + clustering_params = { + 'number_of_clusters': 32, + 'cluster_centroids_init': CentroidInitialization.KMEANS_PLUS_PLUS} + + clustered_model = cluster_weights(model, **clustering_params) + + # Specify the optimizer configurations. + optimizer = tf.keras.optimizers.Adam(learning_rate=lr_schedule) + clustered_model.compile(optimizer=optimizer, + loss=tf.keras.losses.SparseCategoricalCrossentropy(), + metrics=['accuracy']) + + train_data = audio_processor.get_data(audio_processor.Modes.TRAINING, + FLAGS.background_frequency, FLAGS.background_volume, + int((FLAGS.time_shift_ms * FLAGS.sample_rate) / 1000)) + train_data = train_data.repeat().batch(FLAGS.batch_size).prefetch(tf.data.AUTOTUNE) + val_data = audio_processor.get_data(audio_processor.Modes.VALIDATION) + val_data = val_data.batch(FLAGS.batch_size).prefetch(tf.data.AUTOTUNE) + + # We train for a max number of iterations so need to calculate how many 'epochs' this will be. + training_steps_max = np.sum(training_steps_list) + training_epoch_max = int(np.ceil(training_steps_max / FLAGS.eval_step_interval)) + + # Train the model with clustering applied. + clustered_model.fit(x=train_data, + steps_per_epoch=FLAGS.eval_step_interval, + epochs=training_epoch_max, + validation_data=val_data) + + stripped_clustered_model = tfmot.clustering.keras.strip_clustering(clustered_model) + + print_model_weight_clusters(stripped_clustered_model) + + # Save the clustered model weights + train_dir = Path(FLAGS.train_dir) / "optimized" + train_dir.mkdir(parents=True, exist_ok=True) + + stripped_clustered_model.save_weights((train_dir / + (FLAGS.model_architecture + + "_clustered_ckpt"))) + + # Test the model. + test_data = audio_processor.get_data(audio_processor.Modes.TESTING) + test_data = test_data.batch(FLAGS.batch_size) + + stripped_clustered_model.compile(optimizer=optimizer, + loss=tf.keras.losses.SparseCategoricalCrossentropy(), + metrics=['accuracy']) + + test_loss, test_acc = stripped_clustered_model.evaluate(x=test_data) + print(f'Final test accuracy: {test_acc*100:.2f}%') + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + '--data_url', + type=str, + default='http://download.tensorflow.org/data/speech_commands_v0.02.tar.gz', + help='Location of speech training data archive on the web.') + parser.add_argument( + '--data_dir', + type=str, + default='/tmp/speech_dataset/', + help="""\ + Where to download the speech training data to. + """) + parser.add_argument( + '--background_volume', + type=float, + default=0.1, + help="""\ + How loud the background noise should be, between 0 and 1. + """) + parser.add_argument( + '--background_frequency', + type=float, + default=0.8, + help="""\ + How many of the training samples have background noise mixed in. + """) + parser.add_argument( + '--silence_percentage', + type=float, + default=10.0, + help="""\ + How much of the training data should be silence. + """) + parser.add_argument( + '--unknown_percentage', + type=float, + default=10.0, + help="""\ + How much of the training data should be unknown words. + """) + parser.add_argument( + '--time_shift_ms', + type=float, + default=100.0, + help="""\ + Range to randomly shift the training audio by in time. + """) + parser.add_argument( + '--testing_percentage', + type=int, + default=10, + help='What percentage of wavs to use as a test set.') + parser.add_argument( + '--validation_percentage', + type=int, + default=10, + help='What percentage of wavs to use as a validation set.') + parser.add_argument( + '--sample_rate', + type=int, + default=16000, + help='Expected sample rate of the wavs',) + parser.add_argument( + '--clip_duration_ms', + type=int, + default=1000, + help='Expected duration in milliseconds of the wavs',) + parser.add_argument( + '--window_size_ms', + type=float, + default=30.0, + help='How long each spectrogram timeslice is',) + parser.add_argument( + '--window_stride_ms', + type=float, + default=10.0, + help='How long each spectrogram timeslice is',) + parser.add_argument( + '--dct_coefficient_count', + type=int, + default=40, + help='How many bins to use for the MFCC fingerprint',) + parser.add_argument( + '--how_many_training_steps', + type=str, + default='3750,750', + help='How many training loops to run',) + parser.add_argument( + '--eval_step_interval', + type=int, + default=400, + help='How often to evaluate the training results.') + parser.add_argument( + '--learning_rate', + type=str, + default='0.001,0.0001', + help='How large a learning rate to use when training.') + parser.add_argument( + '--batch_size', + type=int, + default=100, + help='How many items to train with at once',) + parser.add_argument( + '--wanted_words', + type=str, + default='yes,no,up,down,left,right,on,off,stop,go', + help='Words to use (others will be added to an unknown label)',) + parser.add_argument( + '--train_dir', + type=str, + default='/tmp/speech_commands_train', + help='Directory to write event logs and checkpoint.') + parser.add_argument( + '--save_step_interval', + type=int, + default=100, + help='Save model checkpoint every save_steps.') + parser.add_argument( + '--checkpoint', + type=str, + help='Checkpoint to load the weights from before fine-tuning.') + parser.add_argument( + '--model_architecture', + type=str, + default='dnn', + help='What model architecture to use') + parser.add_argument( + '--model_size_info', + type=int, + nargs="+", + default=[128, 128, 128], + help='Model dimensions - different for various models') + + FLAGS, _ = parser.parse_known_args() + optimize() diff --git a/models/keyword_spotting/ds_cnn_large/model_package_tf/recreate_model.sh b/models/keyword_spotting/ds_cnn_large/model_package_tf/recreate_model.sh new file mode 100644 index 0000000..fabe86c --- /dev/null +++ b/models/keyword_spotting/ds_cnn_large/model_package_tf/recreate_model.sh @@ -0,0 +1,52 @@ +#!/usr/bin/env bash +# Copyright (C) 2023 Arm Limited or its affiliates. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the License); you may +# not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an AS IS BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +ckpt_path=model_archive/model_source/weights/ds_cnn_0.95_ckpt +train=false + +# Parse command line args +while (( $# >= 1 )); do + case $1 in + --ckpt) + if [ "$2" ]; then + ckpt_path=$2 + shift + else + printf 'ERROR: "--ckpt" requires a path to be supplied.\n' + exit 1 + fi + ;; + --train) + train=true + break;; + *) shift; + esac; +done + + +# DS-CNN Large training +if [ "$train" = true ] +then +python train.py --model_architecture ds_cnn --model_size_info 6 276 10 4 2 1 276 3 3 2 2 276 3 3 1 1 276 3 3 1 1 276 3 3 1 1 276 3 3 1 1 --dct_coefficient_count 10 --window_size_ms 40 --window_stride_ms 20 --learning_rate 0.0005,0.0001,0.00002 --how_many_training_steps 10000,10000,10000 --summaries_dir work/DS_CNN/DS_CNN_L/retrain_logs --train_dir work/DS_CNN/DS_CNN_L/training +fi + +# Conversion to TFLite fp32 +python convert_to_tflite.py --model_architecture ds_cnn --model_size_info 6 276 10 4 2 1 276 3 3 2 2 276 3 3 1 1 276 3 3 1 1 276 3 3 1 1 276 3 3 1 1 --dct_coefficient_count 10 --window_size_ms 40 --window_stride_ms 20 --checkpoint $ckpt_path --no-quantize + +# Conversion to TFLite int8 +python convert_to_tflite.py --model_architecture ds_cnn --model_size_info 6 276 10 4 2 1 276 3 3 2 2 276 3 3 1 1 276 3 3 1 1 276 3 3 1 1 276 3 3 1 1 --dct_coefficient_count 10 --window_size_ms 40 --window_stride_ms 20 --checkpoint $ckpt_path --inference_type int8 + diff --git a/models/keyword_spotting/ds_cnn_large/model_package_tf/requirements.txt b/models/keyword_spotting/ds_cnn_large/model_package_tf/requirements.txt new file mode 100644 index 0000000..3448cff --- /dev/null +++ b/models/keyword_spotting/ds_cnn_large/model_package_tf/requirements.txt @@ -0,0 +1,3 @@ +numpy == 1.19.5 +tensorflow == 2.5.0 +tensorflow-model-optimization == 0.6.0 \ No newline at end of file diff --git a/models/keyword_spotting/ds_cnn_large/model_package_tf/train.py b/models/keyword_spotting/ds_cnn_large/model_package_tf/train.py new file mode 100644 index 0000000..8c488b3 --- /dev/null +++ b/models/keyword_spotting/ds_cnn_large/model_package_tf/train.py @@ -0,0 +1,227 @@ +# Copyright © 2023 Arm Ltd. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Functions for training simple keyword spotting models.""" + +import argparse +from pathlib import Path + +import tensorflow as tf +import numpy as np + +from data_processing import data_preprocessing +from model_core_utils import models + + +def train(): + model_settings = models.prepare_model_settings( + len(data_preprocessing.prepare_words_list(FLAGS.wanted_words.split(','))), + FLAGS.sample_rate, FLAGS.clip_duration_ms, FLAGS.window_size_ms, + FLAGS.window_stride_ms, FLAGS.dct_coefficient_count) + + # Create the model. + model = models.create_model(model_settings, FLAGS.model_architecture, FLAGS.model_size_info, True) + + audio_processor = data_preprocessing.AudioProcessor(data_url=FLAGS.data_url, + data_dir=FLAGS.data_dir, + silence_percentage=FLAGS.silence_percentage, + unknown_percentage=FLAGS.unknown_percentage, + wanted_words=FLAGS.wanted_words.split(','), + validation_percentage=FLAGS.validation_percentage, + testing_percentage=FLAGS.testing_percentage, + model_settings=model_settings) + + # We decay learning rate in a constant piecewise way to help learning. + training_steps_list = list(map(int, FLAGS.how_many_training_steps.split(','))) + learning_rates_list = list(map(float, FLAGS.learning_rate.split(','))) + lr_boundary_list = training_steps_list[:-1] # Only need the values at which to change lr. + lr_schedule = tf.keras.optimizers.schedules.PiecewiseConstantDecay(boundaries=lr_boundary_list, + values=learning_rates_list) + + # Specify the optimizer configurations. + optimizer = tf.keras.optimizers.Adam(learning_rate=lr_schedule) + model.compile(optimizer=optimizer, + loss=tf.keras.losses.SparseCategoricalCrossentropy(), + metrics=['accuracy']) + + train_data = audio_processor.get_data(audio_processor.Modes.TRAINING, + FLAGS.background_frequency, FLAGS.background_volume, + int((FLAGS.time_shift_ms * FLAGS.sample_rate) / 1000)) + train_data = train_data.repeat().batch(FLAGS.batch_size).prefetch(tf.data.AUTOTUNE) + val_data = audio_processor.get_data(audio_processor.Modes.VALIDATION) + val_data = val_data.batch(FLAGS.batch_size).prefetch(tf.data.AUTOTUNE) + + # We train for a max number of iterations so need to calculate how many 'epochs' this will be. + training_steps_max = np.sum(training_steps_list) + training_epoch_max = int(np.ceil(training_steps_max / FLAGS.eval_step_interval)) + + # Callbacks. + train_dir = Path(FLAGS.train_dir) / "best" + train_dir.mkdir(parents=True, exist_ok=True) + model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint( + filepath=(train_dir / (FLAGS.model_architecture + "_{val_accuracy:.3f}_ckpt")), + save_weights_only=True, + monitor='val_accuracy', + mode='max', + save_best_only=True) + tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=FLAGS.summaries_dir) + + # Train the model. + model.fit(x=train_data, + steps_per_epoch=FLAGS.eval_step_interval, + epochs=training_epoch_max, + validation_data=val_data, + callbacks=[model_checkpoint_callback, tensorboard_callback]) + + # Test and save the model. + test_data = audio_processor.get_data(audio_processor.Modes.TESTING) + test_data = test_data.batch(FLAGS.batch_size) + + test_loss, test_acc = model.evaluate(x=test_data) + print(f'Final test accuracy: {test_acc*100:.2f}%') + model.save(f'saved_model/{FLAGS.model_architecture}') + model.save(f'keras/{FLAGS.model_architecture}.h5') + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + '--data_url', + type=str, + default='http://download.tensorflow.org/data/speech_commands_v0.02.tar.gz', + help='Location of speech training data archive on the web.') + parser.add_argument( + '--data_dir', + type=str, + default='/tmp/speech_dataset/', + help="""\ + Where to download the speech training data to. + """) + parser.add_argument( + '--background_volume', + type=float, + default=0.1, + help="""\ + How loud the background noise should be, between 0 and 1. + """) + parser.add_argument( + '--background_frequency', + type=float, + default=0.8, + help="""\ + How many of the training samples have background noise mixed in. + """) + parser.add_argument( + '--silence_percentage', + type=float, + default=10.0, + help="""\ + How much of the training data should be silence. + """) + parser.add_argument( + '--unknown_percentage', + type=float, + default=10.0, + help="""\ + How much of the training data should be unknown words. + """) + parser.add_argument( + '--time_shift_ms', + type=float, + default=100.0, + help="""\ + Range to randomly shift the training audio by in time. + """) + parser.add_argument( + '--testing_percentage', + type=int, + default=10, + help='What percentage of wavs to use as a test set.') + parser.add_argument( + '--validation_percentage', + type=int, + default=10, + help='What percentage of wavs to use as a validation set.') + parser.add_argument( + '--sample_rate', + type=int, + default=16000, + help='Expected sample rate of the wavs',) + parser.add_argument( + '--clip_duration_ms', + type=int, + default=1000, + help='Expected duration in milliseconds of the wavs',) + parser.add_argument( + '--window_size_ms', + type=float, + default=30.0, + help='How long each spectrogram timeslice is',) + parser.add_argument( + '--window_stride_ms', + type=float, + default=10.0, + help='How long each spectrogram timeslice is',) + parser.add_argument( + '--dct_coefficient_count', + type=int, + default=40, + help='How many bins to use for the MFCC fingerprint',) + parser.add_argument( + '--how_many_training_steps', + type=str, + default='15000,3000', + help='How many training loops to run',) + parser.add_argument( + '--eval_step_interval', + type=int, + default=400, + help='How often to evaluate the training results.') + parser.add_argument( + '--learning_rate', + type=str, + default='0.001,0.0001', + help='How large a learning rate to use when training.') + parser.add_argument( + '--batch_size', + type=int, + default=100, + help='How many items to train with at once',) + parser.add_argument( + '--summaries_dir', + type=str, + default='/tmp/retrain_logs', + help='Where to save summary logs for TensorBoard.') + parser.add_argument( + '--wanted_words', + type=str, + default='yes,no,up,down,left,right,on,off,stop,go', + help='Words to use (others will be added to an unknown label)',) + parser.add_argument( + '--train_dir', + type=str, + default='/tmp/speech_commands_train', + help='Directory to write event logs and checkpoint.') + parser.add_argument( + '--model_architecture', + type=str, + default='dnn', + help='What model architecture to use') + parser.add_argument( + '--model_size_info', + type=int, + nargs="+", + default=[128, 128, 128], + help='Model dimensions - different for various models') + + FLAGS, _ = parser.parse_known_args() + train() diff --git a/models/keyword_spotting/ds_cnn_large/model_package_tf/validation_utils/labels.txt b/models/keyword_spotting/ds_cnn_large/model_package_tf/validation_utils/labels.txt new file mode 100644 index 0000000..ba41645 --- /dev/null +++ b/models/keyword_spotting/ds_cnn_large/model_package_tf/validation_utils/labels.txt @@ -0,0 +1,12 @@ +_silence_ +_unknown_ +yes +no +up +down +left +right +on +off +stop +go \ No newline at end of file diff --git a/models/keyword_spotting/ds_cnn_large/tflite_clustered_fp32/README.md b/models/keyword_spotting/ds_cnn_large/tflite_clustered_fp32/README.md deleted file mode 100644 index 0643dd8..0000000 --- a/models/keyword_spotting/ds_cnn_large/tflite_clustered_fp32/README.md +++ /dev/null @@ -1,76 +0,0 @@ -# DS-CNN Clustered FP32 - -## Description -This is a clustered (32 clusters, kmeans++ centroid initialization) and retrained (fine-tuned) FP32 version of the DS-CNN Large model developed by Arm from the Hello Edge paper. Code for the original DS-CNN implementation can be found here: https://github.com/ARM-software/ML-KWS-for-MCU. The original model was converted to Keras and optimized using the Clustering API in TensorFlow Model Optimization Toolkit. - -## License -[Apache-2.0](https://spdx.org/licenses/Apache-2.0.html) - -## Related Materials -### Class Labels -The class labels associated with this model can be downloaded by running the script `get_class_labels.sh`. - -### Model Recreation Code -Code to recreate this model can be found here: https://github.com/ARM-software/ML-examples/tree/master/tflu-kws-cortex-m. - -## Network Information -| Network Information | Value | -|---------------------|----------------| -| Framework | TensorFlow Lite | -| SHA-1 Hash | d9af9829a2363c21fd6158c7bc425d0b635eb55c | -| Size (Bytes) | 1652648 | -| Provenance | The original model (before clustering and quantization) is a pretrained checkpoint based on https://github.com/ARM-software/ML-KWS-for-MCU | -| Paper | https://arxiv.org/abs/1711.07128 | - -## Performance -| Platform | Optimized | -|----------|:---------:| -| Cortex-A |:heavy_check_mark: | -| Cortex-M |:heavy_multiplication_x: | -| Mali GPU |:heavy_check_mark: | -| Ethos U |:heavy_multiplication_x: | - -### Key -* :heavy_check_mark: - Will run on this platform. -* :heavy_multiplication_x: - Will not run on this platform. - -## Accuracy -Dataset: Google Speech Commands Test Set - -| Metric | Value | -|--------|-------| -| Top 1 Accuracy | 0.950 | - -## Optimizations -| Optimization | Value | -|-----------------|---------| -| Number of Clusters | 32 | -| Cluster Initialization | K-Means | - -## Network Inputs - - - - - - - - - - - -
Input Node NameShapeDescription
input(1, 490)The input is a processed MFCCs of shape (1,490)
- -## Network Outputs - - - - - - - - - - - -
Output Node NameShapeDescription
Identity(1, 12)The probability on 12 keywords.
diff --git a/models/keyword_spotting/ds_cnn_large/tflite_clustered_fp32/ckpt/checkpoint b/models/keyword_spotting/ds_cnn_large/tflite_clustered_fp32/ckpt/checkpoint deleted file mode 100644 index be5b265..0000000 --- a/models/keyword_spotting/ds_cnn_large/tflite_clustered_fp32/ckpt/checkpoint +++ /dev/null @@ -1,2 +0,0 @@ -model_checkpoint_path: "ds_cnn_clustered_ckpt" -all_model_checkpoint_paths: "ds_cnn_clustered_ckpt" diff --git a/models/keyword_spotting/ds_cnn_large/tflite_clustered_fp32/ckpt/ds_cnn_clustered_ckpt.data-00000-of-00001 b/models/keyword_spotting/ds_cnn_large/tflite_clustered_fp32/ckpt/ds_cnn_clustered_ckpt.data-00000-of-00001 deleted file mode 100644 index fbbad53..0000000 --- a/models/keyword_spotting/ds_cnn_large/tflite_clustered_fp32/ckpt/ds_cnn_clustered_ckpt.data-00000-of-00001 +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:77f79b7be1dec13fa39088ca249cc6ea1ab2a0e0bab595034a81a7915d0584f1 -size 1699733 diff --git a/models/keyword_spotting/ds_cnn_large/tflite_clustered_fp32/ckpt/ds_cnn_clustered_ckpt.index b/models/keyword_spotting/ds_cnn_large/tflite_clustered_fp32/ckpt/ds_cnn_clustered_ckpt.index deleted file mode 100644 index f1630cc..0000000 --- a/models/keyword_spotting/ds_cnn_large/tflite_clustered_fp32/ckpt/ds_cnn_clustered_ckpt.index +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:62b15a99efc82778286c3de5248bbf4d246a751a95007d27c5e778527929b015 -size 4396 diff --git a/models/keyword_spotting/ds_cnn_large/tflite_clustered_fp32/definition.yaml b/models/keyword_spotting/ds_cnn_large/tflite_clustered_fp32/definition.yaml deleted file mode 100644 index f9c2303..0000000 --- a/models/keyword_spotting/ds_cnn_large/tflite_clustered_fp32/definition.yaml +++ /dev/null @@ -1,47 +0,0 @@ -benchmark: - SpeechCommands: - top_1_accuracy: 0.950 -description: 'This is a clustered (32 clusters, kmeans++ centroid initialization) - and retrained (fine-tuned) FP32 version of the DS-CNN Large model developed by Arm - from the Hello Edge paper. Code for the original DS-CNN implementation can be found - here: https://github.com/ARM-software/ML-KWS-for-MCU. The original model was converted - to Keras and optimized using the Clustering API in TensorFlow Model Optimization - Toolkit.' -license: -- Apache-2.0 -network: - file_size_bytes: 1652648 - filename: ds_cnn_clustered_fp32.tflite - framework: TensorFlow Lite - hash: - algorithm: sha1 - value: d9af9829a2363c21fd6158c7bc425d0b635eb55c - provenance: The original model (before clustering and quantization) is a pretrained - checkpoint based on https://github.com/ARM-software/ML-KWS-for-MCU -network_parameters: - input_nodes: - - description: The input is a processed MFCCs of shape (1,490) - example_input: - path: models/keyword_spotting/ds_cnn_large/tflite_clustered_fp32/testing_input/input - name: input - shape: - - 1 - - 490 - type: float32 - output_nodes: - - description: The probability on 12 keywords. - name: Identity - shape: - - 1 - - 12 - test_output_path: models/keyword_spotting/ds_cnn_large/tflite_clustered_fp32/testing_output/Identity -operators: - TensorFlow Lite: - - AVERAGE_POOL_2D - - CONV_2D - - DEPTHWISE_CONV_2D - - FULLY_CONNECTED - - RELU - - RESHAPE - - SOFTMAX -paper: https://arxiv.org/abs/1711.07128 diff --git a/models/keyword_spotting/ds_cnn_large/tflite_clustered_fp32/get_class_labels.sh b/models/keyword_spotting/ds_cnn_large/tflite_clustered_fp32/get_class_labels.sh deleted file mode 100755 index e59caf5..0000000 --- a/models/keyword_spotting/ds_cnn_large/tflite_clustered_fp32/get_class_labels.sh +++ /dev/null @@ -1,20 +0,0 @@ -# Copyright (C) 2021 Arm Limited or its affiliates. All rights reserved. -# -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the License); you may -# not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an AS IS BASIS, WITHOUT -# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -#!/usr/bin/env bash - -wget https://raw.githubusercontent.com/ARM-software/ML-KWS-for-MCU/e9cf319e9aa2ff71d433e111477dd95329fb94cb/Pretrained_models/labels.txt -mv labels.txt labelmappings.txt \ No newline at end of file diff --git a/models/keyword_spotting/ds_cnn_large/tflite_clustered_int8/README.md b/models/keyword_spotting/ds_cnn_large/tflite_clustered_int8/README.md deleted file mode 100644 index 3e859ed..0000000 --- a/models/keyword_spotting/ds_cnn_large/tflite_clustered_int8/README.md +++ /dev/null @@ -1,77 +0,0 @@ -# DS-CNN Clustered INT8 - -## Description -This is a clustered (32 clusters, kmeans++ centroid initialization), retrained (fine-tuned) and fully quantized version (INT8) of the DS-CNN Large model developed by Arm from the Hello Edge paper. Code for the original DS-CNN implementation can be found here: https://github.com/ARM-software/ML-KWS-for-MCU. The original model was converted to Keras, optimized using the Clustering API in TensorFlow Model Optimization Toolkit, and quantized using post-training quantization in the TF Lite Converter. - -## License -[Apache-2.0](https://spdx.org/licenses/Apache-2.0.html) - -## Related Materials -### Class Labels -The class labels associated with this model can be downloaded by running the script `get_class_labels.sh`. - -### Model Recreation Code -Code to recreate this model can be found here: https://github.com/ARM-software/ML-examples/tree/master/tflu-kws-cortex-m. - -## Network Information -| Network Information | Value | -|---------------------|----------------| -| Framework | TensorFlow Lite | -| SHA-1 Hash | 2ee38794ed171c75d3313460a1633c5d6a79f530 | -| Size (Bytes) | 503816 | -| Provenance | The original model (before clustering) is a pretrained checkpoint based on https://github.com/ARM-software/ML-KWS-for-MCU | -| Paper | https://arxiv.org/abs/1711.07128 | - -## Performance -| Platform | Optimized | -|----------|:---------:| -| Cortex-A |:heavy_multiplication_x: | -| Cortex-M |:heavy_check_mark: | -| Mali GPU |:heavy_check_mark: | -| Ethos U |:heavy_check_mark: | - -### Key -* :heavy_check_mark: - Will run on this platform. -* :heavy_multiplication_x: - Will not run on this platform. - -## Accuracy -Dataset: Google Speech Commands Test Set - -| Metric | Value | -|--------|-------| -| Top 1 Accuracy | 0.940 | - -## Optimizations -| Optimization | Value | -|--------------|---------| -| Quantization | INT8 | -| Number of Clusters | 32 | -| Cluster Initialization | K-Means | - -## Network Inputs - - - - - - - - - - - -
Input Node NameShapeDescription
input(1, 490)The input is a processed MFCCs of shape (1,490)
- -## Network Outputs - - - - - - - - - - - -
Output Node NameShapeDescription
Identity(1, 12)The probability on 12 keywords.
diff --git a/models/keyword_spotting/ds_cnn_large/tflite_clustered_int8/ckpt/checkpoint b/models/keyword_spotting/ds_cnn_large/tflite_clustered_int8/ckpt/checkpoint deleted file mode 100644 index be5b265..0000000 --- a/models/keyword_spotting/ds_cnn_large/tflite_clustered_int8/ckpt/checkpoint +++ /dev/null @@ -1,2 +0,0 @@ -model_checkpoint_path: "ds_cnn_clustered_ckpt" -all_model_checkpoint_paths: "ds_cnn_clustered_ckpt" diff --git a/models/keyword_spotting/ds_cnn_large/tflite_clustered_int8/ckpt/ds_cnn_clustered_ckpt.data-00000-of-00001 b/models/keyword_spotting/ds_cnn_large/tflite_clustered_int8/ckpt/ds_cnn_clustered_ckpt.data-00000-of-00001 deleted file mode 100644 index fbbad53..0000000 --- a/models/keyword_spotting/ds_cnn_large/tflite_clustered_int8/ckpt/ds_cnn_clustered_ckpt.data-00000-of-00001 +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:77f79b7be1dec13fa39088ca249cc6ea1ab2a0e0bab595034a81a7915d0584f1 -size 1699733 diff --git a/models/keyword_spotting/ds_cnn_large/tflite_clustered_int8/ckpt/ds_cnn_clustered_ckpt.index b/models/keyword_spotting/ds_cnn_large/tflite_clustered_int8/ckpt/ds_cnn_clustered_ckpt.index deleted file mode 100644 index f1630cc..0000000 --- a/models/keyword_spotting/ds_cnn_large/tflite_clustered_int8/ckpt/ds_cnn_clustered_ckpt.index +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:62b15a99efc82778286c3de5248bbf4d246a751a95007d27c5e778527929b015 -size 4396 diff --git a/models/keyword_spotting/ds_cnn_large/tflite_clustered_int8/definition.yaml b/models/keyword_spotting/ds_cnn_large/tflite_clustered_int8/definition.yaml deleted file mode 100644 index 3d65144..0000000 --- a/models/keyword_spotting/ds_cnn_large/tflite_clustered_int8/definition.yaml +++ /dev/null @@ -1,48 +0,0 @@ -benchmark: - SpeechCommands: - top_1_accuracy: 0.940 -description: 'This is a clustered (32 clusters, kmeans++ centroid initialization), - retrained (fine-tuned) and fully quantized version (INT8) of the DS-CNN Large model - developed by Arm from the Hello Edge paper. Code for the original DS-CNN implementation - can be found here: https://github.com/ARM-software/ML-KWS-for-MCU. The original - model was converted to Keras, optimized using the Clustering API in TensorFlow Model - Optimization Toolkit, and quantized using post-training quantization in the TF Lite - Converter.' -license: -- Apache-2.0 -network: - file_size_bytes: 503816 - filename: ds_cnn_clustered_int8.tflite - framework: TensorFlow Lite - hash: - algorithm: sha1 - value: 2ee38794ed171c75d3313460a1633c5d6a79f530 - provenance: The original model (before clustering) is a pretrained checkpoint based - on https://github.com/ARM-software/ML-KWS-for-MCU -network_parameters: - input_nodes: - - description: The input is a processed MFCCs of shape (1,490) - example_input: - path: models/keyword_spotting/ds_cnn_large/tflite_clustered_int8/testing_input/input - name: input - shape: - - 1 - - 490 - type: int8 - output_nodes: - - description: The probability on 12 keywords. - name: Identity - shape: - - 1 - - 12 - test_output_path: models/keyword_spotting/ds_cnn_large/tflite_clustered_int8/testing_output/Identity -operators: - TensorFlow Lite: - - AVERAGE_POOL_2D - - CONV_2D - - DEPTHWISE_CONV_2D - - FULLY_CONNECTED - - RELU - - RESHAPE - - SOFTMAX -paper: https://arxiv.org/abs/1711.07128 diff --git a/models/keyword_spotting/ds_cnn_large/tflite_clustered_int8/get_class_labels.sh b/models/keyword_spotting/ds_cnn_large/tflite_clustered_int8/get_class_labels.sh deleted file mode 100755 index e59caf5..0000000 --- a/models/keyword_spotting/ds_cnn_large/tflite_clustered_int8/get_class_labels.sh +++ /dev/null @@ -1,20 +0,0 @@ -# Copyright (C) 2021 Arm Limited or its affiliates. All rights reserved. -# -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the License); you may -# not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an AS IS BASIS, WITHOUT -# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -#!/usr/bin/env bash - -wget https://raw.githubusercontent.com/ARM-software/ML-KWS-for-MCU/e9cf319e9aa2ff71d433e111477dd95329fb94cb/Pretrained_models/labels.txt -mv labels.txt labelmappings.txt \ No newline at end of file diff --git a/models/keyword_spotting/ds_cnn_large/tflite_int8/README.md b/models/keyword_spotting/ds_cnn_large/tflite_int8/README.md deleted file mode 100644 index e132990..0000000 --- a/models/keyword_spotting/ds_cnn_large/tflite_int8/README.md +++ /dev/null @@ -1,59 +0,0 @@ -# DS-CNN Large INT8 - -## Description -This is a fully quantized version (asymmetrical int8) of the DS-CNN Large model developed by Arm, with training checkpoints, from the Hello Edge paper. Code to recreate this model can be found here: https://github.com/ARM-software/ML-examples/tree/master/tflu-kws-cortex-m - -## License -[Apache-2.0](https://spdx.org/licenses/Apache-2.0.html) - -## Related Materials -### Class Labels -The class labels associated with this model can be downloaded by running the script `get_class_labels.sh`. - -### Model Recreation Code -Code to recreate this model can be found here: https://github.com/ARM-software/ML-examples/tree/master/tflu-kws-cortex-m. - -## Network Information -| Network Information | Value | -|---------------------|------------------| -| Framework | TensorFlow Lite | -| SHA-1 Hash | 504f8e7bfa5c0f15c5475e5d08637b3b8aad0972 | -| Size (Bytes) | 503816 | -| Provenance | https://github.com/ARM-software/ML-examples/tree/master/tflu-kws-cortex-m | -| Paper | https://arxiv.org/abs/1711.07128 | - -## Accuracy -Dataset: Google Speech Commands Test Set - -| Metric | Value | -|--------|-------| -| Accuracy | 0.946 | - -## Performance -| Platform | Optimized | -|----------|:---------:| -| Cortex-A |:heavy_check_mark: | -| Cortex-M |:heavy_check_mark: HERO | -| Mali GPU |:heavy_check_mark: | -| Ethos U |:heavy_check_mark: | - -### Key -* :heavy_check_mark: - Will run on this platform. -* :heavy_multiplication_x: - Will not run on this platform. - - - -## Optimizations -| Optimization | Value | -|-----------------|---------| -| Quantization | INT8 | - -## Network Inputs -| Input Node Name | Shape | Description | -|-----------------|---------|-------------| -| input | (1, 490) | The input is a processed MFCCs of shape (1, 490) | - -## Network Outputs -| Output Node Name | Shape | Description | -|------------------|---------|-------------| -| Identity | (1, 12) | The probability on 12 keywords. | diff --git a/models/keyword_spotting/ds_cnn_large/tflite_int8/definition.yaml b/models/keyword_spotting/ds_cnn_large/tflite_int8/definition.yaml deleted file mode 100644 index 54df622..0000000 --- a/models/keyword_spotting/ds_cnn_large/tflite_int8/definition.yaml +++ /dev/null @@ -1,45 +0,0 @@ -benchmark: - Google Speech Commands test set: - Accuracy: 94.58% -description: 'This is a fully quantized version (asymmetrical int8) of the DS-CNN - Large model developed by Arm, with training checkpoints, from the Hello Edge paper. - Code to recreate this model can be found here: https://github.com/ARM-software/ML-examples/tree/master/tflu-kws-cortex-m' -license: -- Apache-2.0 -network: - file_size_bytes: 503816 - filename: ds_cnn_l_quantized.tflite - framework: TensorFlow Lite - hash: - algorithm: sha1 - value: 504f8e7bfa5c0f15c5475e5d08637b3b8aad0972 - provenance: https://github.com/ARM-software/ML-examples/tree/master/tflu-kws-cortex-m - quality_level: hero#CORTEX-M -network_parameters: - input_nodes: - - description: The input is a processed MFCCs of shape (1, 490) - example_input: - path: models/keyword_spotting/ds_cnn_large/tflite_int8/testing_input/input - name: input - shape: - - 1 - - 490 - output_nodes: - - description: The probability on 12 keywords. - name: Identity - shape: - - 1 - - 12 - test_output_path: models/keyword_spotting/ds_cnn_large/tflite_int8/testing_output/Identity -operators: - TensorFlow Lite: - - AVERAGE_POOL_2D - - CONV_2D - - DEPTHWISE_CONV_2D - - DEQUANTIZE - - FULLY_CONNECTED - - QUANTIZE - - RELU - - RESHAPE - - SOFTMAX -paper: https://arxiv.org/abs/1711.07128 diff --git a/models/keyword_spotting/ds_cnn_large/tflite_int8/get_class_labels.sh b/models/keyword_spotting/ds_cnn_large/tflite_int8/get_class_labels.sh deleted file mode 100755 index e59caf5..0000000 --- a/models/keyword_spotting/ds_cnn_large/tflite_int8/get_class_labels.sh +++ /dev/null @@ -1,20 +0,0 @@ -# Copyright (C) 2021 Arm Limited or its affiliates. All rights reserved. -# -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the License); you may -# not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an AS IS BASIS, WITHOUT -# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -#!/usr/bin/env bash - -wget https://raw.githubusercontent.com/ARM-software/ML-KWS-for-MCU/e9cf319e9aa2ff71d433e111477dd95329fb94cb/Pretrained_models/labels.txt -mv labels.txt labelmappings.txt \ No newline at end of file diff --git a/models/keyword_spotting/ds_cnn_medium/model_package_tf/README.md b/models/keyword_spotting/ds_cnn_medium/model_package_tf/README.md new file mode 100644 index 0000000..47e2846 --- /dev/null +++ b/models/keyword_spotting/ds_cnn_medium/model_package_tf/README.md @@ -0,0 +1,115 @@ +# DS-CNN Medium model package + +This folder contains code that will allow you to recreate the DS-CNN Medium keyword spotting model from +the [Hello Edge paper](https://arxiv.org/pdf/1711.07128.pdf). + +## License +[Apache-2.0](https://spdx.org/licenses/Apache-2.0.html) + +## Model Package Overview +| Model | DS_CNN_Medium | +|:---------------: |:------------------------------------------:| +| **Format**: | Keras, Saved Model, TensorFlow Lite int8, TensorFlow Lite fp32 | +| **Feature**: | Keyword spotting for Arm Cortex-M CPUs | +| **Architectural Delta w.r.t. Vanilla**: | None | +| **Domain**: | Keyword spotting | +| **Package Quality**: | Hero | + +## Model Recreation + +In order to recreate the model you will first need to be using ```Python3.7``` and install the requirements in ```requirements.txt```. + +Once you have these requirements satisfied you can execute the recreation script contained within this folder, just run: + +```bash +bash ./recreate_model.sh +``` + +Running this script will use the pre-trained checkpoint files supplied in the ```./model_archive/model_source/weights``` folder +to generate the TFLite files and perform evaluation on the test sets. Both an fp32 version and a quantized version will be produced. +The quantized version will use post-training quantization to fully quantize it. + +If you want to run training from scratch you can do this by supplying ```--train``` when running the script. For example: + +```bash +bash ./recreate_model.sh --train +``` + +Training is then performed and should produce a model to the stated accuracy in this repository. +Note that exporting to TFLite will still happen with the pre-trained checkpoint files so you will need to re-run the script +and this time supply the path to the new checkpoint files you want to use, for example: + +```bash +bash ./recreate_model.sh --ckpt +``` + + +## Training + +To train a DNN with 3 fully-connected layers with 128 neurons in each layer, run: + +``` +python train.py --model_architecture dnn --model_size_info 128 128 128 +``` +The command line argument *--model_size_info* is used to pass the neural network layer +dimensions such as number of layers, convolution filter size/stride as a list to models.py, +which builds the TensorFlow graph based on the provided model architecture +and layer dimensions. For more info on *model_size_info* for each network architecture see +[models.py](models.py). + +The training commands with all the hyperparameters to reproduce the models shown in the +[paper](https://arxiv.org/pdf/1711.07128.pdf) are given [here](recreate_model.sh). + +## Testing +To run inference on the trained model from a checkpoint and get accuracy on validation and test sets, run: +``` +python evaluation.py --model_architecture dnn --model_size_info 128 128 128 --checkpoint +``` +The parameters used here should match those used in the Training step. + +## Optimization + +We introduce a new *optional* step to optimize the trained keyword spotting model for deployment. + +Here we use TensorFlow's [weight clustering API](https://www.tensorflow.org/model_optimization/guide/clustering) to reduce the compressed model size and optimize inference on supported hardware. 32 weight clusters and kmeans++ cluster intialization method are used as the clustering hyperparameters. + +To optimize your trained model (e.g. a DNN), a trained model checkpoint is needed to run clustering and fine-tuning on. +You can use the pre-trained checkpoints provided, or train your own model and use the resulting checkpoint. + +To apply the optimization and fine-tuning, run the following command: +``` +python optimisations.py --model_architecture dnn --model_size_info 128 128 128 --checkpoint +``` +The parameters used here should match those used in the Training step, except for the number of training steps. +The number of training steps is reduced since the optimization step only requires fine-tuning. + +This will generate a clustered model checkpoint that can be used in the quantization step to generate a quantized and clustered TFLite model. + +## Quantization and TFLite Conversion + +As part of the update we now use TensorFlow's +[post training quantization](https://www.tensorflow.org/lite/performance/post_training_quantization) to +make quantization of the trained models super simple. + +To quantize your trained model (e.g. a DNN) run: +``` +python convert_to_tflite.py --model_architecture dnn --model_size_info 128 128 128 --checkpoint [--inference_type int8|int16] +``` +The parameters used here should match those used in the Training step. + +The inference_type parameter is *optional* and to be used if a fully quantized model with inputs and outputs of type int8 or int16 is needed. It defaults to fp32. + +This step will produce a quantized TFLite file *dnn_quantized.tflite*. +You can test the accuracy of this quantized model on the test set by running: +``` +python evaluation.py --tflite_path dnn_quantized.tflite +``` +The parameters used here should match those used in the Training step. + +`convert_to_tflite.py` uses post-training quantization to generate a quantized model by default. If you wish to convert to a floating point TFLite model, use the command below: + +``` +python convert_to_tflite.py --model_architecture dnn --model_size_info 128 128 128 --checkpoint --no-quantize +``` + +This will produce a floating point TFLite file *dnn.tflite*. You can test the accuracy of this floating point model using `evaluation.py` as above. diff --git a/models/keyword_spotting/ds_cnn_medium/model_package_tf/convert_to_tflite.py b/models/keyword_spotting/ds_cnn_medium/model_package_tf/convert_to_tflite.py new file mode 100644 index 0000000..64ab8df --- /dev/null +++ b/models/keyword_spotting/ds_cnn_medium/model_package_tf/convert_to_tflite.py @@ -0,0 +1,234 @@ +# Copyright © 2023 Arm Ltd. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Functions for converting and quantizing a trained keyword spotting + model and saving to TFLite.""" + +import argparse + +import tensorflow as tf + +from data_processing import data_preprocessing +from model_core_utils import models +from evaluation import tflite_test + +NUM_REP_DATA_SAMPLES = 100 # How many samples to use for post training quantization. + + +def convert(model_settings, audio_processor, checkpoint, quantize, inference_type, tflite_path): + """Load our trained floating point model and convert it. + + TFLite conversion or post training quantization is performed and the + resulting model is saved as a TFLite file. + We use samples from the validation set to do post training quantization. + + Args: + model_settings: Dictionary of common model settings. + audio_processor: Audio processor class object. + checkpoint: Path to training checkpoint to load. + quantize: Whether to quantize the model or convert to fp32 TFLite model. + inference_type: Input/output type of the quantized model. + tflite_path: Output TFLite file save path. + """ + model = models.create_model(model_settings, FLAGS.model_architecture, FLAGS.model_size_info, False) + model.load_weights(checkpoint).expect_partial() + + val_data = audio_processor.get_data(audio_processor.Modes.VALIDATION).batch(1) + + def _rep_dataset(): + """Generator function to produce representative dataset.""" + i = 0 + for mfcc, label in val_data: + if i > NUM_REP_DATA_SAMPLES: + break + i += 1 + yield [mfcc] + + if quantize: + # Quantize model and save to disk. + tflite_model = post_training_quantize(model, inference_type, _rep_dataset) + with open(tflite_path, 'wb') as f: + f.write(tflite_model) + print(f'Quantized model saved to {tflite_path}.') + else: + converter = tf.lite.TFLiteConverter.from_keras_model(model) + tflite_model = converter.convert() + with open(tflite_path, 'wb') as f: + f.write(tflite_model) + print(f'Converted model saved to {tflite_path}.') + + +def post_training_quantize(keras_model, inference_type, rep_dataset): + """Perform post training quantization and returns the TFLite model ready for saving. + + See https://www.tensorflow.org/lite/performance/post_training_quantization#full_integer_quantization for + more details. + + Args: + keras_model: The trained tf Keras model used for post training quantization. + inference_type: Input/output type of the quantized model. + rep_dataset: Function to use as a representative dataset, must be callable. + + Returns: + Quantized TFLite model ready for saving to disk. + """ + converter = tf.lite.TFLiteConverter.from_keras_model(keras_model) + converter.optimizations = [tf.lite.Optimize.DEFAULT] + + if inference_type == 'int8': + converter.inference_input_type = tf.int8 + converter.inference_output_type = tf.int8 + supported_ops = tf.lite.OpsSet.TFLITE_BUILTINS_INT8 + if inference_type == 'int16': + converter.inference_input_type = tf.int16 + converter.inference_output_type = tf.int16 + supported_ops = tf.lite.OpsSet.EXPERIMENTAL_TFLITE_BUILTINS_ACTIVATIONS_INT16_WEIGHTS_INT8 + + # Int8 post training quantization needs representative dataset. + converter.representative_dataset = rep_dataset + converter.target_spec.supported_ops = [supported_ops] + + tflite_model = converter.convert() + + return tflite_model + + +def main(): + model_settings = models.prepare_model_settings( + len(data_preprocessing.prepare_words_list(FLAGS.wanted_words.split(','))), + FLAGS.sample_rate, FLAGS.clip_duration_ms, FLAGS.window_size_ms, + FLAGS.window_stride_ms, FLAGS.dct_coefficient_count) + + audio_processor = data_preprocessing.AudioProcessor(data_url=FLAGS.data_url, + data_dir=FLAGS.data_dir, + silence_percentage=FLAGS.silence_percentage, + unknown_percentage=FLAGS.unknown_percentage, + wanted_words=FLAGS.wanted_words.split(','), + validation_percentage=FLAGS.validation_percentage, + testing_percentage=FLAGS.testing_percentage, + model_settings=model_settings) + + if FLAGS.quantize: + tflite_path = f'{FLAGS.model_architecture}_quantized.tflite' + else: + tflite_path = f'{FLAGS.model_architecture}.tflite' + + # Load floating point model from checkpoint and convert it. + convert(model_settings, audio_processor, FLAGS.checkpoint, + FLAGS.quantize, FLAGS.inference_type, tflite_path) + + # Test the newly converted model on the test set. + tflite_test(model_settings, audio_processor, tflite_path) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument( + '--data_url', + type=str, + default='http://download.tensorflow.org/data/speech_commands_v0.02.tar.gz', + help='Location of speech training data archive on the web.') + parser.add_argument( + '--data_dir', + type=str, + default='/tmp/speech_dataset/', + help="""\ + Where to download the speech training data to. + """) + parser.add_argument( + '--silence_percentage', + type=float, + default=10.0, + help="""\ + How much of the training data should be silence. + """) + parser.add_argument( + '--unknown_percentage', + type=float, + default=10.0, + help="""\ + How much of the training data should be unknown words. + """) + parser.add_argument( + '--testing_percentage', + type=int, + default=10, + help='What percentage of wavs to use as a test set.') + parser.add_argument( + '--validation_percentage', + type=int, + default=10, + help='What percentage of wavs to use as a validation set.') + parser.add_argument( + '--sample_rate', + type=int, + default=16000, + help='Expected sample rate of the wavs',) + parser.add_argument( + '--clip_duration_ms', + type=int, + default=1000, + help='Expected duration in milliseconds of the wavs',) + parser.add_argument( + '--window_size_ms', + type=float, + default=30.0, + help='How long each spectrogram timeslice is',) + parser.add_argument( + '--window_stride_ms', + type=float, + default=10.0, + help='How long each spectrogram timeslice is',) + parser.add_argument( + '--dct_coefficient_count', + type=int, + default=40, + help='How many bins to use for the MFCC fingerprint',) + parser.add_argument( + '--wanted_words', + type=str, + default='yes,no,up,down,left,right,on,off,stop,go', + help='Words to use (others will be added to an unknown label)',) + parser.add_argument( + '--model_architecture', + type=str, + default='dnn', + help='What model architecture to use') + parser.add_argument( + '--model_size_info', + type=int, + nargs="+", + default=[128, 128, 128], + help='Model dimensions - different for various models') + parser.add_argument( + '--checkpoint', + type=str, + help='Checkpoint to load the weights from.') + parser.add_argument( + '--quantize', + dest='quantize', + action="store_true", + default=True, + help='Whether to quantize the model or convert to fp32 TFLite model. Defaults to True.') + parser.add_argument( + '--no-quantize', + dest='quantize', + action="store_false", + help='Whether to quantize the model or convert to fp32 TFLite model. Defaults to True.') + parser.add_argument( + '--inference_type', + type=str, + default='fp32', + help='If quantize is true, whether the model input and output is float32, int8 or int16') + + FLAGS, _ = parser.parse_known_args() + main() diff --git a/models/keyword_spotting/ds_cnn_medium/model_package_tf/data_processing/__init__.py b/models/keyword_spotting/ds_cnn_medium/model_package_tf/data_processing/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/models/keyword_spotting/ds_cnn_medium/model_package_tf/data_processing/data_preprocessing.py b/models/keyword_spotting/ds_cnn_medium/model_package_tf/data_processing/data_preprocessing.py new file mode 100644 index 0000000..05cf5ba --- /dev/null +++ b/models/keyword_spotting/ds_cnn_medium/model_package_tf/data_processing/data_preprocessing.py @@ -0,0 +1,462 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +# +# Modifications Copyright 2023 Arm Inc. All Rights Reserved. +# Modified to use TensorFlow 2.0 and data pipelines. +# +"""Functions for loading and preparing data for keyword spotting.""" + +import os +import re +import sys +import urllib +from pathlib import Path +import tarfile +import hashlib +import random +import math +from enum import Enum + +import numpy as np +import tensorflow as tf +from tensorflow.python.ops import gen_audio_ops as audio_ops + +MAX_NUM_WAVS_PER_CLASS = 2**27 - 1 # ~134M +RANDOM_SEED = 59185 +BACKGROUND_NOISE_DIR_NAME = '_background_noise_' +SILENCE_LABEL = '_silence_' +SILENCE_INDEX = 0 +UNKNOWN_WORD_INDEX = 1 +UNKNOWN_WORD_LABEL = '_unknown_' + + +def load_wav_file(wav_filename, desired_samples): + """Loads and then decodes a given 16bit PCM wav file. + + Decoded audio is scaled to the range [-1, 1] and padded or cropped to the desired number of samples. + + Args: + wav_filename: 16bit PCM wav file to load. + desired_samples: Number of samples wanted from the audio file. + + Returns: + Tuple consisting of the decoded audio and sample rate. + """ + wav_file = tf.io.read_file(wav_filename) + decoded_wav = audio_ops.decode_wav(wav_file, desired_channels=1, desired_samples=desired_samples) + + return decoded_wav.audio, decoded_wav.sample_rate + + +def calculate_mfcc(audio_signal, audio_sample_rate, window_size, window_stride, num_mfcc): + """Returns Mel Frequency Cepstral Coefficients (MFCC) for a given audio signal. + + Args: + audio_signal: Raw audio signal in range [-1, 1] + audio_sample_rate: Audio signal sample rate + window_size: Window size in samples for calculating spectrogram + window_stride: Window stride in samples for calculating spectrogram + num_mfcc: The number of MFCC features wanted. + + Returns: + Calculated mffc features. + """ + spectrogram = audio_ops.audio_spectrogram(input=audio_signal, window_size=window_size, stride=window_stride, + magnitude_squared=True) + + mfcc_features = audio_ops.mfcc(spectrogram, audio_sample_rate, dct_coefficient_count=num_mfcc) + + return mfcc_features + + +def which_set(filename, validation_percentage, testing_percentage): + """Determines which data partition the file should belong to. + + We want to keep files in the same training, validation, or testing sets even + if new ones are added over time. This makes it less likely that testing + samples will accidentally be reused in training when long runs are restarted + for example. To keep this stability, a hash of the filename is taken and used + to determine which set it should belong to. This determination only depends on + the name and the set proportions, so it won't change as other files are added. + It's also useful to associate particular files as related (for example words + spoken by the same person), so anything after '_nohash_' in a filename is + ignored for set determination. This ensures that 'bobby_nohash_0.wav' and + 'bobby_nohash_1.wav' are always in the same set, for example. + + Args: + filename: File path of the data sample. + validation_percentage: How much of the data set to use for validation. + testing_percentage: How much of the data set to use for testing. + + Returns: + String, one of 'training', 'validation', or 'testing'. + """ + base_name = os.path.basename(filename) + # We want to ignore anything after '_nohash_' in the file name when + # deciding which set to put a wav in, so the data set creator has a way of + # grouping wavs that are close variations of each other. + hash_name = re.sub(r'_nohash_.*$', '', base_name) + # This looks a bit magical, but we need to decide whether this file should + # go into the training, testing, or validation sets, and we want to keep + # existing files in the same set even if more files are subsequently + # added. + # To do that, we need a stable way of deciding based on just the file name + # itself, so we do a hash of that and then use that to generate a + # probability value that we use to assign it. + hash_name_hashed = hashlib.sha1(tf.compat.as_bytes(hash_name)).hexdigest() + percentage_hash = ((int(hash_name_hashed, 16) % + (MAX_NUM_WAVS_PER_CLASS + 1)) * + (100.0 / MAX_NUM_WAVS_PER_CLASS)) + if percentage_hash < validation_percentage: + result = 'validation' + elif percentage_hash < (testing_percentage + validation_percentage): + result = 'testing' + else: + result = 'training' + return result + + +def prepare_words_list(wanted_words): + """Prepends common tokens to the custom word list. + + Args: + wanted_words: List of strings containing custom words to spot. + + Returns: + List of words with silence and unknown tokens added. + """ + return [SILENCE_LABEL, UNKNOWN_WORD_LABEL] + wanted_words + + +class AudioProcessor: + """Handles loading, partitioning, and preparing audio training data.""" + + class Modes(Enum): + TRAINING = 1 + VALIDATION = 2 + TESTING = 3 + + def __init__(self, data_url, data_dir, silence_percentage, unknown_percentage, + wanted_words, validation_percentage, testing_percentage, model_settings): + self.data_dir = Path(data_dir) + self.model_settings = model_settings + self.words_list = prepare_words_list(wanted_words) + + self._tf_datasets = {} + self.background_data = None + self._set_size = {'training': 0, 'validation': 0, 'testing': 0} + + self._download_and_extract_data(data_url, data_dir) + self._prepare_datasets(silence_percentage, unknown_percentage, wanted_words, + validation_percentage, testing_percentage) + self._prepare_background_data() + + def get_data(self, mode, background_frequency=0, background_volume_range=0, time_shift=0): + """Returns the train, validation or test set for KWS as a TF Dataset. + + Args: + mode: The set to return, see AudioProcessor.Modes enumeration. + background_frequency: How many of the samples have background noise mixed in. + background_volume_range: How loud the background noise should be, between 0 and 1. + time_shift: Range to randomly shift the training audio by in time. + + Returns: + TF dataset that will generate tuples containing an mfcc and corresponding label. + + Raises: + ValueError: If mode is not recognised. + """ + if mode == AudioProcessor.Modes.TRAINING: + dataset = self._tf_datasets['training'] + elif mode == AudioProcessor.Modes.VALIDATION: + dataset = self._tf_datasets['validation'] + elif mode == AudioProcessor.Modes.TESTING: + dataset = self._tf_datasets['testing'] + else: + ValueError("Incorrect dataset type given") + + use_background = (self.background_data is not None) and (mode == AudioProcessor.Modes.TRAINING) + dataset = dataset.map(lambda path, label: self._process_path(path, label, self.model_settings, + background_frequency, background_volume_range, + time_shift, use_background, self.background_data), + num_parallel_calls=tf.data.experimental.AUTOTUNE) + + return dataset + + def set_size(self, mode): + """Get the number of samples in the requested dataset partition. + + Args: + mode: Which partition, see AudioProcessor.Modes enumeration. + + Returns: + Number of samples in the partition. + + Raises: + ValueError: If mode is not recognised. + """ + if mode == AudioProcessor.Modes.TRAINING: + return self._set_size['training'] + elif mode == AudioProcessor.Modes.VALIDATION: + return self._set_size['validation'] + elif mode == AudioProcessor.Modes.TESTING: + return self._set_size['testing'] + else: + ValueError('Incorrect dataset type given') + + @staticmethod + def _process_path(path, label, model_settings, background_frequency, background_volume_range, time_shift_samples, + use_background, background_data): + """Load wav files and calculate mfcc features. + + Random shifting of samples and adding in background noise is done within this function as well. + This function is meant to be mapped onto a TF Dataset by using a lambda function. + + Args: + path: Path to the wav file to load. + label: Integer label for classifying the audio clip. + model_settings: Dictionary of settings for model being trained. + background_frequency: How many clips will have background noise, 0.0 to 1.0. + background_volume_range: How loud the background noise will be. + time_shift_samples: How much to randomly shift the clips by. + use_background: Add in background noise to audio clips or not. + background_data: Ragged tensor of loaded background noise samples. + + Returns: + Tuple of calculated flattened mfcc and its class label. + """ + + desired_samples = model_settings['desired_samples'] + audio, sample_rate = load_wav_file(path, desired_samples=desired_samples) + + # Make our own silence audio data. + if label == SILENCE_INDEX: + audio = tf.multiply(audio, 0) + + # Shift samples start position and pad any gaps with zeros. + if time_shift_samples > 0: + time_shift_amount = tf.random.uniform(shape=(), minval=-time_shift_samples, maxval=time_shift_samples, + dtype=tf.int32) + else: + time_shift_amount = 0 + if time_shift_amount > 0: + time_shift_padding = [[time_shift_amount, 0], [0, 0]] + time_shift_offset = [0, 0] + else: + time_shift_padding = [[0, -time_shift_amount], [0, 0]] + time_shift_offset = [-time_shift_amount, 0] + + padded_foreground = tf.pad(audio, time_shift_padding, mode='CONSTANT') + sliced_foreground = tf.slice(padded_foreground, time_shift_offset, [desired_samples, -1]) + + # Get a random section of background noise. + if use_background: + background_index = tf.random.uniform(shape=(), maxval=background_data.shape[0], dtype=tf.int32) + background_sample = background_data[background_index] + background_offset = tf.random.uniform(shape=(), maxval=len(background_sample)-desired_samples, + dtype=tf.int32) + background_clipped = background_sample[background_offset:(background_offset + desired_samples)] + background_reshaped = tf.reshape(background_clipped, [desired_samples, 1]) + if tf.random.uniform(shape=(), maxval=1) < background_frequency: + background_volume = tf.random.uniform(shape=(), maxval=background_volume_range) + else: + background_volume = tf.constant(0, dtype='float32') + else: + background_reshaped = np.zeros([desired_samples, 1], dtype=np.float32) + background_volume = tf.constant(0, dtype='float32') + + # Mix in background noise. + background_mul = tf.multiply(background_reshaped, background_volume) + background_add = tf.add(background_mul, sliced_foreground) + background_clamp = tf.clip_by_value(background_add, -1.0, 1.0) + + mfcc = calculate_mfcc(background_clamp, sample_rate, model_settings['window_size_samples'], + model_settings['window_stride_samples'], + model_settings['dct_coefficient_count']) + mfcc = tf.reshape(mfcc, [-1]) + + return mfcc, label + + def _download_and_extract_data(self, data_url, target_directory): + """Downloads and extracts file to target directory. + + If the file does not already exist download it and then untar into the target directory. + + Args: + data_url: Web link to the tarred data to download. + target_directory: Directory to download and extract to. + """ + target_directory = Path(target_directory) + target_directory.mkdir(exist_ok=True) + + filename = data_url.split('/')[-1] + filepath = target_directory / filename + + if not filepath.exists(): + def _report_hook(block_num, block_size, total_size): + """Function to track download progress in urllib""" + read_so_far = block_num * block_size + percent = (read_so_far / total_size) * 100.0 + + s = f"\rDownloading {filename} {percent:.1f}%" + + sys.stdout.write(s) + sys.stdout.flush() + + filepath, _ = urllib.request.urlretrieve(data_url, filepath, _report_hook) + print() + + print(f'Untarring {filename}...') + tarfile.open(filepath, 'r:gz').extractall(target_directory) + + def _prepare_datasets(self, silence_percentage, unknown_percentage, wanted_words, + validation_percentage, testing_percentage): + """Split the data into train, validation and testing sets. + + Silence and unknown data is added, then sets are converted to TF Datasets. + + Args: + silence_percentage: Percent of words should be silence. + unknown_percentage: Percent of words that should be unknown. + wanted_words: List of words wanted to classify. + validation_percentage: Percent to split off for validation. + testing_percentage: Percent to split off for testing. + """ + # Make sure the shuffling and picking of unknowns is deterministic. + random.seed(RANDOM_SEED) + wanted_words_index = {} + + for index, wanted_word in enumerate(wanted_words): + wanted_words_index[wanted_word] = index + 2 + + # Find all wav files in subfolders. + search_path = self.data_dir / '*' / '*.wav' + data_index, unknown_index, all_words = self._find_and_sort_wavs(search_path, validation_percentage, + testing_percentage, wanted_words_index) + + for index, wanted_word in enumerate(wanted_words): + if wanted_word not in all_words: + raise Exception(f'Tried to find {wanted_word} in labels but only found: {", ".join(all_words.keys())}') + + word_to_index = {} + for word in all_words: + if word in wanted_words_index: + word_to_index[word] = wanted_words_index[word] + else: + word_to_index[word] = UNKNOWN_WORD_INDEX + word_to_index[SILENCE_LABEL] = SILENCE_INDEX + + # We need an arbitrary file to load as the input for the silence samples. + # It's multiplied by zero later, so the content doesn't matter. + silence_wav_path = data_index['training'][0]['file'] + for set_index in ['validation', 'testing', 'training']: + set_size = len(data_index[set_index]) # Size before adding silence and unknown samples. + silence_size = int(math.ceil(set_size * silence_percentage / 100)) + for _ in range(silence_size): + data_index[set_index].append({ + 'label': SILENCE_LABEL, + 'file': silence_wav_path + }) + # Pick some unknowns to add to each partition of the data set. + random.shuffle(unknown_index[set_index]) + unknown_size = int(math.ceil(set_size * unknown_percentage / 100)) + data_index[set_index].extend(unknown_index[set_index][:unknown_size]) + + self._set_size[set_index] = len(data_index[set_index]) # Size after adding silence and unknown samples. + + # Make sure the ordering is random. + random.shuffle(data_index[set_index]) + + # Transform into TF Datasets ready for easier processing later. + labels, paths = list(zip(*[d.values() for d in data_index[set_index]])) + labels = [word_to_index[label] for label in labels] + self._tf_datasets[set_index] = tf.data.Dataset.from_tensor_slices((list(paths), labels)) + + def _find_and_sort_wavs(self, search_pattern, validation_percentage, testing_percentage, wanted_words_index): + """Find and sort wav files into known and unknown word sets. + + Known words are files containing words in the list of wanted words. + Any other clip goes to the unknown label set. Labels come from the folder names. + All clips are also assigned to train, test and validation sets. + + Args: + search_pattern: Path pattern used by glob to find wav files. + validation_percentage: Percent to split off for validation. + testing_percentage: Percent to split off for testing. + wanted_words_index: Dict mapping wanted words to their label index. + + Returns: + 3-tuple of known words, unknown words and mapping of all word labels. + """ + data_index = {'validation': [], 'testing': [], 'training': []} + unknown_index = {'validation': [], 'testing': [], 'training': []} + all_words = {} + + for wav_path in sorted(tf.io.gfile.glob(str(search_pattern))): + word = Path(wav_path).parent.name.lower() + + # Treat the '_background_noise_' folder as a special case, since we expect + # it to contain long audio samples we mix in to improve training. + if word == BACKGROUND_NOISE_DIR_NAME: + continue + + all_words[word] = True + set_index = which_set(wav_path, validation_percentage, testing_percentage) + # If it's a known class, store its detail, otherwise add it to the list + # we'll use to train the unknown label. + if word in wanted_words_index: + data_index[set_index].append({'label': word, 'file': wav_path}) + else: + unknown_index[set_index].append({'label': word, 'file': wav_path}) + if not all_words: + raise Exception('No .wavs found at ' + str(search_pattern)) + + return data_index, unknown_index, all_words + + def _prepare_background_data(self): + """Searches a folder for background noise audio, and loads it into memory. + + It's expected that the background audio samples will be in a subdirectory + named '_background_noise_' inside the 'data_dir' folder, as .wavs that match + the sample rate of the training data, but can be much longer in duration. + + If the '_background_noise_' folder doesn't exist at all, this isn't an + error, it's just taken to mean that no background noise augmentation should + be used. If the folder does exist, but it's empty, that's treated as an + error. + + Returns: + Ragged tensor of raw PCM-encoded audio samples of background noise. + None if '_background_noise_' folder doesnt exist. + + Raises: + Exception: If files aren't found in the folder. + """ + background_data = [] + background_dir = Path(self.data_dir / BACKGROUND_NOISE_DIR_NAME) + if not background_dir.exists(): + self.background_data = None + return + + search_path = Path(background_dir / '*.wav') + for wav_path in tf.io.gfile.glob(str(search_path)): + wav_data, _ = load_wav_file(wav_path, desired_samples=-1) + background_data.append(tf.reshape(wav_data, [-1])) + + if not background_data: + raise Exception('No background wav files were found in ' + str(search_path)) + + # Ragged tensor as we cant use lists in tf dataset map functions. + self.background_data = tf.ragged.stack(background_data) diff --git a/models/keyword_spotting/ds_cnn_medium/model_package_tf/ds_cnn_m_inference_keras.py b/models/keyword_spotting/ds_cnn_medium/model_package_tf/ds_cnn_m_inference_keras.py new file mode 100644 index 0000000..db7694a --- /dev/null +++ b/models/keyword_spotting/ds_cnn_medium/model_package_tf/ds_cnn_m_inference_keras.py @@ -0,0 +1,76 @@ +# Copyright © 2023 Arm Ltd. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from data_processing.data_preprocessing import load_wav_file, calculate_mfcc + +import tensorflow as tf +import argparse + + +def load_labels(filename): + """Read in labels, one label per line.""" + f = open(filename, "r") + return f.read().splitlines() + + +def main(): + window_size_samples = int(FLAGS.sample_rate * FLAGS.window_size_ms / 1000) + window_stride_samples = int(FLAGS.sample_rate * FLAGS.window_stride_ms / 1000) + decoded, sample = load_wav_file(FLAGS.wav, FLAGS.sample_rate) + x = calculate_mfcc(decoded, sample, window_size_samples, window_stride_samples, FLAGS.dct_coefficient_count) + x = tf.reshape(x, [1, -1]) + + model = tf.keras.models.load_model(FLAGS.keras_file_path) + predictions = model.predict(x) + + # Sort to show labels in order of confidence + top_k = predictions[0].argsort()[-1:][::-1] + for node_id in top_k: + human_string = load_labels(FLAGS.labels)[int(node_id)] + score = predictions[0,node_id] + print(f'model predicted: {human_string} with score {score:.5f}') + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument( + '--wav', type=str, default='', help='Audio file to be identified.') + parser.add_argument( + '--labels', type=str, default='', help='Path to file containing labels.') + parser.add_argument( + '--sample_rate', + type=int, + default=16000, + help='Expected sample rate of the wavs', ) + parser.add_argument( + '--window_size_ms', + type=float, + default=30.0, + help='How long each spectrogram timeslice is', ) + parser.add_argument( + '--window_stride_ms', + type=float, + default=10.0, + help='How long each spectrogram timeslice is', ) + parser.add_argument( + '--dct_coefficient_count', + type=int, + default=40, + help='How many bins to use for the MFCC fingerprint', ) + parser.add_argument( + '--keras_file_path', + type=str, + default='', + help='Path to the .h5 Keras model file to use for testing.') + FLAGS, unparsed = parser.parse_known_args() + main() diff --git a/models/keyword_spotting/ds_cnn_medium/model_package_tf/ds_cnn_m_inference_tflite.py b/models/keyword_spotting/ds_cnn_medium/model_package_tf/ds_cnn_m_inference_tflite.py new file mode 100644 index 0000000..9f79d99 --- /dev/null +++ b/models/keyword_spotting/ds_cnn_medium/model_package_tf/ds_cnn_m_inference_tflite.py @@ -0,0 +1,120 @@ +# Copyright © 2023 Arm Ltd. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from data_processing.data_preprocessing import load_wav_file, calculate_mfcc + +import tensorflow as tf +import numpy as np +import argparse + + +def tflite_inference(input_data, tflite_path): + """Call forwards pass of TFLite file and returns the result. + + Args: + input_data: Input data to use on forward pass. + tflite_path: Path to TFLite file to run. + + Returns: + Output from inference. + """ + supported_quant_dtypes = (np.int8, np.int16) + interpreter = tf.lite.Interpreter(model_path=tflite_path) + interpreter.allocate_tensors() + + input_details = interpreter.get_input_details() + output_details = interpreter.get_output_details() + + input_dtype = input_details[0]["dtype"] + output_dtype = output_details[0]["dtype"] + + # Check if the input/output type is quantized, + # set scale and zero-point accordingly + if input_dtype in supported_quant_dtypes: + input_scale, input_zero_point = input_details[0]["quantization"] + else: + input_scale, input_zero_point = 1, 0 + + input_data = input_data / input_scale + input_zero_point + input_data = np.round(input_data) if input_dtype in supported_quant_dtypes else input_data + + if output_dtype in supported_quant_dtypes: + output_scale, output_zero_point = output_details[0]["quantization"] + else: + output_scale, output_zero_point = 1, 0 + + interpreter.set_tensor(input_details[0]['index'], tf.cast(input_data, input_dtype)) + interpreter.invoke() + + output_data = interpreter.get_tensor(output_details[0]['index']) + + output_data = output_scale * (output_data.astype(np.float32) - output_zero_point) + + return output_data + + +def load_labels(filename): + """Read in labels, one label per line.""" + f = open(filename, "r") + return f.read().splitlines() + + +def main(): + window_size_samples = int(FLAGS.sample_rate * FLAGS.window_size_ms / 1000) + window_stride_samples = int(FLAGS.sample_rate * FLAGS.window_stride_ms / 1000) + decoded, sample = load_wav_file(FLAGS.wav, FLAGS.sample_rate) + x = calculate_mfcc(decoded, sample, window_size_samples, window_stride_samples, FLAGS.dct_coefficient_count) + x = tf.reshape(x, [1, -1]) + predictions = tflite_inference(x, FLAGS.tflite_path) + + # Sort to show labels in order of confidence + top_k = predictions[0].argsort()[-1:][::-1] + for node_id in top_k: + human_string = load_labels(FLAGS.labels)[int(node_id)] + score = predictions[0,node_id] + print(f'model predicted: {human_string} with score {score:.5f}') + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument( + '--wav', type=str, default='', help='Audio file to be identified.') + parser.add_argument( + '--labels', type=str, default='', help='Path to file containing labels.') + parser.add_argument( + '--sample_rate', + type=int, + default=16000, + help='Expected sample rate of the wavs', ) + parser.add_argument( + '--window_size_ms', + type=float, + default=30.0, + help='How long each spectrogram timeslice is', ) + parser.add_argument( + '--window_stride_ms', + type=float, + default=10.0, + help='How long each spectrogram timeslice is', ) + parser.add_argument( + '--dct_coefficient_count', + type=int, + default=40, + help='How many bins to use for the MFCC fingerprint', ) + parser.add_argument( + '--tflite_path', + type=str, + default='', + help='Path to TFLite file to use for testing.') + FLAGS, unparsed = parser.parse_known_args() + main() diff --git a/models/keyword_spotting/ds_cnn_medium/model_package_tf/evaluation.py b/models/keyword_spotting/ds_cnn_medium/model_package_tf/evaluation.py new file mode 100644 index 0000000..f1ea40a --- /dev/null +++ b/models/keyword_spotting/ds_cnn_medium/model_package_tf/evaluation.py @@ -0,0 +1,250 @@ +# Copyright © 2023 Arm Ltd. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Functions for testing trained keyword spotting models from checkpoint files and TFLite files.""" + +import argparse + +import numpy as np +import tensorflow as tf + +from data_processing import data_preprocessing +from model_core_utils import models +from ds_cnn_m_inference_tflite import tflite_inference + + +def tflite_test(model_settings, audio_processor, tflite_path): + """Calculate accuracy and confusion matrices on the validation and test sets. + + A TFLite model is used for doing testing. + + Args: + model_settings: Dictionary of common model settings. + audio_processor: Audio processor class object. + tflite_path: Path to TFLite file to use for inference. + """ + # Evaluate on validation set. + print("Running TFLite evaluation on validation set...") + val_data = audio_processor.get_data(audio_processor.Modes.VALIDATION).batch(1) + expected_indices = np.concatenate([y for x, y in val_data]) + predicted_indices = [] + + for mfcc, label in val_data: + prediction = tflite_inference(mfcc, tflite_path) + predicted_indices.append(np.squeeze(tf.argmax(prediction, axis=1))) + + val_accuracy = calculate_accuracy(predicted_indices, expected_indices) + confusion_matrix = tf.math.confusion_matrix(expected_indices, predicted_indices, + num_classes=model_settings['label_count']) + + print(confusion_matrix.numpy()) + print(f'Validation accuracy = {val_accuracy * 100:.2f}%' + f'(N={audio_processor.set_size(audio_processor.Modes.VALIDATION)})') + + # Evaluate on testing set. + print("Running TFLite evaluation on test set...") + test_data = audio_processor.get_data(audio_processor.Modes.TESTING).batch(1) + expected_indices = np.concatenate([y for x, y in test_data]) + predicted_indices = [] + + for mfcc, label in test_data: + prediction = tflite_inference(mfcc, tflite_path) + predicted_indices.append(np.squeeze(tf.argmax(prediction, axis=1))) + + test_accuracy = calculate_accuracy(predicted_indices, expected_indices) + confusion_matrix = tf.math.confusion_matrix(expected_indices, predicted_indices, + num_classes=model_settings['label_count']) + + print(confusion_matrix.numpy()) + print(f'Test accuracy = {test_accuracy * 100:.2f}%' + f'(N={audio_processor.set_size(audio_processor.Modes.TESTING)})') + + +def keras_test(model_settings, audio_processor, model): + """Calculate accuracy and confusion matrices on the validation and test sets. + + A loaded keras model is used for doing testing. + + Args: + model_settings: Dictionary of common model settings. + audio_processor: Audio processor class object. + model: Loaded keras model. + """ + # Evaluate on validation set. + print("Running TF evaluation on validation set...") + val_data = audio_processor.get_data(audio_processor.Modes.VALIDATION).batch(FLAGS.batch_size) + expected_indices = np.concatenate([y for x, y in val_data]) + + predictions = model.predict(val_data) + predicted_indices = tf.argmax(predictions, axis=1) + + val_accuracy = calculate_accuracy(predicted_indices, expected_indices) + confusion_matrix = tf.math.confusion_matrix(expected_indices, predicted_indices, + num_classes=model_settings['label_count']) + print(confusion_matrix.numpy()) + print(f'Validation accuracy = {val_accuracy * 100:.2f}%' + f'(N={audio_processor.set_size(audio_processor.Modes.VALIDATION)})') + + # Evaluate on testing set. + print("Running TF evaluation on test set...") + test_data = audio_processor.get_data(audio_processor.Modes.TESTING).batch(FLAGS.batch_size) + expected_indices = np.concatenate([y for x, y in test_data]) + + predictions = model.predict(test_data) + predicted_indices = tf.argmax(predictions, axis=1) + + test_accuracy = calculate_accuracy(predicted_indices, expected_indices) + confusion_matrix = tf.math.confusion_matrix(expected_indices, predicted_indices, + num_classes=model_settings['label_count']) + print(confusion_matrix.numpy()) + print(f'Test accuracy = {test_accuracy * 100:.2f}%' + f'(N={audio_processor.set_size(audio_processor.Modes.TESTING)})') + + +def calculate_accuracy(predicted_indices, expected_indices): + """Calculates and returns accuracy. + + Args: + predicted_indices: List of predicted integer indices. + expected_indices: List of expected integer indices. + + Returns: + Accuracy value between 0 and 1. + """ + correct_prediction = tf.equal(predicted_indices, expected_indices) + accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) + return accuracy + + +def evaluate(): + """Calculate accuracy and confusion matrices on validation and test sets. + + Model is created and weights loaded from supplied command line arguments. + """ + model_settings = models.prepare_model_settings( + len(data_preprocessing.prepare_words_list(FLAGS.wanted_words.split(','))), + FLAGS.sample_rate, FLAGS.clip_duration_ms, FLAGS.window_size_ms, + FLAGS.window_stride_ms, FLAGS.dct_coefficient_count) + + audio_processor = data_preprocessing.AudioProcessor(data_url=FLAGS.data_url, + data_dir=FLAGS.data_dir, + silence_percentage=FLAGS.silence_percentage, + unknown_percentage=FLAGS.unknown_percentage, + wanted_words=FLAGS.wanted_words.split(','), + validation_percentage=FLAGS.validation_percentage, + testing_percentage=FLAGS.testing_percentage, + model_settings=model_settings) + + if FLAGS.tflite_path: + tflite_test(model_settings, audio_processor, FLAGS.tflite_path) + + if FLAGS.checkpoint: + model = models.create_model(model_settings, FLAGS.model_architecture, FLAGS.model_size_info, False) + model.load_weights(FLAGS.checkpoint).expect_partial() + keras_test(model_settings, audio_processor, model) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument( + '--data_url', + type=str, + default='http://download.tensorflow.org/data/speech_commands_v0.02.tar.gz', + help='Location of speech training data archive on the web.') + parser.add_argument( + '--data_dir', + type=str, + default='/tmp/speech_dataset/', + help="""\ + Where to download the speech training data to. + """) + parser.add_argument( + '--silence_percentage', + type=float, + default=10.0, + help="""\ + How much of the training data should be silence. + """) + parser.add_argument( + '--unknown_percentage', + type=float, + default=10.0, + help="""\ + How much of the training data should be unknown words. + """) + parser.add_argument( + '--testing_percentage', + type=int, + default=10, + help='What percentage of wavs to use as a test set.') + parser.add_argument( + '--validation_percentage', + type=int, + default=10, + help='What percentage of wavs to use as a validation set.') + parser.add_argument( + '--sample_rate', + type=int, + default=16000, + help='Expected sample rate of the wavs',) + parser.add_argument( + '--clip_duration_ms', + type=int, + default=1000, + help='Expected duration in milliseconds of the wavs',) + parser.add_argument( + '--window_size_ms', + type=float, + default=30.0, + help='How long each spectrogram timeslice is',) + parser.add_argument( + '--window_stride_ms', + type=float, + default=10.0, + help='How long each spectrogram timeslice is',) + parser.add_argument( + '--dct_coefficient_count', + type=int, + default=40, + help='How many bins to use for the MFCC fingerprint',) + parser.add_argument( + '--batch_size', + type=int, + default=100, + help='How many items to train with at once',) + parser.add_argument( + '--wanted_words', + type=str, + default='yes,no,up,down,left,right,on,off,stop,go', + help='Words to use (others will be added to an unknown label)',) + parser.add_argument( + '--checkpoint', + type=str, + help='Checkpoint to load the weights from') + parser.add_argument( + '--tflite_path', + type=str, + help='Path to TFLite file to use for evaluation') + parser.add_argument( + '--model_architecture', + type=str, + default='dnn', + help='What model architecture to use') + parser.add_argument( + '--model_size_info', + type=int, + nargs="+", + default=[128, 128, 128], + help='Model dimensions - different for various models') + + FLAGS, _ = parser.parse_known_args() + evaluate() diff --git a/models/keyword_spotting/ds_cnn_medium/model_package_tf/how_to_guidance.ipynb b/models/keyword_spotting/ds_cnn_medium/model_package_tf/how_to_guidance.ipynb new file mode 100644 index 0000000..fea007f --- /dev/null +++ b/models/keyword_spotting/ds_cnn_medium/model_package_tf/how_to_guidance.ipynb @@ -0,0 +1,428 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Copyright (C) 2023 Arm Limited or its affiliates. All rights reserved.\n", + "#\n", + "# SPDX-License-Identifier: Apache-2.0\n", + "#\n", + "# Licensed under the Apache License, Version 2.0 (the License); you may\n", + "# not use this file except in compliance with the License.\n", + "# You may obtain a copy of the License at\n", + "#\n", + "# www.apache.org/licenses/LICENSE-2.0\n", + "#\n", + "# Unless required by applicable law or agreed to in writing, software\n", + "# distributed under the License is distributed on an AS IS BASIS, WITHOUT\n", + "# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", + "# See the License for the specific language governing permissions and\n", + "# limitations under the License." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# DS_CNN_Medium - Hero\n", + "\n", + "Here we reproduce the models with our established codebase and ModelPackage approach for your convenience.\n", + "\n", + "## Model-Package Overview:\n", + "\n", + "| Model \t| DS_CNN_Medium \t|\n", + "|:---------------:\t|:---------------------------------------------------------------:\t|\n", + "| **Format**: \t| Keras, Saved Model, TensorFlow Lite int8, TensorFlow Lite fp32 |\n", + "| **Feature**: \t| Keyword spotting for Arm Cortex-M CPUs |\n", + "| **Architectural Delta w.r.t. Vanilla**: | None |\n", + "| **Domain**: \t| Keyword spotting |\n", + "| **Package Quality**: \t| Hero |" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Table of contents \n", + "\n", + "This how-to guidance presents the key steps to reproduce everything in this package. The contents are organised as below. We provided the internal navigation links for users to easy-jump among different sections. \n", + "\n", + " \n", + "* [1.0 Model recreation](#model_recreation)\n", + "\n", + "* [2.0 Training](#training)\n", + "\n", + "* [3.0 Testing](#testing)\n", + "\n", + "* [4.0 Optimization](#optimization)\n", + "\n", + "* [5.0 Quantization and TFLite conversion](#tflite_conversion)\n", + "\n", + "* [6.0 Inference the TFLite model files](#tflite_inference)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1.0 Model Recreation\n", + "\n", + "In order to recreate the model you will first need to be using ```Python3.7``` and install the requirements in ```requirements.txt```.\n", + "\n", + "Once you have these requirements satisfied you can execute the recreation script contained within this folder, just run:" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2023-01-31 11:54:08.485801: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0\n", + "Untarring speech_commands_v0.02.tar.gz...\n", + "2023-01-31 11:54:58.475678: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcuda.so.1\n", + "2023-01-31 11:54:58.516721: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: \n", + "pciBusID: 0000:03:00.0 name: NVIDIA TITAN Xp computeCapability: 6.1\n", + "coreClock: 1.582GHz coreCount: 30 deviceMemorySize: 11.91GiB deviceMemoryBandwidth: 510.07GiB/s\n", + "2023-01-31 11:54:58.516765: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0\n", + "2023-01-31 11:54:58.537249: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcublas.so.11\n", + "2023-01-31 11:54:58.537321: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcublasLt.so.11\n", + "2023-01-31 11:54:58.540057: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcufft.so.10\n", + "2023-01-31 11:54:58.540315: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcurand.so.10\n", + "2023-01-31 11:54:58.540872: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcusolver.so.11\n", + "2023-01-31 11:54:58.541591: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcusparse.so.11\n", + "2023-01-31 11:54:58.541745: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudnn.so.8\n", + "2023-01-31 11:54:58.542218: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0\n", + "2023-01-31 11:54:58.542511: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA\n", + "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n", + "2023-01-31 11:54:58.543331: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: \n", + "pciBusID: 0000:03:00.0 name: NVIDIA TITAN Xp computeCapability: 6.1\n", + "coreClock: 1.582GHz coreCount: 30 deviceMemorySize: 11.91GiB deviceMemoryBandwidth: 510.07GiB/s\n", + "2023-01-31 11:54:58.543822: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0\n", + "2023-01-31 11:54:58.543872: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0\n", + "2023-01-31 11:54:58.966709: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1258] Device interconnect StreamExecutor with strength 1 edge matrix:\n", + "2023-01-31 11:54:58.966747: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1264] 0 \n", + "2023-01-31 11:54:58.966761: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1277] 0: N \n", + "2023-01-31 11:54:58.967266: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1418] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 11002 MB memory) -> physical GPU (device: 0, name: NVIDIA TITAN Xp, pci bus id: 0000:03:00.0, compute capability: 6.1)\n", + "WARNING:tensorflow:Compiled the loaded model, but the compiled metrics have yet to be built. `model.compile_metrics` will be empty until you train or evaluate the model.\n", + "2023-01-31 11:55:01.322474: W tensorflow/python/util/util.cc:348] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.\n", + "2023-01-31 11:55:03.039244: I tensorflow/core/grappler/devices.cc:69] Number of eligible GPUs (core count >= 8, compute capability >= 0.0): 1\n", + "2023-01-31 11:55:03.039493: I tensorflow/core/grappler/clusters/single_machine.cc:357] Starting new session\n", + "2023-01-31 11:55:03.039987: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: \n", + "pciBusID: 0000:03:00.0 name: NVIDIA TITAN Xp computeCapability: 6.1\n", + "coreClock: 1.582GHz coreCount: 30 deviceMemorySize: 11.91GiB deviceMemoryBandwidth: 510.07GiB/s\n", + "2023-01-31 11:55:03.040276: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0\n", + "2023-01-31 11:55:03.040309: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1258] Device interconnect StreamExecutor with strength 1 edge matrix:\n", + "2023-01-31 11:55:03.040317: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1264] 0 \n", + "2023-01-31 11:55:03.040325: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1277] 0: N \n", + "2023-01-31 11:55:03.040640: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1418] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 11002 MB memory) -> physical GPU (device: 0, name: NVIDIA TITAN Xp, pci bus id: 0000:03:00.0, compute capability: 6.1)\n", + "2023-01-31 11:55:03.059483: I tensorflow/core/platform/profile_utils/cpu_utils.cc:114] CPU Frequency: 3492140000 Hz\n", + "2023-01-31 11:55:03.063108: I tensorflow/core/grappler/optimizers/meta_optimizer.cc:1144] Optimization results for grappler item: graph_to_optimize\n", + " function_optimizer: function_optimizer did nothing. time = 0.01ms.\n", + " function_optimizer: function_optimizer did nothing. time = 0.001ms.\n", + "\n", + "2023-01-31 11:55:03.313219: W tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc:345] Ignored output_format.\n", + "2023-01-31 11:55:03.313256: W tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc:348] Ignored drop_control_dependency.\n", + "2023-01-31 11:55:03.318616: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:210] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.\n", + "2023-01-31 11:55:03.321473: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: \n", + "pciBusID: 0000:03:00.0 name: NVIDIA TITAN Xp computeCapability: 6.1\n", + "coreClock: 1.582GHz coreCount: 30 deviceMemorySize: 11.91GiB deviceMemoryBandwidth: 510.07GiB/s\n", + "2023-01-31 11:55:03.321732: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0\n", + "2023-01-31 11:55:03.321763: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1258] Device interconnect StreamExecutor with strength 1 edge matrix:\n", + "2023-01-31 11:55:03.321773: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1264] 0 \n", + "2023-01-31 11:55:03.321780: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1277] 0: N \n", + "2023-01-31 11:55:03.322065: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1418] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 11002 MB memory) -> physical GPU (device: 0, name: NVIDIA TITAN Xp, pci bus id: 0000:03:00.0, compute capability: 6.1)\n", + "Converted model saved to ds_cnn.tflite.\n", + "Running TFLite evaluation on validation set...\n", + "2023-01-31 11:55:03.376097: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:176] None of the MLIR Optimization Passes are enabled (registered 2)\n", + "[[371 0 0 0 0 0 0 0 0 0 0 0]\n", + " [ 0 332 3 5 1 5 6 3 8 3 2 3]\n", + " [ 0 4 386 1 0 0 4 0 0 0 0 2]\n", + " [ 0 5 2 378 2 3 3 0 0 1 1 11]\n", + " [ 0 1 2 0 324 1 0 0 1 16 4 1]\n", + " [ 0 3 0 8 1 360 0 0 1 1 1 2]\n", + " [ 1 0 8 1 1 0 338 3 0 0 0 0]\n", + " [ 0 2 1 1 0 0 1 356 0 1 1 0]\n", + " [ 1 5 0 2 4 0 0 0 341 10 0 0]\n", + " [ 0 2 0 0 16 0 3 0 4 345 2 1]\n", + " [ 1 1 0 0 12 1 0 1 0 1 332 1]\n", + " [ 0 4 0 13 2 4 1 0 1 1 1 345]]\n", + "Validation accuracy = 94.67%(N=4445)\n", + "Running TFLite evaluation on test set...\n", + "[[408 0 0 0 0 0 0 0 0 0 0 0]\n", + " [ 0 362 2 8 5 2 8 3 5 1 2 10]\n", + " [ 0 7 402 2 0 0 7 0 0 1 0 0]\n", + " [ 0 4 1 389 0 4 1 0 0 0 0 6]\n", + " [ 0 6 0 0 397 1 0 0 4 12 5 0]\n", + " [ 0 8 1 14 0 374 3 1 1 0 1 3]\n", + " [ 0 8 5 1 0 0 396 2 0 0 0 0]\n", + " [ 0 6 0 0 0 1 4 383 0 1 1 0]\n", + " [ 0 4 0 0 7 3 1 0 368 13 0 0]\n", + " [ 0 5 0 2 11 0 1 0 5 375 0 3]\n", + " [ 0 3 0 0 8 2 1 1 0 0 394 2]\n", + " [ 0 5 1 27 3 1 1 1 0 1 0 362]]\n", + "Test accuracy = 94.27%(N=4890)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2023-01-31 11:55:32.290813: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0\n", + "Untarring speech_commands_v0.02.tar.gz...\n", + "2023-01-31 11:56:25.228757: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcuda.so.1\n", + "2023-01-31 11:56:25.264869: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: \n", + "pciBusID: 0000:03:00.0 name: NVIDIA TITAN Xp computeCapability: 6.1\n", + "coreClock: 1.582GHz coreCount: 30 deviceMemorySize: 11.91GiB deviceMemoryBandwidth: 510.07GiB/s\n", + "2023-01-31 11:56:25.264908: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0\n", + "2023-01-31 11:56:25.285323: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcublas.so.11\n", + "2023-01-31 11:56:25.285388: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcublasLt.so.11\n", + "2023-01-31 11:56:25.288128: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcufft.so.10\n", + "2023-01-31 11:56:25.288385: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcurand.so.10\n", + "2023-01-31 11:56:25.288944: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcusolver.so.11\n", + "2023-01-31 11:56:25.289667: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcusparse.so.11\n", + "2023-01-31 11:56:25.289820: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudnn.so.8\n", + "2023-01-31 11:56:25.292002: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0\n", + "2023-01-31 11:56:25.292281: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA\n", + "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n", + "2023-01-31 11:56:25.293162: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: \n", + "pciBusID: 0000:03:00.0 name: NVIDIA TITAN Xp computeCapability: 6.1\n", + "coreClock: 1.582GHz coreCount: 30 deviceMemorySize: 11.91GiB deviceMemoryBandwidth: 510.07GiB/s\n", + "2023-01-31 11:56:25.293718: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0\n", + "2023-01-31 11:56:25.293799: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0\n", + "2023-01-31 11:56:25.736053: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1258] Device interconnect StreamExecutor with strength 1 edge matrix:\n", + "2023-01-31 11:56:25.736092: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1264] 0 \n", + "2023-01-31 11:56:25.736100: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1277] 0: N \n", + "2023-01-31 11:56:25.736608: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1418] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 11002 MB memory) -> physical GPU (device: 0, name: NVIDIA TITAN Xp, pci bus id: 0000:03:00.0, compute capability: 6.1)\n", + "WARNING:tensorflow:Compiled the loaded model, but the compiled metrics have yet to be built. `model.compile_metrics` will be empty until you train or evaluate the model.\n", + "2023-01-31 11:56:28.038374: W tensorflow/python/util/util.cc:348] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.\n", + "2023-01-31 11:56:29.838652: I tensorflow/core/grappler/devices.cc:69] Number of eligible GPUs (core count >= 8, compute capability >= 0.0): 1\n", + "2023-01-31 11:56:29.838886: I tensorflow/core/grappler/clusters/single_machine.cc:357] Starting new session\n", + "2023-01-31 11:56:29.839342: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: \n", + "pciBusID: 0000:03:00.0 name: NVIDIA TITAN Xp computeCapability: 6.1\n", + "coreClock: 1.582GHz coreCount: 30 deviceMemorySize: 11.91GiB deviceMemoryBandwidth: 510.07GiB/s\n", + "2023-01-31 11:56:29.839606: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0\n", + "2023-01-31 11:56:29.839637: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1258] Device interconnect StreamExecutor with strength 1 edge matrix:\n", + "2023-01-31 11:56:29.839648: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1264] 0 \n", + "2023-01-31 11:56:29.839655: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1277] 0: N \n", + "2023-01-31 11:56:29.839941: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1418] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 11002 MB memory) -> physical GPU (device: 0, name: NVIDIA TITAN Xp, pci bus id: 0000:03:00.0, compute capability: 6.1)\n", + "2023-01-31 11:56:29.859427: I tensorflow/core/platform/profile_utils/cpu_utils.cc:114] CPU Frequency: 3492140000 Hz\n", + "2023-01-31 11:56:29.863763: I tensorflow/core/grappler/optimizers/meta_optimizer.cc:1144] Optimization results for grappler item: graph_to_optimize\n", + " function_optimizer: function_optimizer did nothing. time = 0.013ms.\n", + " function_optimizer: function_optimizer did nothing. time = 0.002ms.\n", + "\n", + "2023-01-31 11:56:30.003088: W tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc:345] Ignored output_format.\n", + "2023-01-31 11:56:30.003122: W tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc:348] Ignored drop_control_dependency.\n", + "2023-01-31 11:56:30.008047: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:210] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.\n", + "2023-01-31 11:56:30.010836: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: \n", + "pciBusID: 0000:03:00.0 name: NVIDIA TITAN Xp computeCapability: 6.1\n", + "coreClock: 1.582GHz coreCount: 30 deviceMemorySize: 11.91GiB deviceMemoryBandwidth: 510.07GiB/s\n", + "2023-01-31 11:56:30.011085: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0\n", + "2023-01-31 11:56:30.011115: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1258] Device interconnect StreamExecutor with strength 1 edge matrix:\n", + "2023-01-31 11:56:30.011125: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1264] 0 \n", + "2023-01-31 11:56:30.011131: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1277] 0: N \n", + "2023-01-31 11:56:30.011421: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1418] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 11002 MB memory) -> physical GPU (device: 0, name: NVIDIA TITAN Xp, pci bus id: 0000:03:00.0, compute capability: 6.1)\n", + "2023-01-31 11:56:30.051239: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:176] None of the MLIR Optimization Passes are enabled (registered 2)\n", + "fully_quantize: 0, inference_type: 6, input_inference_type: 9, output_inference_type: 9\n", + "Quantized model saved to ds_cnn_quantized.tflite.\n", + "Running TFLite evaluation on validation set...\n", + "[[371 0 0 0 0 0 0 0 0 0 0 0]\n", + " [ 0 328 2 5 1 8 5 3 8 4 2 5]\n", + " [ 0 6 375 1 0 1 10 0 0 0 1 3]\n", + " [ 0 9 2 368 1 3 6 0 1 0 4 12]\n", + " [ 0 3 1 0 319 1 1 0 2 13 9 1]\n", + " [ 0 3 2 9 0 350 1 0 3 1 2 6]\n", + " [ 1 3 8 1 1 0 334 3 0 0 0 1]\n", + " [ 0 4 1 0 1 0 1 351 0 1 2 2]\n", + " [ 1 6 0 1 4 0 0 0 343 7 0 1]\n", + " [ 0 5 0 0 21 0 3 1 4 333 3 3]\n", + " [ 1 2 0 0 11 0 0 1 1 2 331 1]\n", + " [ 0 7 0 15 2 4 1 0 0 1 3 339]]\n", + "Validation accuracy = 93.18%(N=4445)\n", + "Running TFLite evaluation on test set...\n", + "[[408 0 0 0 0 0 0 0 0 0 0 0]\n", + " [ 0 363 1 7 2 1 11 3 4 1 3 12]\n", + " [ 0 7 399 2 0 0 8 0 1 0 0 2]\n", + " [ 0 5 1 380 0 4 4 0 0 0 2 9]\n", + " [ 0 8 0 0 390 1 1 2 4 10 8 1]\n", + " [ 0 8 1 14 2 370 2 1 0 0 4 4]\n", + " [ 0 9 4 1 1 0 395 2 0 0 0 0]\n", + " [ 0 8 2 0 2 1 8 372 0 1 2 0]\n", + " [ 0 9 0 0 9 3 1 0 358 12 1 3]\n", + " [ 0 7 0 2 15 0 1 0 4 362 4 7]\n", + " [ 0 3 0 0 7 4 1 2 0 1 391 2]\n", + " [ 0 9 2 26 3 4 0 0 2 1 4 351]]\n", + "Test accuracy = 92.82%(N=4890)\n" + ] + } + ], + "source": [ + "!bash ./recreate_model.sh" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Running this script will use the pre-trained checkpoint files supplied in the ```./model_archive/model_source/weights``` folder to generate the TFLite files and perform evaluation on the test set. Both an fp32 version and a quantized version will be produced. The quantized version will use post-training quantization to fully quantize it.\n", + "\n", + "If you want to run training from scratch you can do this by supplying ```--train``` when running the script. For example:\n", + "\n", + "```bash\n", + "bash ./recreate_model.sh --train\n", + "```\n", + "\n", + "Training is then performed and should produce a model to the stated accuracy in this repository. Note that exporting to TFLite will still happen with the baseline pre-trained checkpoint files, so you will need to re-run the script and this time supply the path to the new checkpoint files you want to use, for example:\n", + "\n", + "```bash\n", + "bash ./recreate_model.sh --ckpt \n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2.0 Training\n", + "\n", + "The training scripts can be used to recreate any of the models from the [Hello Edge paper](https://arxiv.org/pdf/1711.07128.pdf) provided the right hyperparameters are used. The training commands with all the hyperparameters to reproduce the model in this repository are given [here](recreate_model.sh). The model in this part of the repository represents just one variation of the models from the paper, other varieties are covered in other parts of the repository.\n", + "\n", + "\n", + "As a general example of how to train a DNN with 3 fully-connected layers with 128 neurons in each layer, run:\n", + "```\n", + "python train.py --model_architecture dnn --model_size_info 128 128 128\n", + "```\n", + "\n", + "The command line argument *--model_size_info* is used to pass the neural network layer\n", + "dimensions such as number of layers, convolution filter size/stride as a list to models.py,\n", + "which builds the TensorFlow graph based on the provided model architecture\n", + "and layer dimensions. For more info on *model_size_info* for each network architecture see\n", + "[models.py](model_core_utils/models.py).\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 3.0 Testing\n", + "To run inference on the trained model from a checkpoint and get accuracy on validation and test sets, run:\n", + "```\n", + "python evaluation.py --model_architecture dnn --model_size_info 128 128 128 --checkpoint \n", + "```\n", + "**The model and feature extraction parameters passed to this script should match those used in the Training step.**" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 4.0 Optimization\n", + "\n", + "We introduce an *optional* step to optimize the trained keyword spotting model for deployment.\n", + "\n", + "Here we use TensorFlow's [weight clustering API](https://www.tensorflow.org/model_optimization/guide/clustering) to reduce the compressed model size and optimize inference on supported hardware. 32 weight clusters and kmeans++ cluster intialization method are used as the clustering hyperparameters.\n", + "\n", + "To optimize your trained model (e.g. a DNN), a trained model checkpoint is needed to run clustering and fine-tuning on.\n", + "You can use the pre-trained checkpoints provided, or train your own model and use the resulting checkpoint.\n", + "\n", + "To apply the optimization and fine-tuning, run the following command:\n", + "```\n", + "python optimisations.py --model_architecture dnn --model_size_info 128 128 128 --checkpoint \n", + "```\n", + "**The model and feature extraction parameters used here should match those used in the Training step, except for the number of training steps.\n", + "The number of training steps is reduced since the optimization step only requires fine-tuning.**\n", + "\n", + "This will generate a clustered model checkpoint that can be used in the quantization step to generate a quantized and clustered TFLite model." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 5.0 Quantization and TFLite Conversion\n", + "\n", + "You can now use TensorFlow's\n", + "[post training quantization](https://www.tensorflow.org/lite/performance/post_training_quantization) to\n", + "make quantization of the trained models super simple.\n", + "\n", + "To quantize your trained model (e.g. a DNN) run:\n", + "```\n", + "python convert_to_tflite.py --model_architecture dnn --model_size_info 128 128 128 --checkpoint [--inference_type int8|int16]\n", + "```\n", + "**The model and feature extraction parameters used here should match those used in the Training step.**\n", + "\n", + "The ```inference_type``` parameter is *optional* and to be used if a fully quantized model with inputs and outputs of type int8 or int16 is needed. It defaults to fp32.\n", + "\n", + "In this example, this step will produce a quantized TFLite file *dnn_quantized.tflite*." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can test the accuracy of this quantized model on the test set by running:\n", + "```\n", + "python evaluation.py --tflite_path dnn_quantized.tflite\n", + "```\n", + "**The model and feature extraction parameters used here should match those used in the Training step.**\n", + "\n", + "`convert_to_tflite.py` uses post-training quantization to generate a quantized model by default. If you wish to convert to a floating point TFLite model, use the command below:\n", + "\n", + "```\n", + "python convert_to_tflite.py --model_architecture dnn --model_size_info 128 128 128 --checkpoint --no-quantize\n", + "```\n", + "\n", + "This will produce a floating point TFLite file *dnn.tflite*. You can test the accuracy of this floating point model using `evaluation.py` as above.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 6.0 Single inference of the TFLite model files \n", + "\n", + "You can conduct TFLite inference for .fp32 and .int8 model files by using the following command: \n", + "\n", + "```python ds_cnn_m_inference_tflite.py --labels validation_utils/labels.txt --wav --tflite_path ```\n", + "\n", + "**The feature extraction parameters used here should match those used in the Training step.**\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.4" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/models/keyword_spotting/ds_cnn_medium/model_package_tf/model_archive/TFLite/tflite_fp32/README.md b/models/keyword_spotting/ds_cnn_medium/model_package_tf/model_archive/TFLite/tflite_fp32/README.md new file mode 100644 index 0000000..ae2c70e --- /dev/null +++ b/models/keyword_spotting/ds_cnn_medium/model_package_tf/model_archive/TFLite/tflite_fp32/README.md @@ -0,0 +1,62 @@ +# keyword_spotting/ds_cnn_medium/model_package_tf/model_archive/TFLite/tflite_fp32 + +## Description +This is a floating point fp32 version of the DS-CNN Medium model developed by Arm, from the Hello Edge paper. + +## License +[Apache-2.0](https://spdx.org/licenses/Apache-2.0.html) + +## Network Information +| Network Information | Value | +|---------------------|-------| +| Framework | TensorFlow Lite | +| Datatype | fp32 | +| SHA-1 Hash | 620951417ca52a1640bb25490ca7b34507fe8881 | +| Size (Bytes) | 548468 | +| Provenance | https://arxiv.org/abs/1711.07128 | +| Training | Trained by Arm | +| Paper | https://arxiv.org/abs/1711.07128 | + +## DataSet +| Dataset Information | Value | +|--------|-------| +| Name | Google Speech Commands test set | + +## Accuracy + +| Metric | Value | +|--------|-------| +| accuracy | 94.27% | + +## HW Support +| HW Support | Value | +|--------------|-------| +| Cortex-A |:heavy_check_mark: | +| Cortex-M |:heavy_check_mark: HERO | +| Mali GPU |:heavy_check_mark: | +| Ethos U |:heavy_multiplication_x: | + +### Key +* :heavy_check_mark: - Will run on this platform. +* :heavy_multiplication_x: - Will not run on this platform. + +## Network Quality +| Network Quality | Value | +|-------------------------|-------| +| Recreate | :heavy_check_mark: | +| Quality level | Hero | +| Vanilla | :heavy_check_mark: | +| Clustered | :heavy_multiplication_x: | +| Pruned | :heavy_multiplication_x: | +| Quantization - default | :heavy_multiplication_x: | +| Quantization - full | :heavy_multiplication_x: | + +## Network Inputs +| Input Node Name | Shape | Type | Example Path | Example Type | Example Shape | Example Use Case | +|-----------------|-------|-------|--------------|-------|-------|-----------------| +| input | (1, 490) | fp32 | models/keyword_spotting/ds_cnn_medium/model_package_tf/model_archive/TFLite/tflite_fp32/testing_input/input | fp32 | [1, 490] | The input is a processed MFCCs | + +## Network Outputs +| Output Node Name | Shape | Type | Example Path | Example Type | Example Shape | Example Use Case | +|-----------------|-------|-------|--------------|-------|-------|-----------------| +| Identity | (1, 12) | fp32 | models/keyword_spotting/ds_cnn_medium/model_package_tf/model_archive/TFLite/tflite_fp32/testing_output/Identity | fp32 | [1, 12] | The probability on 12 keywords | \ No newline at end of file diff --git a/models/keyword_spotting/ds_cnn_medium/model_package_tf/model_archive/TFLite/tflite_fp32/definition.yaml b/models/keyword_spotting/ds_cnn_medium/model_package_tf/model_archive/TFLite/tflite_fp32/definition.yaml new file mode 100644 index 0000000..2277065 --- /dev/null +++ b/models/keyword_spotting/ds_cnn_medium/model_package_tf/model_archive/TFLite/tflite_fp32/definition.yaml @@ -0,0 +1,66 @@ +benchmark: + benchmark_metrics: + accuracy: 94.27% + benchmark_name: Google Speech Commands test set +description: This is a floating point fp32 version of the DS-CNN Medium model developed + by Arm, from the Hello Edge paper. +license: +- Apache-2.0 +network: + datatype: fp32 + file_size_bytes: 548468 + filename: ds_cnn_m.tflite + framework: TensorFlow Lite + hash: + algorithm: sha1 + value: 620951417ca52a1640bb25490ca7b34507fe8881 + provenance: https://arxiv.org/abs/1711.07128 + training: Trained by Arm +network_parameters: + input_nodes: + - description: The input is a processed MFCCs of shape (1, 490) + example_input: + path: models/keyword_spotting/ds_cnn_medium/model_package_tf/model_archive/TFLite/tflite_fp32/testing_input/input + shape: + - 1 + - 490 + type: fp32 + use_case: Random input for model regression. + input_datatype: fp32 + name: input + shape: + - 1 + - 490 + output_nodes: + - description: The probability on 12 keywords. + example_output: + path: models/keyword_spotting/ds_cnn_medium/model_package_tf/model_archive/TFLite/tflite_fp32/testing_output/Identity + shape: + - 1 + - 12 + type: fp32 + use_case: output for model regression. + name: Identity + output_datatype: fp32 + shape: + - 1 + - 12 +network_quality: + clustered: false + is_vanilla: true + pruned: false + quality_level: Deployable + quality_level_hero_hw: cortex_m + quantization_default: false + quantization_full: false + recreate: true +operators: + TensorFlow Lite: + - AVERAGE_POOL_2D + - CONV_2D + - DEPTHWISE_CONV_2D + - FULLY_CONNECTED + - RELU + - RESHAPE + - SOFTMAX +paper: https://arxiv.org/abs/1711.07128 \ No newline at end of file diff --git a/models/keyword_spotting/ds_cnn_medium/model_package_tf/model_archive/TFLite/tflite_fp32/ds_cnn_m.tflite b/models/keyword_spotting/ds_cnn_medium/model_package_tf/model_archive/TFLite/tflite_fp32/ds_cnn_m.tflite new file mode 100644 index 0000000..b4b2f28 --- /dev/null +++ b/models/keyword_spotting/ds_cnn_medium/model_package_tf/model_archive/TFLite/tflite_fp32/ds_cnn_m.tflite @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:405ba6ec5977ae6bd42ac153deb02f471bcd76e6c07b127352e4a0f3ca5be054 +size 548468 diff --git a/models/keyword_spotting/ds_cnn_medium/model_package_tf/model_archive/TFLite/tflite_fp32/testing_input/input/0.npy b/models/keyword_spotting/ds_cnn_medium/model_package_tf/model_archive/TFLite/tflite_fp32/testing_input/input/0.npy new file mode 100644 index 0000000..701fcd4 --- /dev/null +++ b/models/keyword_spotting/ds_cnn_medium/model_package_tf/model_archive/TFLite/tflite_fp32/testing_input/input/0.npy @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fcdf0702505989d7a0fdffca09308abde32082a1f56bad845c05fbca24e87aa4 +size 2088 diff --git a/models/keyword_spotting/ds_cnn_medium/model_package_tf/model_archive/TFLite/tflite_fp32/testing_output/Identity/0.npy b/models/keyword_spotting/ds_cnn_medium/model_package_tf/model_archive/TFLite/tflite_fp32/testing_output/Identity/0.npy new file mode 100644 index 0000000..f6082ba --- /dev/null +++ b/models/keyword_spotting/ds_cnn_medium/model_package_tf/model_archive/TFLite/tflite_fp32/testing_output/Identity/0.npy @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5a107cccce62cb03a3aadc59387f87ecb46a6e4bf81ed5f67d15750fa8b78fec +size 176 diff --git a/models/keyword_spotting/ds_cnn_medium/model_package_tf/model_archive/TFLite/tflite_int8/README.md b/models/keyword_spotting/ds_cnn_medium/model_package_tf/model_archive/TFLite/tflite_int8/README.md new file mode 100644 index 0000000..331b883 --- /dev/null +++ b/models/keyword_spotting/ds_cnn_medium/model_package_tf/model_archive/TFLite/tflite_int8/README.md @@ -0,0 +1,63 @@ +# keyword_spotting/ds_cnn_medium/model_package_tf/model_archive/TFLite/tflite_int8 + +## Description +This is a fully quantized int8 version of the DS-CNN Medium model developed by Arm, from the Hello Edge paper. + +## License +[Apache-2.0](https://spdx.org/licenses/Apache-2.0.html) + +## Network Information +| Network Information | Value | +|---------------------|-------| +| Framework | TensorFlow Lite | +| Datatype | int8 | +| SHA-1 Hash | 740d32adde16948b2ab45e1e8c856de2925a05eb | +| Size (Bytes) | 186288 | +| Provenance | https://arxiv.org/abs/1711.07128 | +| Training | Trained by Arm | +| Paper | https://arxiv.org/abs/1711.07128 | + +## DataSet +| Dataset Information | Value | +|--------|-------| +| Name | Google Speech Commands test set | + +## Accuracy + +| Metric | Value | +|--------|-------| +| Accuracy | 93.93% | + +## HW Support +| HW Support | Value | +|--------------|-------| +| Cortex-A |:heavy_check_mark: | +| Cortex-M |:heavy_check_mark: HERO | +| Mali GPU |:heavy_check_mark: | +| Ethos U |:heavy_check_mark: | + +### Key +* :heavy_check_mark: - Will run on this platform. +* :heavy_multiplication_x: - Will not run on this platform. + +## Network Quality +| Network Quality | Value | +|-------------------------|-------| +| Recreate | :heavy_check_mark: | +| Quality level | Hero | +| Vanilla | :heavy_check_mark: | +| Clustered | :heavy_multiplication_x: | +| Pruned | :heavy_multiplication_x: | +| Quantization - default | :heavy_multiplication_x: | +| Quantization - full | :heavy_check_mark: | + +## Network Inputs +| Input Node Name | Shape | Type | Example Path | Example Type | Example Shape | Example Use Case | +|-----------------|-------|-------|--------------|-------|-------|-----------------| +| input | (1, 490) | int8 | models/keyword_spotting/ds_cnn_medium/model_package_tf/model_archive/TFLite/tflite_int8/testing_input/input | int8 | [1, 490] | The input is a processed MFCCs | + +## Network Outputs +| Output Node Name | Shape | Type | Example Path | Example Type | Example Shape | Example Use Case | +|-----------------|-------|-------|--------------|-------|-------|-----------------| +| Identity | (1, 12) | int8 | models/keyword_spotting/ds_cnn_medium/model_package_tf/model_archive/TFLite/tflite_int8/testing_output/Identity | int8 | [1, 12] | The probability on 12 keywords | + diff --git a/models/keyword_spotting/ds_cnn_medium/model_package_tf/model_archive/TFLite/tflite_int8/definition.yaml b/models/keyword_spotting/ds_cnn_medium/model_package_tf/model_archive/TFLite/tflite_int8/definition.yaml new file mode 100644 index 0000000..7cc5a2a --- /dev/null +++ b/models/keyword_spotting/ds_cnn_medium/model_package_tf/model_archive/TFLite/tflite_int8/definition.yaml @@ -0,0 +1,66 @@ +benchmark: + benchmark_metrics: + Accuracy: 93.93% + benchmark_name: Google Speech Commands test set +description: This is a fully quantized int8 version of the DS-CNN Medium model developed + by Arm, from the Hello Edge paper. +license: +- Apache-2.0 +network: + datatype: int8 + file_size_bytes: 186288 + filename: ds_cnn_m_quantized.tflite + framework: TensorFlow Lite + hash: + algorithm: sha1 + value: 740d32adde16948b2ab45e1e8c856de2925a05eb + provenance: https://arxiv.org/abs/1711.07128 + training: Trained by Arm +network_parameters: + input_nodes: + - description: The input is a processed MFCCs of shape (1, 490) + example_input: + path: models/keyword_spotting/ds_cnn_medium/model_package_tf/model_archive/TFLite/tflite_int8/testing_input/input + shape: + - 1 + - 490 + type: int8 + use_case: Random input for model regression. + input_datatype: int8 + name: input + shape: + - 1 + - 490 + output_nodes: + - description: The probability on 12 keywords. + example_output: + path: models/keyword_spotting/ds_cnn_medium/model_package_tf/model_archive/TFLite/tflite_int8/testing_output/Identity + shape: + - 1 + - 12 + type: int8 + use_case: output for model regression. + name: Identity + output_datatype: int8 + shape: + - 1 + - 12 +network_quality: + clustered: false + is_vanilla: true + pruned: false + quality_level: Deployable + quality_level_hero_hw: cortex_m + quantization_default: false + quantization_full: true + recreate: true +operators: + TensorFlow Lite: + - AVERAGE_POOL_2D + - CONV_2D + - DEPTHWISE_CONV_2D + - FULLY_CONNECTED + - RELU + - RESHAPE + - SOFTMAX +paper: https://arxiv.org/abs/1711.07128 \ No newline at end of file diff --git a/models/keyword_spotting/ds_cnn_medium/tflite_int8/ds_cnn_m_quantized.tflite b/models/keyword_spotting/ds_cnn_medium/model_package_tf/model_archive/TFLite/tflite_int8/ds_cnn_m_quantized.tflite similarity index 100% rename from models/keyword_spotting/ds_cnn_medium/tflite_int8/ds_cnn_m_quantized.tflite rename to models/keyword_spotting/ds_cnn_medium/model_package_tf/model_archive/TFLite/tflite_int8/ds_cnn_m_quantized.tflite diff --git a/models/keyword_spotting/ds_cnn_medium/tflite_int8/testing_input/input/0.npy b/models/keyword_spotting/ds_cnn_medium/model_package_tf/model_archive/TFLite/tflite_int8/testing_input/input/0.npy similarity index 100% rename from models/keyword_spotting/ds_cnn_medium/tflite_int8/testing_input/input/0.npy rename to models/keyword_spotting/ds_cnn_medium/model_package_tf/model_archive/TFLite/tflite_int8/testing_input/input/0.npy diff --git a/models/keyword_spotting/ds_cnn_medium/tflite_int8/testing_output/Identity/0.npy b/models/keyword_spotting/ds_cnn_medium/model_package_tf/model_archive/TFLite/tflite_int8/testing_output/Identity/0.npy similarity index 100% rename from models/keyword_spotting/ds_cnn_medium/tflite_int8/testing_output/Identity/0.npy rename to models/keyword_spotting/ds_cnn_medium/model_package_tf/model_archive/TFLite/tflite_int8/testing_output/Identity/0.npy diff --git a/models/keyword_spotting/ds_cnn_medium/model_package_tf/model_archive/baseline/saved_model/ds_cnn_medium/keras_metadata.pb b/models/keyword_spotting/ds_cnn_medium/model_package_tf/model_archive/baseline/saved_model/ds_cnn_medium/keras_metadata.pb new file mode 100644 index 0000000..d1cf98b --- /dev/null +++ b/models/keyword_spotting/ds_cnn_medium/model_package_tf/model_archive/baseline/saved_model/ds_cnn_medium/keras_metadata.pb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0e2c679859ef8fe55a5240076d46d21fb6058d6f5eb6789e8f66484c0eb5606c +size 65455 diff --git a/models/keyword_spotting/ds_cnn_medium/model_package_tf/model_archive/baseline/saved_model/ds_cnn_medium/saved_model.pb b/models/keyword_spotting/ds_cnn_medium/model_package_tf/model_archive/baseline/saved_model/ds_cnn_medium/saved_model.pb new file mode 100644 index 0000000..edf9f9d --- /dev/null +++ b/models/keyword_spotting/ds_cnn_medium/model_package_tf/model_archive/baseline/saved_model/ds_cnn_medium/saved_model.pb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f3441fda9da39b45faa7e26c777cb8608318cb6140df5aee5470f2a94c04b5a7 +size 711776 diff --git a/models/keyword_spotting/ds_cnn_medium/model_package_tf/model_archive/baseline/saved_model/ds_cnn_medium/variables/variables.data-00000-of-00001 b/models/keyword_spotting/ds_cnn_medium/model_package_tf/model_archive/baseline/saved_model/ds_cnn_medium/variables/variables.data-00000-of-00001 new file mode 100644 index 0000000..fa0e037 --- /dev/null +++ b/models/keyword_spotting/ds_cnn_medium/model_package_tf/model_archive/baseline/saved_model/ds_cnn_medium/variables/variables.data-00000-of-00001 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:89a822e0c17c8dc7500805a9833fd2558ffe89da671932747c508402e60c7405 +size 583382 diff --git a/models/keyword_spotting/ds_cnn_medium/model_package_tf/model_archive/baseline/saved_model/ds_cnn_medium/variables/variables.index b/models/keyword_spotting/ds_cnn_medium/model_package_tf/model_archive/baseline/saved_model/ds_cnn_medium/variables/variables.index new file mode 100644 index 0000000..24cf127 --- /dev/null +++ b/models/keyword_spotting/ds_cnn_medium/model_package_tf/model_archive/baseline/saved_model/ds_cnn_medium/variables/variables.index @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8f94d6215cd19d5651d333504aad08c2d1450afae072b86e9d6c344b8e23fd26 +size 3642 diff --git a/models/keyword_spotting/ds_cnn_medium/tflite_int8/ckpt/checkpoint b/models/keyword_spotting/ds_cnn_medium/model_package_tf/model_archive/baseline/weights/checkpoint similarity index 100% rename from models/keyword_spotting/ds_cnn_medium/tflite_int8/ckpt/checkpoint rename to models/keyword_spotting/ds_cnn_medium/model_package_tf/model_archive/baseline/weights/checkpoint diff --git a/models/keyword_spotting/ds_cnn_medium/tflite_int8/ckpt/ds_cnn_0.95_ckpt.data-00000-of-00001 b/models/keyword_spotting/ds_cnn_medium/model_package_tf/model_archive/baseline/weights/ds_cnn_0.95_ckpt.data-00000-of-00001 similarity index 100% rename from models/keyword_spotting/ds_cnn_medium/tflite_int8/ckpt/ds_cnn_0.95_ckpt.data-00000-of-00001 rename to models/keyword_spotting/ds_cnn_medium/model_package_tf/model_archive/baseline/weights/ds_cnn_0.95_ckpt.data-00000-of-00001 diff --git a/models/keyword_spotting/ds_cnn_medium/tflite_int8/ckpt/ds_cnn_0.95_ckpt.index b/models/keyword_spotting/ds_cnn_medium/model_package_tf/model_archive/baseline/weights/ds_cnn_0.95_ckpt.index similarity index 100% rename from models/keyword_spotting/ds_cnn_medium/tflite_int8/ckpt/ds_cnn_0.95_ckpt.index rename to models/keyword_spotting/ds_cnn_medium/model_package_tf/model_archive/baseline/weights/ds_cnn_0.95_ckpt.index diff --git a/models/keyword_spotting/ds_cnn_medium/model_package_tf/model_core_utils/__init__.py b/models/keyword_spotting/ds_cnn_medium/model_package_tf/model_core_utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/models/keyword_spotting/ds_cnn_medium/model_package_tf/model_core_utils/models.py b/models/keyword_spotting/ds_cnn_medium/model_package_tf/model_core_utils/models.py new file mode 100644 index 0000000..1978136 --- /dev/null +++ b/models/keyword_spotting/ds_cnn_medium/model_package_tf/model_core_utils/models.py @@ -0,0 +1,327 @@ +# Copyright © 2023 Arm Ltd. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Model definitions for simple keyword spotting.""" + +import math + +import tensorflow as tf + + +def prepare_model_settings(label_count, sample_rate, clip_duration_ms, + window_size_ms, window_stride_ms, + dct_coefficient_count): + """Calculates common settings needed for all models. + + Args: + label_count: How many classes are to be recognized. + sample_rate: Number of audio samples per second. + clip_duration_ms: Length of each audio clip to be analyzed. + window_size_ms: Duration of frequency analysis window. + window_stride_ms: How far to move in time between frequency windows. + dct_coefficient_count: Number of frequency bins to use for analysis. + + Returns: + Dictionary containing common settings. + """ + desired_samples = int(sample_rate * clip_duration_ms / 1000) + window_size_samples = int(sample_rate * window_size_ms / 1000) + window_stride_samples = int(sample_rate * window_stride_ms / 1000) + length_minus_window = (desired_samples - window_size_samples) + if length_minus_window < 0: + spectrogram_length = 0 + else: + spectrogram_length = 1 + int(length_minus_window / window_stride_samples) + fingerprint_size = dct_coefficient_count * spectrogram_length + + return { + 'desired_samples': desired_samples, + 'window_size_samples': window_size_samples, + 'window_stride_samples': window_stride_samples, + 'spectrogram_length': spectrogram_length, + 'dct_coefficient_count': dct_coefficient_count, + 'fingerprint_size': fingerprint_size, + 'label_count': label_count, + 'sample_rate': sample_rate, + } + + +def create_model(model_settings, model_architecture, model_size_info, is_training): + """Builds a tf.keras model of the requested architecture compatible with the settings. + + Args: + model_settings: Dictionary of information about the model. + model_architecture: String specifying which kind of model to create. + model_size_info: Array with specific information for the chosen architecture + (e.g convolutional parameters, number of layers). + + Returns: + A tf.keras Model with the requested architecture. + + Raises: + Exception: If the architecture type isn't recognized. + """ + + if model_architecture == 'dnn': + return create_dnn_model(model_settings, model_size_info) + + elif model_architecture == 'cnn': + return create_cnn_model(model_settings, model_size_info) + + elif model_architecture == 'ds_cnn': + return create_ds_cnn_model(model_settings, model_size_info) + elif model_architecture == 'single_fc': + return create_single_fc_model(model_settings) + elif model_architecture == 'basic_lstm': + return create_basic_lstm_model(model_settings, model_size_info, is_training) + else: + raise Exception(f'model_architecture argument {model_architecture} not recognized' + f', should be one of, "dnn", "cnn", "ds_cnn" ') + + +def create_single_fc_model(model_settings): + """Builds a model with a single fully-connected layer. + + For details see https://arxiv.org/abs/1711.07128. + + Args: + model_settings: Dict of different settings for model training. + + Returns: + tf.keras Model of the 'SINGLE_FC' architecture. + """ + inputs = tf.keras.Input(shape=(model_settings['fingerprint_size'],), name='input') + # Fully connected layer + output = tf.keras.layers.Dense(units=model_settings['label_count'], activation='softmax')(inputs) + + return tf.keras.Model(inputs, output) + + +def create_basic_lstm_model(model_settings, model_size_info, is_training): + """Builds a model with a basic lstm layer. + + For details see https://arxiv.org/abs/1711.07128. + + Args: + model_settings: Dict of different settings for model training. + model_size_info: Length of the array defines the number of hidden-layers and + each element in the array represent the number of neurons in that layer. + is_training: Determining whether the use of the model is for training or for something else. + + Returns: + tf.keras Model of the 'Basic_LSTM' architecture. + """ + inputs = tf.keras.Input(shape=(model_settings['fingerprint_size'], ), name='input') + + input_frequency_size = model_settings['dct_coefficient_count'] + input_time_size = model_settings['spectrogram_length'] + x = tf.reshape(inputs, shape=(-1, input_time_size, input_frequency_size)) + + # LSTM layer, and unrolling depending on whether you are training or not + if is_training: + x = tf.keras.layers.LSTM(units=model_size_info[0], time_major=False, unroll=False)(x) + else: + x = tf.keras.layers.LSTM(units=model_size_info[0], time_major=False, unroll=True)(x) + + # Outputs a fully connected layer + output = tf.keras.layers.Dense(units=model_settings['label_count'], activation='softmax')(x) + + return tf.keras.Model(inputs, output) + + +def create_dnn_model(model_settings, model_size_info): + """Builds a model with multiple hidden fully-connected layers. + + For details see https://arxiv.org/abs/1711.07128. + + Args: + model_settings: Dict of different settings for model training. + model_size_info: Length of the array defines the number of hidden-layers and + each element in the array represent the number of neurons in that layer. + + Returns: + tf.keras Model of the 'DNN' architecture. + """ + + inputs = tf.keras.Input(shape=(model_settings['fingerprint_size'], ), name='input') + + # First fully connected layer. + x = tf.keras.layers.Dense(units=model_size_info[0], activation='relu')(inputs) + + # Hidden layers with ReLU activations. + for i in range(1, len(model_size_info)): + x = tf.keras.layers.Dense(units=model_size_info[i], activation='relu')(x) + + # Output fully connected layer. + output = tf.keras.layers.Dense(units=model_settings['label_count'], activation='softmax')(x) + + return tf.keras.Model(inputs, output) + + +def create_cnn_model(model_settings, model_size_info): + """Builds a model with 2 convolution layers followed by a linear layer and a hidden fully-connected layer. + + For details see https://arxiv.org/abs/1711.07128. + + Args: + model_settings: Dict of different settings for model training. + model_size_info: Defines the first and second convolution parameters in + {number of conv features, conv filter height, width, stride in y,x dir.}, + followed by linear layer size and fully-connected layer size. + + Returns: + tf.keras Model of the 'CNN' architecture. + """ + + input_frequency_size = model_settings['dct_coefficient_count'] + input_time_size = model_settings['spectrogram_length'] + + first_filter_count = model_size_info[0] + first_filter_height = model_size_info[1] # Time axis. + first_filter_width = model_size_info[2] # Frequency axis. + first_filter_stride_y = model_size_info[3] # Time axis. + first_filter_stride_x = model_size_info[4] # Frequency_axis. + + second_filter_count = model_size_info[5] + second_filter_height = model_size_info[6] # Time axis. + second_filter_width = model_size_info[7] # Frequency axis. + second_filter_stride_y = model_size_info[8] # Time axis. + second_filter_stride_x = model_size_info[9] # Frequency axis. + + linear_layer_size = model_size_info[10] + fc_size = model_size_info[11] + + inputs = tf.keras.Input(shape=(model_settings['fingerprint_size']), name='input') + + # Reshape the flattened input. + x = tf.reshape(inputs, shape=(-1, input_time_size, input_frequency_size, 1)) + + # First convolution. + x = tf.keras.layers.Conv2D(filters=first_filter_count, + kernel_size=(first_filter_height, first_filter_width), + strides=(first_filter_stride_y, first_filter_stride_x), + padding='VALID')(x) + x = tf.keras.layers.BatchNormalization()(x) + x = tf.keras.layers.ReLU()(x) + x = tf.keras.layers.Dropout(rate=0)(x) + + # Second convolution. + x = tf.keras.layers.Conv2D(filters=second_filter_count, + kernel_size=(second_filter_height, second_filter_width), + strides=(second_filter_stride_y, second_filter_stride_x), + padding='VALID')(x) + x = tf.keras.layers.BatchNormalization()(x) + x = tf.keras.layers.ReLU()(x) + x = tf.keras.layers.Dropout(rate=0)(x) + + # Flatten for fully connected layers. + x = tf.keras.layers.Flatten()(x) + + # Fully connected layer with no activation. + x = tf.keras.layers.Dense(units=linear_layer_size)(x) + + # Fully connected layer with ReLU activation. + x = tf.keras.layers.Dense(units=fc_size)(x) + x = tf.keras.layers.BatchNormalization()(x) + x = tf.keras.layers.ReLU()(x) + x = tf.keras.layers.Dropout(rate=0)(x) + + # Output fully connected. + output = tf.keras.layers.Dense(units=model_settings['label_count'], activation='softmax')(x) + + return tf.keras.Model(inputs, output) + + +def create_ds_cnn_model(model_settings, model_size_info): + """Builds a model with convolutional & depthwise separable convolutional layers. + + For more details see https://arxiv.org/abs/1711.07128. + + Args: + model_settings: Dict of different settings for model training. + model_size_info: Defines number of layers, followed by the DS-Conv layer + parameters in the order {number of conv features, conv filter height, + width and stride in y,x dir.} for each of the layers. + + Returns: + tf.keras Model of the 'DS-CNN' architecture. + """ + + label_count = model_settings['label_count'] + input_frequency_size = model_settings['dct_coefficient_count'] + input_time_size = model_settings['spectrogram_length'] + + t_dim = input_time_size + f_dim = input_frequency_size + + # Extract model dimensions from model_size_info. + num_layers = model_size_info[0] + conv_feat = [None]*num_layers + conv_kt = [None]*num_layers + conv_kf = [None]*num_layers + conv_st = [None]*num_layers + conv_sf = [None]*num_layers + + i = 1 + for layer_no in range(0, num_layers): + conv_feat[layer_no] = model_size_info[i] + i += 1 + conv_kt[layer_no] = model_size_info[i] + i += 1 + conv_kf[layer_no] = model_size_info[i] + i += 1 + conv_st[layer_no] = model_size_info[i] + i += 1 + conv_sf[layer_no] = model_size_info[i] + i += 1 + + inputs = tf.keras.Input(shape=(model_settings['fingerprint_size']), name='input') + + # Reshape the flattened input. + x = tf.reshape(inputs, shape=(-1, input_time_size, input_frequency_size, 1)) + + # Depthwise separable convolutions. + for layer_no in range(0, num_layers): + if layer_no == 0: + # First convolution. + x = tf.keras.layers.Conv2D(filters=conv_feat[0], + kernel_size=(conv_kt[0], conv_kf[0]), + strides=(conv_st[0], conv_sf[0]), + padding='SAME')(x) + x = tf.keras.layers.BatchNormalization()(x) + x = tf.keras.layers.ReLU()(x) + else: + # Depthwise convolution. + x = tf.keras.layers.DepthwiseConv2D(kernel_size=(conv_kt[layer_no], conv_kf[layer_no]), + strides=(conv_sf[layer_no], conv_st[layer_no]), + padding='SAME')(x) + x = tf.keras.layers.BatchNormalization()(x) + x = tf.keras.layers.ReLU()(x) + + # Pointwise convolution. + x = tf.keras.layers.Conv2D(filters=conv_feat[layer_no], kernel_size=(1, 1))(x) + x = tf.keras.layers.BatchNormalization()(x) + x = tf.keras.layers.ReLU()(x) + + t_dim = math.ceil(t_dim/float(conv_st[layer_no])) + f_dim = math.ceil(f_dim/float(conv_sf[layer_no])) + + # Global average pool. + x = tf.keras.layers.AveragePooling2D(pool_size=(t_dim, f_dim), strides=1)(x) + + # Squeeze before passing to output fully connected layer. + x = tf.reshape(x, shape=(-1, conv_feat[layer_no])) + + # Output connected layer. + output = tf.keras.layers.Dense(units=label_count, activation='softmax')(x) + + return tf.keras.Model(inputs, output) diff --git a/models/keyword_spotting/ds_cnn_medium/model_package_tf/optimisations.py b/models/keyword_spotting/ds_cnn_medium/model_package_tf/optimisations.py new file mode 100644 index 0000000..16b6f4c --- /dev/null +++ b/models/keyword_spotting/ds_cnn_medium/model_package_tf/optimisations.py @@ -0,0 +1,259 @@ +# Copyright © 2023 Arm Ltd. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Functions for optimizing simple keyword spotting models using clustering API.""" + +import argparse +from pathlib import Path + +import tensorflow as tf +import numpy as np +import tensorflow_model_optimization as tfmot + +from data_processing import data_preprocessing +from model_core_utils import models + + +def print_model_weight_clusters(model): + + for layer in model.layers: + if isinstance(layer, tf.keras.layers.Wrapper): + weights = layer.trainable_weights + else: + weights = layer.weights + for weight in weights: + if "kernel" in weight.name: + unique_count = len(np.unique(weight)) + print( + f"{layer.name}/{weight.name}: {unique_count} clusters " + ) + + +def optimize(): + model_settings = models.prepare_model_settings( + len(data_preprocessing.prepare_words_list(FLAGS.wanted_words.split(','))), + FLAGS.sample_rate, FLAGS.clip_duration_ms, FLAGS.window_size_ms, + FLAGS.window_stride_ms, FLAGS.dct_coefficient_count) + + # Create the model to optimize from checkpoint. + model = models.create_model(model_settings, FLAGS.model_architecture, FLAGS.model_size_info) + model.load_weights(FLAGS.checkpoint).expect_partial() + + audio_processor = data_preprocessing.AudioProcessor(data_url=FLAGS.data_url, + data_dir=FLAGS.data_dir, + silence_percentage=FLAGS.silence_percentage, + unknown_percentage=FLAGS.unknown_percentage, + wanted_words=FLAGS.wanted_words.split(','), + validation_percentage=FLAGS.validation_percentage, + testing_percentage=FLAGS.testing_percentage, + model_settings=model_settings) + + # We decay learning rate in a constant piecewise way to help learning. + training_steps_list = list(map(int, FLAGS.how_many_training_steps.split(','))) + learning_rates_list = list(map(float, FLAGS.learning_rate.split(','))) + lr_boundary_list = training_steps_list[:-1] # Only need the values at which to change lr. + lr_schedule = tf.keras.optimizers.schedules.PiecewiseConstantDecay(boundaries=lr_boundary_list, + values=learning_rates_list) + + cluster_weights = tfmot.clustering.keras.cluster_weights + CentroidInitialization = tfmot.clustering.keras.CentroidInitialization + + clustering_params = { + 'number_of_clusters': 32, + 'cluster_centroids_init': CentroidInitialization.KMEANS_PLUS_PLUS} + + clustered_model = cluster_weights(model, **clustering_params) + + # Specify the optimizer configurations. + optimizer = tf.keras.optimizers.Adam(learning_rate=lr_schedule) + clustered_model.compile(optimizer=optimizer, + loss=tf.keras.losses.SparseCategoricalCrossentropy(), + metrics=['accuracy']) + + train_data = audio_processor.get_data(audio_processor.Modes.TRAINING, + FLAGS.background_frequency, FLAGS.background_volume, + int((FLAGS.time_shift_ms * FLAGS.sample_rate) / 1000)) + train_data = train_data.repeat().batch(FLAGS.batch_size).prefetch(tf.data.AUTOTUNE) + val_data = audio_processor.get_data(audio_processor.Modes.VALIDATION) + val_data = val_data.batch(FLAGS.batch_size).prefetch(tf.data.AUTOTUNE) + + # We train for a max number of iterations so need to calculate how many 'epochs' this will be. + training_steps_max = np.sum(training_steps_list) + training_epoch_max = int(np.ceil(training_steps_max / FLAGS.eval_step_interval)) + + # Train the model with clustering applied. + clustered_model.fit(x=train_data, + steps_per_epoch=FLAGS.eval_step_interval, + epochs=training_epoch_max, + validation_data=val_data) + + stripped_clustered_model = tfmot.clustering.keras.strip_clustering(clustered_model) + + print_model_weight_clusters(stripped_clustered_model) + + # Save the clustered model weights + train_dir = Path(FLAGS.train_dir) / "optimized" + train_dir.mkdir(parents=True, exist_ok=True) + + stripped_clustered_model.save_weights((train_dir / + (FLAGS.model_architecture + + "_clustered_ckpt"))) + + # Test the model. + test_data = audio_processor.get_data(audio_processor.Modes.TESTING) + test_data = test_data.batch(FLAGS.batch_size) + + stripped_clustered_model.compile(optimizer=optimizer, + loss=tf.keras.losses.SparseCategoricalCrossentropy(), + metrics=['accuracy']) + + test_loss, test_acc = stripped_clustered_model.evaluate(x=test_data) + print(f'Final test accuracy: {test_acc*100:.2f}%') + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + '--data_url', + type=str, + default='http://download.tensorflow.org/data/speech_commands_v0.02.tar.gz', + help='Location of speech training data archive on the web.') + parser.add_argument( + '--data_dir', + type=str, + default='/tmp/speech_dataset/', + help="""\ + Where to download the speech training data to. + """) + parser.add_argument( + '--background_volume', + type=float, + default=0.1, + help="""\ + How loud the background noise should be, between 0 and 1. + """) + parser.add_argument( + '--background_frequency', + type=float, + default=0.8, + help="""\ + How many of the training samples have background noise mixed in. + """) + parser.add_argument( + '--silence_percentage', + type=float, + default=10.0, + help="""\ + How much of the training data should be silence. + """) + parser.add_argument( + '--unknown_percentage', + type=float, + default=10.0, + help="""\ + How much of the training data should be unknown words. + """) + parser.add_argument( + '--time_shift_ms', + type=float, + default=100.0, + help="""\ + Range to randomly shift the training audio by in time. + """) + parser.add_argument( + '--testing_percentage', + type=int, + default=10, + help='What percentage of wavs to use as a test set.') + parser.add_argument( + '--validation_percentage', + type=int, + default=10, + help='What percentage of wavs to use as a validation set.') + parser.add_argument( + '--sample_rate', + type=int, + default=16000, + help='Expected sample rate of the wavs',) + parser.add_argument( + '--clip_duration_ms', + type=int, + default=1000, + help='Expected duration in milliseconds of the wavs',) + parser.add_argument( + '--window_size_ms', + type=float, + default=30.0, + help='How long each spectrogram timeslice is',) + parser.add_argument( + '--window_stride_ms', + type=float, + default=10.0, + help='How long each spectrogram timeslice is',) + parser.add_argument( + '--dct_coefficient_count', + type=int, + default=40, + help='How many bins to use for the MFCC fingerprint',) + parser.add_argument( + '--how_many_training_steps', + type=str, + default='3750,750', + help='How many training loops to run',) + parser.add_argument( + '--eval_step_interval', + type=int, + default=400, + help='How often to evaluate the training results.') + parser.add_argument( + '--learning_rate', + type=str, + default='0.001,0.0001', + help='How large a learning rate to use when training.') + parser.add_argument( + '--batch_size', + type=int, + default=100, + help='How many items to train with at once',) + parser.add_argument( + '--wanted_words', + type=str, + default='yes,no,up,down,left,right,on,off,stop,go', + help='Words to use (others will be added to an unknown label)',) + parser.add_argument( + '--train_dir', + type=str, + default='/tmp/speech_commands_train', + help='Directory to write event logs and checkpoint.') + parser.add_argument( + '--save_step_interval', + type=int, + default=100, + help='Save model checkpoint every save_steps.') + parser.add_argument( + '--checkpoint', + type=str, + help='Checkpoint to load the weights from before fine-tuning.') + parser.add_argument( + '--model_architecture', + type=str, + default='dnn', + help='What model architecture to use') + parser.add_argument( + '--model_size_info', + type=int, + nargs="+", + default=[128, 128, 128], + help='Model dimensions - different for various models') + + FLAGS, _ = parser.parse_known_args() + optimize() diff --git a/models/keyword_spotting/ds_cnn_medium/model_package_tf/recreate_model.sh b/models/keyword_spotting/ds_cnn_medium/model_package_tf/recreate_model.sh new file mode 100644 index 0000000..278bddd --- /dev/null +++ b/models/keyword_spotting/ds_cnn_medium/model_package_tf/recreate_model.sh @@ -0,0 +1,52 @@ +#!/usr/bin/env bash +# Copyright (C) 2023 Arm Limited or its affiliates. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the License); you may +# not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an AS IS BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +ckpt_path=model_archive/model_source/weights/ds_cnn_0.95_ckpt +train=false + +# Parse command line args +while (( $# >= 1 )); do + case $1 in + --ckpt) + if [ "$2" ]; then + ckpt_path=$2 + shift + else + printf 'ERROR: "--ckpt" requires a path to be supplied.\n' + exit 1 + fi + ;; + --train) + train=true + break;; + *) shift; + esac; +done + + +# DS-CNN Medium training +if [ "$train" = true ] +then +python train.py --model_architecture ds_cnn --model_size_info 5 172 10 4 2 1 172 3 3 2 2 172 3 3 1 1 172 3 3 1 1 172 3 3 1 1 --dct_coefficient_count 10 --window_size_ms 40 --window_stride_ms 20 --learning_rate 0.0005,0.0001,0.00002 --how_many_training_steps 10000,10000,10000 --summaries_dir work/DS_CNN/DS_CNN_M/retrain_logs --train_dir work/DS_CNN/DS_CNN_M/training +fi + +# Conversion to TFLite fp32 +python convert_to_tflite.py --model_architecture ds_cnn --model_size_info 5 172 10 4 2 1 172 3 3 2 2 172 3 3 1 1 172 3 3 1 1 172 3 3 1 1 --dct_coefficient_count 10 --window_size_ms 40 --window_stride_ms 20 --checkpoint $ckpt_path --no-quantize + +# Conversion to TFLite int8 +python convert_to_tflite.py --model_architecture ds_cnn --model_size_info 5 172 10 4 2 1 172 3 3 2 2 172 3 3 1 1 172 3 3 1 1 172 3 3 1 1 --dct_coefficient_count 10 --window_size_ms 40 --window_stride_ms 20 --checkpoint $ckpt_path --inference_type int8 + diff --git a/models/keyword_spotting/ds_cnn_medium/model_package_tf/requirements.txt b/models/keyword_spotting/ds_cnn_medium/model_package_tf/requirements.txt new file mode 100644 index 0000000..3448cff --- /dev/null +++ b/models/keyword_spotting/ds_cnn_medium/model_package_tf/requirements.txt @@ -0,0 +1,3 @@ +numpy == 1.19.5 +tensorflow == 2.5.0 +tensorflow-model-optimization == 0.6.0 \ No newline at end of file diff --git a/models/keyword_spotting/ds_cnn_medium/model_package_tf/train.py b/models/keyword_spotting/ds_cnn_medium/model_package_tf/train.py new file mode 100644 index 0000000..8c488b3 --- /dev/null +++ b/models/keyword_spotting/ds_cnn_medium/model_package_tf/train.py @@ -0,0 +1,227 @@ +# Copyright © 2023 Arm Ltd. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Functions for training simple keyword spotting models.""" + +import argparse +from pathlib import Path + +import tensorflow as tf +import numpy as np + +from data_processing import data_preprocessing +from model_core_utils import models + + +def train(): + model_settings = models.prepare_model_settings( + len(data_preprocessing.prepare_words_list(FLAGS.wanted_words.split(','))), + FLAGS.sample_rate, FLAGS.clip_duration_ms, FLAGS.window_size_ms, + FLAGS.window_stride_ms, FLAGS.dct_coefficient_count) + + # Create the model. + model = models.create_model(model_settings, FLAGS.model_architecture, FLAGS.model_size_info, True) + + audio_processor = data_preprocessing.AudioProcessor(data_url=FLAGS.data_url, + data_dir=FLAGS.data_dir, + silence_percentage=FLAGS.silence_percentage, + unknown_percentage=FLAGS.unknown_percentage, + wanted_words=FLAGS.wanted_words.split(','), + validation_percentage=FLAGS.validation_percentage, + testing_percentage=FLAGS.testing_percentage, + model_settings=model_settings) + + # We decay learning rate in a constant piecewise way to help learning. + training_steps_list = list(map(int, FLAGS.how_many_training_steps.split(','))) + learning_rates_list = list(map(float, FLAGS.learning_rate.split(','))) + lr_boundary_list = training_steps_list[:-1] # Only need the values at which to change lr. + lr_schedule = tf.keras.optimizers.schedules.PiecewiseConstantDecay(boundaries=lr_boundary_list, + values=learning_rates_list) + + # Specify the optimizer configurations. + optimizer = tf.keras.optimizers.Adam(learning_rate=lr_schedule) + model.compile(optimizer=optimizer, + loss=tf.keras.losses.SparseCategoricalCrossentropy(), + metrics=['accuracy']) + + train_data = audio_processor.get_data(audio_processor.Modes.TRAINING, + FLAGS.background_frequency, FLAGS.background_volume, + int((FLAGS.time_shift_ms * FLAGS.sample_rate) / 1000)) + train_data = train_data.repeat().batch(FLAGS.batch_size).prefetch(tf.data.AUTOTUNE) + val_data = audio_processor.get_data(audio_processor.Modes.VALIDATION) + val_data = val_data.batch(FLAGS.batch_size).prefetch(tf.data.AUTOTUNE) + + # We train for a max number of iterations so need to calculate how many 'epochs' this will be. + training_steps_max = np.sum(training_steps_list) + training_epoch_max = int(np.ceil(training_steps_max / FLAGS.eval_step_interval)) + + # Callbacks. + train_dir = Path(FLAGS.train_dir) / "best" + train_dir.mkdir(parents=True, exist_ok=True) + model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint( + filepath=(train_dir / (FLAGS.model_architecture + "_{val_accuracy:.3f}_ckpt")), + save_weights_only=True, + monitor='val_accuracy', + mode='max', + save_best_only=True) + tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=FLAGS.summaries_dir) + + # Train the model. + model.fit(x=train_data, + steps_per_epoch=FLAGS.eval_step_interval, + epochs=training_epoch_max, + validation_data=val_data, + callbacks=[model_checkpoint_callback, tensorboard_callback]) + + # Test and save the model. + test_data = audio_processor.get_data(audio_processor.Modes.TESTING) + test_data = test_data.batch(FLAGS.batch_size) + + test_loss, test_acc = model.evaluate(x=test_data) + print(f'Final test accuracy: {test_acc*100:.2f}%') + model.save(f'saved_model/{FLAGS.model_architecture}') + model.save(f'keras/{FLAGS.model_architecture}.h5') + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + '--data_url', + type=str, + default='http://download.tensorflow.org/data/speech_commands_v0.02.tar.gz', + help='Location of speech training data archive on the web.') + parser.add_argument( + '--data_dir', + type=str, + default='/tmp/speech_dataset/', + help="""\ + Where to download the speech training data to. + """) + parser.add_argument( + '--background_volume', + type=float, + default=0.1, + help="""\ + How loud the background noise should be, between 0 and 1. + """) + parser.add_argument( + '--background_frequency', + type=float, + default=0.8, + help="""\ + How many of the training samples have background noise mixed in. + """) + parser.add_argument( + '--silence_percentage', + type=float, + default=10.0, + help="""\ + How much of the training data should be silence. + """) + parser.add_argument( + '--unknown_percentage', + type=float, + default=10.0, + help="""\ + How much of the training data should be unknown words. + """) + parser.add_argument( + '--time_shift_ms', + type=float, + default=100.0, + help="""\ + Range to randomly shift the training audio by in time. + """) + parser.add_argument( + '--testing_percentage', + type=int, + default=10, + help='What percentage of wavs to use as a test set.') + parser.add_argument( + '--validation_percentage', + type=int, + default=10, + help='What percentage of wavs to use as a validation set.') + parser.add_argument( + '--sample_rate', + type=int, + default=16000, + help='Expected sample rate of the wavs',) + parser.add_argument( + '--clip_duration_ms', + type=int, + default=1000, + help='Expected duration in milliseconds of the wavs',) + parser.add_argument( + '--window_size_ms', + type=float, + default=30.0, + help='How long each spectrogram timeslice is',) + parser.add_argument( + '--window_stride_ms', + type=float, + default=10.0, + help='How long each spectrogram timeslice is',) + parser.add_argument( + '--dct_coefficient_count', + type=int, + default=40, + help='How many bins to use for the MFCC fingerprint',) + parser.add_argument( + '--how_many_training_steps', + type=str, + default='15000,3000', + help='How many training loops to run',) + parser.add_argument( + '--eval_step_interval', + type=int, + default=400, + help='How often to evaluate the training results.') + parser.add_argument( + '--learning_rate', + type=str, + default='0.001,0.0001', + help='How large a learning rate to use when training.') + parser.add_argument( + '--batch_size', + type=int, + default=100, + help='How many items to train with at once',) + parser.add_argument( + '--summaries_dir', + type=str, + default='/tmp/retrain_logs', + help='Where to save summary logs for TensorBoard.') + parser.add_argument( + '--wanted_words', + type=str, + default='yes,no,up,down,left,right,on,off,stop,go', + help='Words to use (others will be added to an unknown label)',) + parser.add_argument( + '--train_dir', + type=str, + default='/tmp/speech_commands_train', + help='Directory to write event logs and checkpoint.') + parser.add_argument( + '--model_architecture', + type=str, + default='dnn', + help='What model architecture to use') + parser.add_argument( + '--model_size_info', + type=int, + nargs="+", + default=[128, 128, 128], + help='Model dimensions - different for various models') + + FLAGS, _ = parser.parse_known_args() + train() diff --git a/models/keyword_spotting/ds_cnn_medium/model_package_tf/validation_utils/labels.txt b/models/keyword_spotting/ds_cnn_medium/model_package_tf/validation_utils/labels.txt new file mode 100644 index 0000000..ba41645 --- /dev/null +++ b/models/keyword_spotting/ds_cnn_medium/model_package_tf/validation_utils/labels.txt @@ -0,0 +1,12 @@ +_silence_ +_unknown_ +yes +no +up +down +left +right +on +off +stop +go \ No newline at end of file diff --git a/models/keyword_spotting/ds_cnn_medium/tflite_int8/README.md b/models/keyword_spotting/ds_cnn_medium/tflite_int8/README.md deleted file mode 100644 index c675a6f..0000000 --- a/models/keyword_spotting/ds_cnn_medium/tflite_int8/README.md +++ /dev/null @@ -1,59 +0,0 @@ -# DS-CNN Medium INT8 - -## Description -This is a fully quantized version (asymmetrical int8) of the DS-CNN Medium model developed by Arm, with training checkpoints, from the Hello Edge paper. Code to recreate this model can be found here: https://github.com/ARM-software/ML-examples/tree/master/tflu-kws-cortex-m - -## License -[Apache-2.0](https://spdx.org/licenses/Apache-2.0.html) - -## Related Materials -### Class Labels -The class labels associated with this model can be downloaded by running the script `get_class_labels.sh`. - -### Model Recreation Code -Code to recreate this model can be found here: https://github.com/ARM-software/ML-examples/tree/master/tflu-kws-cortex-m. - -## Network Information -| Network Information | Value | -|---------------------|------------------| -| Framework | TensorFlow Lite | -| SHA-1 Hash | 740d32adde16948b2ab45e1e8c856de2925a05eb | -| Size (Bytes) | 186288 | -| Provenance | https://github.com/ARM-software/ML-examples/tree/master/tflu-kws-cortex-m | -| Paper | https://arxiv.org/abs/1711.07128 | - -## Accuracy -Dataset: Google Speech Commands Test Set - -| Metric | Value | -|--------|-------| -| Accuracy | 0.941 | - -## Performance -| Platform | Optimized | -|----------|:---------:| -| Cortex-A |:heavy_check_mark: | -| Cortex-M |:heavy_check_mark: HERO | -| Mali GPU |:heavy_check_mark: | -| Ethos U |:heavy_check_mark: | - -### Key -* :heavy_check_mark: - Will run on this platform. -* :heavy_multiplication_x: - Will not run on this platform. - - - -## Optimizations -| Optimization | Value | -|-----------------|---------| -| Quantization | INT8 | - -## Network Inputs -| Input Node Name | Shape | Description | -|-----------------|---------|-------------| -| input | (1, 490) | The input is a processed MFCCs of shape (1, 490) | - -## Network Outputs -| Output Node Name | Shape | Description | -|------------------|---------|-------------| -| Identity | (1, 12) | The probability on 12 keywords. | diff --git a/models/keyword_spotting/ds_cnn_medium/tflite_int8/definition.yaml b/models/keyword_spotting/ds_cnn_medium/tflite_int8/definition.yaml deleted file mode 100644 index c77867c..0000000 --- a/models/keyword_spotting/ds_cnn_medium/tflite_int8/definition.yaml +++ /dev/null @@ -1,45 +0,0 @@ -benchmark: - Google Speech Commands test set: - Accuracy: 94.13% -description: 'This is a fully quantized version (asymmetrical int8) of the DS-CNN - Medium model developed by Arm, with training checkpoints, from the Hello Edge paper. - Code to recreate this model can be found here: https://github.com/ARM-software/ML-examples/tree/master/tflu-kws-cortex-m' -license: -- Apache-2.0 -network: - file_size_bytes: 186288 - filename: ds_cnn_m_quantized.tflite - framework: TensorFlow Lite - hash: - algorithm: sha1 - value: 740d32adde16948b2ab45e1e8c856de2925a05eb - provenance: https://github.com/ARM-software/ML-examples/tree/master/tflu-kws-cortex-m - quality_level: hero#CORTEX-M -network_parameters: - input_nodes: - - description: The input is a processed MFCCs of shape (1, 490) - example_input: - path: models/keyword_spotting/ds_cnn_medium/tflite_int8/testing_input/input - name: input - shape: - - 1 - - 490 - output_nodes: - - description: The probability on 12 keywords. - name: Identity - shape: - - 1 - - 12 - test_output_path: models/keyword_spotting/ds_cnn_medium/tflite_int8/testing_output/Identity -operators: - TensorFlow Lite: - - AVERAGE_POOL_2D - - CONV_2D - - DEPTHWISE_CONV_2D - - DEQUANTIZE - - FULLY_CONNECTED - - QUANTIZE - - RELU - - RESHAPE - - SOFTMAX -paper: https://arxiv.org/abs/1711.07128 diff --git a/models/keyword_spotting/ds_cnn_medium/tflite_int8/get_class_labels.sh b/models/keyword_spotting/ds_cnn_medium/tflite_int8/get_class_labels.sh deleted file mode 100755 index e59caf5..0000000 --- a/models/keyword_spotting/ds_cnn_medium/tflite_int8/get_class_labels.sh +++ /dev/null @@ -1,20 +0,0 @@ -# Copyright (C) 2021 Arm Limited or its affiliates. All rights reserved. -# -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the License); you may -# not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an AS IS BASIS, WITHOUT -# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -#!/usr/bin/env bash - -wget https://raw.githubusercontent.com/ARM-software/ML-KWS-for-MCU/e9cf319e9aa2ff71d433e111477dd95329fb94cb/Pretrained_models/labels.txt -mv labels.txt labelmappings.txt \ No newline at end of file diff --git a/models/keyword_spotting/ds_cnn_small/model_package_tf/README.md b/models/keyword_spotting/ds_cnn_small/model_package_tf/README.md new file mode 100644 index 0000000..077f31c --- /dev/null +++ b/models/keyword_spotting/ds_cnn_small/model_package_tf/README.md @@ -0,0 +1,115 @@ +# DS-CNN Small model package + +This folder contains code that will allow you to recreate the DS-CNN Small keyword spotting model from +the [Hello Edge paper](https://arxiv.org/pdf/1711.07128.pdf). + +## License +[Apache-2.0](https://spdx.org/licenses/Apache-2.0.html) + +## Model Package Overview +| Model | DS_CNN_Small | +|:---------------: |:------------------------------------------:| +| **Format**: | Keras, Saved Model, TensorFlow Lite int8, TensorFlow Lite fp32 | +| **Feature**: | Keyword spotting for Arm Cortex-M CPUs | +| **Architectural Delta w.r.t. Vanilla**: | None | +| **Domain**: | Keyword spotting | +| **Package Quality**: | Hero | + +## Model Recreation + +In order to recreate the model you will first need to be using ```Python3.7``` and install the requirements in ```requirements.txt```. + +Once you have these requirements satisfied you can execute the recreation script contained within this folder, just run: + +```bash +bash ./recreate_model.sh +``` + +Running this script will use the pre-trained checkpoint files supplied in the ```./model_archive/model_source/weights``` folder +to generate the TFLite files and perform evaluation on the test sets. Both an fp32 version and a quantized version will be produced. +The quantized version will use post-training quantization to fully quantize it. + +If you want to run training from scratch you can do this by supplying ```--train``` when running the script. For example: + +```bash +bash ./recreate_model.sh --train +``` + +Training is then performed and should produce a model to the stated accuracy in this repository. +Note that exporting to TFLite will still happen with the pre-trained checkpoint files so you will need to re-run the script +and this time supply the path to the new checkpoint files you want to use, for example: + +```bash +bash ./recreate_model.sh --ckpt +``` + + +## Training + +To train a DNN with 3 fully-connected layers with 128 neurons in each layer, run: + +``` +python train.py --model_architecture dnn --model_size_info 128 128 128 +``` +The command line argument *--model_size_info* is used to pass the neural network layer +dimensions such as number of layers, convolution filter size/stride as a list to models.py, +which builds the TensorFlow graph based on the provided model architecture +and layer dimensions. For more info on *model_size_info* for each network architecture see +[models.py](models.py). + +The training commands with all the hyperparameters to reproduce the models shown in the +[paper](https://arxiv.org/pdf/1711.07128.pdf) are given [here](recreate_model.sh). + +## Testing +To run inference on the trained model from a checkpoint and get accuracy on validation and test sets, run: +``` +python evaluation.py --model_architecture dnn --model_size_info 128 128 128 --checkpoint +``` +The parameters used here should match those used in the Training step. + +## Optimization + +We introduce a new *optional* step to optimize the trained keyword spotting model for deployment. + +Here we use TensorFlow's [weight clustering API](https://www.tensorflow.org/model_optimization/guide/clustering) to reduce the compressed model size and optimize inference on supported hardware. 32 weight clusters and kmeans++ cluster intialization method are used as the clustering hyperparameters. + +To optimize your trained model (e.g. a DNN), a trained model checkpoint is needed to run clustering and fine-tuning on. +You can use the pre-trained checkpoints provided, or train your own model and use the resulting checkpoint. + +To apply the optimization and fine-tuning, run the following command: +``` +python optimisations.py --model_architecture dnn --model_size_info 128 128 128 --checkpoint +``` +The parameters used here should match those used in the Training step, except for the number of training steps. +The number of training steps is reduced since the optimization step only requires fine-tuning. + +This will generate a clustered model checkpoint that can be used in the quantization step to generate a quantized and clustered TFLite model. + +## Quantization and TFLite Conversion + +As part of the update we now use TensorFlow's +[post training quantization](https://www.tensorflow.org/lite/performance/post_training_quantization) to +make quantization of the trained models super simple. + +To quantize your trained model (e.g. a DNN) run: +``` +python convert_to_tflite.py --model_architecture dnn --model_size_info 128 128 128 --checkpoint [--inference_type int8|int16] +``` +The parameters used here should match those used in the Training step. + +The inference_type parameter is *optional* and to be used if a fully quantized model with inputs and outputs of type int8 or int16 is needed. It defaults to fp32. + +This step will produce a quantized TFLite file *dnn_quantized.tflite*. +You can test the accuracy of this quantized model on the test set by running: +``` +python evaluation.py --tflite_path dnn_quantized.tflite +``` +The parameters used here should match those used in the Training step. + +`convert_to_tflite.py` uses post-training quantization to generate a quantized model by default. If you wish to convert to a floating point TFLite model, use the command below: + +``` +python convert_to_tflite.py --model_architecture dnn --model_size_info 128 128 128 --checkpoint --no-quantize +``` + +This will produce a floating point TFLite file *dnn.tflite*. You can test the accuracy of this floating point model using `evaluation.py` as above. diff --git a/models/keyword_spotting/ds_cnn_small/model_package_tf/convert_to_tflite.py b/models/keyword_spotting/ds_cnn_small/model_package_tf/convert_to_tflite.py new file mode 100644 index 0000000..64ab8df --- /dev/null +++ b/models/keyword_spotting/ds_cnn_small/model_package_tf/convert_to_tflite.py @@ -0,0 +1,234 @@ +# Copyright © 2023 Arm Ltd. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Functions for converting and quantizing a trained keyword spotting + model and saving to TFLite.""" + +import argparse + +import tensorflow as tf + +from data_processing import data_preprocessing +from model_core_utils import models +from evaluation import tflite_test + +NUM_REP_DATA_SAMPLES = 100 # How many samples to use for post training quantization. + + +def convert(model_settings, audio_processor, checkpoint, quantize, inference_type, tflite_path): + """Load our trained floating point model and convert it. + + TFLite conversion or post training quantization is performed and the + resulting model is saved as a TFLite file. + We use samples from the validation set to do post training quantization. + + Args: + model_settings: Dictionary of common model settings. + audio_processor: Audio processor class object. + checkpoint: Path to training checkpoint to load. + quantize: Whether to quantize the model or convert to fp32 TFLite model. + inference_type: Input/output type of the quantized model. + tflite_path: Output TFLite file save path. + """ + model = models.create_model(model_settings, FLAGS.model_architecture, FLAGS.model_size_info, False) + model.load_weights(checkpoint).expect_partial() + + val_data = audio_processor.get_data(audio_processor.Modes.VALIDATION).batch(1) + + def _rep_dataset(): + """Generator function to produce representative dataset.""" + i = 0 + for mfcc, label in val_data: + if i > NUM_REP_DATA_SAMPLES: + break + i += 1 + yield [mfcc] + + if quantize: + # Quantize model and save to disk. + tflite_model = post_training_quantize(model, inference_type, _rep_dataset) + with open(tflite_path, 'wb') as f: + f.write(tflite_model) + print(f'Quantized model saved to {tflite_path}.') + else: + converter = tf.lite.TFLiteConverter.from_keras_model(model) + tflite_model = converter.convert() + with open(tflite_path, 'wb') as f: + f.write(tflite_model) + print(f'Converted model saved to {tflite_path}.') + + +def post_training_quantize(keras_model, inference_type, rep_dataset): + """Perform post training quantization and returns the TFLite model ready for saving. + + See https://www.tensorflow.org/lite/performance/post_training_quantization#full_integer_quantization for + more details. + + Args: + keras_model: The trained tf Keras model used for post training quantization. + inference_type: Input/output type of the quantized model. + rep_dataset: Function to use as a representative dataset, must be callable. + + Returns: + Quantized TFLite model ready for saving to disk. + """ + converter = tf.lite.TFLiteConverter.from_keras_model(keras_model) + converter.optimizations = [tf.lite.Optimize.DEFAULT] + + if inference_type == 'int8': + converter.inference_input_type = tf.int8 + converter.inference_output_type = tf.int8 + supported_ops = tf.lite.OpsSet.TFLITE_BUILTINS_INT8 + if inference_type == 'int16': + converter.inference_input_type = tf.int16 + converter.inference_output_type = tf.int16 + supported_ops = tf.lite.OpsSet.EXPERIMENTAL_TFLITE_BUILTINS_ACTIVATIONS_INT16_WEIGHTS_INT8 + + # Int8 post training quantization needs representative dataset. + converter.representative_dataset = rep_dataset + converter.target_spec.supported_ops = [supported_ops] + + tflite_model = converter.convert() + + return tflite_model + + +def main(): + model_settings = models.prepare_model_settings( + len(data_preprocessing.prepare_words_list(FLAGS.wanted_words.split(','))), + FLAGS.sample_rate, FLAGS.clip_duration_ms, FLAGS.window_size_ms, + FLAGS.window_stride_ms, FLAGS.dct_coefficient_count) + + audio_processor = data_preprocessing.AudioProcessor(data_url=FLAGS.data_url, + data_dir=FLAGS.data_dir, + silence_percentage=FLAGS.silence_percentage, + unknown_percentage=FLAGS.unknown_percentage, + wanted_words=FLAGS.wanted_words.split(','), + validation_percentage=FLAGS.validation_percentage, + testing_percentage=FLAGS.testing_percentage, + model_settings=model_settings) + + if FLAGS.quantize: + tflite_path = f'{FLAGS.model_architecture}_quantized.tflite' + else: + tflite_path = f'{FLAGS.model_architecture}.tflite' + + # Load floating point model from checkpoint and convert it. + convert(model_settings, audio_processor, FLAGS.checkpoint, + FLAGS.quantize, FLAGS.inference_type, tflite_path) + + # Test the newly converted model on the test set. + tflite_test(model_settings, audio_processor, tflite_path) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument( + '--data_url', + type=str, + default='http://download.tensorflow.org/data/speech_commands_v0.02.tar.gz', + help='Location of speech training data archive on the web.') + parser.add_argument( + '--data_dir', + type=str, + default='/tmp/speech_dataset/', + help="""\ + Where to download the speech training data to. + """) + parser.add_argument( + '--silence_percentage', + type=float, + default=10.0, + help="""\ + How much of the training data should be silence. + """) + parser.add_argument( + '--unknown_percentage', + type=float, + default=10.0, + help="""\ + How much of the training data should be unknown words. + """) + parser.add_argument( + '--testing_percentage', + type=int, + default=10, + help='What percentage of wavs to use as a test set.') + parser.add_argument( + '--validation_percentage', + type=int, + default=10, + help='What percentage of wavs to use as a validation set.') + parser.add_argument( + '--sample_rate', + type=int, + default=16000, + help='Expected sample rate of the wavs',) + parser.add_argument( + '--clip_duration_ms', + type=int, + default=1000, + help='Expected duration in milliseconds of the wavs',) + parser.add_argument( + '--window_size_ms', + type=float, + default=30.0, + help='How long each spectrogram timeslice is',) + parser.add_argument( + '--window_stride_ms', + type=float, + default=10.0, + help='How long each spectrogram timeslice is',) + parser.add_argument( + '--dct_coefficient_count', + type=int, + default=40, + help='How many bins to use for the MFCC fingerprint',) + parser.add_argument( + '--wanted_words', + type=str, + default='yes,no,up,down,left,right,on,off,stop,go', + help='Words to use (others will be added to an unknown label)',) + parser.add_argument( + '--model_architecture', + type=str, + default='dnn', + help='What model architecture to use') + parser.add_argument( + '--model_size_info', + type=int, + nargs="+", + default=[128, 128, 128], + help='Model dimensions - different for various models') + parser.add_argument( + '--checkpoint', + type=str, + help='Checkpoint to load the weights from.') + parser.add_argument( + '--quantize', + dest='quantize', + action="store_true", + default=True, + help='Whether to quantize the model or convert to fp32 TFLite model. Defaults to True.') + parser.add_argument( + '--no-quantize', + dest='quantize', + action="store_false", + help='Whether to quantize the model or convert to fp32 TFLite model. Defaults to True.') + parser.add_argument( + '--inference_type', + type=str, + default='fp32', + help='If quantize is true, whether the model input and output is float32, int8 or int16') + + FLAGS, _ = parser.parse_known_args() + main() diff --git a/models/keyword_spotting/ds_cnn_small/model_package_tf/data_processing/__init__.py b/models/keyword_spotting/ds_cnn_small/model_package_tf/data_processing/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/models/keyword_spotting/ds_cnn_small/model_package_tf/data_processing/data_preprocessing.py b/models/keyword_spotting/ds_cnn_small/model_package_tf/data_processing/data_preprocessing.py new file mode 100644 index 0000000..05cf5ba --- /dev/null +++ b/models/keyword_spotting/ds_cnn_small/model_package_tf/data_processing/data_preprocessing.py @@ -0,0 +1,462 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +# +# Modifications Copyright 2023 Arm Inc. All Rights Reserved. +# Modified to use TensorFlow 2.0 and data pipelines. +# +"""Functions for loading and preparing data for keyword spotting.""" + +import os +import re +import sys +import urllib +from pathlib import Path +import tarfile +import hashlib +import random +import math +from enum import Enum + +import numpy as np +import tensorflow as tf +from tensorflow.python.ops import gen_audio_ops as audio_ops + +MAX_NUM_WAVS_PER_CLASS = 2**27 - 1 # ~134M +RANDOM_SEED = 59185 +BACKGROUND_NOISE_DIR_NAME = '_background_noise_' +SILENCE_LABEL = '_silence_' +SILENCE_INDEX = 0 +UNKNOWN_WORD_INDEX = 1 +UNKNOWN_WORD_LABEL = '_unknown_' + + +def load_wav_file(wav_filename, desired_samples): + """Loads and then decodes a given 16bit PCM wav file. + + Decoded audio is scaled to the range [-1, 1] and padded or cropped to the desired number of samples. + + Args: + wav_filename: 16bit PCM wav file to load. + desired_samples: Number of samples wanted from the audio file. + + Returns: + Tuple consisting of the decoded audio and sample rate. + """ + wav_file = tf.io.read_file(wav_filename) + decoded_wav = audio_ops.decode_wav(wav_file, desired_channels=1, desired_samples=desired_samples) + + return decoded_wav.audio, decoded_wav.sample_rate + + +def calculate_mfcc(audio_signal, audio_sample_rate, window_size, window_stride, num_mfcc): + """Returns Mel Frequency Cepstral Coefficients (MFCC) for a given audio signal. + + Args: + audio_signal: Raw audio signal in range [-1, 1] + audio_sample_rate: Audio signal sample rate + window_size: Window size in samples for calculating spectrogram + window_stride: Window stride in samples for calculating spectrogram + num_mfcc: The number of MFCC features wanted. + + Returns: + Calculated mffc features. + """ + spectrogram = audio_ops.audio_spectrogram(input=audio_signal, window_size=window_size, stride=window_stride, + magnitude_squared=True) + + mfcc_features = audio_ops.mfcc(spectrogram, audio_sample_rate, dct_coefficient_count=num_mfcc) + + return mfcc_features + + +def which_set(filename, validation_percentage, testing_percentage): + """Determines which data partition the file should belong to. + + We want to keep files in the same training, validation, or testing sets even + if new ones are added over time. This makes it less likely that testing + samples will accidentally be reused in training when long runs are restarted + for example. To keep this stability, a hash of the filename is taken and used + to determine which set it should belong to. This determination only depends on + the name and the set proportions, so it won't change as other files are added. + It's also useful to associate particular files as related (for example words + spoken by the same person), so anything after '_nohash_' in a filename is + ignored for set determination. This ensures that 'bobby_nohash_0.wav' and + 'bobby_nohash_1.wav' are always in the same set, for example. + + Args: + filename: File path of the data sample. + validation_percentage: How much of the data set to use for validation. + testing_percentage: How much of the data set to use for testing. + + Returns: + String, one of 'training', 'validation', or 'testing'. + """ + base_name = os.path.basename(filename) + # We want to ignore anything after '_nohash_' in the file name when + # deciding which set to put a wav in, so the data set creator has a way of + # grouping wavs that are close variations of each other. + hash_name = re.sub(r'_nohash_.*$', '', base_name) + # This looks a bit magical, but we need to decide whether this file should + # go into the training, testing, or validation sets, and we want to keep + # existing files in the same set even if more files are subsequently + # added. + # To do that, we need a stable way of deciding based on just the file name + # itself, so we do a hash of that and then use that to generate a + # probability value that we use to assign it. + hash_name_hashed = hashlib.sha1(tf.compat.as_bytes(hash_name)).hexdigest() + percentage_hash = ((int(hash_name_hashed, 16) % + (MAX_NUM_WAVS_PER_CLASS + 1)) * + (100.0 / MAX_NUM_WAVS_PER_CLASS)) + if percentage_hash < validation_percentage: + result = 'validation' + elif percentage_hash < (testing_percentage + validation_percentage): + result = 'testing' + else: + result = 'training' + return result + + +def prepare_words_list(wanted_words): + """Prepends common tokens to the custom word list. + + Args: + wanted_words: List of strings containing custom words to spot. + + Returns: + List of words with silence and unknown tokens added. + """ + return [SILENCE_LABEL, UNKNOWN_WORD_LABEL] + wanted_words + + +class AudioProcessor: + """Handles loading, partitioning, and preparing audio training data.""" + + class Modes(Enum): + TRAINING = 1 + VALIDATION = 2 + TESTING = 3 + + def __init__(self, data_url, data_dir, silence_percentage, unknown_percentage, + wanted_words, validation_percentage, testing_percentage, model_settings): + self.data_dir = Path(data_dir) + self.model_settings = model_settings + self.words_list = prepare_words_list(wanted_words) + + self._tf_datasets = {} + self.background_data = None + self._set_size = {'training': 0, 'validation': 0, 'testing': 0} + + self._download_and_extract_data(data_url, data_dir) + self._prepare_datasets(silence_percentage, unknown_percentage, wanted_words, + validation_percentage, testing_percentage) + self._prepare_background_data() + + def get_data(self, mode, background_frequency=0, background_volume_range=0, time_shift=0): + """Returns the train, validation or test set for KWS as a TF Dataset. + + Args: + mode: The set to return, see AudioProcessor.Modes enumeration. + background_frequency: How many of the samples have background noise mixed in. + background_volume_range: How loud the background noise should be, between 0 and 1. + time_shift: Range to randomly shift the training audio by in time. + + Returns: + TF dataset that will generate tuples containing an mfcc and corresponding label. + + Raises: + ValueError: If mode is not recognised. + """ + if mode == AudioProcessor.Modes.TRAINING: + dataset = self._tf_datasets['training'] + elif mode == AudioProcessor.Modes.VALIDATION: + dataset = self._tf_datasets['validation'] + elif mode == AudioProcessor.Modes.TESTING: + dataset = self._tf_datasets['testing'] + else: + ValueError("Incorrect dataset type given") + + use_background = (self.background_data is not None) and (mode == AudioProcessor.Modes.TRAINING) + dataset = dataset.map(lambda path, label: self._process_path(path, label, self.model_settings, + background_frequency, background_volume_range, + time_shift, use_background, self.background_data), + num_parallel_calls=tf.data.experimental.AUTOTUNE) + + return dataset + + def set_size(self, mode): + """Get the number of samples in the requested dataset partition. + + Args: + mode: Which partition, see AudioProcessor.Modes enumeration. + + Returns: + Number of samples in the partition. + + Raises: + ValueError: If mode is not recognised. + """ + if mode == AudioProcessor.Modes.TRAINING: + return self._set_size['training'] + elif mode == AudioProcessor.Modes.VALIDATION: + return self._set_size['validation'] + elif mode == AudioProcessor.Modes.TESTING: + return self._set_size['testing'] + else: + ValueError('Incorrect dataset type given') + + @staticmethod + def _process_path(path, label, model_settings, background_frequency, background_volume_range, time_shift_samples, + use_background, background_data): + """Load wav files and calculate mfcc features. + + Random shifting of samples and adding in background noise is done within this function as well. + This function is meant to be mapped onto a TF Dataset by using a lambda function. + + Args: + path: Path to the wav file to load. + label: Integer label for classifying the audio clip. + model_settings: Dictionary of settings for model being trained. + background_frequency: How many clips will have background noise, 0.0 to 1.0. + background_volume_range: How loud the background noise will be. + time_shift_samples: How much to randomly shift the clips by. + use_background: Add in background noise to audio clips or not. + background_data: Ragged tensor of loaded background noise samples. + + Returns: + Tuple of calculated flattened mfcc and its class label. + """ + + desired_samples = model_settings['desired_samples'] + audio, sample_rate = load_wav_file(path, desired_samples=desired_samples) + + # Make our own silence audio data. + if label == SILENCE_INDEX: + audio = tf.multiply(audio, 0) + + # Shift samples start position and pad any gaps with zeros. + if time_shift_samples > 0: + time_shift_amount = tf.random.uniform(shape=(), minval=-time_shift_samples, maxval=time_shift_samples, + dtype=tf.int32) + else: + time_shift_amount = 0 + if time_shift_amount > 0: + time_shift_padding = [[time_shift_amount, 0], [0, 0]] + time_shift_offset = [0, 0] + else: + time_shift_padding = [[0, -time_shift_amount], [0, 0]] + time_shift_offset = [-time_shift_amount, 0] + + padded_foreground = tf.pad(audio, time_shift_padding, mode='CONSTANT') + sliced_foreground = tf.slice(padded_foreground, time_shift_offset, [desired_samples, -1]) + + # Get a random section of background noise. + if use_background: + background_index = tf.random.uniform(shape=(), maxval=background_data.shape[0], dtype=tf.int32) + background_sample = background_data[background_index] + background_offset = tf.random.uniform(shape=(), maxval=len(background_sample)-desired_samples, + dtype=tf.int32) + background_clipped = background_sample[background_offset:(background_offset + desired_samples)] + background_reshaped = tf.reshape(background_clipped, [desired_samples, 1]) + if tf.random.uniform(shape=(), maxval=1) < background_frequency: + background_volume = tf.random.uniform(shape=(), maxval=background_volume_range) + else: + background_volume = tf.constant(0, dtype='float32') + else: + background_reshaped = np.zeros([desired_samples, 1], dtype=np.float32) + background_volume = tf.constant(0, dtype='float32') + + # Mix in background noise. + background_mul = tf.multiply(background_reshaped, background_volume) + background_add = tf.add(background_mul, sliced_foreground) + background_clamp = tf.clip_by_value(background_add, -1.0, 1.0) + + mfcc = calculate_mfcc(background_clamp, sample_rate, model_settings['window_size_samples'], + model_settings['window_stride_samples'], + model_settings['dct_coefficient_count']) + mfcc = tf.reshape(mfcc, [-1]) + + return mfcc, label + + def _download_and_extract_data(self, data_url, target_directory): + """Downloads and extracts file to target directory. + + If the file does not already exist download it and then untar into the target directory. + + Args: + data_url: Web link to the tarred data to download. + target_directory: Directory to download and extract to. + """ + target_directory = Path(target_directory) + target_directory.mkdir(exist_ok=True) + + filename = data_url.split('/')[-1] + filepath = target_directory / filename + + if not filepath.exists(): + def _report_hook(block_num, block_size, total_size): + """Function to track download progress in urllib""" + read_so_far = block_num * block_size + percent = (read_so_far / total_size) * 100.0 + + s = f"\rDownloading {filename} {percent:.1f}%" + + sys.stdout.write(s) + sys.stdout.flush() + + filepath, _ = urllib.request.urlretrieve(data_url, filepath, _report_hook) + print() + + print(f'Untarring {filename}...') + tarfile.open(filepath, 'r:gz').extractall(target_directory) + + def _prepare_datasets(self, silence_percentage, unknown_percentage, wanted_words, + validation_percentage, testing_percentage): + """Split the data into train, validation and testing sets. + + Silence and unknown data is added, then sets are converted to TF Datasets. + + Args: + silence_percentage: Percent of words should be silence. + unknown_percentage: Percent of words that should be unknown. + wanted_words: List of words wanted to classify. + validation_percentage: Percent to split off for validation. + testing_percentage: Percent to split off for testing. + """ + # Make sure the shuffling and picking of unknowns is deterministic. + random.seed(RANDOM_SEED) + wanted_words_index = {} + + for index, wanted_word in enumerate(wanted_words): + wanted_words_index[wanted_word] = index + 2 + + # Find all wav files in subfolders. + search_path = self.data_dir / '*' / '*.wav' + data_index, unknown_index, all_words = self._find_and_sort_wavs(search_path, validation_percentage, + testing_percentage, wanted_words_index) + + for index, wanted_word in enumerate(wanted_words): + if wanted_word not in all_words: + raise Exception(f'Tried to find {wanted_word} in labels but only found: {", ".join(all_words.keys())}') + + word_to_index = {} + for word in all_words: + if word in wanted_words_index: + word_to_index[word] = wanted_words_index[word] + else: + word_to_index[word] = UNKNOWN_WORD_INDEX + word_to_index[SILENCE_LABEL] = SILENCE_INDEX + + # We need an arbitrary file to load as the input for the silence samples. + # It's multiplied by zero later, so the content doesn't matter. + silence_wav_path = data_index['training'][0]['file'] + for set_index in ['validation', 'testing', 'training']: + set_size = len(data_index[set_index]) # Size before adding silence and unknown samples. + silence_size = int(math.ceil(set_size * silence_percentage / 100)) + for _ in range(silence_size): + data_index[set_index].append({ + 'label': SILENCE_LABEL, + 'file': silence_wav_path + }) + # Pick some unknowns to add to each partition of the data set. + random.shuffle(unknown_index[set_index]) + unknown_size = int(math.ceil(set_size * unknown_percentage / 100)) + data_index[set_index].extend(unknown_index[set_index][:unknown_size]) + + self._set_size[set_index] = len(data_index[set_index]) # Size after adding silence and unknown samples. + + # Make sure the ordering is random. + random.shuffle(data_index[set_index]) + + # Transform into TF Datasets ready for easier processing later. + labels, paths = list(zip(*[d.values() for d in data_index[set_index]])) + labels = [word_to_index[label] for label in labels] + self._tf_datasets[set_index] = tf.data.Dataset.from_tensor_slices((list(paths), labels)) + + def _find_and_sort_wavs(self, search_pattern, validation_percentage, testing_percentage, wanted_words_index): + """Find and sort wav files into known and unknown word sets. + + Known words are files containing words in the list of wanted words. + Any other clip goes to the unknown label set. Labels come from the folder names. + All clips are also assigned to train, test and validation sets. + + Args: + search_pattern: Path pattern used by glob to find wav files. + validation_percentage: Percent to split off for validation. + testing_percentage: Percent to split off for testing. + wanted_words_index: Dict mapping wanted words to their label index. + + Returns: + 3-tuple of known words, unknown words and mapping of all word labels. + """ + data_index = {'validation': [], 'testing': [], 'training': []} + unknown_index = {'validation': [], 'testing': [], 'training': []} + all_words = {} + + for wav_path in sorted(tf.io.gfile.glob(str(search_pattern))): + word = Path(wav_path).parent.name.lower() + + # Treat the '_background_noise_' folder as a special case, since we expect + # it to contain long audio samples we mix in to improve training. + if word == BACKGROUND_NOISE_DIR_NAME: + continue + + all_words[word] = True + set_index = which_set(wav_path, validation_percentage, testing_percentage) + # If it's a known class, store its detail, otherwise add it to the list + # we'll use to train the unknown label. + if word in wanted_words_index: + data_index[set_index].append({'label': word, 'file': wav_path}) + else: + unknown_index[set_index].append({'label': word, 'file': wav_path}) + if not all_words: + raise Exception('No .wavs found at ' + str(search_pattern)) + + return data_index, unknown_index, all_words + + def _prepare_background_data(self): + """Searches a folder for background noise audio, and loads it into memory. + + It's expected that the background audio samples will be in a subdirectory + named '_background_noise_' inside the 'data_dir' folder, as .wavs that match + the sample rate of the training data, but can be much longer in duration. + + If the '_background_noise_' folder doesn't exist at all, this isn't an + error, it's just taken to mean that no background noise augmentation should + be used. If the folder does exist, but it's empty, that's treated as an + error. + + Returns: + Ragged tensor of raw PCM-encoded audio samples of background noise. + None if '_background_noise_' folder doesnt exist. + + Raises: + Exception: If files aren't found in the folder. + """ + background_data = [] + background_dir = Path(self.data_dir / BACKGROUND_NOISE_DIR_NAME) + if not background_dir.exists(): + self.background_data = None + return + + search_path = Path(background_dir / '*.wav') + for wav_path in tf.io.gfile.glob(str(search_path)): + wav_data, _ = load_wav_file(wav_path, desired_samples=-1) + background_data.append(tf.reshape(wav_data, [-1])) + + if not background_data: + raise Exception('No background wav files were found in ' + str(search_path)) + + # Ragged tensor as we cant use lists in tf dataset map functions. + self.background_data = tf.ragged.stack(background_data) diff --git a/models/keyword_spotting/ds_cnn_small/model_package_tf/ds_cnn_s_inference_keras.py b/models/keyword_spotting/ds_cnn_small/model_package_tf/ds_cnn_s_inference_keras.py new file mode 100644 index 0000000..db7694a --- /dev/null +++ b/models/keyword_spotting/ds_cnn_small/model_package_tf/ds_cnn_s_inference_keras.py @@ -0,0 +1,76 @@ +# Copyright © 2023 Arm Ltd. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from data_processing.data_preprocessing import load_wav_file, calculate_mfcc + +import tensorflow as tf +import argparse + + +def load_labels(filename): + """Read in labels, one label per line.""" + f = open(filename, "r") + return f.read().splitlines() + + +def main(): + window_size_samples = int(FLAGS.sample_rate * FLAGS.window_size_ms / 1000) + window_stride_samples = int(FLAGS.sample_rate * FLAGS.window_stride_ms / 1000) + decoded, sample = load_wav_file(FLAGS.wav, FLAGS.sample_rate) + x = calculate_mfcc(decoded, sample, window_size_samples, window_stride_samples, FLAGS.dct_coefficient_count) + x = tf.reshape(x, [1, -1]) + + model = tf.keras.models.load_model(FLAGS.keras_file_path) + predictions = model.predict(x) + + # Sort to show labels in order of confidence + top_k = predictions[0].argsort()[-1:][::-1] + for node_id in top_k: + human_string = load_labels(FLAGS.labels)[int(node_id)] + score = predictions[0,node_id] + print(f'model predicted: {human_string} with score {score:.5f}') + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument( + '--wav', type=str, default='', help='Audio file to be identified.') + parser.add_argument( + '--labels', type=str, default='', help='Path to file containing labels.') + parser.add_argument( + '--sample_rate', + type=int, + default=16000, + help='Expected sample rate of the wavs', ) + parser.add_argument( + '--window_size_ms', + type=float, + default=30.0, + help='How long each spectrogram timeslice is', ) + parser.add_argument( + '--window_stride_ms', + type=float, + default=10.0, + help='How long each spectrogram timeslice is', ) + parser.add_argument( + '--dct_coefficient_count', + type=int, + default=40, + help='How many bins to use for the MFCC fingerprint', ) + parser.add_argument( + '--keras_file_path', + type=str, + default='', + help='Path to the .h5 Keras model file to use for testing.') + FLAGS, unparsed = parser.parse_known_args() + main() diff --git a/models/keyword_spotting/ds_cnn_small/model_package_tf/ds_cnn_s_inference_tflite.py b/models/keyword_spotting/ds_cnn_small/model_package_tf/ds_cnn_s_inference_tflite.py new file mode 100644 index 0000000..9f79d99 --- /dev/null +++ b/models/keyword_spotting/ds_cnn_small/model_package_tf/ds_cnn_s_inference_tflite.py @@ -0,0 +1,120 @@ +# Copyright © 2023 Arm Ltd. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from data_processing.data_preprocessing import load_wav_file, calculate_mfcc + +import tensorflow as tf +import numpy as np +import argparse + + +def tflite_inference(input_data, tflite_path): + """Call forwards pass of TFLite file and returns the result. + + Args: + input_data: Input data to use on forward pass. + tflite_path: Path to TFLite file to run. + + Returns: + Output from inference. + """ + supported_quant_dtypes = (np.int8, np.int16) + interpreter = tf.lite.Interpreter(model_path=tflite_path) + interpreter.allocate_tensors() + + input_details = interpreter.get_input_details() + output_details = interpreter.get_output_details() + + input_dtype = input_details[0]["dtype"] + output_dtype = output_details[0]["dtype"] + + # Check if the input/output type is quantized, + # set scale and zero-point accordingly + if input_dtype in supported_quant_dtypes: + input_scale, input_zero_point = input_details[0]["quantization"] + else: + input_scale, input_zero_point = 1, 0 + + input_data = input_data / input_scale + input_zero_point + input_data = np.round(input_data) if input_dtype in supported_quant_dtypes else input_data + + if output_dtype in supported_quant_dtypes: + output_scale, output_zero_point = output_details[0]["quantization"] + else: + output_scale, output_zero_point = 1, 0 + + interpreter.set_tensor(input_details[0]['index'], tf.cast(input_data, input_dtype)) + interpreter.invoke() + + output_data = interpreter.get_tensor(output_details[0]['index']) + + output_data = output_scale * (output_data.astype(np.float32) - output_zero_point) + + return output_data + + +def load_labels(filename): + """Read in labels, one label per line.""" + f = open(filename, "r") + return f.read().splitlines() + + +def main(): + window_size_samples = int(FLAGS.sample_rate * FLAGS.window_size_ms / 1000) + window_stride_samples = int(FLAGS.sample_rate * FLAGS.window_stride_ms / 1000) + decoded, sample = load_wav_file(FLAGS.wav, FLAGS.sample_rate) + x = calculate_mfcc(decoded, sample, window_size_samples, window_stride_samples, FLAGS.dct_coefficient_count) + x = tf.reshape(x, [1, -1]) + predictions = tflite_inference(x, FLAGS.tflite_path) + + # Sort to show labels in order of confidence + top_k = predictions[0].argsort()[-1:][::-1] + for node_id in top_k: + human_string = load_labels(FLAGS.labels)[int(node_id)] + score = predictions[0,node_id] + print(f'model predicted: {human_string} with score {score:.5f}') + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument( + '--wav', type=str, default='', help='Audio file to be identified.') + parser.add_argument( + '--labels', type=str, default='', help='Path to file containing labels.') + parser.add_argument( + '--sample_rate', + type=int, + default=16000, + help='Expected sample rate of the wavs', ) + parser.add_argument( + '--window_size_ms', + type=float, + default=30.0, + help='How long each spectrogram timeslice is', ) + parser.add_argument( + '--window_stride_ms', + type=float, + default=10.0, + help='How long each spectrogram timeslice is', ) + parser.add_argument( + '--dct_coefficient_count', + type=int, + default=40, + help='How many bins to use for the MFCC fingerprint', ) + parser.add_argument( + '--tflite_path', + type=str, + default='', + help='Path to TFLite file to use for testing.') + FLAGS, unparsed = parser.parse_known_args() + main() diff --git a/models/keyword_spotting/ds_cnn_small/model_package_tf/evaluation.py b/models/keyword_spotting/ds_cnn_small/model_package_tf/evaluation.py new file mode 100644 index 0000000..9488d35 --- /dev/null +++ b/models/keyword_spotting/ds_cnn_small/model_package_tf/evaluation.py @@ -0,0 +1,250 @@ +# Copyright © 2023 Arm Ltd. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Functions for testing trained keyword spotting models from checkpoint files and TFLite files.""" + +import argparse + +import numpy as np +import tensorflow as tf + +from data_processing import data_preprocessing +from model_core_utils import models +from ds_cnn_s_inference_tflite import tflite_inference + + +def tflite_test(model_settings, audio_processor, tflite_path): + """Calculate accuracy and confusion matrices on the validation and test sets. + + A TFLite model is used for doing testing. + + Args: + model_settings: Dictionary of common model settings. + audio_processor: Audio processor class object. + tflite_path: Path to TFLite file to use for inference. + """ + # Evaluate on validation set. + print("Running TFLite evaluation on validation set...") + val_data = audio_processor.get_data(audio_processor.Modes.VALIDATION).batch(1) + expected_indices = np.concatenate([y for x, y in val_data]) + predicted_indices = [] + + for mfcc, label in val_data: + prediction = tflite_inference(mfcc, tflite_path) + predicted_indices.append(np.squeeze(tf.argmax(prediction, axis=1))) + + val_accuracy = calculate_accuracy(predicted_indices, expected_indices) + confusion_matrix = tf.math.confusion_matrix(expected_indices, predicted_indices, + num_classes=model_settings['label_count']) + + print(confusion_matrix.numpy()) + print(f'Validation accuracy = {val_accuracy * 100:.2f}%' + f'(N={audio_processor.set_size(audio_processor.Modes.VALIDATION)})') + + # Evaluate on testing set. + print("Running TFLite evaluation on test set...") + test_data = audio_processor.get_data(audio_processor.Modes.TESTING).batch(1) + expected_indices = np.concatenate([y for x, y in test_data]) + predicted_indices = [] + + for mfcc, label in test_data: + prediction = tflite_inference(mfcc, tflite_path) + predicted_indices.append(np.squeeze(tf.argmax(prediction, axis=1))) + + test_accuracy = calculate_accuracy(predicted_indices, expected_indices) + confusion_matrix = tf.math.confusion_matrix(expected_indices, predicted_indices, + num_classes=model_settings['label_count']) + + print(confusion_matrix.numpy()) + print(f'Test accuracy = {test_accuracy * 100:.2f}%' + f'(N={audio_processor.set_size(audio_processor.Modes.TESTING)})') + + +def keras_test(model_settings, audio_processor, model): + """Calculate accuracy and confusion matrices on the validation and test sets. + + A loaded keras model is used for doing testing. + + Args: + model_settings: Dictionary of common model settings. + audio_processor: Audio processor class object. + model: Loaded keras model. + """ + # Evaluate on validation set. + print("Running TF evaluation on validation set...") + val_data = audio_processor.get_data(audio_processor.Modes.VALIDATION).batch(FLAGS.batch_size) + expected_indices = np.concatenate([y for x, y in val_data]) + + predictions = model.predict(val_data) + predicted_indices = tf.argmax(predictions, axis=1) + + val_accuracy = calculate_accuracy(predicted_indices, expected_indices) + confusion_matrix = tf.math.confusion_matrix(expected_indices, predicted_indices, + num_classes=model_settings['label_count']) + print(confusion_matrix.numpy()) + print(f'Validation accuracy = {val_accuracy * 100:.2f}%' + f'(N={audio_processor.set_size(audio_processor.Modes.VALIDATION)})') + + # Evaluate on testing set. + print("Running TF evaluation on test set...") + test_data = audio_processor.get_data(audio_processor.Modes.TESTING).batch(FLAGS.batch_size) + expected_indices = np.concatenate([y for x, y in test_data]) + + predictions = model.predict(test_data) + predicted_indices = tf.argmax(predictions, axis=1) + + test_accuracy = calculate_accuracy(predicted_indices, expected_indices) + confusion_matrix = tf.math.confusion_matrix(expected_indices, predicted_indices, + num_classes=model_settings['label_count']) + print(confusion_matrix.numpy()) + print(f'Test accuracy = {test_accuracy * 100:.2f}%' + f'(N={audio_processor.set_size(audio_processor.Modes.TESTING)})') + + +def calculate_accuracy(predicted_indices, expected_indices): + """Calculates and returns accuracy. + + Args: + predicted_indices: List of predicted integer indices. + expected_indices: List of expected integer indices. + + Returns: + Accuracy value between 0 and 1. + """ + correct_prediction = tf.equal(predicted_indices, expected_indices) + accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) + return accuracy + + +def evaluate(): + """Calculate accuracy and confusion matrices on validation and test sets. + + Model is created and weights loaded from supplied command line arguments. + """ + model_settings = models.prepare_model_settings( + len(data_preprocessing.prepare_words_list(FLAGS.wanted_words.split(','))), + FLAGS.sample_rate, FLAGS.clip_duration_ms, FLAGS.window_size_ms, + FLAGS.window_stride_ms, FLAGS.dct_coefficient_count) + + audio_processor = data_preprocessing.AudioProcessor(data_url=FLAGS.data_url, + data_dir=FLAGS.data_dir, + silence_percentage=FLAGS.silence_percentage, + unknown_percentage=FLAGS.unknown_percentage, + wanted_words=FLAGS.wanted_words.split(','), + validation_percentage=FLAGS.validation_percentage, + testing_percentage=FLAGS.testing_percentage, + model_settings=model_settings) + + if FLAGS.tflite_path: + tflite_test(model_settings, audio_processor, FLAGS.tflite_path) + + if FLAGS.checkpoint: + model = models.create_model(model_settings, FLAGS.model_architecture, FLAGS.model_size_info, False) + model.load_weights(FLAGS.checkpoint).expect_partial() + keras_test(model_settings, audio_processor, model) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument( + '--data_url', + type=str, + default='http://download.tensorflow.org/data/speech_commands_v0.02.tar.gz', + help='Location of speech training data archive on the web.') + parser.add_argument( + '--data_dir', + type=str, + default='/tmp/speech_dataset/', + help="""\ + Where to download the speech training data to. + """) + parser.add_argument( + '--silence_percentage', + type=float, + default=10.0, + help="""\ + How much of the training data should be silence. + """) + parser.add_argument( + '--unknown_percentage', + type=float, + default=10.0, + help="""\ + How much of the training data should be unknown words. + """) + parser.add_argument( + '--testing_percentage', + type=int, + default=10, + help='What percentage of wavs to use as a test set.') + parser.add_argument( + '--validation_percentage', + type=int, + default=10, + help='What percentage of wavs to use as a validation set.') + parser.add_argument( + '--sample_rate', + type=int, + default=16000, + help='Expected sample rate of the wavs',) + parser.add_argument( + '--clip_duration_ms', + type=int, + default=1000, + help='Expected duration in milliseconds of the wavs',) + parser.add_argument( + '--window_size_ms', + type=float, + default=30.0, + help='How long each spectrogram timeslice is',) + parser.add_argument( + '--window_stride_ms', + type=float, + default=10.0, + help='How long each spectrogram timeslice is',) + parser.add_argument( + '--dct_coefficient_count', + type=int, + default=40, + help='How many bins to use for the MFCC fingerprint',) + parser.add_argument( + '--batch_size', + type=int, + default=100, + help='How many items to train with at once',) + parser.add_argument( + '--wanted_words', + type=str, + default='yes,no,up,down,left,right,on,off,stop,go', + help='Words to use (others will be added to an unknown label)',) + parser.add_argument( + '--checkpoint', + type=str, + help='Checkpoint to load the weights from') + parser.add_argument( + '--tflite_path', + type=str, + help='Path to TFLite file to use for evaluation') + parser.add_argument( + '--model_architecture', + type=str, + default='dnn', + help='What model architecture to use') + parser.add_argument( + '--model_size_info', + type=int, + nargs="+", + default=[128, 128, 128], + help='Model dimensions - different for various models') + + FLAGS, _ = parser.parse_known_args() + evaluate() diff --git a/models/keyword_spotting/ds_cnn_small/model_package_tf/how_to_guidance.ipynb b/models/keyword_spotting/ds_cnn_small/model_package_tf/how_to_guidance.ipynb new file mode 100644 index 0000000..1391914 --- /dev/null +++ b/models/keyword_spotting/ds_cnn_small/model_package_tf/how_to_guidance.ipynb @@ -0,0 +1,428 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Copyright (C) 2023 Arm Limited or its affiliates. All rights reserved.\n", + "#\n", + "# SPDX-License-Identifier: Apache-2.0\n", + "#\n", + "# Licensed under the Apache License, Version 2.0 (the License); you may\n", + "# not use this file except in compliance with the License.\n", + "# You may obtain a copy of the License at\n", + "#\n", + "# www.apache.org/licenses/LICENSE-2.0\n", + "#\n", + "# Unless required by applicable law or agreed to in writing, software\n", + "# distributed under the License is distributed on an AS IS BASIS, WITHOUT\n", + "# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", + "# See the License for the specific language governing permissions and\n", + "# limitations under the License." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# DS_CNN_Small - Hero\n", + "\n", + "Here we reproduce the models with our established codebase and ModelPackage approach for your convenience.\n", + "\n", + "## Model-Package Overview:\n", + "\n", + "| Model \t| DS_CNN_Small \t|\n", + "|:---------------:\t|:---------------------------------------------------------------:\t|\n", + "| **Format**: \t| Keras, Saved Model, TensorFlow Lite int8, TensorFlow Lite fp32 |\n", + "| **Feature**: \t| Keyword spotting for Arm Cortex-M CPUs |\n", + "| **Architectural Delta w.r.t. Vanilla**: | None |\n", + "| **Domain**: \t| Keyword spotting |\n", + "| **Package Quality**: \t| Hero |" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Table of contents \n", + "\n", + "This how-to guidance presents the key steps to reproduce everything in this package. The contents are organised as below. We provided the internal navigation links for users to easy-jump among different sections. \n", + "\n", + " \n", + "* [1.0 Model recreation](#model_recreation)\n", + "\n", + "* [2.0 Training](#training)\n", + "\n", + "* [3.0 Testing](#testing)\n", + "\n", + "* [4.0 Optimization](#optimization)\n", + "\n", + "* [5.0 Quantization and TFLite conversion](#tflite_conversion)\n", + "\n", + "* [6.0 Inference the TFLite model files](#tflite_inference)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1.0 Model Recreation\n", + "\n", + "In order to recreate the model you will first need to be using ```Python3.7``` and install the requirements in ```requirements.txt```.\n", + "\n", + "Once you have these requirements satisfied you can execute the recreation script contained within this folder, just run:" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2023-01-31 12:04:29.102214: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0\n", + "Untarring speech_commands_v0.02.tar.gz...\n", + "2023-01-31 12:05:19.918303: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcuda.so.1\n", + "2023-01-31 12:05:19.952173: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: \n", + "pciBusID: 0000:03:00.0 name: NVIDIA TITAN Xp computeCapability: 6.1\n", + "coreClock: 1.582GHz coreCount: 30 deviceMemorySize: 11.91GiB deviceMemoryBandwidth: 510.07GiB/s\n", + "2023-01-31 12:05:19.952211: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0\n", + "2023-01-31 12:05:19.971851: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcublas.so.11\n", + "2023-01-31 12:05:19.971921: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcublasLt.so.11\n", + "2023-01-31 12:05:19.974596: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcufft.so.10\n", + "2023-01-31 12:05:19.974884: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcurand.so.10\n", + "2023-01-31 12:05:19.975441: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcusolver.so.11\n", + "2023-01-31 12:05:19.976147: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcusparse.so.11\n", + "2023-01-31 12:05:19.976295: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudnn.so.8\n", + "2023-01-31 12:05:19.976755: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0\n", + "2023-01-31 12:05:19.977035: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA\n", + "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n", + "2023-01-31 12:05:19.977720: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: \n", + "pciBusID: 0000:03:00.0 name: NVIDIA TITAN Xp computeCapability: 6.1\n", + "coreClock: 1.582GHz coreCount: 30 deviceMemorySize: 11.91GiB deviceMemoryBandwidth: 510.07GiB/s\n", + "2023-01-31 12:05:19.978052: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0\n", + "2023-01-31 12:05:19.978106: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0\n", + "2023-01-31 12:05:20.390120: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1258] Device interconnect StreamExecutor with strength 1 edge matrix:\n", + "2023-01-31 12:05:20.390158: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1264] 0 \n", + "2023-01-31 12:05:20.390167: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1277] 0: N \n", + "2023-01-31 12:05:20.390683: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1418] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 11007 MB memory) -> physical GPU (device: 0, name: NVIDIA TITAN Xp, pci bus id: 0000:03:00.0, compute capability: 6.1)\n", + "WARNING:tensorflow:Compiled the loaded model, but the compiled metrics have yet to be built. `model.compile_metrics` will be empty until you train or evaluate the model.\n", + "2023-01-31 12:05:22.730373: W tensorflow/python/util/util.cc:348] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.\n", + "2023-01-31 12:05:24.433377: I tensorflow/core/grappler/devices.cc:69] Number of eligible GPUs (core count >= 8, compute capability >= 0.0): 1\n", + "2023-01-31 12:05:24.433576: I tensorflow/core/grappler/clusters/single_machine.cc:357] Starting new session\n", + "2023-01-31 12:05:24.434021: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: \n", + "pciBusID: 0000:03:00.0 name: NVIDIA TITAN Xp computeCapability: 6.1\n", + "coreClock: 1.582GHz coreCount: 30 deviceMemorySize: 11.91GiB deviceMemoryBandwidth: 510.07GiB/s\n", + "2023-01-31 12:05:24.434280: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0\n", + "2023-01-31 12:05:24.434312: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1258] Device interconnect StreamExecutor with strength 1 edge matrix:\n", + "2023-01-31 12:05:24.434324: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1264] 0 \n", + "2023-01-31 12:05:24.434333: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1277] 0: N \n", + "2023-01-31 12:05:24.434616: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1418] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 11007 MB memory) -> physical GPU (device: 0, name: NVIDIA TITAN Xp, pci bus id: 0000:03:00.0, compute capability: 6.1)\n", + "2023-01-31 12:05:24.451559: I tensorflow/core/platform/profile_utils/cpu_utils.cc:114] CPU Frequency: 3492140000 Hz\n", + "2023-01-31 12:05:24.458087: I tensorflow/core/grappler/optimizers/meta_optimizer.cc:1144] Optimization results for grappler item: graph_to_optimize\n", + " function_optimizer: function_optimizer did nothing. time = 0.014ms.\n", + " function_optimizer: function_optimizer did nothing. time = 0.001ms.\n", + "\n", + "2023-01-31 12:05:24.730913: W tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc:345] Ignored output_format.\n", + "2023-01-31 12:05:24.730951: W tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc:348] Ignored drop_control_dependency.\n", + "2023-01-31 12:05:24.736446: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:210] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.\n", + "2023-01-31 12:05:24.739564: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: \n", + "pciBusID: 0000:03:00.0 name: NVIDIA TITAN Xp computeCapability: 6.1\n", + "coreClock: 1.582GHz coreCount: 30 deviceMemorySize: 11.91GiB deviceMemoryBandwidth: 510.07GiB/s\n", + "2023-01-31 12:05:24.739849: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0\n", + "2023-01-31 12:05:24.739885: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1258] Device interconnect StreamExecutor with strength 1 edge matrix:\n", + "2023-01-31 12:05:24.739895: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1264] 0 \n", + "2023-01-31 12:05:24.739902: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1277] 0: N \n", + "2023-01-31 12:05:24.740218: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1418] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 11007 MB memory) -> physical GPU (device: 0, name: NVIDIA TITAN Xp, pci bus id: 0000:03:00.0, compute capability: 6.1)\n", + "Converted model saved to ds_cnn.tflite.\n", + "Running TFLite evaluation on validation set...\n", + "2023-01-31 12:05:24.804992: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:176] None of the MLIR Optimization Passes are enabled (registered 2)\n", + "[[371 0 0 0 0 0 0 0 0 0 0 0]\n", + " [ 0 311 1 9 2 9 8 4 11 3 4 9]\n", + " [ 0 5 387 1 0 0 3 0 0 0 0 1]\n", + " [ 0 11 5 372 1 6 5 0 0 0 0 6]\n", + " [ 0 4 0 0 327 0 2 0 1 10 6 0]\n", + " [ 0 2 2 6 0 360 0 1 1 0 1 4]\n", + " [ 0 1 7 0 3 1 333 5 0 0 0 2]\n", + " [ 0 5 0 1 0 0 5 350 1 0 0 1]\n", + " [ 1 5 0 1 4 1 0 1 343 7 0 0]\n", + " [ 0 1 1 1 16 0 2 1 5 343 1 2]\n", + " [ 1 2 0 0 9 1 0 0 0 3 334 0]\n", + " [ 0 15 0 14 1 6 0 0 0 2 3 331]]\n", + "Validation accuracy = 93.63%(N=4445)\n", + "Running TFLite evaluation on test set...\n", + "[[408 0 0 0 0 0 0 0 0 0 0 0]\n", + " [ 0 354 5 8 4 2 7 8 6 0 5 9]\n", + " [ 0 5 404 1 0 0 9 0 0 0 0 0]\n", + " [ 0 4 1 380 0 7 3 0 0 0 0 10]\n", + " [ 0 4 0 0 396 1 1 0 2 14 4 3]\n", + " [ 0 12 1 9 0 376 2 0 1 0 1 4]\n", + " [ 0 2 7 1 1 0 399 1 0 0 1 0]\n", + " [ 0 10 0 0 1 1 6 376 0 0 2 0]\n", + " [ 0 7 1 0 4 0 0 0 364 16 1 3]\n", + " [ 1 5 1 3 12 0 1 0 2 369 1 7]\n", + " [ 0 1 0 1 4 2 1 0 1 1 397 3]\n", + " [ 0 3 2 18 1 5 1 0 0 2 2 368]]\n", + "Test accuracy = 93.89%(N=4890)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2023-01-31 12:05:46.655980: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0\n", + "Untarring speech_commands_v0.02.tar.gz...\n", + "2023-01-31 12:06:37.310206: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcuda.so.1\n", + "2023-01-31 12:06:37.346033: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: \n", + "pciBusID: 0000:03:00.0 name: NVIDIA TITAN Xp computeCapability: 6.1\n", + "coreClock: 1.582GHz coreCount: 30 deviceMemorySize: 11.91GiB deviceMemoryBandwidth: 510.07GiB/s\n", + "2023-01-31 12:06:37.346068: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0\n", + "2023-01-31 12:06:37.365782: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcublas.so.11\n", + "2023-01-31 12:06:37.365855: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcublasLt.so.11\n", + "2023-01-31 12:06:37.368622: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcufft.so.10\n", + "2023-01-31 12:06:37.368939: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcurand.so.10\n", + "2023-01-31 12:06:37.369500: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcusolver.so.11\n", + "2023-01-31 12:06:37.370276: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcusparse.so.11\n", + "2023-01-31 12:06:37.370427: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudnn.so.8\n", + "2023-01-31 12:06:37.370808: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0\n", + "2023-01-31 12:06:37.371101: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA\n", + "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n", + "2023-01-31 12:06:37.371913: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: \n", + "pciBusID: 0000:03:00.0 name: NVIDIA TITAN Xp computeCapability: 6.1\n", + "coreClock: 1.582GHz coreCount: 30 deviceMemorySize: 11.91GiB deviceMemoryBandwidth: 510.07GiB/s\n", + "2023-01-31 12:06:37.372648: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0\n", + "2023-01-31 12:06:37.372708: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0\n", + "2023-01-31 12:06:37.810221: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1258] Device interconnect StreamExecutor with strength 1 edge matrix:\n", + "2023-01-31 12:06:37.810261: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1264] 0 \n", + "2023-01-31 12:06:37.810269: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1277] 0: N \n", + "2023-01-31 12:06:37.810782: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1418] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 11007 MB memory) -> physical GPU (device: 0, name: NVIDIA TITAN Xp, pci bus id: 0000:03:00.0, compute capability: 6.1)\n", + "WARNING:tensorflow:Compiled the loaded model, but the compiled metrics have yet to be built. `model.compile_metrics` will be empty until you train or evaluate the model.\n", + "2023-01-31 12:06:40.113450: W tensorflow/python/util/util.cc:348] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.\n", + "2023-01-31 12:06:41.895930: I tensorflow/core/grappler/devices.cc:69] Number of eligible GPUs (core count >= 8, compute capability >= 0.0): 1\n", + "2023-01-31 12:06:41.896029: I tensorflow/core/grappler/clusters/single_machine.cc:357] Starting new session\n", + "2023-01-31 12:06:41.896600: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: \n", + "pciBusID: 0000:03:00.0 name: NVIDIA TITAN Xp computeCapability: 6.1\n", + "coreClock: 1.582GHz coreCount: 30 deviceMemorySize: 11.91GiB deviceMemoryBandwidth: 510.07GiB/s\n", + "2023-01-31 12:06:41.896861: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0\n", + "2023-01-31 12:06:41.896892: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1258] Device interconnect StreamExecutor with strength 1 edge matrix:\n", + "2023-01-31 12:06:41.896901: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1264] 0 \n", + "2023-01-31 12:06:41.896909: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1277] 0: N \n", + "2023-01-31 12:06:41.897198: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1418] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 11007 MB memory) -> physical GPU (device: 0, name: NVIDIA TITAN Xp, pci bus id: 0000:03:00.0, compute capability: 6.1)\n", + "2023-01-31 12:06:41.915523: I tensorflow/core/platform/profile_utils/cpu_utils.cc:114] CPU Frequency: 3492140000 Hz\n", + "2023-01-31 12:06:41.922229: I tensorflow/core/grappler/optimizers/meta_optimizer.cc:1144] Optimization results for grappler item: graph_to_optimize\n", + " function_optimizer: function_optimizer did nothing. time = 0.019ms.\n", + " function_optimizer: function_optimizer did nothing. time = 0.003ms.\n", + "\n", + "2023-01-31 12:06:42.074632: W tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc:345] Ignored output_format.\n", + "2023-01-31 12:06:42.074672: W tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc:348] Ignored drop_control_dependency.\n", + "2023-01-31 12:06:42.079631: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:210] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.\n", + "2023-01-31 12:06:42.082664: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: \n", + "pciBusID: 0000:03:00.0 name: NVIDIA TITAN Xp computeCapability: 6.1\n", + "coreClock: 1.582GHz coreCount: 30 deviceMemorySize: 11.91GiB deviceMemoryBandwidth: 510.07GiB/s\n", + "2023-01-31 12:06:42.082962: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0\n", + "2023-01-31 12:06:42.083001: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1258] Device interconnect StreamExecutor with strength 1 edge matrix:\n", + "2023-01-31 12:06:42.083013: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1264] 0 \n", + "2023-01-31 12:06:42.083021: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1277] 0: N \n", + "2023-01-31 12:06:42.083360: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1418] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 11007 MB memory) -> physical GPU (device: 0, name: NVIDIA TITAN Xp, pci bus id: 0000:03:00.0, compute capability: 6.1)\n", + "2023-01-31 12:06:42.114217: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:176] None of the MLIR Optimization Passes are enabled (registered 2)\n", + "fully_quantize: 0, inference_type: 6, input_inference_type: 9, output_inference_type: 9\n", + "Quantized model saved to ds_cnn_quantized.tflite.\n", + "Running TFLite evaluation on validation set...\n", + "[[371 0 0 0 0 0 0 0 0 0 0 0]\n", + " [ 0 308 4 6 1 11 5 6 11 4 5 10]\n", + " [ 0 5 379 0 0 1 9 0 0 1 0 2]\n", + " [ 0 10 3 369 3 6 6 0 1 2 0 6]\n", + " [ 0 5 0 1 315 0 3 0 2 13 10 1]\n", + " [ 0 5 2 9 0 347 0 1 3 1 5 4]\n", + " [ 0 2 5 1 2 1 335 4 0 1 1 0]\n", + " [ 0 7 0 1 2 0 7 342 1 1 1 1]\n", + " [ 1 4 0 1 6 1 0 0 343 6 1 0]\n", + " [ 0 2 0 1 22 0 1 0 6 336 2 3]\n", + " [ 1 4 0 0 14 0 0 0 0 1 328 2]\n", + " [ 0 12 0 16 2 9 0 0 1 2 4 326]]\n", + "Validation accuracy = 92.22%(N=4445)\n", + "Running TFLite evaluation on test set...\n", + "[[408 0 0 0 0 0 0 0 0 0 0 0]\n", + " [ 0 347 3 8 5 5 8 8 9 1 6 8]\n", + " [ 0 7 399 2 0 1 8 0 0 1 1 0]\n", + " [ 0 4 1 377 4 7 2 0 0 0 1 9]\n", + " [ 0 5 1 0 390 1 1 1 2 14 6 4]\n", + " [ 0 15 0 12 2 361 4 0 2 1 1 8]\n", + " [ 0 6 5 2 4 0 393 2 0 0 0 0]\n", + " [ 0 9 0 0 5 0 10 365 1 1 2 3]\n", + " [ 0 9 0 1 6 1 3 1 357 15 1 2]\n", + " [ 0 4 1 2 15 0 1 0 2 369 1 7]\n", + " [ 0 1 0 2 4 3 2 0 1 3 393 2]\n", + " [ 0 5 2 21 3 7 2 0 0 3 1 358]]\n", + "Test accuracy = 92.37%(N=4890)\n" + ] + } + ], + "source": [ + "!bash ./recreate_model.sh" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Running this script will use the pre-trained checkpoint files supplied in the ```./model_archive/model_source/weights``` folder to generate the TFLite files and perform evaluation on the test set. Both an fp32 version and a quantized version will be produced. The quantized version will use post-training quantization to fully quantize it.\n", + "\n", + "If you want to run training from scratch you can do this by supplying ```--train``` when running the script. For example:\n", + "\n", + "```bash\n", + "bash ./recreate_model.sh --train\n", + "```\n", + "\n", + "Training is then performed and should produce a model to the stated accuracy in this repository. Note that exporting to TFLite will still happen with the baseline pre-trained checkpoint files, so you will need to re-run the script and this time supply the path to the new checkpoint files you want to use, for example:\n", + "\n", + "```bash\n", + "bash ./recreate_model.sh --ckpt \n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2.0 Training\n", + "\n", + "The training scripts can be used to recreate any of the models from the [Hello Edge paper](https://arxiv.org/pdf/1711.07128.pdf) provided the right hyperparameters are used. The training commands with all the hyperparameters to reproduce the model in this repository are given [here](recreate_model.sh). The model in this part of the repository represents just one variation of the models from the paper, other varieties are covered in other parts of the repository.\n", + "\n", + "\n", + "As a general example of how to train a DNN with 3 fully-connected layers with 128 neurons in each layer, run:\n", + "```\n", + "python train.py --model_architecture dnn --model_size_info 128 128 128\n", + "```\n", + "\n", + "The command line argument *--model_size_info* is used to pass the neural network layer\n", + "dimensions such as number of layers, convolution filter size/stride as a list to models.py,\n", + "which builds the TensorFlow graph based on the provided model architecture\n", + "and layer dimensions. For more info on *model_size_info* for each network architecture see\n", + "[models.py](model_core_utils/models.py).\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 3.0 Testing\n", + "To run inference on the trained model from a checkpoint and get accuracy on validation and test sets, run:\n", + "```\n", + "python evaluation.py --model_architecture dnn --model_size_info 128 128 128 --checkpoint \n", + "```\n", + "**The model and feature extraction parameters passed to this script should match those used in the Training step.**" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 4.0 Optimization\n", + "\n", + "We introduce an *optional* step to optimize the trained keyword spotting model for deployment.\n", + "\n", + "Here we use TensorFlow's [weight clustering API](https://www.tensorflow.org/model_optimization/guide/clustering) to reduce the compressed model size and optimize inference on supported hardware. 32 weight clusters and kmeans++ cluster intialization method are used as the clustering hyperparameters.\n", + "\n", + "To optimize your trained model (e.g. a DNN), a trained model checkpoint is needed to run clustering and fine-tuning on.\n", + "You can use the pre-trained checkpoints provided, or train your own model and use the resulting checkpoint.\n", + "\n", + "To apply the optimization and fine-tuning, run the following command:\n", + "```\n", + "python optimisations.py --model_architecture dnn --model_size_info 128 128 128 --checkpoint \n", + "```\n", + "**The model and feature extraction parameters used here should match those used in the Training step, except for the number of training steps.\n", + "The number of training steps is reduced since the optimization step only requires fine-tuning.**\n", + "\n", + "This will generate a clustered model checkpoint that can be used in the quantization step to generate a quantized and clustered TFLite model." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 5.0 Quantization and TFLite Conversion\n", + "\n", + "You can now use TensorFlow's\n", + "[post training quantization](https://www.tensorflow.org/lite/performance/post_training_quantization) to\n", + "make quantization of the trained models super simple.\n", + "\n", + "To quantize your trained model (e.g. a DNN) run:\n", + "```\n", + "python convert_to_tflite.py --model_architecture dnn --model_size_info 128 128 128 --checkpoint [--inference_type int8|int16]\n", + "```\n", + "**The model and feature extraction parameters used here should match those used in the Training step.**\n", + "\n", + "The ```inference_type``` parameter is *optional* and to be used if a fully quantized model with inputs and outputs of type int8 or int16 is needed. It defaults to fp32.\n", + "\n", + "In this example, this step will produce a quantized TFLite file *dnn_quantized.tflite*." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can test the accuracy of this quantized model on the test set by running:\n", + "```\n", + "python evaluation.py --tflite_path dnn_quantized.tflite\n", + "```\n", + "**The model and feature extraction parameters used here should match those used in the Training step.**\n", + "\n", + "`convert_to_tflite.py` uses post-training quantization to generate a quantized model by default. If you wish to convert to a floating point TFLite model, use the command below:\n", + "\n", + "```\n", + "python convert_to_tflite.py --model_architecture dnn --model_size_info 128 128 128 --checkpoint --no-quantize\n", + "```\n", + "\n", + "This will produce a floating point TFLite file *dnn.tflite*. You can test the accuracy of this floating point model using `evaluation.py` as above.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 6.0 Single inference of the TFLite model files \n", + "\n", + "You can conduct TFLite inference for .fp32 and .int8 model files by using the following command: \n", + "\n", + "```python ds_cnn_s_inference_tflite.py --labels validation_utils/labels.txt --wav --tflite_path ```\n", + "\n", + "**The feature extraction parameters used here should match those used in the Training step.**\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.4" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/models/keyword_spotting/ds_cnn_small/model_package_tf/model_archive/TFLite/tflite_fp32/README.md b/models/keyword_spotting/ds_cnn_small/model_package_tf/model_archive/TFLite/tflite_fp32/README.md new file mode 100644 index 0000000..b8fbdcb --- /dev/null +++ b/models/keyword_spotting/ds_cnn_small/model_package_tf/model_archive/TFLite/tflite_fp32/README.md @@ -0,0 +1,62 @@ +# keyword_spotting/ds_cnn_small/model_package_tf/model_archive/TFLite/tflite_fp32 + +## Description +This is a floating point fp32 version of the DS-CNN Small model developed by Arm, from the Hello Edge paper. + +## License +[Apache-2.0](https://spdx.org/licenses/Apache-2.0.html) + +## Network Information +| Network Information | Value | +|---------------------|-------| +| Framework | TensorFlow Lite | +| Datatype | fp32 | +| SHA-1 Hash | 8aadd5126bc0d3371c1b834d027c853e794423c1 | +| Size (Bytes) | 98756 | +| Provenance | https://arxiv.org/abs/1711.07128 | +| Training | Trained by Arm | +| Paper | https://arxiv.org/abs/1711.07128 | + +## DataSet +| Dataset Information | Value | +|--------|-------| +| Name | Google Speech Commands test set | + +## Accuracy + +| Metric | Value | +|--------|-------| +| accuracy | 93.89% | + +## HW Support +| HW Support | Value | +|--------------|-------| +| Cortex-A |:heavy_check_mark: | +| Cortex-M |:heavy_check_mark: HERO | +| Mali GPU |:heavy_check_mark: | +| Ethos U |:heavy_multiplication_x: | + +### Key +* :heavy_check_mark: - Will run on this platform. +* :heavy_multiplication_x: - Will not run on this platform. + +## Network Quality +| Network Quality | Value | +|-------------------------|-------| +| Recreate | :heavy_check_mark: | +| Quality level | Hero | +| Vanilla | :heavy_check_mark: | +| Clustered | :heavy_multiplication_x: | +| Pruned | :heavy_multiplication_x: | +| Quantization - default | :heavy_multiplication_x: | +| Quantization - full | :heavy_multiplication_x: | + +## Network Inputs +| Input Node Name | Shape | Type | Example Path | Example Type | Example Shape | Example Use Case | +|-----------------|-------|-------|--------------|-------|-------|-----------------| +| input | (1, 490) | fp32 | models/keyword_spotting/ds_cnn_small/model_package_tf/model_archive/TFLite/tflite_fp32/testing_input/input | fp32 | [1, 490] | The input is a processed MFCCs | + +## Network Outputs +| Output Node Name | Shape | Type | Example Path | Example Type | Example Shape | Example Use Case | +|-----------------|-------|-------|--------------|-------|-------|-----------------| +| Identity | (1, 12) | fp32 | models/keyword_spotting/ds_cnn_small/model_package_tf/model_archive/TFLite/tflite_fp32/testing_output/Identity | fp32 | [1, 12] | The probability on 12 keywords | \ No newline at end of file diff --git a/models/keyword_spotting/ds_cnn_small/model_package_tf/model_archive/TFLite/tflite_fp32/definition.yaml b/models/keyword_spotting/ds_cnn_small/model_package_tf/model_archive/TFLite/tflite_fp32/definition.yaml new file mode 100644 index 0000000..71aa3f6 --- /dev/null +++ b/models/keyword_spotting/ds_cnn_small/model_package_tf/model_archive/TFLite/tflite_fp32/definition.yaml @@ -0,0 +1,66 @@ +benchmark: + benchmark_metrics: + accuracy: 93.89% + benchmark_name: Google Speech Commands test set +description: This is a floating point fp32 version of the DS-CNN Small model developed + by Arm, from the Hello Edge paper. +license: +- Apache-2.0 +network: + datatype: fp32 + file_size_bytes: 98756 + filename: ds_cnn_s.tflite + framework: TensorFlow Lite + hash: + algorithm: sha1 + value: 8aadd5126bc0d3371c1b834d027c853e794423c1 + provenance: https://arxiv.org/abs/1711.07128 + training: Trained by Arm +network_parameters: + input_nodes: + - description: The input is a processed MFCCs of shape (1, 490) + example_input: + path: models/keyword_spotting/ds_cnn_small/model_package_tf/model_archive/TFLite/tflite_fp32/testing_input/input + shape: + - 1 + - 490 + type: fp32 + use_case: Random input for model regression. + input_datatype: fp32 + name: input + shape: + - 1 + - 490 + output_nodes: + - description: The probability on 12 keywords. + example_output: + path: models/keyword_spotting/ds_cnn_small/model_package_tf/model_archive/TFLite/tflite_fp32/testing_output/Identity + shape: + - 1 + - 12 + type: fp32 + use_case: output for model regression. + name: Identity + output_datatype: fp32 + shape: + - 1 + - 12 +network_quality: + clustered: false + is_vanilla: true + pruned: false + quality_level: Hero + quality_level_hero_hw: cortex_m + quantization_default: false + quantization_full: false + recreate: true +operators: + TensorFlow Lite: + - AVERAGE_POOL_2D + - CONV_2D + - DEPTHWISE_CONV_2D + - FULLY_CONNECTED + - RELU + - RESHAPE + - SOFTMAX +paper: https://arxiv.org/abs/1711.07128 \ No newline at end of file diff --git a/models/keyword_spotting/ds_cnn_small/model_package_tf/model_archive/TFLite/tflite_fp32/ds_cnn_s.tflite b/models/keyword_spotting/ds_cnn_small/model_package_tf/model_archive/TFLite/tflite_fp32/ds_cnn_s.tflite new file mode 100644 index 0000000..3fb7602 --- /dev/null +++ b/models/keyword_spotting/ds_cnn_small/model_package_tf/model_archive/TFLite/tflite_fp32/ds_cnn_s.tflite @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0d302f1f2c53c1344edcde850e28130c0877b60e1567db977292239a9391f59b +size 98756 diff --git a/models/keyword_spotting/ds_cnn_small/model_package_tf/model_archive/TFLite/tflite_fp32/testing_input/input/0.npy b/models/keyword_spotting/ds_cnn_small/model_package_tf/model_archive/TFLite/tflite_fp32/testing_input/input/0.npy new file mode 100644 index 0000000..27d44a7 --- /dev/null +++ b/models/keyword_spotting/ds_cnn_small/model_package_tf/model_archive/TFLite/tflite_fp32/testing_input/input/0.npy @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1ee7676110faaf59275371c1d6b27097d657f049967840cbd214d62a272fa543 +size 2088 diff --git a/models/keyword_spotting/ds_cnn_small/model_package_tf/model_archive/TFLite/tflite_fp32/testing_output/Identity/0.npy b/models/keyword_spotting/ds_cnn_small/model_package_tf/model_archive/TFLite/tflite_fp32/testing_output/Identity/0.npy new file mode 100644 index 0000000..38660ee --- /dev/null +++ b/models/keyword_spotting/ds_cnn_small/model_package_tf/model_archive/TFLite/tflite_fp32/testing_output/Identity/0.npy @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fad0cf24907c9eeb36f99fb498f09667e129f1cdbcca9b50cd826e9322b145d1 +size 176 diff --git a/models/keyword_spotting/ds_cnn_small/model_package_tf/model_archive/TFLite/tflite_int16/README.md b/models/keyword_spotting/ds_cnn_small/model_package_tf/model_archive/TFLite/tflite_int16/README.md new file mode 100644 index 0000000..b025116 --- /dev/null +++ b/models/keyword_spotting/ds_cnn_small/model_package_tf/model_archive/TFLite/tflite_int16/README.md @@ -0,0 +1,62 @@ +# keyword_spotting/ds_cnn_small/model_package_tf/model_archive/TFLite/tflite_int16 + +## Description +This is a fully quantized int16 version of the DS-CNN Small model developed by Arm, from the Hello Edge paper. + +## License +[Apache-2.0](https://spdx.org/licenses/Apache-2.0.html) + +## Network Information +| Network Information | Value | +|---------------------|-------| +| Framework | TensorFlow Lite | +| Datatype | int16 | +| SHA-1 Hash | e82c7d645bec3dec580a096de0a297c6dd9a6463 | +| Size (Bytes) | 55392 | +| Provenance | https://github.com/ARM-software/ML-examples/tree/main/tflu-kws-cortex-m | +| Training | Trained by Arm | +| Paper | https://arxiv.org/abs/1711.07128 | + +## DataSet +| Dataset Information | Value | +|--------|-------| +| Name | Google Speech Commands test set | + +## Accuracy + +| Metric | Value | +|--------|-------| +| Accuracy | 93.39% | + +## HW Support +| HW Support | Value | +|--------------|-------| +| Cortex-A |:heavy_check_mark: | +| Cortex-M |:heavy_check_mark: HERO | +| Mali GPU |:heavy_check_mark: | +| Ethos U |:heavy_check_mark: | + +### Key +* :heavy_check_mark: - Will run on this platform. +* :heavy_multiplication_x: - Will not run on this platform. + +## Network Quality +| Network Quality | Value | +|-------------------------|-------| +| Recreate | :heavy_check_mark: | +| Quality level | Hero | +| Vanilla | :heavy_check_mark: | +| Clustered | :heavy_multiplication_x: | +| Pruned | :heavy_multiplication_x: | +| Quantization - default | :heavy_multiplication_x: | +| Quantization - full | :heavy_check_mark: | + +## Network Inputs +| Input Node Name | Shape | Type | Example Path | Example Type | Example Shape | Example Use Case | +|-----------------|-------|-------|--------------|-------|-------|-----------------| +| serving_default_input:0 | (1, 490) | int16 | models/keyword_spotting/ds_cnn_small/model_package_tf/model_archive/TFLite/tflite_int16/testing_input | int16 | [1, 490] | The input is a processed MFCCs | + +## Network Outputs +| Output Node Name | Shape | Type | Example Path | Example Type | Example Shape | Example Use Case | +|-----------------|-------|-------|--------------|-------|-------|-----------------| +| StatefulPartitionedCall:0 | (1, 12) | int16 | models/keyword_spotting/ds_cnn_small/model_package_tf/model_archive/TFLite/tflite_int16/testing_output | int16 | [1, 12] | The probability on 12 keywords | \ No newline at end of file diff --git a/models/keyword_spotting/ds_cnn_small/model_package_tf/model_archive/TFLite/tflite_int16/definition.yaml b/models/keyword_spotting/ds_cnn_small/model_package_tf/model_archive/TFLite/tflite_int16/definition.yaml new file mode 100644 index 0000000..730a6cc --- /dev/null +++ b/models/keyword_spotting/ds_cnn_small/model_package_tf/model_archive/TFLite/tflite_int16/definition.yaml @@ -0,0 +1,66 @@ +benchmark: + benchmark_metrics: + Accuracy: 93.39% + benchmark_name: Google Speech Commands test set +description: This is a fully quantized int16 version of the DS-CNN Small model developed + by Arm, from the Hello Edge paper. +license: +- Apache-2.0 +network: + datatype: int16 + file_size_bytes: 55392 + filename: ds_cnn_s_quantized_int16.tflite + framework: TensorFlow Lite + hash: + algorithm: sha1 + value: e82c7d645bec3dec580a096de0a297c6dd9a6463 + provenance: https://arxiv.org/abs/1711.07128 + training: Trained by Arm +network_parameters: + input_nodes: + - description: The input is a processed MFCCs of shape (1, 490) + example_input: + path: models/keyword_spotting/ds_cnn_small/model_package_tf/model_archive/TFLite/tflite_int16/testing_input + shape: + - 1 + - 490 + type: int16 + use_case: Random input for model regression. + input_datatype: int16 + name: serving_default_input:0 + shape: + - 1 + - 490 + output_nodes: + - description: The probability on 12 keywords. + example_output: + path: models/keyword_spotting/ds_cnn_small/model_package_tf/model_archive/TFLite/tflite_int16/testing_output + shape: + - 1 + - 12 + type: int16 + use_case: output for model regression. + name: StatefulPartitionedCall:0 + output_datatype: int16 + shape: + - 1 + - 12 +network_quality: + clustered: false + is_vanilla: true + pruned: false + quality_level: Hero + quality_level_hero_hw: cortex_m + quantization_default: false + quantization_full: true + recreate: true +operators: + TensorFlow Lite: + - AVERAGE_POOL_2D + - CONV_2D + - DEPTHWISE_CONV_2D + - FULLY_CONNECTED + - RELU + - RESHAPE + - SOFTMAX +paper: https://arxiv.org/abs/1711.07128 \ No newline at end of file diff --git a/models/keyword_spotting/ds_cnn_small/model_package_tf/model_archive/TFLite/tflite_int16/ds_cnn_s_quantized_int16.tflite b/models/keyword_spotting/ds_cnn_small/model_package_tf/model_archive/TFLite/tflite_int16/ds_cnn_s_quantized_int16.tflite new file mode 100644 index 0000000..d3d56fe --- /dev/null +++ b/models/keyword_spotting/ds_cnn_small/model_package_tf/model_archive/TFLite/tflite_int16/ds_cnn_s_quantized_int16.tflite @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e80b231d6848e6de69d70d36a17f9bb64022ae46d9957b1f6972b6527f943186 +size 55392 diff --git a/models/keyword_spotting/ds_cnn_small/model_package_tf/model_archive/TFLite/tflite_int16/testing_input/0.npy b/models/keyword_spotting/ds_cnn_small/model_package_tf/model_archive/TFLite/tflite_int16/testing_input/0.npy new file mode 100644 index 0000000..797c2b0 --- /dev/null +++ b/models/keyword_spotting/ds_cnn_small/model_package_tf/model_archive/TFLite/tflite_int16/testing_input/0.npy @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3e24c5c602a9c74776927198465769dc6e80645663bf7604ae45aed0586a066a +size 1108 diff --git a/models/keyword_spotting/ds_cnn_small/model_package_tf/model_archive/TFLite/tflite_int16/testing_output/0.npy b/models/keyword_spotting/ds_cnn_small/model_package_tf/model_archive/TFLite/tflite_int16/testing_output/0.npy new file mode 100644 index 0000000..4e37127 --- /dev/null +++ b/models/keyword_spotting/ds_cnn_small/model_package_tf/model_archive/TFLite/tflite_int16/testing_output/0.npy @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:397aff56a28c4e81818c117ae49b216ad8ae501c3612b7abac2cdf9f45ccbf44 +size 152 diff --git a/models/keyword_spotting/ds_cnn_small/model_package_tf/model_archive/TFLite/tflite_int8/README.md b/models/keyword_spotting/ds_cnn_small/model_package_tf/model_archive/TFLite/tflite_int8/README.md new file mode 100644 index 0000000..3e9a6cc --- /dev/null +++ b/models/keyword_spotting/ds_cnn_small/model_package_tf/model_archive/TFLite/tflite_int8/README.md @@ -0,0 +1,62 @@ +# keyword_spotting/ds_cnn_small/model_package_tf/model_archive/TFLite/tflite_int8 + +## Description +This is a fully quantized int8 version of the DS-CNN Small model developed by Arm, from the Hello Edge paper. + +## License +[Apache-2.0](https://spdx.org/licenses/Apache-2.0.html) + +## Network Information +| Network Information | Value | +|---------------------|-------| +| Framework | TensorFlow Lite | +| Datatype | int8 | +| SHA-1 Hash | cf24429e86a9647b1632c382894bc68d26d34039 | +| Size (Bytes) | 47616 | +| Provenance | https://arxiv.org/abs/1711.07128 | +| Training | Trained by Arm | +| Paper | https://arxiv.org/abs/1711.07128 | + +## DataSet +| Dataset Information | Value | +|--------|-------| +| Name | Google Speech Commands test set | + +## Accuracy + +| Metric | Value | +|--------|-------| +| Accuracy | 93.11% | + +## HW Support +| HW Support | Value | +|--------------|-------| +| Cortex-A |:heavy_check_mark: | +| Cortex-M |:heavy_check_mark: HERO | +| Mali GPU |:heavy_check_mark: | +| Ethos U |:heavy_check_mark: | + +### Key +* :heavy_check_mark: - Will run on this platform. +* :heavy_multiplication_x: - Will not run on this platform. + +## Network Quality +| Network Quality | Value | +|-------------------------|-------| +| Recreate | :heavy_check_mark: | +| Quality level | Hero | +| Vanilla | :heavy_check_mark: | +| Clustered | :heavy_multiplication_x: | +| Pruned | :heavy_multiplication_x: | +| Quantization - default | :heavy_multiplication_x: | +| Quantization - full | :heavy_check_mark: | + +## Network Inputs +| Input Node Name | Shape | Type | Example Path | Example Type | Example Shape | Example Use Case | +|-----------------|-------|-------|--------------|-------|-------|-----------------| +| input | (1, 490) | int8 | models/keyword_spotting/ds_cnn_small/model_package_tf/model_archive/TFLite/tflite_int8/testing_input/input | int8 | [1, 490] | The input is a processed MFCCs | + +## Network Outputs +| Output Node Name | Shape | Type | Example Path | Example Type | Example Shape | Example Use Case | +|-----------------|-------|-------|--------------|-------|-------|-----------------| +| Identity | (1, 12) | int8 | models/keyword_spotting/ds_cnn_small/model_package_tf/model_archive/TFLite/tflite_int8/testing_output/Identity | int8 | [1, 12] | The probability on 12 keywords | \ No newline at end of file diff --git a/models/keyword_spotting/ds_cnn_small/model_package_tf/model_archive/TFLite/tflite_int8/definition.yaml b/models/keyword_spotting/ds_cnn_small/model_package_tf/model_archive/TFLite/tflite_int8/definition.yaml new file mode 100644 index 0000000..6d2f978 --- /dev/null +++ b/models/keyword_spotting/ds_cnn_small/model_package_tf/model_archive/TFLite/tflite_int8/definition.yaml @@ -0,0 +1,66 @@ +benchmark: + benchmark_metrics: + Accuracy: 93.11% + benchmark_name: Google Speech Commands test set +description: This is a fully quantized int8 version of the DS-CNN Small model developed + by Arm, from the Hello Edge paper. +license: +- Apache-2.0 +network: + datatype: int8 + file_size_bytes: 47616 + filename: ds_cnn_s_quantized.tflite + framework: TensorFlow Lite + hash: + algorithm: sha1 + value: cf24429e86a9647b1632c382894bc68d26d34039 + provenance: https://arxiv.org/abs/1711.07128 + training: Trained by Arm +network_parameters: + input_nodes: + - description: The input is a processed MFCCs of shape (1, 490) + example_input: + path: models/keyword_spotting/ds_cnn_small/model_package_tf/model_archive/TFLite/tflite_int8/testing_input/input + shape: + - 1 + - 490 + type: int8 + use_case: Random input for model regression. + input_datatype: int8 + name: input + shape: + - 1 + - 490 + output_nodes: + - description: The probability on 12 keywords. + example_output: + path: models/keyword_spotting/ds_cnn_small/model_package_tf/model_archive/TFLite/tflite_int8/testing_output/Identity + shape: + - 1 + - 12 + type: int8 + use_case: output for model regression. + name: Identity + output_datatype: int8 + shape: + - 1 + - 12 +network_quality: + clustered: false + is_vanilla: true + pruned: false + quality_level: Hero + quality_level_hero_hw: cortex_m + quantization_default: false + quantization_full: true + recreate: true +operators: + TensorFlow Lite: + - AVERAGE_POOL_2D + - CONV_2D + - DEPTHWISE_CONV_2D + - FULLY_CONNECTED + - RELU + - RESHAPE + - SOFTMAX +paper: https://arxiv.org/abs/1711.07128 \ No newline at end of file diff --git a/models/keyword_spotting/ds_cnn_small/tflite_int8/ds_cnn_s_quantized.tflite b/models/keyword_spotting/ds_cnn_small/model_package_tf/model_archive/TFLite/tflite_int8/ds_cnn_s_quantized.tflite similarity index 100% rename from models/keyword_spotting/ds_cnn_small/tflite_int8/ds_cnn_s_quantized.tflite rename to models/keyword_spotting/ds_cnn_small/model_package_tf/model_archive/TFLite/tflite_int8/ds_cnn_s_quantized.tflite diff --git a/models/keyword_spotting/ds_cnn_small/tflite_int8/testing_input/input/0.npy b/models/keyword_spotting/ds_cnn_small/model_package_tf/model_archive/TFLite/tflite_int8/testing_input/input/0.npy similarity index 100% rename from models/keyword_spotting/ds_cnn_small/tflite_int8/testing_input/input/0.npy rename to models/keyword_spotting/ds_cnn_small/model_package_tf/model_archive/TFLite/tflite_int8/testing_input/input/0.npy diff --git a/models/keyword_spotting/ds_cnn_small/tflite_int8/testing_output/Identity/0.npy b/models/keyword_spotting/ds_cnn_small/model_package_tf/model_archive/TFLite/tflite_int8/testing_output/Identity/0.npy similarity index 100% rename from models/keyword_spotting/ds_cnn_small/tflite_int8/testing_output/Identity/0.npy rename to models/keyword_spotting/ds_cnn_small/model_package_tf/model_archive/TFLite/tflite_int8/testing_output/Identity/0.npy diff --git a/models/keyword_spotting/ds_cnn_small/model_package_tf/model_archive/model_source/saved_model/ds_cnn_small/keras_metadata.pb b/models/keyword_spotting/ds_cnn_small/model_package_tf/model_archive/model_source/saved_model/ds_cnn_small/keras_metadata.pb new file mode 100644 index 0000000..a265c82 --- /dev/null +++ b/models/keyword_spotting/ds_cnn_small/model_package_tf/model_archive/model_source/saved_model/ds_cnn_small/keras_metadata.pb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:edda8ec1a48de025c96dfcef1163b343f69616f516a6fec12279e71c5a58b4d2 +size 65399 diff --git a/models/keyword_spotting/ds_cnn_small/model_package_tf/model_archive/model_source/saved_model/ds_cnn_small/saved_model.pb b/models/keyword_spotting/ds_cnn_small/model_package_tf/model_archive/model_source/saved_model/ds_cnn_small/saved_model.pb new file mode 100644 index 0000000..3fd736c --- /dev/null +++ b/models/keyword_spotting/ds_cnn_small/model_package_tf/model_archive/model_source/saved_model/ds_cnn_small/saved_model.pb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3ef43701d6901c7fa2452cf5390d2198b7ba14a3e5f41d10385ec152f0631349 +size 708163 diff --git a/models/keyword_spotting/ds_cnn_small/model_package_tf/model_archive/model_source/saved_model/ds_cnn_small/variables/variables.data-00000-of-00001 b/models/keyword_spotting/ds_cnn_small/model_package_tf/model_archive/model_source/saved_model/ds_cnn_small/variables/variables.data-00000-of-00001 new file mode 100644 index 0000000..4217bf8 --- /dev/null +++ b/models/keyword_spotting/ds_cnn_small/model_package_tf/model_archive/model_source/saved_model/ds_cnn_small/variables/variables.data-00000-of-00001 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5cee02f3a1e371e6de9e2192600842bd92be832739233b8bdeaf6f3f3b9f1e73 +size 118118 diff --git a/models/keyword_spotting/ds_cnn_small/model_package_tf/model_archive/model_source/saved_model/ds_cnn_small/variables/variables.index b/models/keyword_spotting/ds_cnn_small/model_package_tf/model_archive/model_source/saved_model/ds_cnn_small/variables/variables.index new file mode 100644 index 0000000..364f025 --- /dev/null +++ b/models/keyword_spotting/ds_cnn_small/model_package_tf/model_archive/model_source/saved_model/ds_cnn_small/variables/variables.index @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e77718d9698810a79c1ce8989db07245c69ae8d0277c5337703e3f32c6a863f5 +size 3570 diff --git a/models/keyword_spotting/ds_cnn_small/tflite_int8/ckpt/checkpoint b/models/keyword_spotting/ds_cnn_small/model_package_tf/model_archive/model_source/weights/checkpoint similarity index 100% rename from models/keyword_spotting/ds_cnn_small/tflite_int8/ckpt/checkpoint rename to models/keyword_spotting/ds_cnn_small/model_package_tf/model_archive/model_source/weights/checkpoint diff --git a/models/keyword_spotting/ds_cnn_small/tflite_int8/ckpt/ds_cnn_0.94_ckpt.data-00000-of-00001 b/models/keyword_spotting/ds_cnn_small/model_package_tf/model_archive/model_source/weights/ds_cnn_0.94_ckpt.data-00000-of-00001 similarity index 100% rename from models/keyword_spotting/ds_cnn_small/tflite_int8/ckpt/ds_cnn_0.94_ckpt.data-00000-of-00001 rename to models/keyword_spotting/ds_cnn_small/model_package_tf/model_archive/model_source/weights/ds_cnn_0.94_ckpt.data-00000-of-00001 diff --git a/models/keyword_spotting/ds_cnn_small/tflite_int8/ckpt/ds_cnn_0.94_ckpt.index b/models/keyword_spotting/ds_cnn_small/model_package_tf/model_archive/model_source/weights/ds_cnn_0.94_ckpt.index similarity index 100% rename from models/keyword_spotting/ds_cnn_small/tflite_int8/ckpt/ds_cnn_0.94_ckpt.index rename to models/keyword_spotting/ds_cnn_small/model_package_tf/model_archive/model_source/weights/ds_cnn_0.94_ckpt.index diff --git a/models/keyword_spotting/ds_cnn_small/model_package_tf/model_core_utils/__init__.py b/models/keyword_spotting/ds_cnn_small/model_package_tf/model_core_utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/models/keyword_spotting/ds_cnn_small/model_package_tf/model_core_utils/models.py b/models/keyword_spotting/ds_cnn_small/model_package_tf/model_core_utils/models.py new file mode 100644 index 0000000..1978136 --- /dev/null +++ b/models/keyword_spotting/ds_cnn_small/model_package_tf/model_core_utils/models.py @@ -0,0 +1,327 @@ +# Copyright © 2023 Arm Ltd. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Model definitions for simple keyword spotting.""" + +import math + +import tensorflow as tf + + +def prepare_model_settings(label_count, sample_rate, clip_duration_ms, + window_size_ms, window_stride_ms, + dct_coefficient_count): + """Calculates common settings needed for all models. + + Args: + label_count: How many classes are to be recognized. + sample_rate: Number of audio samples per second. + clip_duration_ms: Length of each audio clip to be analyzed. + window_size_ms: Duration of frequency analysis window. + window_stride_ms: How far to move in time between frequency windows. + dct_coefficient_count: Number of frequency bins to use for analysis. + + Returns: + Dictionary containing common settings. + """ + desired_samples = int(sample_rate * clip_duration_ms / 1000) + window_size_samples = int(sample_rate * window_size_ms / 1000) + window_stride_samples = int(sample_rate * window_stride_ms / 1000) + length_minus_window = (desired_samples - window_size_samples) + if length_minus_window < 0: + spectrogram_length = 0 + else: + spectrogram_length = 1 + int(length_minus_window / window_stride_samples) + fingerprint_size = dct_coefficient_count * spectrogram_length + + return { + 'desired_samples': desired_samples, + 'window_size_samples': window_size_samples, + 'window_stride_samples': window_stride_samples, + 'spectrogram_length': spectrogram_length, + 'dct_coefficient_count': dct_coefficient_count, + 'fingerprint_size': fingerprint_size, + 'label_count': label_count, + 'sample_rate': sample_rate, + } + + +def create_model(model_settings, model_architecture, model_size_info, is_training): + """Builds a tf.keras model of the requested architecture compatible with the settings. + + Args: + model_settings: Dictionary of information about the model. + model_architecture: String specifying which kind of model to create. + model_size_info: Array with specific information for the chosen architecture + (e.g convolutional parameters, number of layers). + + Returns: + A tf.keras Model with the requested architecture. + + Raises: + Exception: If the architecture type isn't recognized. + """ + + if model_architecture == 'dnn': + return create_dnn_model(model_settings, model_size_info) + + elif model_architecture == 'cnn': + return create_cnn_model(model_settings, model_size_info) + + elif model_architecture == 'ds_cnn': + return create_ds_cnn_model(model_settings, model_size_info) + elif model_architecture == 'single_fc': + return create_single_fc_model(model_settings) + elif model_architecture == 'basic_lstm': + return create_basic_lstm_model(model_settings, model_size_info, is_training) + else: + raise Exception(f'model_architecture argument {model_architecture} not recognized' + f', should be one of, "dnn", "cnn", "ds_cnn" ') + + +def create_single_fc_model(model_settings): + """Builds a model with a single fully-connected layer. + + For details see https://arxiv.org/abs/1711.07128. + + Args: + model_settings: Dict of different settings for model training. + + Returns: + tf.keras Model of the 'SINGLE_FC' architecture. + """ + inputs = tf.keras.Input(shape=(model_settings['fingerprint_size'],), name='input') + # Fully connected layer + output = tf.keras.layers.Dense(units=model_settings['label_count'], activation='softmax')(inputs) + + return tf.keras.Model(inputs, output) + + +def create_basic_lstm_model(model_settings, model_size_info, is_training): + """Builds a model with a basic lstm layer. + + For details see https://arxiv.org/abs/1711.07128. + + Args: + model_settings: Dict of different settings for model training. + model_size_info: Length of the array defines the number of hidden-layers and + each element in the array represent the number of neurons in that layer. + is_training: Determining whether the use of the model is for training or for something else. + + Returns: + tf.keras Model of the 'Basic_LSTM' architecture. + """ + inputs = tf.keras.Input(shape=(model_settings['fingerprint_size'], ), name='input') + + input_frequency_size = model_settings['dct_coefficient_count'] + input_time_size = model_settings['spectrogram_length'] + x = tf.reshape(inputs, shape=(-1, input_time_size, input_frequency_size)) + + # LSTM layer, and unrolling depending on whether you are training or not + if is_training: + x = tf.keras.layers.LSTM(units=model_size_info[0], time_major=False, unroll=False)(x) + else: + x = tf.keras.layers.LSTM(units=model_size_info[0], time_major=False, unroll=True)(x) + + # Outputs a fully connected layer + output = tf.keras.layers.Dense(units=model_settings['label_count'], activation='softmax')(x) + + return tf.keras.Model(inputs, output) + + +def create_dnn_model(model_settings, model_size_info): + """Builds a model with multiple hidden fully-connected layers. + + For details see https://arxiv.org/abs/1711.07128. + + Args: + model_settings: Dict of different settings for model training. + model_size_info: Length of the array defines the number of hidden-layers and + each element in the array represent the number of neurons in that layer. + + Returns: + tf.keras Model of the 'DNN' architecture. + """ + + inputs = tf.keras.Input(shape=(model_settings['fingerprint_size'], ), name='input') + + # First fully connected layer. + x = tf.keras.layers.Dense(units=model_size_info[0], activation='relu')(inputs) + + # Hidden layers with ReLU activations. + for i in range(1, len(model_size_info)): + x = tf.keras.layers.Dense(units=model_size_info[i], activation='relu')(x) + + # Output fully connected layer. + output = tf.keras.layers.Dense(units=model_settings['label_count'], activation='softmax')(x) + + return tf.keras.Model(inputs, output) + + +def create_cnn_model(model_settings, model_size_info): + """Builds a model with 2 convolution layers followed by a linear layer and a hidden fully-connected layer. + + For details see https://arxiv.org/abs/1711.07128. + + Args: + model_settings: Dict of different settings for model training. + model_size_info: Defines the first and second convolution parameters in + {number of conv features, conv filter height, width, stride in y,x dir.}, + followed by linear layer size and fully-connected layer size. + + Returns: + tf.keras Model of the 'CNN' architecture. + """ + + input_frequency_size = model_settings['dct_coefficient_count'] + input_time_size = model_settings['spectrogram_length'] + + first_filter_count = model_size_info[0] + first_filter_height = model_size_info[1] # Time axis. + first_filter_width = model_size_info[2] # Frequency axis. + first_filter_stride_y = model_size_info[3] # Time axis. + first_filter_stride_x = model_size_info[4] # Frequency_axis. + + second_filter_count = model_size_info[5] + second_filter_height = model_size_info[6] # Time axis. + second_filter_width = model_size_info[7] # Frequency axis. + second_filter_stride_y = model_size_info[8] # Time axis. + second_filter_stride_x = model_size_info[9] # Frequency axis. + + linear_layer_size = model_size_info[10] + fc_size = model_size_info[11] + + inputs = tf.keras.Input(shape=(model_settings['fingerprint_size']), name='input') + + # Reshape the flattened input. + x = tf.reshape(inputs, shape=(-1, input_time_size, input_frequency_size, 1)) + + # First convolution. + x = tf.keras.layers.Conv2D(filters=first_filter_count, + kernel_size=(first_filter_height, first_filter_width), + strides=(first_filter_stride_y, first_filter_stride_x), + padding='VALID')(x) + x = tf.keras.layers.BatchNormalization()(x) + x = tf.keras.layers.ReLU()(x) + x = tf.keras.layers.Dropout(rate=0)(x) + + # Second convolution. + x = tf.keras.layers.Conv2D(filters=second_filter_count, + kernel_size=(second_filter_height, second_filter_width), + strides=(second_filter_stride_y, second_filter_stride_x), + padding='VALID')(x) + x = tf.keras.layers.BatchNormalization()(x) + x = tf.keras.layers.ReLU()(x) + x = tf.keras.layers.Dropout(rate=0)(x) + + # Flatten for fully connected layers. + x = tf.keras.layers.Flatten()(x) + + # Fully connected layer with no activation. + x = tf.keras.layers.Dense(units=linear_layer_size)(x) + + # Fully connected layer with ReLU activation. + x = tf.keras.layers.Dense(units=fc_size)(x) + x = tf.keras.layers.BatchNormalization()(x) + x = tf.keras.layers.ReLU()(x) + x = tf.keras.layers.Dropout(rate=0)(x) + + # Output fully connected. + output = tf.keras.layers.Dense(units=model_settings['label_count'], activation='softmax')(x) + + return tf.keras.Model(inputs, output) + + +def create_ds_cnn_model(model_settings, model_size_info): + """Builds a model with convolutional & depthwise separable convolutional layers. + + For more details see https://arxiv.org/abs/1711.07128. + + Args: + model_settings: Dict of different settings for model training. + model_size_info: Defines number of layers, followed by the DS-Conv layer + parameters in the order {number of conv features, conv filter height, + width and stride in y,x dir.} for each of the layers. + + Returns: + tf.keras Model of the 'DS-CNN' architecture. + """ + + label_count = model_settings['label_count'] + input_frequency_size = model_settings['dct_coefficient_count'] + input_time_size = model_settings['spectrogram_length'] + + t_dim = input_time_size + f_dim = input_frequency_size + + # Extract model dimensions from model_size_info. + num_layers = model_size_info[0] + conv_feat = [None]*num_layers + conv_kt = [None]*num_layers + conv_kf = [None]*num_layers + conv_st = [None]*num_layers + conv_sf = [None]*num_layers + + i = 1 + for layer_no in range(0, num_layers): + conv_feat[layer_no] = model_size_info[i] + i += 1 + conv_kt[layer_no] = model_size_info[i] + i += 1 + conv_kf[layer_no] = model_size_info[i] + i += 1 + conv_st[layer_no] = model_size_info[i] + i += 1 + conv_sf[layer_no] = model_size_info[i] + i += 1 + + inputs = tf.keras.Input(shape=(model_settings['fingerprint_size']), name='input') + + # Reshape the flattened input. + x = tf.reshape(inputs, shape=(-1, input_time_size, input_frequency_size, 1)) + + # Depthwise separable convolutions. + for layer_no in range(0, num_layers): + if layer_no == 0: + # First convolution. + x = tf.keras.layers.Conv2D(filters=conv_feat[0], + kernel_size=(conv_kt[0], conv_kf[0]), + strides=(conv_st[0], conv_sf[0]), + padding='SAME')(x) + x = tf.keras.layers.BatchNormalization()(x) + x = tf.keras.layers.ReLU()(x) + else: + # Depthwise convolution. + x = tf.keras.layers.DepthwiseConv2D(kernel_size=(conv_kt[layer_no], conv_kf[layer_no]), + strides=(conv_sf[layer_no], conv_st[layer_no]), + padding='SAME')(x) + x = tf.keras.layers.BatchNormalization()(x) + x = tf.keras.layers.ReLU()(x) + + # Pointwise convolution. + x = tf.keras.layers.Conv2D(filters=conv_feat[layer_no], kernel_size=(1, 1))(x) + x = tf.keras.layers.BatchNormalization()(x) + x = tf.keras.layers.ReLU()(x) + + t_dim = math.ceil(t_dim/float(conv_st[layer_no])) + f_dim = math.ceil(f_dim/float(conv_sf[layer_no])) + + # Global average pool. + x = tf.keras.layers.AveragePooling2D(pool_size=(t_dim, f_dim), strides=1)(x) + + # Squeeze before passing to output fully connected layer. + x = tf.reshape(x, shape=(-1, conv_feat[layer_no])) + + # Output connected layer. + output = tf.keras.layers.Dense(units=label_count, activation='softmax')(x) + + return tf.keras.Model(inputs, output) diff --git a/models/keyword_spotting/ds_cnn_small/model_package_tf/optimisations.py b/models/keyword_spotting/ds_cnn_small/model_package_tf/optimisations.py new file mode 100644 index 0000000..16b6f4c --- /dev/null +++ b/models/keyword_spotting/ds_cnn_small/model_package_tf/optimisations.py @@ -0,0 +1,259 @@ +# Copyright © 2023 Arm Ltd. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Functions for optimizing simple keyword spotting models using clustering API.""" + +import argparse +from pathlib import Path + +import tensorflow as tf +import numpy as np +import tensorflow_model_optimization as tfmot + +from data_processing import data_preprocessing +from model_core_utils import models + + +def print_model_weight_clusters(model): + + for layer in model.layers: + if isinstance(layer, tf.keras.layers.Wrapper): + weights = layer.trainable_weights + else: + weights = layer.weights + for weight in weights: + if "kernel" in weight.name: + unique_count = len(np.unique(weight)) + print( + f"{layer.name}/{weight.name}: {unique_count} clusters " + ) + + +def optimize(): + model_settings = models.prepare_model_settings( + len(data_preprocessing.prepare_words_list(FLAGS.wanted_words.split(','))), + FLAGS.sample_rate, FLAGS.clip_duration_ms, FLAGS.window_size_ms, + FLAGS.window_stride_ms, FLAGS.dct_coefficient_count) + + # Create the model to optimize from checkpoint. + model = models.create_model(model_settings, FLAGS.model_architecture, FLAGS.model_size_info) + model.load_weights(FLAGS.checkpoint).expect_partial() + + audio_processor = data_preprocessing.AudioProcessor(data_url=FLAGS.data_url, + data_dir=FLAGS.data_dir, + silence_percentage=FLAGS.silence_percentage, + unknown_percentage=FLAGS.unknown_percentage, + wanted_words=FLAGS.wanted_words.split(','), + validation_percentage=FLAGS.validation_percentage, + testing_percentage=FLAGS.testing_percentage, + model_settings=model_settings) + + # We decay learning rate in a constant piecewise way to help learning. + training_steps_list = list(map(int, FLAGS.how_many_training_steps.split(','))) + learning_rates_list = list(map(float, FLAGS.learning_rate.split(','))) + lr_boundary_list = training_steps_list[:-1] # Only need the values at which to change lr. + lr_schedule = tf.keras.optimizers.schedules.PiecewiseConstantDecay(boundaries=lr_boundary_list, + values=learning_rates_list) + + cluster_weights = tfmot.clustering.keras.cluster_weights + CentroidInitialization = tfmot.clustering.keras.CentroidInitialization + + clustering_params = { + 'number_of_clusters': 32, + 'cluster_centroids_init': CentroidInitialization.KMEANS_PLUS_PLUS} + + clustered_model = cluster_weights(model, **clustering_params) + + # Specify the optimizer configurations. + optimizer = tf.keras.optimizers.Adam(learning_rate=lr_schedule) + clustered_model.compile(optimizer=optimizer, + loss=tf.keras.losses.SparseCategoricalCrossentropy(), + metrics=['accuracy']) + + train_data = audio_processor.get_data(audio_processor.Modes.TRAINING, + FLAGS.background_frequency, FLAGS.background_volume, + int((FLAGS.time_shift_ms * FLAGS.sample_rate) / 1000)) + train_data = train_data.repeat().batch(FLAGS.batch_size).prefetch(tf.data.AUTOTUNE) + val_data = audio_processor.get_data(audio_processor.Modes.VALIDATION) + val_data = val_data.batch(FLAGS.batch_size).prefetch(tf.data.AUTOTUNE) + + # We train for a max number of iterations so need to calculate how many 'epochs' this will be. + training_steps_max = np.sum(training_steps_list) + training_epoch_max = int(np.ceil(training_steps_max / FLAGS.eval_step_interval)) + + # Train the model with clustering applied. + clustered_model.fit(x=train_data, + steps_per_epoch=FLAGS.eval_step_interval, + epochs=training_epoch_max, + validation_data=val_data) + + stripped_clustered_model = tfmot.clustering.keras.strip_clustering(clustered_model) + + print_model_weight_clusters(stripped_clustered_model) + + # Save the clustered model weights + train_dir = Path(FLAGS.train_dir) / "optimized" + train_dir.mkdir(parents=True, exist_ok=True) + + stripped_clustered_model.save_weights((train_dir / + (FLAGS.model_architecture + + "_clustered_ckpt"))) + + # Test the model. + test_data = audio_processor.get_data(audio_processor.Modes.TESTING) + test_data = test_data.batch(FLAGS.batch_size) + + stripped_clustered_model.compile(optimizer=optimizer, + loss=tf.keras.losses.SparseCategoricalCrossentropy(), + metrics=['accuracy']) + + test_loss, test_acc = stripped_clustered_model.evaluate(x=test_data) + print(f'Final test accuracy: {test_acc*100:.2f}%') + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + '--data_url', + type=str, + default='http://download.tensorflow.org/data/speech_commands_v0.02.tar.gz', + help='Location of speech training data archive on the web.') + parser.add_argument( + '--data_dir', + type=str, + default='/tmp/speech_dataset/', + help="""\ + Where to download the speech training data to. + """) + parser.add_argument( + '--background_volume', + type=float, + default=0.1, + help="""\ + How loud the background noise should be, between 0 and 1. + """) + parser.add_argument( + '--background_frequency', + type=float, + default=0.8, + help="""\ + How many of the training samples have background noise mixed in. + """) + parser.add_argument( + '--silence_percentage', + type=float, + default=10.0, + help="""\ + How much of the training data should be silence. + """) + parser.add_argument( + '--unknown_percentage', + type=float, + default=10.0, + help="""\ + How much of the training data should be unknown words. + """) + parser.add_argument( + '--time_shift_ms', + type=float, + default=100.0, + help="""\ + Range to randomly shift the training audio by in time. + """) + parser.add_argument( + '--testing_percentage', + type=int, + default=10, + help='What percentage of wavs to use as a test set.') + parser.add_argument( + '--validation_percentage', + type=int, + default=10, + help='What percentage of wavs to use as a validation set.') + parser.add_argument( + '--sample_rate', + type=int, + default=16000, + help='Expected sample rate of the wavs',) + parser.add_argument( + '--clip_duration_ms', + type=int, + default=1000, + help='Expected duration in milliseconds of the wavs',) + parser.add_argument( + '--window_size_ms', + type=float, + default=30.0, + help='How long each spectrogram timeslice is',) + parser.add_argument( + '--window_stride_ms', + type=float, + default=10.0, + help='How long each spectrogram timeslice is',) + parser.add_argument( + '--dct_coefficient_count', + type=int, + default=40, + help='How many bins to use for the MFCC fingerprint',) + parser.add_argument( + '--how_many_training_steps', + type=str, + default='3750,750', + help='How many training loops to run',) + parser.add_argument( + '--eval_step_interval', + type=int, + default=400, + help='How often to evaluate the training results.') + parser.add_argument( + '--learning_rate', + type=str, + default='0.001,0.0001', + help='How large a learning rate to use when training.') + parser.add_argument( + '--batch_size', + type=int, + default=100, + help='How many items to train with at once',) + parser.add_argument( + '--wanted_words', + type=str, + default='yes,no,up,down,left,right,on,off,stop,go', + help='Words to use (others will be added to an unknown label)',) + parser.add_argument( + '--train_dir', + type=str, + default='/tmp/speech_commands_train', + help='Directory to write event logs and checkpoint.') + parser.add_argument( + '--save_step_interval', + type=int, + default=100, + help='Save model checkpoint every save_steps.') + parser.add_argument( + '--checkpoint', + type=str, + help='Checkpoint to load the weights from before fine-tuning.') + parser.add_argument( + '--model_architecture', + type=str, + default='dnn', + help='What model architecture to use') + parser.add_argument( + '--model_size_info', + type=int, + nargs="+", + default=[128, 128, 128], + help='Model dimensions - different for various models') + + FLAGS, _ = parser.parse_known_args() + optimize() diff --git a/models/keyword_spotting/ds_cnn_small/model_package_tf/recreate_model.sh b/models/keyword_spotting/ds_cnn_small/model_package_tf/recreate_model.sh new file mode 100644 index 0000000..a081905 --- /dev/null +++ b/models/keyword_spotting/ds_cnn_small/model_package_tf/recreate_model.sh @@ -0,0 +1,52 @@ +#!/usr/bin/env bash +# Copyright (C) 2023 Arm Limited or its affiliates. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the License); you may +# not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an AS IS BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +ckpt_path=model_archive/model_source/weights/ds_cnn_0.94_ckpt +train=false + +# Parse command line args +while (( $# >= 1 )); do + case $1 in + --ckpt) + if [ "$2" ]; then + ckpt_path=$2 + shift + else + printf 'ERROR: "--ckpt" requires a path to be supplied.\n' + exit 1 + fi + ;; + --train) + train=true + break;; + *) shift; + esac; +done + + +# DS-CNN Small training +if [ "$train" = true ] +then +python train.py --model_architecture ds_cnn --model_size_info 5 64 10 4 2 2 64 3 3 1 1 64 3 3 1 1 64 3 3 1 1 64 3 3 1 1 --dct_coefficient_count 10 --window_size_ms 40 --window_stride_ms 20 --learning_rate 0.0005,0.0001,0.00002 --how_many_training_steps 10000,10000,10000 --summaries_dir work/DS_CNN/DS_CNN_S/retrain_logs --train_dir work/DS_CNN/DS_CNN_S/training +fi + +# Conversion to TFLite fp32 +python convert_to_tflite.py --model_architecture ds_cnn --model_size_info 5 64 10 4 2 2 64 3 3 1 1 64 3 3 1 1 64 3 3 1 1 64 3 3 1 1 --dct_coefficient_count 10 --window_size_ms 40 --window_stride_ms 20 --checkpoint $ckpt_path --no-quantize + +# Conversion to TFLite int8 +python convert_to_tflite.py --model_architecture ds_cnn --model_size_info 5 64 10 4 2 2 64 3 3 1 1 64 3 3 1 1 64 3 3 1 1 64 3 3 1 1 --dct_coefficient_count 10 --window_size_ms 40 --window_stride_ms 20 --checkpoint $ckpt_path --inference_type int8 + diff --git a/models/keyword_spotting/ds_cnn_small/model_package_tf/requirements.txt b/models/keyword_spotting/ds_cnn_small/model_package_tf/requirements.txt new file mode 100644 index 0000000..3448cff --- /dev/null +++ b/models/keyword_spotting/ds_cnn_small/model_package_tf/requirements.txt @@ -0,0 +1,3 @@ +numpy == 1.19.5 +tensorflow == 2.5.0 +tensorflow-model-optimization == 0.6.0 \ No newline at end of file diff --git a/models/keyword_spotting/ds_cnn_small/model_package_tf/train.py b/models/keyword_spotting/ds_cnn_small/model_package_tf/train.py new file mode 100644 index 0000000..8c488b3 --- /dev/null +++ b/models/keyword_spotting/ds_cnn_small/model_package_tf/train.py @@ -0,0 +1,227 @@ +# Copyright © 2023 Arm Ltd. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Functions for training simple keyword spotting models.""" + +import argparse +from pathlib import Path + +import tensorflow as tf +import numpy as np + +from data_processing import data_preprocessing +from model_core_utils import models + + +def train(): + model_settings = models.prepare_model_settings( + len(data_preprocessing.prepare_words_list(FLAGS.wanted_words.split(','))), + FLAGS.sample_rate, FLAGS.clip_duration_ms, FLAGS.window_size_ms, + FLAGS.window_stride_ms, FLAGS.dct_coefficient_count) + + # Create the model. + model = models.create_model(model_settings, FLAGS.model_architecture, FLAGS.model_size_info, True) + + audio_processor = data_preprocessing.AudioProcessor(data_url=FLAGS.data_url, + data_dir=FLAGS.data_dir, + silence_percentage=FLAGS.silence_percentage, + unknown_percentage=FLAGS.unknown_percentage, + wanted_words=FLAGS.wanted_words.split(','), + validation_percentage=FLAGS.validation_percentage, + testing_percentage=FLAGS.testing_percentage, + model_settings=model_settings) + + # We decay learning rate in a constant piecewise way to help learning. + training_steps_list = list(map(int, FLAGS.how_many_training_steps.split(','))) + learning_rates_list = list(map(float, FLAGS.learning_rate.split(','))) + lr_boundary_list = training_steps_list[:-1] # Only need the values at which to change lr. + lr_schedule = tf.keras.optimizers.schedules.PiecewiseConstantDecay(boundaries=lr_boundary_list, + values=learning_rates_list) + + # Specify the optimizer configurations. + optimizer = tf.keras.optimizers.Adam(learning_rate=lr_schedule) + model.compile(optimizer=optimizer, + loss=tf.keras.losses.SparseCategoricalCrossentropy(), + metrics=['accuracy']) + + train_data = audio_processor.get_data(audio_processor.Modes.TRAINING, + FLAGS.background_frequency, FLAGS.background_volume, + int((FLAGS.time_shift_ms * FLAGS.sample_rate) / 1000)) + train_data = train_data.repeat().batch(FLAGS.batch_size).prefetch(tf.data.AUTOTUNE) + val_data = audio_processor.get_data(audio_processor.Modes.VALIDATION) + val_data = val_data.batch(FLAGS.batch_size).prefetch(tf.data.AUTOTUNE) + + # We train for a max number of iterations so need to calculate how many 'epochs' this will be. + training_steps_max = np.sum(training_steps_list) + training_epoch_max = int(np.ceil(training_steps_max / FLAGS.eval_step_interval)) + + # Callbacks. + train_dir = Path(FLAGS.train_dir) / "best" + train_dir.mkdir(parents=True, exist_ok=True) + model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint( + filepath=(train_dir / (FLAGS.model_architecture + "_{val_accuracy:.3f}_ckpt")), + save_weights_only=True, + monitor='val_accuracy', + mode='max', + save_best_only=True) + tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=FLAGS.summaries_dir) + + # Train the model. + model.fit(x=train_data, + steps_per_epoch=FLAGS.eval_step_interval, + epochs=training_epoch_max, + validation_data=val_data, + callbacks=[model_checkpoint_callback, tensorboard_callback]) + + # Test and save the model. + test_data = audio_processor.get_data(audio_processor.Modes.TESTING) + test_data = test_data.batch(FLAGS.batch_size) + + test_loss, test_acc = model.evaluate(x=test_data) + print(f'Final test accuracy: {test_acc*100:.2f}%') + model.save(f'saved_model/{FLAGS.model_architecture}') + model.save(f'keras/{FLAGS.model_architecture}.h5') + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + '--data_url', + type=str, + default='http://download.tensorflow.org/data/speech_commands_v0.02.tar.gz', + help='Location of speech training data archive on the web.') + parser.add_argument( + '--data_dir', + type=str, + default='/tmp/speech_dataset/', + help="""\ + Where to download the speech training data to. + """) + parser.add_argument( + '--background_volume', + type=float, + default=0.1, + help="""\ + How loud the background noise should be, between 0 and 1. + """) + parser.add_argument( + '--background_frequency', + type=float, + default=0.8, + help="""\ + How many of the training samples have background noise mixed in. + """) + parser.add_argument( + '--silence_percentage', + type=float, + default=10.0, + help="""\ + How much of the training data should be silence. + """) + parser.add_argument( + '--unknown_percentage', + type=float, + default=10.0, + help="""\ + How much of the training data should be unknown words. + """) + parser.add_argument( + '--time_shift_ms', + type=float, + default=100.0, + help="""\ + Range to randomly shift the training audio by in time. + """) + parser.add_argument( + '--testing_percentage', + type=int, + default=10, + help='What percentage of wavs to use as a test set.') + parser.add_argument( + '--validation_percentage', + type=int, + default=10, + help='What percentage of wavs to use as a validation set.') + parser.add_argument( + '--sample_rate', + type=int, + default=16000, + help='Expected sample rate of the wavs',) + parser.add_argument( + '--clip_duration_ms', + type=int, + default=1000, + help='Expected duration in milliseconds of the wavs',) + parser.add_argument( + '--window_size_ms', + type=float, + default=30.0, + help='How long each spectrogram timeslice is',) + parser.add_argument( + '--window_stride_ms', + type=float, + default=10.0, + help='How long each spectrogram timeslice is',) + parser.add_argument( + '--dct_coefficient_count', + type=int, + default=40, + help='How many bins to use for the MFCC fingerprint',) + parser.add_argument( + '--how_many_training_steps', + type=str, + default='15000,3000', + help='How many training loops to run',) + parser.add_argument( + '--eval_step_interval', + type=int, + default=400, + help='How often to evaluate the training results.') + parser.add_argument( + '--learning_rate', + type=str, + default='0.001,0.0001', + help='How large a learning rate to use when training.') + parser.add_argument( + '--batch_size', + type=int, + default=100, + help='How many items to train with at once',) + parser.add_argument( + '--summaries_dir', + type=str, + default='/tmp/retrain_logs', + help='Where to save summary logs for TensorBoard.') + parser.add_argument( + '--wanted_words', + type=str, + default='yes,no,up,down,left,right,on,off,stop,go', + help='Words to use (others will be added to an unknown label)',) + parser.add_argument( + '--train_dir', + type=str, + default='/tmp/speech_commands_train', + help='Directory to write event logs and checkpoint.') + parser.add_argument( + '--model_architecture', + type=str, + default='dnn', + help='What model architecture to use') + parser.add_argument( + '--model_size_info', + type=int, + nargs="+", + default=[128, 128, 128], + help='Model dimensions - different for various models') + + FLAGS, _ = parser.parse_known_args() + train() diff --git a/models/keyword_spotting/ds_cnn_small/model_package_tf/validation_utils/labels.txt b/models/keyword_spotting/ds_cnn_small/model_package_tf/validation_utils/labels.txt new file mode 100644 index 0000000..ba41645 --- /dev/null +++ b/models/keyword_spotting/ds_cnn_small/model_package_tf/validation_utils/labels.txt @@ -0,0 +1,12 @@ +_silence_ +_unknown_ +yes +no +up +down +left +right +on +off +stop +go \ No newline at end of file diff --git a/models/keyword_spotting/ds_cnn_small/tflite_int16/README.md b/models/keyword_spotting/ds_cnn_small/tflite_int16/README.md deleted file mode 100644 index 26be0bf..0000000 --- a/models/keyword_spotting/ds_cnn_small/tflite_int16/README.md +++ /dev/null @@ -1,59 +0,0 @@ -# DS-CNN Small INT16 - -## Description -This is a fully quantized version (asymmetrical int16) of the DS-CNN Small model developed by Arm, with training checkpoints, from the Hello Edge paper. Code to recreate this model can be found here: https://github.com/ARM-software/ML-examples/tree/master/tflu-kws-cortex-m - -## License -[Apache-2.0](https://spdx.org/licenses/Apache-2.0.html) - -## Related Materials -### Class Labels -The class labels associated with this model can be downloaded by running the script `get_class_labels.sh`. - -### Model Recreation Code -Code to recreate this model can be found here: https://github.com/ARM-software/ML-examples/tree/master/tflu-kws-cortex-m. - -## Network Information -| Network Information | Value | -|---------------------|------------------| -| Framework | TensorFlow Lite | -| SHA-1 Hash | e82c7d645bec3dec580a096de0a297c6dd9a6463 | -| Size (Bytes) | 55392 | -| Provenance | https://github.com/ARM-software/ML-examples/tree/master/tflu-kws-cortex-m | -| Paper | https://arxiv.org/abs/1711.07128 | - -## Accuracy -Dataset: Google Speech Commands Test Set - -| Metric | Value | -|--------|-------| -| Accuracy | 0.933 | - -## Performance -| Platform | Optimized | -|----------|:---------:| -| Cortex-A |:heavy_check_mark: | -| Cortex-M |:heavy_check_mark: HERO | -| Mali GPU |:heavy_check_mark: | -| Ethos U |:heavy_check_mark: | - -### Key -* :heavy_check_mark: - Will run on this platform. -* :heavy_multiplication_x: - Will not run on this platform. - - - -## Optimizations -| Optimization | Value | -|-----------------|---------| -| Quantization | INT | - -## Network Inputs -| Input Node Name | Shape | Description | -|-----------------|---------|-------------| -| input | (1, 490) | The input is a processed MFCCs of shape (1, 490) | - -## Network Outputs -| Output Node Name | Shape | Description | -|------------------|---------|-------------| -| Identity | (1, 12) | The probability on 12 keywords. | diff --git a/models/keyword_spotting/ds_cnn_small/tflite_int16/ckpt/checkpoint b/models/keyword_spotting/ds_cnn_small/tflite_int16/ckpt/checkpoint deleted file mode 100644 index 7415b78..0000000 --- a/models/keyword_spotting/ds_cnn_small/tflite_int16/ckpt/checkpoint +++ /dev/null @@ -1,2 +0,0 @@ -model_checkpoint_path: "ds_cnn_0.939_ckpt" -all_model_checkpoint_paths: "ds_cnn_0.939_ckpt" diff --git a/models/keyword_spotting/ds_cnn_small/tflite_int16/ckpt/ds_cnn_0.939_ckpt.data-00000-of-00001 b/models/keyword_spotting/ds_cnn_small/tflite_int16/ckpt/ds_cnn_0.939_ckpt.data-00000-of-00001 deleted file mode 100644 index d850952..0000000 Binary files a/models/keyword_spotting/ds_cnn_small/tflite_int16/ckpt/ds_cnn_0.939_ckpt.data-00000-of-00001 and /dev/null differ diff --git a/models/keyword_spotting/ds_cnn_small/tflite_int16/ckpt/ds_cnn_0.939_ckpt.index b/models/keyword_spotting/ds_cnn_small/tflite_int16/ckpt/ds_cnn_0.939_ckpt.index deleted file mode 100644 index 75f70e3..0000000 Binary files a/models/keyword_spotting/ds_cnn_small/tflite_int16/ckpt/ds_cnn_0.939_ckpt.index and /dev/null differ diff --git a/models/keyword_spotting/ds_cnn_small/tflite_int16/definition.yaml b/models/keyword_spotting/ds_cnn_small/tflite_int16/definition.yaml deleted file mode 100644 index 59c1dc7..0000000 --- a/models/keyword_spotting/ds_cnn_small/tflite_int16/definition.yaml +++ /dev/null @@ -1,48 +0,0 @@ -benchmark: - Google Speech Commands test set: - Accuracy: 93.39% -description: 'This is a fully quantized version (asymmetrical int16) of the DS-CNN - Small model developed by Arm, with training checkpoints, from the Hello Edge paper. - Code to recreate this model can be found here: https://github.com/ARM-software/ML-examples/tree/master/tflu-kws-cortex-m' -license: -- Apache-2.0 -network: - file_size_bytes: 55392 - filename: ds_cnn_quantized.tflite - framework: TensorFlow Lite - hash: - algorithm: sha1 - value: e82c7d645bec3dec580a096de0a297c6dd9a6463 - provenance: https://github.com/ARM-software/ML-examples/tree/main/tflu-kws-cortex-m -network_parameters: - input_nodes: - - description: The input is a processed MFCCs of shape (1, 490) - example_input: - path: models/keyword_spotting/ds_cnn/tflite_int16/testing_input/serving_default_input:0 - shape: - - 1 - - 490 - type: int16 - use_case: Random input for model regression. - input_datatype: int16 - name: serving_default_input:0 - shape: - - 1 - - 490 - output_nodes: - - description: The probability on 12 keywords. - name: StatefulPartitionedCall:0 - output_datatype: int16 - shape: - - 1 - - 12 -operators: - TensorFlow Lite: - - AVERAGE_POOL_2D - - CONV_2D - - DEPTHWISE_CONV_2D - - FULLY_CONNECTED - - RELU - - RESHAPE - - SOFTMAX -paper: https://arxiv.org/abs/1711.07128 diff --git a/models/keyword_spotting/ds_cnn_small/tflite_int16/ds_cnn_quantized.tflite b/models/keyword_spotting/ds_cnn_small/tflite_int16/ds_cnn_quantized.tflite deleted file mode 100644 index b19b478..0000000 Binary files a/models/keyword_spotting/ds_cnn_small/tflite_int16/ds_cnn_quantized.tflite and /dev/null differ diff --git a/models/keyword_spotting/ds_cnn_small/tflite_int16/get_class_labels.sh b/models/keyword_spotting/ds_cnn_small/tflite_int16/get_class_labels.sh deleted file mode 100755 index e59caf5..0000000 --- a/models/keyword_spotting/ds_cnn_small/tflite_int16/get_class_labels.sh +++ /dev/null @@ -1,20 +0,0 @@ -# Copyright (C) 2021 Arm Limited or its affiliates. All rights reserved. -# -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the License); you may -# not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an AS IS BASIS, WITHOUT -# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -#!/usr/bin/env bash - -wget https://raw.githubusercontent.com/ARM-software/ML-KWS-for-MCU/e9cf319e9aa2ff71d433e111477dd95329fb94cb/Pretrained_models/labels.txt -mv labels.txt labelmappings.txt \ No newline at end of file diff --git a/models/keyword_spotting/ds_cnn_small/tflite_int16/testing_input/serving_default_input:0/0.npy b/models/keyword_spotting/ds_cnn_small/tflite_int16/testing_input/serving_default_input:0/0.npy deleted file mode 100644 index 75a2851..0000000 Binary files a/models/keyword_spotting/ds_cnn_small/tflite_int16/testing_input/serving_default_input:0/0.npy and /dev/null differ diff --git a/models/keyword_spotting/ds_cnn_small/tflite_int16/testing_output/StatefulPartitionedCall:0/0.npy b/models/keyword_spotting/ds_cnn_small/tflite_int16/testing_output/StatefulPartitionedCall:0/0.npy deleted file mode 100644 index b4c71a3..0000000 Binary files a/models/keyword_spotting/ds_cnn_small/tflite_int16/testing_output/StatefulPartitionedCall:0/0.npy and /dev/null differ diff --git a/models/keyword_spotting/ds_cnn_small/tflite_int8/README.md b/models/keyword_spotting/ds_cnn_small/tflite_int8/README.md deleted file mode 100644 index 230a02f..0000000 --- a/models/keyword_spotting/ds_cnn_small/tflite_int8/README.md +++ /dev/null @@ -1,59 +0,0 @@ -# DS-CNN Small INT8 - -## Description -This is a fully quantized version (asymmetrical int8) of the DS-CNN Small model developed by Arm, with training checkpoints, from the Hello Edge paper. Code to recreate this model can be found here: https://github.com/ARM-software/ML-examples/tree/master/tflu-kws-cortex-m - -## License -[Apache-2.0](https://spdx.org/licenses/Apache-2.0.html) - -## Related Materials -### Class Labels -The class labels associated with this model can be downloaded by running the script `get_class_labels.sh`. - -### Model Recreation Code -Code to recreate this model can be found here: https://github.com/ARM-software/ML-examples/tree/master/tflu-kws-cortex-m. - -## Network Information -| Network Information | Value | -|---------------------|------------------| -| Framework | TensorFlow Lite | -| SHA-1 Hash | cf24429e86a9647b1632c382894bc68d26d34039 | -| Size (Bytes) | 47616 | -| Provenance | https://github.com/ARM-software/ML-examples/tree/master/tflu-kws-cortex-m | -| Paper | https://arxiv.org/abs/1711.07128 | - -## Accuracy -Dataset: Google Speech Commands Test Set - -| Metric | Value | -|--------|-------| -| Accuracy | 0.935 | - -## Performance -| Platform | Optimized | -|----------|:---------:| -| Cortex-A |:heavy_check_mark: | -| Cortex-M |:heavy_check_mark: HERO | -| Mali GPU |:heavy_check_mark: | -| Ethos U |:heavy_check_mark: | - -### Key -* :heavy_check_mark: - Will run on this platform. -* :heavy_multiplication_x: - Will not run on this platform. - - - -## Optimizations -| Optimization | Value | -|-----------------|---------| -| Quantization | INT8 | - -## Network Inputs -| Input Node Name | Shape | Description | -|-----------------|---------|-------------| -| input | (1, 490) | The input is a processed MFCCs of shape (1, 490) | - -## Network Outputs -| Output Node Name | Shape | Description | -|------------------|---------|-------------| -| Identity | (1, 12) | The probability on 12 keywords. | diff --git a/models/keyword_spotting/ds_cnn_small/tflite_int8/definition.yaml b/models/keyword_spotting/ds_cnn_small/tflite_int8/definition.yaml deleted file mode 100644 index 5e507b4..0000000 --- a/models/keyword_spotting/ds_cnn_small/tflite_int8/definition.yaml +++ /dev/null @@ -1,45 +0,0 @@ -benchmark: - Google Speech Commands test set: - Accuracy: 93.56% -description: 'This is a fully quantized version (asymmetrical int8) of the DS-CNN - Small model developed by Arm, with training checkpoints, from the Hello Edge paper. - Code to recreate this model can be found here: https://github.com/ARM-software/ML-examples/tree/master/tflu-kws-cortex-m' -license: -- Apache-2.0 -network: - file_size_bytes: 47616 - filename: ds_cnn_s_quantized.tflite - framework: TensorFlow Lite - hash: - algorithm: sha1 - value: cf24429e86a9647b1632c382894bc68d26d34039 - provenance: https://github.com/ARM-software/ML-examples/tree/master/tflu-kws-cortex-m - quality_level: hero#CORTEX-M -network_parameters: - input_nodes: - - description: The input is a processed MFCCs of shape (1, 490) - example_input: - path: models/keyword_spotting/ds_cnn_small/tflite_int8/testing_input/input - name: input - shape: - - 1 - - 490 - output_nodes: - - description: The probability on 12 keywords. - name: Identity - shape: - - 1 - - 12 - test_output_path: models/keyword_spotting/ds_cnn_small/tflite_int8/testing_output/Identity -operators: - TensorFlow Lite: - - AVERAGE_POOL_2D - - CONV_2D - - DEPTHWISE_CONV_2D - - DEQUANTIZE - - FULLY_CONNECTED - - QUANTIZE - - RELU - - RESHAPE - - SOFTMAX -paper: https://arxiv.org/abs/1711.07128 diff --git a/models/keyword_spotting/ds_cnn_small/tflite_int8/get_class_labels.sh b/models/keyword_spotting/ds_cnn_small/tflite_int8/get_class_labels.sh deleted file mode 100755 index e59caf5..0000000 --- a/models/keyword_spotting/ds_cnn_small/tflite_int8/get_class_labels.sh +++ /dev/null @@ -1,20 +0,0 @@ -# Copyright (C) 2021 Arm Limited or its affiliates. All rights reserved. -# -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the License); you may -# not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an AS IS BASIS, WITHOUT -# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -#!/usr/bin/env bash - -wget https://raw.githubusercontent.com/ARM-software/ML-KWS-for-MCU/e9cf319e9aa2ff71d433e111477dd95329fb94cb/Pretrained_models/labels.txt -mv labels.txt labelmappings.txt \ No newline at end of file