Merge pull request #45 from ARM-software/tiny-wav2letter-new

tiny-wav2letter
ARM-software · Feb 23, 2022 · 8595dc6 · 8595dc6
2 parents b9e26e6 + 61e9a31
commit 8595dc6
Show file tree

Hide file tree

Showing 63 changed files with 4,724 additions and 1 deletion.
diff --git a/README.md b/README.md
@@ -361,9 +361,29 @@
         <td align="center">:heavy_check_mark: </td>
         <td align="center">0.0783</td>
     </tr>
+    <tr>
+        <td><a href="models/speech_recognition/tiny_wav2letter/tflite_int8">Tiny Wav2letter INT8 *</a></td>
+        <td align="center">INT8</td>
+        <td align="center">TensorFlow Lite</td>
+        <td align="center">:heavy_check_mark: </td>
+        <td align="center">:heavy_check_mark: </td>
+        <td align="center">:heavy_multiplication_x: </td>
+        <td align="center">:heavy_check_mark: </td>
+        <td align="center">0.0348</td>
+    </tr>
+    <tr>
+        <td><a href="models/speech_recognition/tiny_wav2letter/tflite_pruned_int8">Tiny Wav2letter Pruned INT8 *</a></td>
+        <td align="center">INT8</td>
+        <td align="center">TensorFlow Lite</td>
+        <td align="center">:heavy_check_mark: </td>
+        <td align="center">:heavy_check_mark: </td>
+        <td align="center">:heavy_multiplication_x: </td>
+        <td align="center">:heavy_check_mark: </td>
+        <td align="center">0.0283</td>
+    </tr>
 </table>
 
-**Dataset**: LibriSpeech
+**Dataset**: LibriSpeech, Fluent Speech
 
 ## Superresolution
 

diff --git a/models/speech_recognition/tiny_wav2letter/tflite_int8/README.md b/models/speech_recognition/tiny_wav2letter/tflite_int8/README.md
@@ -0,0 +1,74 @@
+# Tiny Wav2letter INT8
+
+## Description
+Tiny Wav2letter is a tiny version of the original Wav2Letter model. It is a convolutional speech recognition neural network. This implementation was created by Arm, pruned to 50% sparsity, fine-tuned and quantized using the TensorFlow Model Optimization Toolkit.
+
+
+
+## License
+[Apache-2.0](https://spdx.org/licenses/Apache-2.0.html)
+
+## Network Information
+| Network Information |  Value         |
+|---------------------|----------------|
+|  Framework          | TensorFlow Lite |
+|  SHA-1 Hash         | 13ca2294ba4bbb1f1c6c5e663cb532d58cd76a6b |
+|  Size (Bytes)       | 3997112 |
+|  Provenance         | https://github.com/ARM-software/ML-zoo/tree/master/models/speech_recognition/wav2letter |
+|  Paper              | https://arxiv.org/abs/1609.03193 |
+
+## Performance
+
+| Platform | Optimized |
+|----------|:---------:|
+| Cortex-A |:heavy_check_mark:          |
+| Cortex-M |:heavy_check_mark:          |
+| Mali GPU |:heavy_multiplication_x:          |
+| Ethos U  |:heavy_check_mark:          |
+
+### Key
+* :heavy_check_mark: - Will run on this platform.
+* :heavy_multiplication_x: - Will not run on this platform.
+
+## Accuracy
+Dataset: Fluent Speech (trianed on LibriSpeech,Mini LibrySpeech,Fluent Speech)
+<br />
+Please note that Fluent Speech dataset hosted on Kaggle is a licensed dataset.
+
+| Metric | Value |
+|--------|-------|
+| LER | 0.0348 |
+| WER | 0.112 |
+
+## Optimizations
+| Optimization |  Value  |
+|--------------|---------|
+| Quantization | INT8 |
+
+## Network Inputs
+<table>
+    <tr>
+        <th width="200">Input Node Name</th>
+        <th width="100">Shape</th>
+        <th width="300">Description</th>
+    </tr>
+    <tr>
+        <td>input_1_int8</td>
+        <td>(1, 296, 39)</td>
+        <td>Speech converted to MFCCs and quantized to INT8</td> 
+    </tr>
+</table>
+
+## Network Outputs
+<table>
+    <tr>
+        <th width="200">Output Node Name</th>
+        <th width="100">Shape</th>
+        <th width="300">Description</th>
+    </tr>
+    <tr>
+        <td>Identity_int8</td>
+        <td>(1, 1, 148, 29)</td>
+        <td>A tensor of time and class probabilities, that represents the probability of each class at each timestep. Should be passed to a decoder. For example ctc_beam_search_decoder.</td> 
+    </tr>
+</table>
diff --git a/models/speech_recognition/tiny_wav2letter/tflite_int8/definition.yaml b/models/speech_recognition/tiny_wav2letter/tflite_int8/definition.yaml
@@ -0,0 +1,60 @@
+author_notes: null
+benchmark:
+  benchmark_description: please note that fluent-speech-corpus dataset hosted on Kaggle
+    is a licensed dataset.
+  benchmark_link: https://www.kaggle.com/tommyngx/fluent-speech-corpus
+  benchmark_metrics:
+    LER: '0.0348'
+    WER: '0.1123'
+  benchmark_name: Fluent speech
+description: "Tiny Wav2letter is a tiny version of the original Wav2Letter model.\
+  \ It is a convolutional speech recognition neural network. This implementation was\
+  \ created by Arm, pruned to 50% sparsity, fine-tuned and quantized using the TensorFlow\
+  \ Model Optimization Toolkit.\r\n\r\n"
+license:
+- Apache-2.0
+network:
+  datatype: int8
+  file_size_bytes: 3997112
+  filename: tiny_wav2letter_int8.tflite
+  framework: TensorFlow Lite
+  framework_version: 2.4.1
+  hash:
+    algorithm: sha1
+    value: 13ca2294ba4bbb1f1c6c5e663cb532d58cd76a6b
+  provenance: https://github.com/ARM-software/ML-zoo/tree/master/models/speech_recognition/wav2letter
+  training: LibriSpeech,Mini LibrySpeech,fluent speech
+network_parameters:
+  input_nodes:
+  - description: Speech converted to MFCCs and quantized to INT8
+    example_input:
+      path: models/speech_recognition/tiny_wav2letter/tflite_int8/testing_input/input_1_int8
+    input_datatype: int8
+    name: input_1_int8
+    shape:
+    - 1
+    - 296
+    - 39
+  output_nodes:
+  - description: A tensor of time and class probabilities, that represents the probability
+      of each class at each timestep. Should be passed to a decoder. For example ctc_beam_search_decoder.
+    example_output:
+      path: models/speech_recognition/tiny_wav2letter/tflite_int8/testing_output/Identity_int8
+    name: Identity_int8
+    output_datatype: int8
+    shape:
+    - 1
+    - 1
+    - 148
+    - 29
+network_quality:
+  quality_level: Deployable
+  quality_level_hero_hw: null
+operators:
+  TensorFlow Lite:
+  - CONV_2D
+  - DEQUANTIZE
+  - LEAKY_RELU
+  - QUANTIZE
+  - RESHAPE
+paper: https://arxiv.org/abs/1609.03193
diff --git a/models/speech_recognition/tiny_wav2letter/tflite_int8/demo_input/84-121550-0000.flac b/models/speech_recognition/tiny_wav2letter/tflite_int8/demo_input/84-121550-0000.flac