LASR-at-Home · m-barker · Jun 11, 2024 · Mar 12, 2024 · Mar 12, 2024 · Mar 12, 2024
diff --git a/common/speech/lasr_speech_recognition_whisper/nodes/transcribe_microphone_server b/common/speech/lasr_speech_recognition_whisper/nodes/transcribe_microphone_server
@@ -31,6 +31,7 @@ class speech_model_params:
         mic_device (Optional[str]): Microphone device index or name. Defaults to None.
         timer_duration (Optional[int]): Duration of the timer for adjusting the microphone for ambient noise. Defaults to 20 seconds.
         warmup (bool): Whether to warmup the model by running inference on a test file. Defaults to True.
+        energy_threshold (Optional[int]): Energy threshold for silence detection. Using this disables automatic adjustment. Defaults to None.
     """
 
     model_name: str = "medium.en"
@@ -41,6 +42,7 @@ class speech_model_params:
     mic_device: Optional[str] = None
     timer_duration: Optional[int] = 20
     warmup: bool = True
+    energy_threshold: Optional[int] = None
 
 
 class TranscribeSpeechAction(object):
@@ -71,7 +73,7 @@ class TranscribeSpeechAction(object):
             self._model_params.warmup,
         )
         # Configure the speech recogniser object and adjust for ambient noise
-        self.recogniser = self._configure_recogniser(ambient_adj=True)
+        self.recogniser = self._configure_recogniser()
         # Setup the action server and register execution callback
         self._action_server = actionlib.SimpleActionServer(
             self._action_name,
@@ -84,11 +86,6 @@ class TranscribeSpeechAction(object):
 
         self._action_server.start()
 
-    def _reset_timer(self) -> None:
-        """Resets the timer for adjusting the microphone for ambient noise."""
-        self._timer.shutdown()
-        self._timer = rospy.Timer(rospy.Duration(self._timer_duration), self._timer_cb)
-
     def _configure_microphone(self) -> sr.Microphone:
         """Configures the microphone for listening to speech based on the
         microphone device index or name.
@@ -116,20 +113,22 @@ class TranscribeSpeechAction(object):
                 f"Could not find microphone with name: {self._model_params.mic_device}"
             )
 
-    def _configure_recogniser(self, ambient_adj: bool = True) -> sr.Recognizer:
+    def _configure_recogniser(self) -> sr.Recognizer:
         """Configures the speech recogniser object.
 
-        Args:
-            ambient_adj (bool, optional): Whether to adjust for ambient noise. Defaults to True.
-
         Returns:
             sr.Recognizer: speech recogniser object.
         """
         self._listening = True
         recogniser = sr.Recognizer()
-        if ambient_adj:
-            with self._configure_microphone() as source:
-                recogniser.adjust_for_ambient_noise(source)
+
+        if self._model_params.energy_threshold:
+            recogniser.dynamic_energy_threshold = False
+            recogniser.energy_threshold = self._model_params.energy_threshold
+            return recogniser
+
+        with self._configure_microphone() as source:
+            recogniser.adjust_for_ambient_noise(source)
         self._listening = False
         return recogniser
 
@@ -257,7 +256,14 @@ def parse_args() -> dict:
         help="Disable warming up the model by running inference on a test file.",
     )
 
-    args,unknown = parser.parse_known_args()
+    parser.add_argument(
+        "--energy_threshold",
+        type=int,
+        default=None,
+        help="Energy threshold for silence detection. Using this disables automatic adjustment",
+    )
+
+    args, unknown = parser.parse_known_args()
     return vars(args)