Timeout early without confident speech

Avoid cutting command off during thinking pause
2026-07-01 02:55:57 +02:00 · 2026-06-29 16:23:18 -05:00 · 2026-06-29 14:55:20 -05:00
4 changed files with 256 additions and 24 deletions
@@ -522,6 +522,9 @@ class AudioSettings:
    silence_seconds: float = 0.7
    """Seconds of silence after voice command has ended."""

+    in_command_silence_threshold: float = 0.2
+    """Probability below which in-command audio counts toward ending the command."""
+
    def __post_init__(self) -> None:
        """Verify settings post-initialization."""
        if (self.noise_suppression_level < 0) or (self.noise_suppression_level > 4):
@@ -953,7 +956,10 @@ class PipelineRun:
                and self.stt_provider.audio_processing.requires_external_vad
            ):
                stt_vad = VoiceCommandSegmenter(
-                    silence_seconds=self.audio_settings.silence_seconds
+                    silence_seconds=self.audio_settings.silence_seconds,
+                    in_command_silence_threshold=(
+                        self.audio_settings.in_command_silence_threshold
+                    ),
                )

            result = await self.stt_provider.async_process_audio_stream(
@@ -29,6 +29,23 @@ class VadSensitivity(StrEnum):

        return 0.7

+    @staticmethod
+    def to_in_command_silence_threshold(sensitivity: VadSensitivity | str) -> float:
+        """Return in-command silence probability threshold for sensitivity level.
+
+        Lower values keep the command open through thinking pauses (more pause
+        tolerance, more hang-on in noise); a value equal to the in-command speech
+        threshold disables the dead band entirely.
+        """
+        sensitivity = VadSensitivity(sensitivity)
+        if sensitivity == VadSensitivity.RELAXED:
+            return 0.1
+
+        if sensitivity == VadSensitivity.AGGRESSIVE:
+            return 0.5
+
+        return 0.2
+

 class AudioBuffer:
    """Fixed-sized audio buffer with variable internal length."""
@@ -100,9 +117,21 @@ class VoiceCommandSegmenter:
    in_command_speech_threshold: float = 0.5
    """Probability threshold for speech during voice command."""

+    in_command_silence_threshold: float = 0.2
+    """Probability below which an in-command chunk counts toward ending the command."""
+
+    min_command_speech_seconds: float = 0.05
+    """Confident speech required before a command may finish on silence."""
+
+    false_start_timeout_seconds: float = 5.0
+    """Seconds to wait for confident speech before giving up (a false activation)."""
+
    _speech_seconds_left: float = 0.0
    """Seconds left before considering voice command as started."""

+    _command_speech_seconds_left: float = 0.0
+    """Confident speech still required before the command may finish."""
+
    _command_seconds_left: float = 0.0
    """Seconds left before voice command could stop."""

@@ -123,6 +152,7 @@ class VoiceCommandSegmenter:
        """Reset all counters and state."""
        self._speech_seconds_left = self.speech_seconds
        self._command_seconds_left = self.command_seconds - self.speech_seconds
+        self._command_speech_seconds_left = self.min_command_speech_seconds
        self._silence_seconds_left = self.silence_seconds
        self._timeout_seconds_left = self.timeout_seconds
        self._reset_seconds_left = self.reset_seconds
@@ -137,10 +167,15 @@ class VoiceCommandSegmenter:
            self.timed_out = False

        self._timeout_seconds_left -= chunk_seconds
-        if self._timeout_seconds_left <= 0:
+        no_speech_yet = self._command_speech_seconds_left > 0
+        elapsed = self.timeout_seconds - self._timeout_seconds_left
+        if (self._timeout_seconds_left <= 0) or (
+            no_speech_yet and (elapsed >= self.false_start_timeout_seconds)
+        ):
            _LOGGER.debug(
-                "VAD end of speech detection timed out after %s seconds",
-                self.timeout_seconds,
+                "VAD timed out after %.2f seconds (no_speech_yet=%s)",
+                elapsed,
+                no_speech_yet,
            )
            self.reset()
            self.timed_out = True
@@ -148,6 +183,9 @@ class VoiceCommandSegmenter:

        if speech_probability is None:
            speech_probability = 0.0
+        else:
+            # MicroVad returns -1.0 as a "no result yet" sentinel; clamp to [0, 1]
+            speech_probability = min(1.0, max(0.0, speech_probability))

        if not self.in_command:
            # Before command
@@ -155,6 +193,9 @@ class VoiceCommandSegmenter:
            if is_speech:
                self._reset_seconds_left = self.reset_seconds
                self._speech_seconds_left -= chunk_seconds
+                if speech_probability > self.in_command_speech_threshold:
+                    # Confident speech during onset also counts toward confirmation
+                    self._command_speech_seconds_left -= chunk_seconds
                if self._speech_seconds_left <= 0:
                    # Inside voice command
                    self.in_command = True
@@ -168,30 +209,37 @@ class VoiceCommandSegmenter:
                self._reset_seconds_left -= chunk_seconds
                if self._reset_seconds_left <= 0:
                    self._speech_seconds_left = self.speech_seconds
+                    self._command_speech_seconds_left = self.min_command_speech_seconds
                    self._reset_seconds_left = self.reset_seconds
        else:
            # In command
-            is_speech = speech_probability > self.in_command_speech_threshold
-            if not is_speech:
-                # Silence in command
-                self._reset_seconds_left = self.reset_seconds
-                self._silence_seconds_left -= chunk_seconds
-                self._command_seconds_left -= chunk_seconds
-                if (self._silence_seconds_left <= 0) and (
-                    self._command_seconds_left <= 0
-                ):
-                    # Command finished successfully
-                    self.reset()
-                    _LOGGER.debug("Voice command finished")
-                    return False
-            else:
+            self._command_seconds_left -= chunk_seconds
+            if speech_probability > self.in_command_speech_threshold:
                # Speech in command.
+                self._command_speech_seconds_left -= chunk_seconds
                # Reset silence counter if enough speech.
                self._reset_seconds_left -= chunk_seconds
-                self._command_seconds_left -= chunk_seconds
                if self._reset_seconds_left <= 0:
                    self._silence_seconds_left = self.silence_seconds
                    self._reset_seconds_left = self.reset_seconds
+            elif speech_probability < self.in_command_silence_threshold:
+                # Silence in command
+                self._reset_seconds_left = self.reset_seconds
+                self._silence_seconds_left -= chunk_seconds
+                if (self._silence_seconds_left <= 0) and (
+                    self._command_seconds_left <= 0
+                ):
+                    if self._command_speech_seconds_left <= 0:
+                        # Command finished successfully
+                        self.reset()
+                        _LOGGER.debug("Voice command finished")
+                        return False
+                    # Triggered without real speech: false start, resume listening
+                    _LOGGER.debug("Discarding false start (no speech detected)")
+                    timeout_seconds_left = self._timeout_seconds_left
+                    self.reset()
+                    self._timeout_seconds_left = timeout_seconds_left
+            # else: dead-band activity holds the command open without refreshing

        return True

@@ -506,6 +506,7 @@ class AssistSatelliteEntity(entity.Entity):
            # Store the conversation ID. If it is no longer valid,
            # get_chat_session will reset it
            self._conversation_id = session.conversation_id
+            vad_sensitivity = self._resolve_vad_sensitivity()
            self._pipeline_task = (
                self.platform.config_entry.async_create_background_task(
                    self.hass,
@@ -529,7 +530,14 @@ class AssistSatelliteEntity(entity.Entity):
                        tts_audio_output=self.tts_options,
                        wake_word_phrase=wake_word_phrase,
                        audio_settings=AudioSettings(
-                            silence_seconds=self._resolve_vad_sensitivity()
+                            silence_seconds=vad.VadSensitivity.to_seconds(
+                                vad_sensitivity
+                            ),
+                            in_command_silence_threshold=(
+                                vad.VadSensitivity.to_in_command_silence_threshold(
+                                    vad_sensitivity
+                                )
+                            ),
                        ),
                        start_stage=start_stage,
                        end_stage=end_stage,
@@ -627,7 +635,7 @@ class AssistSatelliteEntity(entity.Entity):
        return None

    @callback
-    def _resolve_vad_sensitivity(self) -> float:
+    def _resolve_vad_sensitivity(self) -> vad.VadSensitivity:
        """Resolve VAD sensitivity from select entity to enum."""
        vad_sensitivity = vad.VadSensitivity.DEFAULT

@@ -639,7 +647,7 @@ class AssistSatelliteEntity(entity.Entity):

            vad_sensitivity = vad.VadSensitivity(vad_sensitivity_state.state)

-        return vad.VadSensitivity.to_seconds(vad_sensitivity)
+        return vad_sensitivity

    async def _resolve_announcement_media_id(
        self,
@@ -2,8 +2,11 @@

 import itertools as it

+import pytest
+
 from homeassistant.components.assist_pipeline.vad import (
    AudioBuffer,
+    VadSensitivity,
    VoiceCommandSegmenter,
    chunk_samples,
 )
@@ -234,9 +237,11 @@ def test_speech_thresholds() -> None:
    segmenter = VoiceCommandSegmenter(
        before_command_speech_threshold=0.2,
        in_command_speech_threshold=0.5,
+        in_command_silence_threshold=0.2,
        command_seconds=2,
        speech_seconds=1,
        silence_seconds=1,
+        min_command_speech_seconds=0.0,
    )

    # Not high enough probability to trigger command
@@ -247,6 +252,171 @@ def test_speech_thresholds() -> None:
    assert segmenter.process(_ONE_SECOND, 0.3)
    assert segmenter.in_command

-    # Now that same probability is considered silence.
-    # Finishes command.
+    # Dead-band probability is neither speech nor silence: command stays open
+    assert segmenter.process(_ONE_SECOND, 0.3)
+    assert segmenter.in_command
+
+    # Clear silence finishes the command
+    assert not segmenter.process(_ONE_SECOND, 0.05)
+
+
+def test_dead_band_preserves_pause() -> None:
+    """Test dead-band activity during a pause does not end the command early."""
+
+    segmenter = VoiceCommandSegmenter(
+        in_command_speech_threshold=0.5,
+        in_command_silence_threshold=0.2,
+        speech_seconds=0.5,
+        command_seconds=1.0,
+        silence_seconds=1.0,
+        reset_seconds=0.5,
+    )
+
+    # Start the command
+    assert segmenter.process(_ONE_SECOND * 0.5, 1.0)
+    assert segmenter.in_command
+
+    # 1.5s of dead-band activity (> silence_seconds) keeps the command open.
+    # The original single-threshold logic would have ended it during this pause.
+    for _ in range(3):
+        assert segmenter.process(_ONE_SECOND * 0.5, 0.3)
+        assert segmenter.in_command
+
+    # Speech resumes, then genuine silence ends the command
+    assert segmenter.process(_ONE_SECOND * 0.5, 1.0)
+    assert segmenter.process(_ONE_SECOND * 0.5, 0.0)
+    assert not segmenter.process(_ONE_SECOND * 0.5, 0.0)
+    assert not segmenter.in_command
+
+
+def test_dead_band_times_out() -> None:
+    """Test sustained dead-band audio ends via timeout, never a premature finish."""
+
+    segmenter = VoiceCommandSegmenter(
+        in_command_speech_threshold=0.5,
+        in_command_silence_threshold=0.2,
+        speech_seconds=0.5,
+        timeout_seconds=2.0,
+    )
+
+    # Start the command
+    assert segmenter.process(_ONE_SECOND * 0.5, 1.0)
+    assert segmenter.in_command
+
+    # Dead-band audio holds the command open (never finishes) until the timeout
+    assert segmenter.process(_ONE_SECOND, 0.3)
+    assert not segmenter.timed_out
+
    assert not segmenter.process(_ONE_SECOND, 0.3)
+    assert segmenter.timed_out
+
+
+def test_false_start_aborts_and_resumes() -> None:
+    """Test a trigger without confident speech aborts instead of finishing."""
+
+    segmenter = VoiceCommandSegmenter(
+        before_command_speech_threshold=0.2,
+        in_command_speech_threshold=0.5,
+        in_command_silence_threshold=0.2,
+        min_command_speech_seconds=0.1,
+        speech_seconds=0.3,
+        command_seconds=1.0,
+        silence_seconds=0.5,
+    )
+
+    # A sub-0.5 transient (above entry, below speech) triggers a command...
+    assert segmenter.process(_ONE_SECOND * 0.3, 0.35)
+    assert segmenter.in_command
+
+    # ...but with no confident speech, the silence end aborts back to listening
+    # instead of finishing into near-empty audio.
+    assert segmenter.process(_ONE_SECOND, 0.0)
+    assert not segmenter.in_command
+
+    # The real command then arrives and finishes normally
+    assert segmenter.process(_ONE_SECOND, 1.0)
+    assert segmenter.in_command
+    assert not segmenter.process(_ONE_SECOND, 0.0)
+    assert not segmenter.in_command
+
+
+def test_false_start_timeout() -> None:
+    """Test a false activation gives up at false_start_timeout, not timeout_seconds."""
+
+    segmenter = VoiceCommandSegmenter(
+        before_command_speech_threshold=0.2,
+        in_command_speech_threshold=0.5,
+        in_command_silence_threshold=0.2,
+        min_command_speech_seconds=0.05,
+        false_start_timeout_seconds=3.0,
+        timeout_seconds=15.0,
+    )
+
+    # A trigger with no confident speech, followed by silence: gives up at 3s,
+    # well before the 15s command timeout.
+    assert segmenter.process(_ONE_SECOND, 0.35)
+    assert segmenter.process(_ONE_SECOND, 0.0)
+    assert not segmenter.timed_out
+
+    # Crosses false_start_timeout (3s elapsed) without confident speech
+    assert not segmenter.process(_ONE_SECOND, 0.0)
+    assert segmenter.timed_out
+
+
+def test_false_start_timeout_not_applied_after_speech() -> None:
+    """Test confident speech lifts the false-start timeout to the full command timeout."""
+
+    segmenter = VoiceCommandSegmenter(
+        in_command_speech_threshold=0.5,
+        min_command_speech_seconds=0.05,
+        false_start_timeout_seconds=3.0,
+        timeout_seconds=15.0,
+        speech_seconds=0.3,
+    )
+
+    # Confident speech early confirms the command...
+    assert segmenter.process(_ONE_SECOND * 0.5, 1.0)
+    assert segmenter.in_command
+
+    # ...so a long pause past false_start_timeout does NOT give up early
+    assert segmenter.process(_ONE_SECOND * 4, 0.3)
+    assert not segmenter.timed_out
+    assert segmenter.in_command
+
+
+def test_min_command_speech_disabled() -> None:
+    """Test min_command_speech_seconds=0 keeps the original finish behavior."""
+
+    segmenter = VoiceCommandSegmenter(
+        in_command_speech_threshold=0.5,
+        in_command_silence_threshold=0.2,
+        min_command_speech_seconds=0.0,
+        speech_seconds=0.3,
+        command_seconds=1.0,
+        silence_seconds=0.5,
+    )
+
+    # Trigger on a sub-0.5 transient and finish on silence (no abort)
+    assert segmenter.process(_ONE_SECOND * 0.3, 0.35)
+    assert segmenter.in_command
+    assert not segmenter.process(_ONE_SECOND, 0.0)
+    assert not segmenter.in_command
+
+
+@pytest.mark.parametrize(
+    ("sensitivity", "expected_threshold"),
+    [
+        pytest.param(VadSensitivity.AGGRESSIVE, 0.5, id="aggressive"),
+        pytest.param(VadSensitivity.DEFAULT, 0.2, id="default"),
+        pytest.param(VadSensitivity.RELAXED, 0.1, id="relaxed"),
+    ],
+)
+def test_vad_sensitivity_in_command_silence_threshold(
+    sensitivity: VadSensitivity, expected_threshold: float
+) -> None:
+    """Test sensitivity maps to an in-command silence threshold."""
+
+    assert (
+        VadSensitivity.to_in_command_silence_threshold(sensitivity)
+        == expected_threshold
+    )
Author	SHA1	Message	Date
Michael Hansen	f37023a316	Timeout early without confident speech	2026-06-29 16:23:18 -05:00
Michael Hansen	66e8c50377	Avoid cutting command off during thinking pause	2026-06-29 14:55:20 -05:00