Compare commits

...

2 Commits

Author SHA1 Message Date
Michael Hansen f37023a316 Timeout early without confident speech 2026-06-29 16:23:18 -05:00
Michael Hansen 66e8c50377 Avoid cutting command off during thinking pause 2026-06-29 14:55:20 -05:00
4 changed files with 256 additions and 24 deletions
@@ -522,6 +522,9 @@ class AudioSettings:
silence_seconds: float = 0.7
"""Seconds of silence after voice command has ended."""
in_command_silence_threshold: float = 0.2
"""Probability below which in-command audio counts toward ending the command."""
def __post_init__(self) -> None:
"""Verify settings post-initialization."""
if (self.noise_suppression_level < 0) or (self.noise_suppression_level > 4):
@@ -953,7 +956,10 @@ class PipelineRun:
and self.stt_provider.audio_processing.requires_external_vad
):
stt_vad = VoiceCommandSegmenter(
silence_seconds=self.audio_settings.silence_seconds
silence_seconds=self.audio_settings.silence_seconds,
in_command_silence_threshold=(
self.audio_settings.in_command_silence_threshold
),
)
result = await self.stt_provider.async_process_audio_stream(
+66 -18
View File
@@ -29,6 +29,23 @@ class VadSensitivity(StrEnum):
return 0.7
@staticmethod
def to_in_command_silence_threshold(sensitivity: VadSensitivity | str) -> float:
"""Return in-command silence probability threshold for sensitivity level.
Lower values keep the command open through thinking pauses (more pause
tolerance, more hang-on in noise); a value equal to the in-command speech
threshold disables the dead band entirely.
"""
sensitivity = VadSensitivity(sensitivity)
if sensitivity == VadSensitivity.RELAXED:
return 0.1
if sensitivity == VadSensitivity.AGGRESSIVE:
return 0.5
return 0.2
class AudioBuffer:
"""Fixed-sized audio buffer with variable internal length."""
@@ -100,9 +117,21 @@ class VoiceCommandSegmenter:
in_command_speech_threshold: float = 0.5
"""Probability threshold for speech during voice command."""
in_command_silence_threshold: float = 0.2
"""Probability below which an in-command chunk counts toward ending the command."""
min_command_speech_seconds: float = 0.05
"""Confident speech required before a command may finish on silence."""
false_start_timeout_seconds: float = 5.0
"""Seconds to wait for confident speech before giving up (a false activation)."""
_speech_seconds_left: float = 0.0
"""Seconds left before considering voice command as started."""
_command_speech_seconds_left: float = 0.0
"""Confident speech still required before the command may finish."""
_command_seconds_left: float = 0.0
"""Seconds left before voice command could stop."""
@@ -123,6 +152,7 @@ class VoiceCommandSegmenter:
"""Reset all counters and state."""
self._speech_seconds_left = self.speech_seconds
self._command_seconds_left = self.command_seconds - self.speech_seconds
self._command_speech_seconds_left = self.min_command_speech_seconds
self._silence_seconds_left = self.silence_seconds
self._timeout_seconds_left = self.timeout_seconds
self._reset_seconds_left = self.reset_seconds
@@ -137,10 +167,15 @@ class VoiceCommandSegmenter:
self.timed_out = False
self._timeout_seconds_left -= chunk_seconds
if self._timeout_seconds_left <= 0:
no_speech_yet = self._command_speech_seconds_left > 0
elapsed = self.timeout_seconds - self._timeout_seconds_left
if (self._timeout_seconds_left <= 0) or (
no_speech_yet and (elapsed >= self.false_start_timeout_seconds)
):
_LOGGER.debug(
"VAD end of speech detection timed out after %s seconds",
self.timeout_seconds,
"VAD timed out after %.2f seconds (no_speech_yet=%s)",
elapsed,
no_speech_yet,
)
self.reset()
self.timed_out = True
@@ -148,6 +183,9 @@ class VoiceCommandSegmenter:
if speech_probability is None:
speech_probability = 0.0
else:
# MicroVad returns -1.0 as a "no result yet" sentinel; clamp to [0, 1]
speech_probability = min(1.0, max(0.0, speech_probability))
if not self.in_command:
# Before command
@@ -155,6 +193,9 @@ class VoiceCommandSegmenter:
if is_speech:
self._reset_seconds_left = self.reset_seconds
self._speech_seconds_left -= chunk_seconds
if speech_probability > self.in_command_speech_threshold:
# Confident speech during onset also counts toward confirmation
self._command_speech_seconds_left -= chunk_seconds
if self._speech_seconds_left <= 0:
# Inside voice command
self.in_command = True
@@ -168,30 +209,37 @@ class VoiceCommandSegmenter:
self._reset_seconds_left -= chunk_seconds
if self._reset_seconds_left <= 0:
self._speech_seconds_left = self.speech_seconds
self._command_speech_seconds_left = self.min_command_speech_seconds
self._reset_seconds_left = self.reset_seconds
else:
# In command
is_speech = speech_probability > self.in_command_speech_threshold
if not is_speech:
# Silence in command
self._reset_seconds_left = self.reset_seconds
self._silence_seconds_left -= chunk_seconds
self._command_seconds_left -= chunk_seconds
if (self._silence_seconds_left <= 0) and (
self._command_seconds_left <= 0
):
# Command finished successfully
self.reset()
_LOGGER.debug("Voice command finished")
return False
else:
self._command_seconds_left -= chunk_seconds
if speech_probability > self.in_command_speech_threshold:
# Speech in command.
self._command_speech_seconds_left -= chunk_seconds
# Reset silence counter if enough speech.
self._reset_seconds_left -= chunk_seconds
self._command_seconds_left -= chunk_seconds
if self._reset_seconds_left <= 0:
self._silence_seconds_left = self.silence_seconds
self._reset_seconds_left = self.reset_seconds
elif speech_probability < self.in_command_silence_threshold:
# Silence in command
self._reset_seconds_left = self.reset_seconds
self._silence_seconds_left -= chunk_seconds
if (self._silence_seconds_left <= 0) and (
self._command_seconds_left <= 0
):
if self._command_speech_seconds_left <= 0:
# Command finished successfully
self.reset()
_LOGGER.debug("Voice command finished")
return False
# Triggered without real speech: false start, resume listening
_LOGGER.debug("Discarding false start (no speech detected)")
timeout_seconds_left = self._timeout_seconds_left
self.reset()
self._timeout_seconds_left = timeout_seconds_left
# else: dead-band activity holds the command open without refreshing
return True
@@ -506,6 +506,7 @@ class AssistSatelliteEntity(entity.Entity):
# Store the conversation ID. If it is no longer valid,
# get_chat_session will reset it
self._conversation_id = session.conversation_id
vad_sensitivity = self._resolve_vad_sensitivity()
self._pipeline_task = (
self.platform.config_entry.async_create_background_task(
self.hass,
@@ -529,7 +530,14 @@ class AssistSatelliteEntity(entity.Entity):
tts_audio_output=self.tts_options,
wake_word_phrase=wake_word_phrase,
audio_settings=AudioSettings(
silence_seconds=self._resolve_vad_sensitivity()
silence_seconds=vad.VadSensitivity.to_seconds(
vad_sensitivity
),
in_command_silence_threshold=(
vad.VadSensitivity.to_in_command_silence_threshold(
vad_sensitivity
)
),
),
start_stage=start_stage,
end_stage=end_stage,
@@ -627,7 +635,7 @@ class AssistSatelliteEntity(entity.Entity):
return None
@callback
def _resolve_vad_sensitivity(self) -> float:
def _resolve_vad_sensitivity(self) -> vad.VadSensitivity:
"""Resolve VAD sensitivity from select entity to enum."""
vad_sensitivity = vad.VadSensitivity.DEFAULT
@@ -639,7 +647,7 @@ class AssistSatelliteEntity(entity.Entity):
vad_sensitivity = vad.VadSensitivity(vad_sensitivity_state.state)
return vad.VadSensitivity.to_seconds(vad_sensitivity)
return vad_sensitivity
async def _resolve_announcement_media_id(
self,
+172 -2
View File
@@ -2,8 +2,11 @@
import itertools as it
import pytest
from homeassistant.components.assist_pipeline.vad import (
AudioBuffer,
VadSensitivity,
VoiceCommandSegmenter,
chunk_samples,
)
@@ -234,9 +237,11 @@ def test_speech_thresholds() -> None:
segmenter = VoiceCommandSegmenter(
before_command_speech_threshold=0.2,
in_command_speech_threshold=0.5,
in_command_silence_threshold=0.2,
command_seconds=2,
speech_seconds=1,
silence_seconds=1,
min_command_speech_seconds=0.0,
)
# Not high enough probability to trigger command
@@ -247,6 +252,171 @@ def test_speech_thresholds() -> None:
assert segmenter.process(_ONE_SECOND, 0.3)
assert segmenter.in_command
# Now that same probability is considered silence.
# Finishes command.
# Dead-band probability is neither speech nor silence: command stays open
assert segmenter.process(_ONE_SECOND, 0.3)
assert segmenter.in_command
# Clear silence finishes the command
assert not segmenter.process(_ONE_SECOND, 0.05)
def test_dead_band_preserves_pause() -> None:
"""Test dead-band activity during a pause does not end the command early."""
segmenter = VoiceCommandSegmenter(
in_command_speech_threshold=0.5,
in_command_silence_threshold=0.2,
speech_seconds=0.5,
command_seconds=1.0,
silence_seconds=1.0,
reset_seconds=0.5,
)
# Start the command
assert segmenter.process(_ONE_SECOND * 0.5, 1.0)
assert segmenter.in_command
# 1.5s of dead-band activity (> silence_seconds) keeps the command open.
# The original single-threshold logic would have ended it during this pause.
for _ in range(3):
assert segmenter.process(_ONE_SECOND * 0.5, 0.3)
assert segmenter.in_command
# Speech resumes, then genuine silence ends the command
assert segmenter.process(_ONE_SECOND * 0.5, 1.0)
assert segmenter.process(_ONE_SECOND * 0.5, 0.0)
assert not segmenter.process(_ONE_SECOND * 0.5, 0.0)
assert not segmenter.in_command
def test_dead_band_times_out() -> None:
"""Test sustained dead-band audio ends via timeout, never a premature finish."""
segmenter = VoiceCommandSegmenter(
in_command_speech_threshold=0.5,
in_command_silence_threshold=0.2,
speech_seconds=0.5,
timeout_seconds=2.0,
)
# Start the command
assert segmenter.process(_ONE_SECOND * 0.5, 1.0)
assert segmenter.in_command
# Dead-band audio holds the command open (never finishes) until the timeout
assert segmenter.process(_ONE_SECOND, 0.3)
assert not segmenter.timed_out
assert not segmenter.process(_ONE_SECOND, 0.3)
assert segmenter.timed_out
def test_false_start_aborts_and_resumes() -> None:
"""Test a trigger without confident speech aborts instead of finishing."""
segmenter = VoiceCommandSegmenter(
before_command_speech_threshold=0.2,
in_command_speech_threshold=0.5,
in_command_silence_threshold=0.2,
min_command_speech_seconds=0.1,
speech_seconds=0.3,
command_seconds=1.0,
silence_seconds=0.5,
)
# A sub-0.5 transient (above entry, below speech) triggers a command...
assert segmenter.process(_ONE_SECOND * 0.3, 0.35)
assert segmenter.in_command
# ...but with no confident speech, the silence end aborts back to listening
# instead of finishing into near-empty audio.
assert segmenter.process(_ONE_SECOND, 0.0)
assert not segmenter.in_command
# The real command then arrives and finishes normally
assert segmenter.process(_ONE_SECOND, 1.0)
assert segmenter.in_command
assert not segmenter.process(_ONE_SECOND, 0.0)
assert not segmenter.in_command
def test_false_start_timeout() -> None:
"""Test a false activation gives up at false_start_timeout, not timeout_seconds."""
segmenter = VoiceCommandSegmenter(
before_command_speech_threshold=0.2,
in_command_speech_threshold=0.5,
in_command_silence_threshold=0.2,
min_command_speech_seconds=0.05,
false_start_timeout_seconds=3.0,
timeout_seconds=15.0,
)
# A trigger with no confident speech, followed by silence: gives up at 3s,
# well before the 15s command timeout.
assert segmenter.process(_ONE_SECOND, 0.35)
assert segmenter.process(_ONE_SECOND, 0.0)
assert not segmenter.timed_out
# Crosses false_start_timeout (3s elapsed) without confident speech
assert not segmenter.process(_ONE_SECOND, 0.0)
assert segmenter.timed_out
def test_false_start_timeout_not_applied_after_speech() -> None:
"""Test confident speech lifts the false-start timeout to the full command timeout."""
segmenter = VoiceCommandSegmenter(
in_command_speech_threshold=0.5,
min_command_speech_seconds=0.05,
false_start_timeout_seconds=3.0,
timeout_seconds=15.0,
speech_seconds=0.3,
)
# Confident speech early confirms the command...
assert segmenter.process(_ONE_SECOND * 0.5, 1.0)
assert segmenter.in_command
# ...so a long pause past false_start_timeout does NOT give up early
assert segmenter.process(_ONE_SECOND * 4, 0.3)
assert not segmenter.timed_out
assert segmenter.in_command
def test_min_command_speech_disabled() -> None:
"""Test min_command_speech_seconds=0 keeps the original finish behavior."""
segmenter = VoiceCommandSegmenter(
in_command_speech_threshold=0.5,
in_command_silence_threshold=0.2,
min_command_speech_seconds=0.0,
speech_seconds=0.3,
command_seconds=1.0,
silence_seconds=0.5,
)
# Trigger on a sub-0.5 transient and finish on silence (no abort)
assert segmenter.process(_ONE_SECOND * 0.3, 0.35)
assert segmenter.in_command
assert not segmenter.process(_ONE_SECOND, 0.0)
assert not segmenter.in_command
@pytest.mark.parametrize(
("sensitivity", "expected_threshold"),
[
pytest.param(VadSensitivity.AGGRESSIVE, 0.5, id="aggressive"),
pytest.param(VadSensitivity.DEFAULT, 0.2, id="default"),
pytest.param(VadSensitivity.RELAXED, 0.1, id="relaxed"),
],
)
def test_vad_sensitivity_in_command_silence_threshold(
sensitivity: VadSensitivity, expected_threshold: float
) -> None:
"""Test sensitivity maps to an in-command silence threshold."""
assert (
VadSensitivity.to_in_command_silence_threshold(sensitivity)
== expected_threshold
)