mirror of
https://github.com/home-assistant/core.git
synced 2026-07-01 02:55:57 +02:00
Compare commits
2 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| f37023a316 | |||
| 66e8c50377 |
@@ -522,6 +522,9 @@ class AudioSettings:
|
||||
silence_seconds: float = 0.7
|
||||
"""Seconds of silence after voice command has ended."""
|
||||
|
||||
in_command_silence_threshold: float = 0.2
|
||||
"""Probability below which in-command audio counts toward ending the command."""
|
||||
|
||||
def __post_init__(self) -> None:
|
||||
"""Verify settings post-initialization."""
|
||||
if (self.noise_suppression_level < 0) or (self.noise_suppression_level > 4):
|
||||
@@ -953,7 +956,10 @@ class PipelineRun:
|
||||
and self.stt_provider.audio_processing.requires_external_vad
|
||||
):
|
||||
stt_vad = VoiceCommandSegmenter(
|
||||
silence_seconds=self.audio_settings.silence_seconds
|
||||
silence_seconds=self.audio_settings.silence_seconds,
|
||||
in_command_silence_threshold=(
|
||||
self.audio_settings.in_command_silence_threshold
|
||||
),
|
||||
)
|
||||
|
||||
result = await self.stt_provider.async_process_audio_stream(
|
||||
|
||||
@@ -29,6 +29,23 @@ class VadSensitivity(StrEnum):
|
||||
|
||||
return 0.7
|
||||
|
||||
@staticmethod
|
||||
def to_in_command_silence_threshold(sensitivity: VadSensitivity | str) -> float:
|
||||
"""Return in-command silence probability threshold for sensitivity level.
|
||||
|
||||
Lower values keep the command open through thinking pauses (more pause
|
||||
tolerance, more hang-on in noise); a value equal to the in-command speech
|
||||
threshold disables the dead band entirely.
|
||||
"""
|
||||
sensitivity = VadSensitivity(sensitivity)
|
||||
if sensitivity == VadSensitivity.RELAXED:
|
||||
return 0.1
|
||||
|
||||
if sensitivity == VadSensitivity.AGGRESSIVE:
|
||||
return 0.5
|
||||
|
||||
return 0.2
|
||||
|
||||
|
||||
class AudioBuffer:
|
||||
"""Fixed-sized audio buffer with variable internal length."""
|
||||
@@ -100,9 +117,21 @@ class VoiceCommandSegmenter:
|
||||
in_command_speech_threshold: float = 0.5
|
||||
"""Probability threshold for speech during voice command."""
|
||||
|
||||
in_command_silence_threshold: float = 0.2
|
||||
"""Probability below which an in-command chunk counts toward ending the command."""
|
||||
|
||||
min_command_speech_seconds: float = 0.05
|
||||
"""Confident speech required before a command may finish on silence."""
|
||||
|
||||
false_start_timeout_seconds: float = 5.0
|
||||
"""Seconds to wait for confident speech before giving up (a false activation)."""
|
||||
|
||||
_speech_seconds_left: float = 0.0
|
||||
"""Seconds left before considering voice command as started."""
|
||||
|
||||
_command_speech_seconds_left: float = 0.0
|
||||
"""Confident speech still required before the command may finish."""
|
||||
|
||||
_command_seconds_left: float = 0.0
|
||||
"""Seconds left before voice command could stop."""
|
||||
|
||||
@@ -123,6 +152,7 @@ class VoiceCommandSegmenter:
|
||||
"""Reset all counters and state."""
|
||||
self._speech_seconds_left = self.speech_seconds
|
||||
self._command_seconds_left = self.command_seconds - self.speech_seconds
|
||||
self._command_speech_seconds_left = self.min_command_speech_seconds
|
||||
self._silence_seconds_left = self.silence_seconds
|
||||
self._timeout_seconds_left = self.timeout_seconds
|
||||
self._reset_seconds_left = self.reset_seconds
|
||||
@@ -137,10 +167,15 @@ class VoiceCommandSegmenter:
|
||||
self.timed_out = False
|
||||
|
||||
self._timeout_seconds_left -= chunk_seconds
|
||||
if self._timeout_seconds_left <= 0:
|
||||
no_speech_yet = self._command_speech_seconds_left > 0
|
||||
elapsed = self.timeout_seconds - self._timeout_seconds_left
|
||||
if (self._timeout_seconds_left <= 0) or (
|
||||
no_speech_yet and (elapsed >= self.false_start_timeout_seconds)
|
||||
):
|
||||
_LOGGER.debug(
|
||||
"VAD end of speech detection timed out after %s seconds",
|
||||
self.timeout_seconds,
|
||||
"VAD timed out after %.2f seconds (no_speech_yet=%s)",
|
||||
elapsed,
|
||||
no_speech_yet,
|
||||
)
|
||||
self.reset()
|
||||
self.timed_out = True
|
||||
@@ -148,6 +183,9 @@ class VoiceCommandSegmenter:
|
||||
|
||||
if speech_probability is None:
|
||||
speech_probability = 0.0
|
||||
else:
|
||||
# MicroVad returns -1.0 as a "no result yet" sentinel; clamp to [0, 1]
|
||||
speech_probability = min(1.0, max(0.0, speech_probability))
|
||||
|
||||
if not self.in_command:
|
||||
# Before command
|
||||
@@ -155,6 +193,9 @@ class VoiceCommandSegmenter:
|
||||
if is_speech:
|
||||
self._reset_seconds_left = self.reset_seconds
|
||||
self._speech_seconds_left -= chunk_seconds
|
||||
if speech_probability > self.in_command_speech_threshold:
|
||||
# Confident speech during onset also counts toward confirmation
|
||||
self._command_speech_seconds_left -= chunk_seconds
|
||||
if self._speech_seconds_left <= 0:
|
||||
# Inside voice command
|
||||
self.in_command = True
|
||||
@@ -168,30 +209,37 @@ class VoiceCommandSegmenter:
|
||||
self._reset_seconds_left -= chunk_seconds
|
||||
if self._reset_seconds_left <= 0:
|
||||
self._speech_seconds_left = self.speech_seconds
|
||||
self._command_speech_seconds_left = self.min_command_speech_seconds
|
||||
self._reset_seconds_left = self.reset_seconds
|
||||
else:
|
||||
# In command
|
||||
is_speech = speech_probability > self.in_command_speech_threshold
|
||||
if not is_speech:
|
||||
# Silence in command
|
||||
self._reset_seconds_left = self.reset_seconds
|
||||
self._silence_seconds_left -= chunk_seconds
|
||||
self._command_seconds_left -= chunk_seconds
|
||||
if (self._silence_seconds_left <= 0) and (
|
||||
self._command_seconds_left <= 0
|
||||
):
|
||||
# Command finished successfully
|
||||
self.reset()
|
||||
_LOGGER.debug("Voice command finished")
|
||||
return False
|
||||
else:
|
||||
self._command_seconds_left -= chunk_seconds
|
||||
if speech_probability > self.in_command_speech_threshold:
|
||||
# Speech in command.
|
||||
self._command_speech_seconds_left -= chunk_seconds
|
||||
# Reset silence counter if enough speech.
|
||||
self._reset_seconds_left -= chunk_seconds
|
||||
self._command_seconds_left -= chunk_seconds
|
||||
if self._reset_seconds_left <= 0:
|
||||
self._silence_seconds_left = self.silence_seconds
|
||||
self._reset_seconds_left = self.reset_seconds
|
||||
elif speech_probability < self.in_command_silence_threshold:
|
||||
# Silence in command
|
||||
self._reset_seconds_left = self.reset_seconds
|
||||
self._silence_seconds_left -= chunk_seconds
|
||||
if (self._silence_seconds_left <= 0) and (
|
||||
self._command_seconds_left <= 0
|
||||
):
|
||||
if self._command_speech_seconds_left <= 0:
|
||||
# Command finished successfully
|
||||
self.reset()
|
||||
_LOGGER.debug("Voice command finished")
|
||||
return False
|
||||
# Triggered without real speech: false start, resume listening
|
||||
_LOGGER.debug("Discarding false start (no speech detected)")
|
||||
timeout_seconds_left = self._timeout_seconds_left
|
||||
self.reset()
|
||||
self._timeout_seconds_left = timeout_seconds_left
|
||||
# else: dead-band activity holds the command open without refreshing
|
||||
|
||||
return True
|
||||
|
||||
|
||||
@@ -506,6 +506,7 @@ class AssistSatelliteEntity(entity.Entity):
|
||||
# Store the conversation ID. If it is no longer valid,
|
||||
# get_chat_session will reset it
|
||||
self._conversation_id = session.conversation_id
|
||||
vad_sensitivity = self._resolve_vad_sensitivity()
|
||||
self._pipeline_task = (
|
||||
self.platform.config_entry.async_create_background_task(
|
||||
self.hass,
|
||||
@@ -529,7 +530,14 @@ class AssistSatelliteEntity(entity.Entity):
|
||||
tts_audio_output=self.tts_options,
|
||||
wake_word_phrase=wake_word_phrase,
|
||||
audio_settings=AudioSettings(
|
||||
silence_seconds=self._resolve_vad_sensitivity()
|
||||
silence_seconds=vad.VadSensitivity.to_seconds(
|
||||
vad_sensitivity
|
||||
),
|
||||
in_command_silence_threshold=(
|
||||
vad.VadSensitivity.to_in_command_silence_threshold(
|
||||
vad_sensitivity
|
||||
)
|
||||
),
|
||||
),
|
||||
start_stage=start_stage,
|
||||
end_stage=end_stage,
|
||||
@@ -627,7 +635,7 @@ class AssistSatelliteEntity(entity.Entity):
|
||||
return None
|
||||
|
||||
@callback
|
||||
def _resolve_vad_sensitivity(self) -> float:
|
||||
def _resolve_vad_sensitivity(self) -> vad.VadSensitivity:
|
||||
"""Resolve VAD sensitivity from select entity to enum."""
|
||||
vad_sensitivity = vad.VadSensitivity.DEFAULT
|
||||
|
||||
@@ -639,7 +647,7 @@ class AssistSatelliteEntity(entity.Entity):
|
||||
|
||||
vad_sensitivity = vad.VadSensitivity(vad_sensitivity_state.state)
|
||||
|
||||
return vad.VadSensitivity.to_seconds(vad_sensitivity)
|
||||
return vad_sensitivity
|
||||
|
||||
async def _resolve_announcement_media_id(
|
||||
self,
|
||||
|
||||
@@ -2,8 +2,11 @@
|
||||
|
||||
import itertools as it
|
||||
|
||||
import pytest
|
||||
|
||||
from homeassistant.components.assist_pipeline.vad import (
|
||||
AudioBuffer,
|
||||
VadSensitivity,
|
||||
VoiceCommandSegmenter,
|
||||
chunk_samples,
|
||||
)
|
||||
@@ -234,9 +237,11 @@ def test_speech_thresholds() -> None:
|
||||
segmenter = VoiceCommandSegmenter(
|
||||
before_command_speech_threshold=0.2,
|
||||
in_command_speech_threshold=0.5,
|
||||
in_command_silence_threshold=0.2,
|
||||
command_seconds=2,
|
||||
speech_seconds=1,
|
||||
silence_seconds=1,
|
||||
min_command_speech_seconds=0.0,
|
||||
)
|
||||
|
||||
# Not high enough probability to trigger command
|
||||
@@ -247,6 +252,171 @@ def test_speech_thresholds() -> None:
|
||||
assert segmenter.process(_ONE_SECOND, 0.3)
|
||||
assert segmenter.in_command
|
||||
|
||||
# Now that same probability is considered silence.
|
||||
# Finishes command.
|
||||
# Dead-band probability is neither speech nor silence: command stays open
|
||||
assert segmenter.process(_ONE_SECOND, 0.3)
|
||||
assert segmenter.in_command
|
||||
|
||||
# Clear silence finishes the command
|
||||
assert not segmenter.process(_ONE_SECOND, 0.05)
|
||||
|
||||
|
||||
def test_dead_band_preserves_pause() -> None:
|
||||
"""Test dead-band activity during a pause does not end the command early."""
|
||||
|
||||
segmenter = VoiceCommandSegmenter(
|
||||
in_command_speech_threshold=0.5,
|
||||
in_command_silence_threshold=0.2,
|
||||
speech_seconds=0.5,
|
||||
command_seconds=1.0,
|
||||
silence_seconds=1.0,
|
||||
reset_seconds=0.5,
|
||||
)
|
||||
|
||||
# Start the command
|
||||
assert segmenter.process(_ONE_SECOND * 0.5, 1.0)
|
||||
assert segmenter.in_command
|
||||
|
||||
# 1.5s of dead-band activity (> silence_seconds) keeps the command open.
|
||||
# The original single-threshold logic would have ended it during this pause.
|
||||
for _ in range(3):
|
||||
assert segmenter.process(_ONE_SECOND * 0.5, 0.3)
|
||||
assert segmenter.in_command
|
||||
|
||||
# Speech resumes, then genuine silence ends the command
|
||||
assert segmenter.process(_ONE_SECOND * 0.5, 1.0)
|
||||
assert segmenter.process(_ONE_SECOND * 0.5, 0.0)
|
||||
assert not segmenter.process(_ONE_SECOND * 0.5, 0.0)
|
||||
assert not segmenter.in_command
|
||||
|
||||
|
||||
def test_dead_band_times_out() -> None:
|
||||
"""Test sustained dead-band audio ends via timeout, never a premature finish."""
|
||||
|
||||
segmenter = VoiceCommandSegmenter(
|
||||
in_command_speech_threshold=0.5,
|
||||
in_command_silence_threshold=0.2,
|
||||
speech_seconds=0.5,
|
||||
timeout_seconds=2.0,
|
||||
)
|
||||
|
||||
# Start the command
|
||||
assert segmenter.process(_ONE_SECOND * 0.5, 1.0)
|
||||
assert segmenter.in_command
|
||||
|
||||
# Dead-band audio holds the command open (never finishes) until the timeout
|
||||
assert segmenter.process(_ONE_SECOND, 0.3)
|
||||
assert not segmenter.timed_out
|
||||
|
||||
assert not segmenter.process(_ONE_SECOND, 0.3)
|
||||
assert segmenter.timed_out
|
||||
|
||||
|
||||
def test_false_start_aborts_and_resumes() -> None:
|
||||
"""Test a trigger without confident speech aborts instead of finishing."""
|
||||
|
||||
segmenter = VoiceCommandSegmenter(
|
||||
before_command_speech_threshold=0.2,
|
||||
in_command_speech_threshold=0.5,
|
||||
in_command_silence_threshold=0.2,
|
||||
min_command_speech_seconds=0.1,
|
||||
speech_seconds=0.3,
|
||||
command_seconds=1.0,
|
||||
silence_seconds=0.5,
|
||||
)
|
||||
|
||||
# A sub-0.5 transient (above entry, below speech) triggers a command...
|
||||
assert segmenter.process(_ONE_SECOND * 0.3, 0.35)
|
||||
assert segmenter.in_command
|
||||
|
||||
# ...but with no confident speech, the silence end aborts back to listening
|
||||
# instead of finishing into near-empty audio.
|
||||
assert segmenter.process(_ONE_SECOND, 0.0)
|
||||
assert not segmenter.in_command
|
||||
|
||||
# The real command then arrives and finishes normally
|
||||
assert segmenter.process(_ONE_SECOND, 1.0)
|
||||
assert segmenter.in_command
|
||||
assert not segmenter.process(_ONE_SECOND, 0.0)
|
||||
assert not segmenter.in_command
|
||||
|
||||
|
||||
def test_false_start_timeout() -> None:
|
||||
"""Test a false activation gives up at false_start_timeout, not timeout_seconds."""
|
||||
|
||||
segmenter = VoiceCommandSegmenter(
|
||||
before_command_speech_threshold=0.2,
|
||||
in_command_speech_threshold=0.5,
|
||||
in_command_silence_threshold=0.2,
|
||||
min_command_speech_seconds=0.05,
|
||||
false_start_timeout_seconds=3.0,
|
||||
timeout_seconds=15.0,
|
||||
)
|
||||
|
||||
# A trigger with no confident speech, followed by silence: gives up at 3s,
|
||||
# well before the 15s command timeout.
|
||||
assert segmenter.process(_ONE_SECOND, 0.35)
|
||||
assert segmenter.process(_ONE_SECOND, 0.0)
|
||||
assert not segmenter.timed_out
|
||||
|
||||
# Crosses false_start_timeout (3s elapsed) without confident speech
|
||||
assert not segmenter.process(_ONE_SECOND, 0.0)
|
||||
assert segmenter.timed_out
|
||||
|
||||
|
||||
def test_false_start_timeout_not_applied_after_speech() -> None:
|
||||
"""Test confident speech lifts the false-start timeout to the full command timeout."""
|
||||
|
||||
segmenter = VoiceCommandSegmenter(
|
||||
in_command_speech_threshold=0.5,
|
||||
min_command_speech_seconds=0.05,
|
||||
false_start_timeout_seconds=3.0,
|
||||
timeout_seconds=15.0,
|
||||
speech_seconds=0.3,
|
||||
)
|
||||
|
||||
# Confident speech early confirms the command...
|
||||
assert segmenter.process(_ONE_SECOND * 0.5, 1.0)
|
||||
assert segmenter.in_command
|
||||
|
||||
# ...so a long pause past false_start_timeout does NOT give up early
|
||||
assert segmenter.process(_ONE_SECOND * 4, 0.3)
|
||||
assert not segmenter.timed_out
|
||||
assert segmenter.in_command
|
||||
|
||||
|
||||
def test_min_command_speech_disabled() -> None:
|
||||
"""Test min_command_speech_seconds=0 keeps the original finish behavior."""
|
||||
|
||||
segmenter = VoiceCommandSegmenter(
|
||||
in_command_speech_threshold=0.5,
|
||||
in_command_silence_threshold=0.2,
|
||||
min_command_speech_seconds=0.0,
|
||||
speech_seconds=0.3,
|
||||
command_seconds=1.0,
|
||||
silence_seconds=0.5,
|
||||
)
|
||||
|
||||
# Trigger on a sub-0.5 transient and finish on silence (no abort)
|
||||
assert segmenter.process(_ONE_SECOND * 0.3, 0.35)
|
||||
assert segmenter.in_command
|
||||
assert not segmenter.process(_ONE_SECOND, 0.0)
|
||||
assert not segmenter.in_command
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("sensitivity", "expected_threshold"),
|
||||
[
|
||||
pytest.param(VadSensitivity.AGGRESSIVE, 0.5, id="aggressive"),
|
||||
pytest.param(VadSensitivity.DEFAULT, 0.2, id="default"),
|
||||
pytest.param(VadSensitivity.RELAXED, 0.1, id="relaxed"),
|
||||
],
|
||||
)
|
||||
def test_vad_sensitivity_in_command_silence_threshold(
|
||||
sensitivity: VadSensitivity, expected_threshold: float
|
||||
) -> None:
|
||||
"""Test sensitivity maps to an in-command silence threshold."""
|
||||
|
||||
assert (
|
||||
VadSensitivity.to_in_command_silence_threshold(sensitivity)
|
||||
== expected_threshold
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user