First pass at acknowledgement

2025-08-31 10:21:30 +02:00 · 2025-08-13 17:04:38 -05:00
parent cf68214c4d
commit 36d3086864
4 changed files with 123 additions and 11 deletions
--- a/homeassistant/components/assist_pipeline/init.py
+++ b/homeassistant/components/assist_pipeline/init.py
@@ -3,11 +3,14 @@
 from __future__ import annotations
 from collections.abc import AsyncIterable
 from http import HTTPStatus
 from pathlib import Path
 from typing import Any
 from aiohttp import web
 import voluptuous as vol
-from homeassistant.components import stt
+from homeassistant.components import http, stt
 from homeassistant.core import Context, HomeAssistant
 from homeassistant.helpers import chat_session
 from homeassistant.helpers.typing import ConfigType
@@ -86,6 +89,8 @@ async def async_setup(hass: HomeAssistant, config: ConfigType) -> bool:
    await async_setup_pipeline_store(hass)
    async_register_websocket_api(hass)
    hass.http.register_view(DefaultSoundsView(hass))
    return True
@@ -133,3 +138,19 @@ async def async_pipeline_from_audio_stream(
        )
        await pipeline_input.validate()
        await pipeline_input.execute()
 class DefaultSoundsView(http.HomeAssistantView):
    url = f"/api/{DOMAIN}/sounds/{{filename}}"
    name = f"api:{DOMAIN}:sounds"
    requires_auth = False
    def __init__(self, hass: HomeAssistant) -> None:
        self.hass = hass
        self.base_dir = Path(__file__).parent / "sounds"
    async def get(self, request: web.Request, filename: str):
        if filename not in ("acknowledge.mp3",):
            return web.Response(body="Invalid filename", status=HTTPStatus.BAD_REQUEST)
        return web.FileResponse(self.base_dir / filename)
--- a/homeassistant/components/assist_pipeline/manifest.json
+++ b/homeassistant/components/assist_pipeline/manifest.json
@@ -3,7 +3,7 @@
  "name": "Assist pipeline",
  "after_dependencies": ["repairs"],
  "codeowners": ["@balloob", "@synesthesiam"],
-  "dependencies": ["conversation", "stt", "tts", "wake_word"],
+  "dependencies": ["conversation", "stt", "tts", "wake_word", "http"],
  "documentation": "https://www.home-assistant.io/integrations/assist_pipeline",
  "integration_type": "system",
  "iot_class": "local_push",
--- a/homeassistant/components/assist_pipeline/pipeline.py
+++ b/homeassistant/components/assist_pipeline/pipeline.py
@@ -19,11 +19,24 @@ import wave
 import hass_nabucasa
 import voluptuous as vol
-from homeassistant.components import conversation, stt, tts, wake_word, websocket_api
+from homeassistant.components import (
    conversation,
    media_source,
    stt,
    tts,
    wake_word,
    websocket_api,
 )
 from homeassistant.const import ATTR_SUPPORTED_FEATURES, MATCH_ALL
 from homeassistant.core import Context, HomeAssistant, callback
 from homeassistant.exceptions import HomeAssistantError
-from homeassistant.helpers import chat_session, intent
+from homeassistant.helpers import (
    chat_session,
    device_registry as dr,
    entity_registry as er,
    intent,
    network,
 )
 from homeassistant.helpers.collection import (
    CHANGE_UPDATED,
    CollectionError,
@@ -91,6 +104,8 @@ KEY_PIPELINE_CONVERSATION_DATA: HassKey[dict[str, PipelineConversationData]] = H
 # Number of response parts to handle before streaming the response
 STREAM_RESPONSE_CHARS = 60
 DEFAULT_ACKNOWLEDGE_MEDIA_ID = f"/api/{DOMAIN}/sounds/acknowledge.mp3"
 def validate_language(data: dict[str, Any]) -> Any:
    """Validate language settings."""
@@ -412,6 +427,8 @@ class Pipeline:
    wake_word_entity: str | None
    wake_word_id: str | None
    prefer_local_intents: bool = False
    acknowledge_same_area: bool = True
    acknowledge_media_id: str | None = None
    id: str = field(default_factory=ulid_util.ulid_now)
@@ -436,6 +453,10 @@ class Pipeline:
            wake_word_entity=data["wake_word_entity"],
            wake_word_id=data["wake_word_id"],
            prefer_local_intents=data.get("prefer_local_intents", False),
            acknowledge_same_area=data.get("acknowledge_same_area", True),
            acknowledge_media_id=data.get(
                "acknowledge_media_id", DEFAULT_ACKNOWLEDGE_MEDIA_ID
            ),
        )
    def to_json(self) -> dict[str, Any]:
@@ -454,6 +475,7 @@ class Pipeline:
            "wake_word_entity": self.wake_word_entity,
            "wake_word_id": self.wake_word_id,
            "prefer_local_intents": self.prefer_local_intents,
            "acknowledge_media_id": self.acknowledge_media_id,
        }
@@ -1059,7 +1081,7 @@ class PipelineRun:
        conversation_id: str,
        device_id: str | None,
        conversation_extra_system_prompt: str | None,
-    ) -> str:
+    ) -> tuple[str, bool]:
        """Run intent recognition portion of pipeline. Returns text to speak."""
        if self.intent_agent is None or self._conversation_data is None:
            raise RuntimeError("Recognize intent was not prepared")
@@ -1107,6 +1129,7 @@ class PipelineRun:
            agent_id = self.intent_agent.id
            processed_locally = agent_id == conversation.HOME_ASSISTANT_AGENT
            all_same_area = False
            intent_response: intent.IntentResponse | None = None
            if not processed_locally and not self._intent_agent_only:
                # Sentence triggers override conversation agent
@@ -1136,7 +1159,8 @@ class PipelineRun:
                # Try local intents
                if (
-                    intent_response is None
+                    self.pipeline.acknowledge_same_area
                    and intent_response is None
                    and self.pipeline.prefer_local_intents
                    and (
                        intent_response := await conversation.async_handle_intents(
@@ -1280,6 +1304,43 @@ class PipelineRun:
                    if tts_input_stream and self._streamed_response_text:
                        tts_input_stream.put_nowait(None)
                    intent_response = conversation_result.response
                    device_registry = dr.async_get(self.hass)
                    if (
                        (
                            intent_response.response_type
                            == intent.IntentResponseType.ACTION_DONE
                        )
                        and intent_response.matched_states
                        and device_id
                        and (device := device_registry.async_get(device_id))
                        and device.area_id
                    ):
                        entity_registry = er.async_get(self.hass)
                        all_same_area = True
                        for state in intent_response.matched_states:
                            entity = entity_registry.async_get(state.entity_id)
                            if (
                                (not entity)
                                or (
                                    entity.area_id
                                    and (entity.area_id != device.area_id)
                                )
                                or (
                                    entity.device_id
                                    and (
                                        entity_device := device_registry.async_get(
                                            entity.device_id
                                        )
                                    )
                                    and entity_device.area_id != device.area_id
                                )
                            ):
                                all_same_area = False
                                break
                    _LOGGER.error("All same area: %s", all_same_area)
        except Exception as src_error:
            _LOGGER.exception("Unexpected error during intent recognition")
            raise IntentRecognitionError(
@@ -1302,7 +1363,7 @@ class PipelineRun:
        if conversation_result.continue_conversation:
            self._conversation_data.continue_conversation_agent = agent_id
-        return speech
+        return speech, all_same_area
    async def prepare_text_to_speech(self) -> None:
        """Prepare text-to-speech."""
@@ -1370,6 +1431,30 @@ class PipelineRun:
            PipelineEvent(PipelineEventType.TTS_END, {"tts_output": tts_output})
        )
    async def acknowledge(self, media_id: str, tts_input: str | None) -> None:
        self.process_event(
            PipelineEvent(
                PipelineEventType.TTS_START,
                {
                    "language": self.pipeline.tts_language,
                    "voice": self.pipeline.tts_voice,
                    "tts_input": tts_input or "",
                },
            )
        )
        if media_source.is_media_source_id(media_id):
            media = await media_source.async_resolve_media(self.hass, media_id, None)
            media_id = media.url
        else:
            media_id = network.get_url(self.hass) + media_id
        tts_output = {"url": media_id}
        self.process_event(
            PipelineEvent(PipelineEventType.TTS_END, {"tts_output": tts_output})
        )
    def _capture_chunk(self, audio_bytes: bytes | None) -> None:
        """Forward audio chunk to various capturing mechanisms."""
        if self.debug_recording_queue is not None:
@@ -1649,17 +1734,18 @@ class PipelineInput:
            if self.run.end_stage != PipelineStage.STT:
                tts_input = self.tts_input
                all_same_area = False
                if current_stage == PipelineStage.INTENT:
                    # intent-recognition
                    assert intent_input is not None
-                    tts_input = await self.run.recognize_intent(
+                    tts_input, all_same_area = await self.run.recognize_intent(
                        intent_input,
                        self.session.conversation_id,
                        self.device_id,
                        self.conversation_extra_system_prompt,
                    )
-                    if tts_input.strip():
+                    if all_same_area or tts_input.strip():
                        current_stage = PipelineStage.TTS
                    else:
                        # Skip TTS
@@ -1668,6 +1754,11 @@ class PipelineInput:
                if self.run.end_stage != PipelineStage.INTENT:
                    # text-to-speech
                    if current_stage == PipelineStage.TTS:
                        if all_same_area and self.run.pipeline.acknowledge_media_id:
                            await self.run.acknowledge(
                                self.run.pipeline.acknowledge_media_id, tts_input
                            )
                        else:
                            assert tts_input is not None
                            await self.run.text_to_speech(tts_input)
--- a/homeassistant/components/assist_pipeline/sounds/acknowledge.mp3
+++ b/homeassistant/components/assist_pipeline/sounds/acknowledge.mp3