First pass at acknowledgement

2025-08-30 18:01:31 +02:00 · 2025-08-13 17:04:38 -05:00
parent cf68214c4d
commit 36d3086864
4 changed files with 123 additions and 11 deletions
--- a/homeassistant/components/assist_pipeline/init.py
+++ b/homeassistant/components/assist_pipeline/init.py
@@ -3,11 +3,14 @@
 from __future__ import annotations

 from collections.abc import AsyncIterable
+from http import HTTPStatus
+from pathlib import Path
 from typing import Any

+from aiohttp import web
 import voluptuous as vol

-from homeassistant.components import stt
+from homeassistant.components import http, stt
 from homeassistant.core import Context, HomeAssistant
 from homeassistant.helpers import chat_session
 from homeassistant.helpers.typing import ConfigType
@@ -86,6 +89,8 @@ async def async_setup(hass: HomeAssistant, config: ConfigType) -> bool:
    await async_setup_pipeline_store(hass)
    async_register_websocket_api(hass)

+    hass.http.register_view(DefaultSoundsView(hass))
+
    return True


@@ -133,3 +138,19 @@ async def async_pipeline_from_audio_stream(
        )
        await pipeline_input.validate()
        await pipeline_input.execute()
+
+
+class DefaultSoundsView(http.HomeAssistantView):
+    url = f"/api/{DOMAIN}/sounds/{{filename}}"
+    name = f"api:{DOMAIN}:sounds"
+    requires_auth = False
+
+    def __init__(self, hass: HomeAssistant) -> None:
+        self.hass = hass
+        self.base_dir = Path(__file__).parent / "sounds"
+
+    async def get(self, request: web.Request, filename: str):
+        if filename not in ("acknowledge.mp3",):
+            return web.Response(body="Invalid filename", status=HTTPStatus.BAD_REQUEST)
+
+        return web.FileResponse(self.base_dir / filename)
--- a/homeassistant/components/assist_pipeline/manifest.json
+++ b/homeassistant/components/assist_pipeline/manifest.json
@@ -3,7 +3,7 @@
  "name": "Assist pipeline",
  "after_dependencies": ["repairs"],
  "codeowners": ["@balloob", "@synesthesiam"],
-  "dependencies": ["conversation", "stt", "tts", "wake_word"],
+  "dependencies": ["conversation", "stt", "tts", "wake_word", "http"],
  "documentation": "https://www.home-assistant.io/integrations/assist_pipeline",
  "integration_type": "system",
  "iot_class": "local_push",
--- a/homeassistant/components/assist_pipeline/pipeline.py
+++ b/homeassistant/components/assist_pipeline/pipeline.py
@@ -19,11 +19,24 @@ import wave
 import hass_nabucasa
 import voluptuous as vol

-from homeassistant.components import conversation, stt, tts, wake_word, websocket_api
+from homeassistant.components import (
+    conversation,
+    media_source,
+    stt,
+    tts,
+    wake_word,
+    websocket_api,
+)
 from homeassistant.const import ATTR_SUPPORTED_FEATURES, MATCH_ALL
 from homeassistant.core import Context, HomeAssistant, callback
 from homeassistant.exceptions import HomeAssistantError
-from homeassistant.helpers import chat_session, intent
+from homeassistant.helpers import (
+    chat_session,
+    device_registry as dr,
+    entity_registry as er,
+    intent,
+    network,
+)
 from homeassistant.helpers.collection import (
    CHANGE_UPDATED,
    CollectionError,
@@ -91,6 +104,8 @@ KEY_PIPELINE_CONVERSATION_DATA: HassKey[dict[str, PipelineConversationData]] = H
 # Number of response parts to handle before streaming the response
 STREAM_RESPONSE_CHARS = 60

+DEFAULT_ACKNOWLEDGE_MEDIA_ID = f"/api/{DOMAIN}/sounds/acknowledge.mp3"
+

 def validate_language(data: dict[str, Any]) -> Any:
    """Validate language settings."""
@@ -412,6 +427,8 @@ class Pipeline:
    wake_word_entity: str | None
    wake_word_id: str | None
    prefer_local_intents: bool = False
+    acknowledge_same_area: bool = True
+    acknowledge_media_id: str | None = None

    id: str = field(default_factory=ulid_util.ulid_now)

@@ -436,6 +453,10 @@ class Pipeline:
            wake_word_entity=data["wake_word_entity"],
            wake_word_id=data["wake_word_id"],
            prefer_local_intents=data.get("prefer_local_intents", False),
+            acknowledge_same_area=data.get("acknowledge_same_area", True),
+            acknowledge_media_id=data.get(
+                "acknowledge_media_id", DEFAULT_ACKNOWLEDGE_MEDIA_ID
+            ),
        )

    def to_json(self) -> dict[str, Any]:
@@ -454,6 +475,7 @@ class Pipeline:
            "wake_word_entity": self.wake_word_entity,
            "wake_word_id": self.wake_word_id,
            "prefer_local_intents": self.prefer_local_intents,
+            "acknowledge_media_id": self.acknowledge_media_id,
        }


@@ -1059,7 +1081,7 @@ class PipelineRun:
        conversation_id: str,
        device_id: str | None,
        conversation_extra_system_prompt: str | None,
-    ) -> str:
+    ) -> tuple[str, bool]:
        """Run intent recognition portion of pipeline. Returns text to speak."""
        if self.intent_agent is None or self._conversation_data is None:
            raise RuntimeError("Recognize intent was not prepared")
@@ -1107,6 +1129,7 @@ class PipelineRun:

            agent_id = self.intent_agent.id
            processed_locally = agent_id == conversation.HOME_ASSISTANT_AGENT
+            all_same_area = False
            intent_response: intent.IntentResponse | None = None
            if not processed_locally and not self._intent_agent_only:
                # Sentence triggers override conversation agent
@@ -1136,7 +1159,8 @@ class PipelineRun:

                # Try local intents
                if (
-                    intent_response is None
+                    self.pipeline.acknowledge_same_area
+                    and intent_response is None
                    and self.pipeline.prefer_local_intents
                    and (
                        intent_response := await conversation.async_handle_intents(
@@ -1280,6 +1304,43 @@ class PipelineRun:
                    if tts_input_stream and self._streamed_response_text:
                        tts_input_stream.put_nowait(None)

+                    intent_response = conversation_result.response
+                    device_registry = dr.async_get(self.hass)
+                    if (
+                        (
+                            intent_response.response_type
+                            == intent.IntentResponseType.ACTION_DONE
+                        )
+                        and intent_response.matched_states
+                        and device_id
+                        and (device := device_registry.async_get(device_id))
+                        and device.area_id
+                    ):
+                        entity_registry = er.async_get(self.hass)
+                        all_same_area = True
+                        for state in intent_response.matched_states:
+                            entity = entity_registry.async_get(state.entity_id)
+                            if (
+                                (not entity)
+                                or (
+                                    entity.area_id
+                                    and (entity.area_id != device.area_id)
+                                )
+                                or (
+                                    entity.device_id
+                                    and (
+                                        entity_device := device_registry.async_get(
+                                            entity.device_id
+                                        )
+                                    )
+                                    and entity_device.area_id != device.area_id
+                                )
+                            ):
+                                all_same_area = False
+                                break
+
+                    _LOGGER.error("All same area: %s", all_same_area)
+
        except Exception as src_error:
            _LOGGER.exception("Unexpected error during intent recognition")
            raise IntentRecognitionError(
@@ -1302,7 +1363,7 @@ class PipelineRun:
        if conversation_result.continue_conversation:
            self._conversation_data.continue_conversation_agent = agent_id

-        return speech
+        return speech, all_same_area

    async def prepare_text_to_speech(self) -> None:
        """Prepare text-to-speech."""
@@ -1370,6 +1431,30 @@ class PipelineRun:
            PipelineEvent(PipelineEventType.TTS_END, {"tts_output": tts_output})
        )

+    async def acknowledge(self, media_id: str, tts_input: str | None) -> None:
+        self.process_event(
+            PipelineEvent(
+                PipelineEventType.TTS_START,
+                {
+                    "language": self.pipeline.tts_language,
+                    "voice": self.pipeline.tts_voice,
+                    "tts_input": tts_input or "",
+                },
+            )
+        )
+
+        if media_source.is_media_source_id(media_id):
+            media = await media_source.async_resolve_media(self.hass, media_id, None)
+            media_id = media.url
+        else:
+            media_id = network.get_url(self.hass) + media_id
+
+        tts_output = {"url": media_id}
+
+        self.process_event(
+            PipelineEvent(PipelineEventType.TTS_END, {"tts_output": tts_output})
+        )
+
    def _capture_chunk(self, audio_bytes: bytes | None) -> None:
        """Forward audio chunk to various capturing mechanisms."""
        if self.debug_recording_queue is not None:
@@ -1649,17 +1734,18 @@ class PipelineInput:

            if self.run.end_stage != PipelineStage.STT:
                tts_input = self.tts_input
+                all_same_area = False

                if current_stage == PipelineStage.INTENT:
                    # intent-recognition
                    assert intent_input is not None
-                    tts_input = await self.run.recognize_intent(
+                    tts_input, all_same_area = await self.run.recognize_intent(
                        intent_input,
                        self.session.conversation_id,
                        self.device_id,
                        self.conversation_extra_system_prompt,
                    )
-                    if tts_input.strip():
+                    if all_same_area or tts_input.strip():
                        current_stage = PipelineStage.TTS
                    else:
                        # Skip TTS
@@ -1668,8 +1754,13 @@ class PipelineInput:
                if self.run.end_stage != PipelineStage.INTENT:
                    # text-to-speech
                    if current_stage == PipelineStage.TTS:
-                        assert tts_input is not None
-                        await self.run.text_to_speech(tts_input)
+                        if all_same_area and self.run.pipeline.acknowledge_media_id:
+                            await self.run.acknowledge(
+                                self.run.pipeline.acknowledge_media_id, tts_input
+                            )
+                        else:
+                            assert tts_input is not None
+                            await self.run.text_to_speech(tts_input)

        except PipelineError as err:
            self.run.process_event(
--- a/homeassistant/components/assist_pipeline/sounds/acknowledge.mp3
+++ b/homeassistant/components/assist_pipeline/sounds/acknowledge.mp3