Files

614 lines
23 KiB
Python

"""Helpers for cloud LLM chat handling."""
import base64
from collections.abc import AsyncGenerator, Callable, Iterable
from enum import Enum
import json
import logging
import re
from typing import Any, Literal, cast
from hass_nabucasa import Cloud, NabuCasaBaseError
from hass_nabucasa.llm import (
LLMAuthenticationError,
LLMRateLimitError,
LLMResponseCompletedEvent,
LLMResponseError,
LLMResponseErrorEvent,
LLMResponseFailedEvent,
LLMResponseFunctionCallArgumentsDeltaEvent,
LLMResponseFunctionCallArgumentsDoneEvent,
LLMResponseFunctionCallOutputItem,
LLMResponseImageOutputItem,
LLMResponseIncompleteEvent,
LLMResponseMessageOutputItem,
LLMResponseOutputItemAddedEvent,
LLMResponseOutputItemDoneEvent,
LLMResponseOutputTextDeltaEvent,
LLMResponseReasoningOutputItem,
LLMResponseReasoningSummaryTextDeltaEvent,
LLMResponseWebSearchCallOutputItem,
LLMResponseWebSearchCallSearchingEvent,
LLMServiceError,
)
from openai.types.responses import (
FunctionToolParam,
ResponseInputItemParam,
ResponseReasoningItem,
ToolParam,
WebSearchToolParam,
)
from openai.types.responses.response_input_param import (
ImageGenerationCall as ImageGenerationCallParam,
)
from openai.types.responses.response_output_item import ImageGenerationCall
import voluptuous as vol
from voluptuous_openapi import convert
from homeassistant.components import conversation
from homeassistant.config_entries import ConfigEntry
from homeassistant.exceptions import HomeAssistantError
from homeassistant.helpers import llm
from homeassistant.helpers.entity import Entity
from homeassistant.util import slugify
from .client import CloudClient
_LOGGER = logging.getLogger(__name__)
_MAX_TOOL_ITERATIONS = 10
class ResponseItemType(str, Enum):
"""Response item types."""
FUNCTION_CALL = "function_call"
MESSAGE = "message"
REASONING = "reasoning"
WEB_SEARCH_CALL = "web_search_call"
IMAGE = "image"
def _convert_content_to_param(
chat_content: Iterable[conversation.Content],
) -> list[ResponseInputItemParam]:
"""Convert any native chat message for this agent to the native format."""
messages: list[ResponseInputItemParam] = []
reasoning_summary: list[str] = []
web_search_calls: dict[str, dict[str, Any]] = {}
for content in chat_content:
if isinstance(content, conversation.ToolResultContent):
if (
content.tool_name == "web_search_call"
and content.tool_call_id in web_search_calls
):
web_search_call = web_search_calls.pop(content.tool_call_id)
web_search_call["status"] = content.tool_result.get(
"status", "completed"
)
messages.append(cast("ResponseInputItemParam", web_search_call))
else:
messages.append(
{
"type": "function_call_output",
"call_id": content.tool_call_id,
"output": json.dumps(content.tool_result),
}
)
continue
if content.content:
role: Literal["user", "assistant", "system", "developer"] = content.role
if role == "system":
role = "developer"
messages.append(
{"type": "message", "role": role, "content": content.content}
)
if isinstance(content, conversation.AssistantContent):
if content.tool_calls:
for tool_call in content.tool_calls:
if (
tool_call.external
and tool_call.tool_name == "web_search_call"
and "action" in tool_call.tool_args
):
web_search_calls[tool_call.id] = {
"type": "web_search_call",
"id": tool_call.id,
"action": tool_call.tool_args["action"],
"status": "completed",
}
else:
messages.append(
{
"type": "function_call",
"name": tool_call.tool_name,
"arguments": json.dumps(tool_call.tool_args),
"call_id": tool_call.id,
}
)
if content.thinking_content:
reasoning_summary.append(content.thinking_content)
if isinstance(content.native, ResponseReasoningItem):
messages.append(
{
"type": "reasoning",
"id": content.native.id,
"summary": (
[
{
"type": "summary_text",
"text": summary,
}
for summary in reasoning_summary
]
if content.thinking_content
else []
),
"encrypted_content": content.native.encrypted_content,
}
)
reasoning_summary = []
elif isinstance(content.native, ImageGenerationCall):
messages.append(
cast(ImageGenerationCallParam, content.native.to_dict())
)
return messages
def _format_tool(
tool: llm.Tool,
custom_serializer: Callable[[Any], Any] | None,
) -> ToolParam:
"""Format a Home Assistant tool for the OpenAI Responses API."""
parameters = convert(tool.parameters, custom_serializer=custom_serializer)
spec: FunctionToolParam = {
"type": "function",
"name": tool.name,
"strict": False,
"description": tool.description,
"parameters": parameters,
}
return spec
def _adjust_schema(schema: dict[str, Any]) -> None:
"""Adjust the schema to be compatible with OpenAI API."""
if schema["type"] == "object":
schema.setdefault("strict", True)
schema.setdefault("additionalProperties", False)
if "properties" not in schema:
return
if "required" not in schema:
schema["required"] = []
# Ensure all properties are required
for prop, prop_info in schema["properties"].items():
_adjust_schema(prop_info)
if prop not in schema["required"]:
prop_info["type"] = [prop_info["type"], "null"]
schema["required"].append(prop)
elif schema["type"] == "array":
if "items" not in schema:
return
_adjust_schema(schema["items"])
def _format_structured_output(
schema: vol.Schema, llm_api: llm.APIInstance | None
) -> dict[str, Any]:
"""Format the schema to be compatible with OpenAI API."""
result: dict[str, Any] = convert(
schema,
custom_serializer=(
llm_api.custom_serializer if llm_api else llm.selector_serializer
),
)
_ensure_schema_constraints(result)
return result
def _ensure_schema_constraints(schema: dict[str, Any]) -> None:
"""Ensure generated schemas match the Responses API expectations."""
schema_type = schema.get("type")
if schema_type == "object":
schema.setdefault("additionalProperties", False)
properties = schema.get("properties")
if isinstance(properties, dict):
for property_schema in properties.values():
if isinstance(property_schema, dict):
_ensure_schema_constraints(property_schema)
elif schema_type == "array":
items = schema.get("items")
if isinstance(items, dict):
_ensure_schema_constraints(items)
# Borrowed and adapted from openai_conversation component
async def _transform_stream( # noqa: C901 - This is complex, but better to have it in one place
chat_log: conversation.ChatLog,
stream: Any,
remove_citations: bool = False,
) -> AsyncGenerator[
conversation.AssistantContentDeltaDict | conversation.ToolResultContentDeltaDict
]:
"""Transform stream result into HA format."""
last_summary_index = None
last_role: Literal["assistant", "tool_result"] | None = None
current_tool_call: LLMResponseFunctionCallOutputItem | None = None
# Non-reasoning models don't follow our request to remove citations, so we remove
# them manually here. They always follow the same pattern: the citation is always
# in parentheses in Markdown format, the citation is always in a single delta event,
# and sometimes the closing parenthesis is split into a separate delta event.
remove_parentheses: bool = False
citation_regexp = re.compile(r"\(\[([^\]]+)\]\((https?:\/\/[^\)]+)\)")
async for event in stream:
_LOGGER.debug("Event[%s]", getattr(event, "type", None))
if isinstance(event, LLMResponseOutputItemAddedEvent):
if isinstance(event.item, LLMResponseFunctionCallOutputItem):
# OpenAI has tool calls as individual events
# while HA puts tool calls inside the assistant message.
# We turn them into individual assistant content for HA
# to ensure that tools are called as soon as possible.
yield {"role": "assistant"}
last_role = "assistant"
last_summary_index = None
current_tool_call = event.item
elif (
isinstance(event.item, LLMResponseMessageOutputItem)
or (
isinstance(event.item, LLMResponseReasoningOutputItem)
and last_summary_index is not None
) # Subsequent ResponseReasoningItem
or last_role != "assistant"
):
yield {"role": "assistant"}
last_role = "assistant"
last_summary_index = None
elif isinstance(event, LLMResponseOutputItemDoneEvent):
if isinstance(event.item, LLMResponseReasoningOutputItem):
encrypted_content = event.item.encrypted_content
summary = event.item.summary
yield {
"native": LLMResponseReasoningOutputItem(
type=event.item.type,
id=event.item.id,
summary=[],
encrypted_content=encrypted_content,
)
}
last_summary_index = len(summary) - 1 if summary else None
elif isinstance(event.item, LLMResponseWebSearchCallOutputItem):
action_dict = event.item.action
yield {
"tool_calls": [
llm.ToolInput(
id=event.item.id,
tool_name="web_search_call",
tool_args={"action": action_dict},
external=True,
)
]
}
yield {
"role": "tool_result",
"tool_call_id": event.item.id,
"tool_name": "web_search_call",
"tool_result": {"status": event.item.status},
}
last_role = "tool_result"
elif isinstance(event.item, LLMResponseImageOutputItem):
yield {"native": event.item.raw}
last_summary_index = -1 # Trigger new assistant message on next turn
elif isinstance(event, LLMResponseOutputTextDeltaEvent):
data = event.delta
if remove_parentheses:
data = data.removeprefix(")")
remove_parentheses = False
elif remove_citations and (match := citation_regexp.search(data)):
match_start, match_end = match.span()
# remove leading space if any
if data[match_start - 1 : match_start] == " ":
match_start -= 1
# remove closing parenthesis:
if data[match_end : match_end + 1] == ")":
match_end += 1
else:
remove_parentheses = True
data = data[:match_start] + data[match_end:]
if data:
yield {"content": data}
elif isinstance(event, LLMResponseReasoningSummaryTextDeltaEvent):
# OpenAI can output several reasoning summaries
# in a single ResponseReasoningItem. We split them as separate
# AssistantContent messages. Only last of them will have
# the reasoning `native` field set.
if (
last_summary_index is not None
and event.summary_index != last_summary_index
):
yield {"role": "assistant"}
last_role = "assistant"
last_summary_index = event.summary_index
yield {"thinking_content": event.delta}
elif isinstance(event, LLMResponseFunctionCallArgumentsDeltaEvent):
if current_tool_call is not None:
current_tool_call.arguments += event.delta
elif isinstance(event, LLMResponseWebSearchCallSearchingEvent):
yield {"role": "assistant"}
elif isinstance(event, LLMResponseFunctionCallArgumentsDoneEvent):
if current_tool_call is not None:
current_tool_call.status = "completed"
raw_args = json.loads(current_tool_call.arguments)
for key in ("area", "floor"):
if key in raw_args and not raw_args[key]:
# Remove keys that are "" or None
raw_args.pop(key, None)
yield {
"tool_calls": [
llm.ToolInput(
id=current_tool_call.call_id,
tool_name=current_tool_call.name,
tool_args=raw_args,
)
]
}
elif isinstance(event, LLMResponseCompletedEvent):
response = event.response
if response and "usage" in response:
usage = response["usage"]
chat_log.async_trace(
{
"stats": {
"input_tokens": usage.get("input_tokens"),
"output_tokens": usage.get("output_tokens"),
}
}
)
elif isinstance(event, LLMResponseIncompleteEvent):
response = event.response
if response and "usage" in response:
usage = response["usage"]
chat_log.async_trace(
{
"stats": {
"input_tokens": usage.get("input_tokens"),
"output_tokens": usage.get("output_tokens"),
}
}
)
incomplete_details = response.get("incomplete_details")
reason = "unknown reason"
if incomplete_details is not None and incomplete_details.get("reason"):
reason = incomplete_details["reason"]
if reason == "max_output_tokens":
reason = "max output tokens reached"
elif reason == "content_filter":
reason = "content filter triggered"
raise HomeAssistantError(f"OpenAI response incomplete: {reason}")
elif isinstance(event, LLMResponseFailedEvent):
response = event.response
if response and "usage" in response:
usage = response["usage"]
chat_log.async_trace(
{
"stats": {
"input_tokens": usage.get("input_tokens"),
"output_tokens": usage.get("output_tokens"),
}
}
)
reason = "unknown reason"
if isinstance(error := response.get("error"), dict):
reason = error.get("message") or reason
raise HomeAssistantError(f"OpenAI response failed: {reason}")
elif isinstance(event, LLMResponseErrorEvent):
raise HomeAssistantError(f"OpenAI response error: {event.message}")
class BaseCloudLLMEntity(Entity):
"""Cloud LLM conversation agent."""
def __init__(self, cloud: Cloud[CloudClient], config_entry: ConfigEntry) -> None:
"""Initialize the entity."""
self._cloud = cloud
self._entry = config_entry
async def _prepare_chat_for_generation(
self,
chat_log: conversation.ChatLog,
messages: list[ResponseInputItemParam],
response_format: dict[str, Any] | None = None,
) -> dict[str, Any]:
"""Prepare kwargs for Cloud LLM from the chat log."""
last_content: Any = chat_log.content[-1]
if last_content.role == "user" and last_content.attachments:
files = await self._async_prepare_files_for_prompt(last_content.attachments)
current_content = last_content.content
last_content = [*(current_content or []), *files]
tools: list[ToolParam] = []
tool_choice: str | None = None
if chat_log.llm_api:
ha_tools: list[ToolParam] = [
_format_tool(tool, chat_log.llm_api.custom_serializer)
for tool in chat_log.llm_api.tools
]
if ha_tools:
if not chat_log.unresponded_tool_results:
tools = ha_tools
tool_choice = "auto"
else:
tools = []
tool_choice = "none"
web_search = WebSearchToolParam(
type="web_search",
search_context_size="medium",
)
tools.append(web_search)
response_kwargs: dict[str, Any] = {
"messages": messages,
"conversation_id": chat_log.conversation_id,
}
if response_format is not None:
response_kwargs["response_format"] = response_format
if tools is not None:
response_kwargs["tools"] = tools
if tool_choice is not None:
response_kwargs["tool_choice"] = tool_choice
response_kwargs["stream"] = True
return response_kwargs
async def _async_prepare_files_for_prompt(
self,
attachments: list[conversation.Attachment],
) -> list[dict[str, Any]]:
"""Prepare files for multimodal prompts."""
def prepare() -> list[dict[str, Any]]:
content: list[dict[str, Any]] = []
for attachment in attachments:
mime_type = attachment.mime_type
path = attachment.path
if not path.exists():
raise HomeAssistantError(f"`{path}` does not exist")
data = base64.b64encode(path.read_bytes()).decode("utf-8")
if mime_type and mime_type.startswith("image/"):
content.append(
{
"type": "input_image",
"image_url": f"data:{mime_type};base64,{data}",
"detail": "auto",
}
)
elif mime_type and mime_type.startswith("application/pdf"):
content.append(
{
"type": "input_file",
"filename": str(path.name),
"file_data": f"data:{mime_type};base64,{data}",
}
)
else:
raise HomeAssistantError(
"Only images and PDF are currently supported as attachments"
)
return content
return await self.hass.async_add_executor_job(prepare)
async def _async_handle_chat_log(
self,
type: Literal["ai_task", "conversation"],
chat_log: conversation.ChatLog,
structure_name: str | None = None,
structure: vol.Schema | None = None,
) -> None:
"""Generate a response for the chat log."""
for _ in range(_MAX_TOOL_ITERATIONS):
response_format: dict[str, Any] | None = None
if structure and structure_name:
response_format = {
"type": "json_schema",
"json_schema": {
"name": slugify(structure_name),
"schema": _format_structured_output(
structure, chat_log.llm_api
),
"strict": False,
},
}
messages = _convert_content_to_param(chat_log.content)
response_kwargs = await self._prepare_chat_for_generation(
chat_log,
messages,
response_format,
)
try:
if type == "conversation":
raw_stream = await self._cloud.llm.async_process_conversation(
**response_kwargs,
)
else:
raw_stream = await self._cloud.llm.async_generate_data(
**response_kwargs,
)
messages.extend(
_convert_content_to_param(
[
content
async for content in chat_log.async_add_delta_content_stream(
self.entity_id,
_transform_stream(
chat_log,
raw_stream,
True,
),
)
]
)
)
except LLMAuthenticationError as err:
raise HomeAssistantError("Cloud LLM authentication failed") from err
except LLMRateLimitError as err:
raise HomeAssistantError("Cloud LLM is rate limited") from err
except LLMResponseError as err:
raise HomeAssistantError(str(err)) from err
except LLMServiceError as err:
raise HomeAssistantError("Error talking to Cloud LLM") from err
except NabuCasaBaseError as err:
raise HomeAssistantError(str(err)) from err
if not chat_log.unresponded_tool_results:
break