Add Google Cloud Speech-to-Text (STT) (#120854)

* Google Cloud

* .

* fix

* mypy

* add tests

* Update .coveragerc

* Update const.py

* upload file, reconfigure and import flow

* fixes

* default to latest_short

* mypy

* update

* Allow clearing options in the UI

* update

* update

* update
This commit is contained in:
tronikos
2024-09-03 06:23:07 -07:00
committed by GitHub
parent eda1656e75
commit 334359bb0a
9 changed files with 345 additions and 4 deletions

View File

@ -6,7 +6,7 @@ from homeassistant.config_entries import ConfigEntry
from homeassistant.const import Platform
from homeassistant.core import HomeAssistant
PLATFORMS = [Platform.TTS]
PLATFORMS = [Platform.STT, Platform.TTS]
async def async_setup_entry(hass: HomeAssistant, entry: ConfigEntry) -> bool:

View File

@ -26,7 +26,16 @@ from homeassistant.helpers.selector import (
SelectSelectorMode,
)
from .const import CONF_KEY_FILE, CONF_SERVICE_ACCOUNT_INFO, DEFAULT_LANG, DOMAIN, TITLE
from .const import (
CONF_KEY_FILE,
CONF_SERVICE_ACCOUNT_INFO,
CONF_STT_MODEL,
DEFAULT_LANG,
DEFAULT_STT_MODEL,
DOMAIN,
SUPPORTED_STT_MODELS,
TITLE,
)
from .helpers import (
async_tts_voices,
tts_options_schema,
@ -162,6 +171,15 @@ class GoogleCloudOptionsFlowHandler(OptionsFlowWithConfigEntry):
**tts_options_schema(
self.options, voices, from_config_flow=True
).schema,
vol.Optional(
CONF_STT_MODEL,
default=DEFAULT_STT_MODEL,
): SelectSelector(
SelectSelectorConfig(
mode=SelectSelectorMode.DROPDOWN,
options=SUPPORTED_STT_MODELS,
)
),
}
),
self.options,

View File

@ -10,6 +10,7 @@ CONF_KEY_FILE = "key_file"
DEFAULT_LANG = "en-US"
# TTS constants
CONF_GENDER = "gender"
CONF_VOICE = "voice"
CONF_ENCODING = "encoding"
@ -18,3 +19,166 @@ CONF_PITCH = "pitch"
CONF_GAIN = "gain"
CONF_PROFILES = "profiles"
CONF_TEXT_TYPE = "text_type"
# STT constants
CONF_STT_MODEL = "stt_model"
DEFAULT_STT_MODEL = "latest_short"
# https://cloud.google.com/speech-to-text/docs/transcription-model
SUPPORTED_STT_MODELS = [
"latest_long",
"latest_short",
"telephony",
"telephony_short",
"medical_dictation",
"medical_conversation",
"command_and_search",
"default",
"phone_call",
"video",
]
# https://cloud.google.com/speech-to-text/docs/speech-to-text-supported-languages
STT_LANGUAGES = [
"af-ZA",
"am-ET",
"ar-AE",
"ar-BH",
"ar-DZ",
"ar-EG",
"ar-IL",
"ar-IQ",
"ar-JO",
"ar-KW",
"ar-LB",
"ar-MA",
"ar-MR",
"ar-OM",
"ar-PS",
"ar-QA",
"ar-SA",
"ar-SY",
"ar-TN",
"ar-YE",
"az-AZ",
"bg-BG",
"bn-BD",
"bn-IN",
"bs-BA",
"ca-ES",
"cmn-Hans-CN",
"cmn-Hans-HK",
"cmn-Hant-TW",
"cs-CZ",
"da-DK",
"de-AT",
"de-CH",
"de-DE",
"el-GR",
"en-AU",
"en-CA",
"en-GB",
"en-GH",
"en-HK",
"en-IE",
"en-IN",
"en-KE",
"en-NG",
"en-NZ",
"en-PH",
"en-PK",
"en-SG",
"en-TZ",
"en-US",
"en-ZA",
"es-AR",
"es-BO",
"es-CL",
"es-CO",
"es-CR",
"es-DO",
"es-EC",
"es-ES",
"es-GT",
"es-HN",
"es-MX",
"es-NI",
"es-PA",
"es-PE",
"es-PR",
"es-PY",
"es-SV",
"es-US",
"es-UY",
"es-VE",
"et-EE",
"eu-ES",
"fa-IR",
"fi-FI",
"fil-PH",
"fr-BE",
"fr-CA",
"fr-CH",
"fr-FR",
"gl-ES",
"gu-IN",
"hi-IN",
"hr-HR",
"hu-HU",
"hy-AM",
"id-ID",
"is-IS",
"it-CH",
"it-IT",
"iw-IL",
"ja-JP",
"jv-ID",
"ka-GE",
"kk-KZ",
"km-KH",
"kn-IN",
"ko-KR",
"lo-LA",
"lt-LT",
"lv-LV",
"mk-MK",
"ml-IN",
"mn-MN",
"mr-IN",
"ms-MY",
"my-MM",
"ne-NP",
"nl-BE",
"nl-NL",
"no-NO",
"pa-Guru-IN",
"pl-PL",
"pt-BR",
"pt-PT",
"ro-RO",
"ru-RU",
"si-LK",
"sk-SK",
"sl-SI",
"sq-AL",
"sr-RS",
"su-ID",
"sv-SE",
"sw-KE",
"sw-TZ",
"ta-IN",
"ta-LK",
"ta-MY",
"ta-SG",
"te-IN",
"th-TH",
"tr-TR",
"uk-UA",
"ur-IN",
"ur-PK",
"uz-UZ",
"vi-VN",
"yue-Hant-HK",
"zu-ZA",
]

View File

@ -7,5 +7,8 @@
"documentation": "https://www.home-assistant.io/integrations/google_cloud",
"integration_type": "service",
"iot_class": "cloud_push",
"requirements": ["google-cloud-texttospeech==2.17.2"]
"requirements": [
"google-cloud-texttospeech==2.17.2",
"google-cloud-speech==2.27.0"
]
}

View File

@ -24,7 +24,8 @@
"pitch": "Default pitch of the voice",
"gain": "Default volume gain (in dB) of the voice",
"profiles": "Default audio profiles",
"text_type": "Default text type"
"text_type": "Default text type",
"stt_model": "STT model"
}
}
}

View File

@ -0,0 +1,147 @@
"""Support for the Google Cloud STT service."""
from __future__ import annotations
from collections.abc import AsyncGenerator, AsyncIterable
import logging
from google.api_core.exceptions import GoogleAPIError, Unauthenticated
from google.cloud import speech_v1
from homeassistant.components.stt import (
AudioBitRates,
AudioChannels,
AudioCodecs,
AudioFormats,
AudioSampleRates,
SpeechMetadata,
SpeechResult,
SpeechResultState,
SpeechToTextEntity,
)
from homeassistant.config_entries import ConfigEntry
from homeassistant.core import HomeAssistant
from homeassistant.helpers import device_registry as dr
from homeassistant.helpers.entity_platform import AddEntitiesCallback
from .const import (
CONF_SERVICE_ACCOUNT_INFO,
CONF_STT_MODEL,
DEFAULT_STT_MODEL,
DOMAIN,
STT_LANGUAGES,
)
_LOGGER = logging.getLogger(__name__)
async def async_setup_entry(
hass: HomeAssistant,
config_entry: ConfigEntry,
async_add_entities: AddEntitiesCallback,
) -> None:
"""Set up Google Cloud speech platform via config entry."""
service_account_info = config_entry.data[CONF_SERVICE_ACCOUNT_INFO]
client = speech_v1.SpeechAsyncClient.from_service_account_info(service_account_info)
async_add_entities([GoogleCloudSpeechToTextEntity(config_entry, client)])
class GoogleCloudSpeechToTextEntity(SpeechToTextEntity):
"""Google Cloud STT entity."""
def __init__(
self,
entry: ConfigEntry,
client: speech_v1.SpeechAsyncClient,
) -> None:
"""Init Google Cloud STT entity."""
self._attr_unique_id = f"{entry.entry_id}-stt"
self._attr_name = entry.title
self._attr_device_info = dr.DeviceInfo(
identifiers={(DOMAIN, entry.entry_id)},
manufacturer="Google",
model="Cloud",
entry_type=dr.DeviceEntryType.SERVICE,
)
self._entry = entry
self._client = client
self._model = entry.options.get(CONF_STT_MODEL, DEFAULT_STT_MODEL)
@property
def supported_languages(self) -> list[str]:
"""Return a list of supported languages."""
return STT_LANGUAGES
@property
def supported_formats(self) -> list[AudioFormats]:
"""Return a list of supported formats."""
return [AudioFormats.WAV, AudioFormats.OGG]
@property
def supported_codecs(self) -> list[AudioCodecs]:
"""Return a list of supported codecs."""
return [AudioCodecs.PCM, AudioCodecs.OPUS]
@property
def supported_bit_rates(self) -> list[AudioBitRates]:
"""Return a list of supported bitrates."""
return [AudioBitRates.BITRATE_16]
@property
def supported_sample_rates(self) -> list[AudioSampleRates]:
"""Return a list of supported samplerates."""
return [AudioSampleRates.SAMPLERATE_16000]
@property
def supported_channels(self) -> list[AudioChannels]:
"""Return a list of supported channels."""
return [AudioChannels.CHANNEL_MONO]
async def async_process_audio_stream(
self, metadata: SpeechMetadata, stream: AsyncIterable[bytes]
) -> SpeechResult:
"""Process an audio stream to STT service."""
streaming_config = speech_v1.StreamingRecognitionConfig(
config=speech_v1.RecognitionConfig(
encoding=(
speech_v1.RecognitionConfig.AudioEncoding.OGG_OPUS
if metadata.codec == AudioCodecs.OPUS
else speech_v1.RecognitionConfig.AudioEncoding.LINEAR16
),
sample_rate_hertz=metadata.sample_rate,
language_code=metadata.language,
model=self._model,
)
)
async def request_generator() -> (
AsyncGenerator[speech_v1.StreamingRecognizeRequest]
):
# The first request must only contain a streaming_config
yield speech_v1.StreamingRecognizeRequest(streaming_config=streaming_config)
# All subsequent requests must only contain audio_content
async for audio_content in stream:
yield speech_v1.StreamingRecognizeRequest(audio_content=audio_content)
try:
responses = await self._client.streaming_recognize(
requests=request_generator(),
timeout=10,
)
transcript = ""
async for response in responses:
_LOGGER.debug("response: %s", response)
if not response.results:
continue
result = response.results[0]
if not result.alternatives:
continue
transcript += response.results[0].alternatives[0].transcript
except GoogleAPIError as err:
_LOGGER.error("Error occurred during Google Cloud STT call: %s", err)
if isinstance(err, Unauthenticated):
self._entry.async_start_reauth(self.hass)
return SpeechResult(None, SpeechResultState.ERROR)
return SpeechResult(transcript, SpeechResultState.SUCCESS)

View File

@ -985,6 +985,9 @@ google-api-python-client==2.71.0
# homeassistant.components.google_pubsub
google-cloud-pubsub==2.23.0
# homeassistant.components.google_cloud
google-cloud-speech==2.27.0
# homeassistant.components.google_cloud
google-cloud-texttospeech==2.17.2

View File

@ -835,6 +835,9 @@ google-api-python-client==2.71.0
# homeassistant.components.google_pubsub
google-cloud-pubsub==2.23.0
# homeassistant.components.google_cloud
google-cloud-speech==2.27.0
# homeassistant.components.google_cloud
google-cloud-texttospeech==2.17.2

View File

@ -161,6 +161,7 @@ async def test_options_flow(
"gain",
"profiles",
"text_type",
"stt_model",
}
assert mock_api_tts_from_service_account_info.list_voices.call_count == 2
@ -179,5 +180,6 @@ async def test_options_flow(
"gain": 0.0,
"profiles": [],
"text_type": "text",
"stt_model": "latest_short",
}
assert mock_api_tts_from_service_account_info.list_voices.call_count == 3