Auto recover when the Bluetooth adapter stops responding (#77043)

This commit is contained in:
J. Nick Koston
2022-08-20 11:58:14 -10:00
committed by GitHub
parent eb0828efdb
commit ced8278e32
10 changed files with 406 additions and 72 deletions

View File

@ -39,7 +39,7 @@ from .models import (
HaBleakScannerWrapper,
ProcessAdvertisementCallback,
)
from .scanner import HaScanner, create_bleak_scanner
from .scanner import HaScanner, ScannerStartError, create_bleak_scanner
from .util import adapter_human_name, adapter_unique_name, async_default_adapter
if TYPE_CHECKING:
@ -281,7 +281,10 @@ async def async_setup_entry(
) from err
scanner = HaScanner(hass, bleak_scanner, adapter, address)
entry.async_on_unload(scanner.async_register_callback(manager.scanner_adv_received))
await scanner.async_start()
try:
await scanner.async_start()
except ScannerStartError as err:
raise ConfigEntryNotReady from err
entry.async_on_unload(manager.async_register_scanner(scanner))
await async_update_device(entry, manager, adapter, address)
hass.data.setdefault(DOMAIN, {})[entry.entry_id] = scanner

View File

@ -13,13 +13,12 @@ WINDOWS_DEFAULT_BLUETOOTH_ADAPTER = "bluetooth"
MACOS_DEFAULT_BLUETOOTH_ADAPTER = "Core Bluetooth"
UNIX_DEFAULT_BLUETOOTH_ADAPTER = "hci0"
DEFAULT_ADAPTERS = {MACOS_DEFAULT_BLUETOOTH_ADAPTER, UNIX_DEFAULT_BLUETOOTH_ADAPTER}
DEFAULT_ADAPTER_BY_PLATFORM = {
"Windows": WINDOWS_DEFAULT_BLUETOOTH_ADAPTER,
"Darwin": MACOS_DEFAULT_BLUETOOTH_ADAPTER,
}
# Some operating systems hide the adapter address for privacy reasons (ex MacOS)
DEFAULT_ADDRESS: Final = "00:00:00:00:00:00"
@ -28,9 +27,29 @@ SOURCE_LOCAL: Final = "local"
DATA_MANAGER: Final = "bluetooth_manager"
UNAVAILABLE_TRACK_SECONDS: Final = 60 * 5
START_TIMEOUT = 12
SCANNER_WATCHDOG_TIMEOUT: Final = 60 * 5
SCANNER_WATCHDOG_INTERVAL: Final = timedelta(seconds=SCANNER_WATCHDOG_TIMEOUT)
START_TIMEOUT = 15
MAX_DBUS_SETUP_SECONDS = 5
# Anything after 30s is considered stale, we have buffer
# for start timeouts and execution time
STALE_ADVERTISEMENT_SECONDS: Final = 30 + START_TIMEOUT + MAX_DBUS_SETUP_SECONDS
# We must recover before we hit the 180s mark
# where the device is removed from the stack
# or the devices will go unavailable. Since
# we only check every 30s, we need this number
# to be
# 180s Time when device is removed from stack
# - 30s check interval
# - 20s scanner restart time * 2
#
SCANNER_WATCHDOG_TIMEOUT: Final = 110
# How often to check if the scanner has reached
# the SCANNER_WATCHDOG_TIMEOUT without seeing anything
SCANNER_WATCHDOG_INTERVAL: Final = timedelta(seconds=30)
class AdapterDetails(TypedDict, total=False):

View File

@ -23,6 +23,7 @@ from homeassistant.helpers.event import async_track_time_interval
from .const import (
ADAPTER_ADDRESS,
SOURCE_LOCAL,
STALE_ADVERTISEMENT_SECONDS,
UNAVAILABLE_TRACK_SECONDS,
AdapterDetails,
)
@ -46,7 +47,6 @@ FILTER_UUIDS: Final = "UUIDs"
RSSI_SWITCH_THRESHOLD = 6
STALE_ADVERTISEMENT_SECONDS = 180
_LOGGER = logging.getLogger(__name__)

View File

@ -4,7 +4,11 @@
"documentation": "https://www.home-assistant.io/integrations/bluetooth",
"dependencies": ["websocket_api"],
"quality_scale": "internal",
"requirements": ["bleak==0.15.1", "bluetooth-adapters==0.2.0"],
"requirements": [
"bleak==0.15.1",
"bluetooth-adapters==0.2.0",
"bluetooth-auto-recovery==0.2.1"
],
"codeowners": ["@bdraco"],
"config_flow": true,
"iot_class": "local_push"

View File

@ -5,6 +5,7 @@ import asyncio
from collections.abc import Callable
from datetime import datetime
import logging
import platform
import time
import async_timeout
@ -21,19 +22,18 @@ from homeassistant.core import (
HomeAssistant,
callback as hass_callback,
)
from homeassistant.exceptions import ConfigEntryNotReady
from homeassistant.exceptions import HomeAssistantError
from homeassistant.helpers.event import async_track_time_interval
from homeassistant.util.package import is_docker_env
from .const import (
DEFAULT_ADAPTERS,
SCANNER_WATCHDOG_INTERVAL,
SCANNER_WATCHDOG_TIMEOUT,
SOURCE_LOCAL,
START_TIMEOUT,
)
from .models import BluetoothScanningMode
from .util import adapter_human_name
from .util import adapter_human_name, async_reset_adapter
OriginalBleakScanner = bleak.BleakScanner
MONOTONIC_TIME = time.monotonic
@ -44,6 +44,12 @@ _LOGGER = logging.getLogger(__name__)
MONOTONIC_TIME = time.monotonic
NEED_RESET_ERRORS = [
"org.bluez.Error.Failed",
"org.bluez.Error.InProgress",
"org.bluez.Error.NotReady",
]
START_ATTEMPTS = 2
SCANNING_MODE_TO_BLEAK = {
BluetoothScanningMode.ACTIVE: "active",
@ -51,12 +57,17 @@ SCANNING_MODE_TO_BLEAK = {
}
class ScannerStartError(HomeAssistantError):
"""Error to indicate that the scanner failed to start."""
def create_bleak_scanner(
scanning_mode: BluetoothScanningMode, adapter: str | None
) -> bleak.BleakScanner:
"""Create a Bleak scanner."""
scanner_kwargs = {"scanning_mode": SCANNING_MODE_TO_BLEAK[scanning_mode]}
if adapter and adapter not in DEFAULT_ADAPTERS:
# Only Linux supports multiple adapters
if adapter and platform.system() == "Linux":
scanner_kwargs["adapter"] = adapter
_LOGGER.debug("Initializing bluetooth scanner with %s", scanner_kwargs)
try:
@ -66,7 +77,7 @@ def create_bleak_scanner(
class HaScanner:
"""Operate a BleakScanner.
"""Operate and automatically recover a BleakScanner.
Multiple BleakScanner can be used at the same time
if there are multiple adapters. This is only useful
@ -91,6 +102,7 @@ class HaScanner:
self._cancel_stop: CALLBACK_TYPE | None = None
self._cancel_watchdog: CALLBACK_TYPE | None = None
self._last_detection = 0.0
self._start_time = 0.0
self._callbacks: list[
Callable[[BLEDevice, AdvertisementData, float, str], None]
] = []
@ -129,9 +141,19 @@ class HaScanner:
Currently this is used to feed the callbacks into the
central manager.
"""
self._last_detection = MONOTONIC_TIME()
callback_time = MONOTONIC_TIME()
if (
advertisement_data.local_name
or advertisement_data.manufacturer_data
or advertisement_data.service_data
or advertisement_data.service_uuids
):
# Don't count empty advertisements
# as the adapter is in a failure
# state if all the data is empty.
self._last_detection = callback_time
for callback in self._callbacks:
callback(ble_device, advertisement_data, self._last_detection, self.source)
callback(ble_device, advertisement_data, callback_time, self.source)
async def async_start(self) -> None:
"""Start bluetooth scanner."""
@ -142,55 +164,85 @@ class HaScanner:
async def _async_start(self) -> None:
"""Start bluetooth scanner under the lock."""
try:
async with async_timeout.timeout(START_TIMEOUT):
await self.scanner.start() # type: ignore[no-untyped-call]
except InvalidMessageError as ex:
for attempt in range(START_ATTEMPTS):
_LOGGER.debug(
"%s: Invalid DBus message received: %s", self.name, ex, exc_info=True
)
raise ConfigEntryNotReady(
f"{self.name}: Invalid DBus message received: {ex}; try restarting `dbus`"
) from ex
except BrokenPipeError as ex:
_LOGGER.debug(
"%s: DBus connection broken: %s", self.name, ex, exc_info=True
)
if is_docker_env():
raise ConfigEntryNotReady(
f"{self.name}: DBus connection broken: {ex}; try restarting `bluetooth`, `dbus`, and finally the docker container"
) from ex
raise ConfigEntryNotReady(
f"{self.name}: DBus connection broken: {ex}; try restarting `bluetooth` and `dbus`"
) from ex
except FileNotFoundError as ex:
_LOGGER.debug(
"%s: FileNotFoundError while starting bluetooth: %s",
"%s: Starting bluetooth discovery attempt: (%s/%s)",
self.name,
ex,
exc_info=True,
attempt + 1,
START_ATTEMPTS,
)
if is_docker_env():
raise ConfigEntryNotReady(
f"{self.name}: DBus service not found; docker config may be missing `-v /run/dbus:/run/dbus:ro`: {ex}"
try:
async with async_timeout.timeout(START_TIMEOUT):
await self.scanner.start() # type: ignore[no-untyped-call]
except InvalidMessageError as ex:
_LOGGER.debug(
"%s: Invalid DBus message received: %s",
self.name,
ex,
exc_info=True,
)
raise ScannerStartError(
f"{self.name}: Invalid DBus message received: {ex}; "
"try restarting `dbus`"
) from ex
raise ConfigEntryNotReady(
f"{self.name}: DBus service not found; make sure the DBus socket is available to Home Assistant: {ex}"
) from ex
except asyncio.TimeoutError as ex:
raise ConfigEntryNotReady(
f"{self.name}: Timed out starting Bluetooth after {START_TIMEOUT} seconds"
) from ex
except BleakError as ex:
_LOGGER.debug(
"%s: BleakError while starting bluetooth: %s",
self.name,
ex,
exc_info=True,
)
raise ConfigEntryNotReady(
f"{self.name}: Failed to start Bluetooth: {ex}"
) from ex
except BrokenPipeError as ex:
_LOGGER.debug(
"%s: DBus connection broken: %s", self.name, ex, exc_info=True
)
if is_docker_env():
raise ScannerStartError(
f"{self.name}: DBus connection broken: {ex}; try restarting "
"`bluetooth`, `dbus`, and finally the docker container"
) from ex
raise ScannerStartError(
f"{self.name}: DBus connection broken: {ex}; try restarting "
"`bluetooth` and `dbus`"
) from ex
except FileNotFoundError as ex:
_LOGGER.debug(
"%s: FileNotFoundError while starting bluetooth: %s",
self.name,
ex,
exc_info=True,
)
if is_docker_env():
raise ScannerStartError(
f"{self.name}: DBus service not found; docker config may "
"be missing `-v /run/dbus:/run/dbus:ro`: {ex}"
) from ex
raise ScannerStartError(
f"{self.name}: DBus service not found; make sure the DBus socket "
f"is available to Home Assistant: {ex}"
) from ex
except asyncio.TimeoutError as ex:
if attempt == 0:
await self._async_reset_adapter()
continue
raise ScannerStartError(
f"{self.name}: Timed out starting Bluetooth after {START_TIMEOUT} seconds"
) from ex
except BleakError as ex:
if attempt == 0:
error_str = str(ex)
if any(
needs_reset_error in error_str
for needs_reset_error in NEED_RESET_ERRORS
):
await self._async_reset_adapter()
continue
_LOGGER.debug(
"%s: BleakError while starting bluetooth: %s",
self.name,
ex,
exc_info=True,
)
raise ScannerStartError(
f"{self.name}: Failed to start Bluetooth: {ex}"
) from ex
# Everything is fine, break out of the loop
break
self._async_setup_scanner_watchdog()
self._cancel_stop = self.hass.bus.async_listen_once(
EVENT_HOMEASSISTANT_STOP, self._async_hass_stopping
@ -199,48 +251,78 @@ class HaScanner:
@hass_callback
def _async_setup_scanner_watchdog(self) -> None:
"""If Dbus gets restarted or updated, we need to restart the scanner."""
self._last_detection = MONOTONIC_TIME()
self._cancel_watchdog = async_track_time_interval(
self.hass, self._async_scanner_watchdog, SCANNER_WATCHDOG_INTERVAL
)
self._start_time = self._last_detection = MONOTONIC_TIME()
if not self._cancel_watchdog:
self._cancel_watchdog = async_track_time_interval(
self.hass, self._async_scanner_watchdog, SCANNER_WATCHDOG_INTERVAL
)
async def _async_scanner_watchdog(self, now: datetime) -> None:
"""Check if the scanner is running."""
time_since_last_detection = MONOTONIC_TIME() - self._last_detection
_LOGGER.debug(
"%s: Scanner watchdog time_since_last_detection: %s",
self.name,
time_since_last_detection,
)
if time_since_last_detection < SCANNER_WATCHDOG_TIMEOUT:
return
_LOGGER.info(
"%s: Bluetooth scanner has gone quiet for %s, restarting",
"%s: Bluetooth scanner has gone quiet for %ss, restarting",
self.name,
SCANNER_WATCHDOG_INTERVAL,
SCANNER_WATCHDOG_TIMEOUT,
)
async with self._start_stop_lock:
await self._async_stop()
await self._async_start()
# Stop the scanner but not the watchdog
# since we want to try again later if it's still quiet
await self._async_stop_scanner()
if self._start_time == self._last_detection or (
time_since_last_detection
) > (SCANNER_WATCHDOG_TIMEOUT + SCANNER_WATCHDOG_INTERVAL.total_seconds()):
await self._async_reset_adapter()
try:
await self._async_start()
except ScannerStartError as ex:
_LOGGER.error(
"%s: Failed to restart Bluetooth scanner: %s",
self.name,
ex,
exc_info=True,
)
async def _async_hass_stopping(self, event: Event) -> None:
"""Stop the Bluetooth integration at shutdown."""
self._cancel_stop = None
await self.async_stop()
async def _async_reset_adapter(self) -> None:
"""Reset the adapter."""
_LOGGER.warning("%s: adapter stopped responding; executing reset", self.name)
result = await async_reset_adapter(self.adapter)
_LOGGER.info("%s: adapter reset result: %s", self.name, result)
async def async_stop(self) -> None:
"""Stop bluetooth scanner."""
async with self._start_stop_lock:
await self._async_stop()
async def _async_stop(self) -> None:
"""Stop bluetooth discovery under the lock."""
_LOGGER.debug("Stopping bluetooth discovery")
"""Cancel watchdog and bluetooth discovery under the lock."""
if self._cancel_watchdog:
self._cancel_watchdog()
self._cancel_watchdog = None
await self._async_stop_scanner()
async def _async_stop_scanner(self) -> None:
"""Stop bluetooth discovery under the lock."""
if self._cancel_stop:
self._cancel_stop()
self._cancel_stop = None
_LOGGER.debug("%s: Stopping bluetooth discovery", self.name)
try:
await self.scanner.stop() # type: ignore[no-untyped-call]
except BleakError as ex:
# This is not fatal, and they may want to reload
# the config entry to restart the scanner if they
# change the bluetooth dongle.
_LOGGER.error("Error stopping scanner: %s", ex)
_LOGGER.error("%s: Error stopping scanner: %s", self.name, ex)

View File

@ -3,6 +3,8 @@ from __future__ import annotations
import platform
from bluetooth_auto_recovery import recover_adapter
from homeassistant.core import callback
from .const import (
@ -65,3 +67,11 @@ def adapter_human_name(adapter: str, address: str) -> str:
def adapter_unique_name(adapter: str, address: str) -> str:
"""Return a unique name for the adapter."""
return adapter if address == DEFAULT_ADDRESS else address
async def async_reset_adapter(adapter: str | None) -> bool | None:
"""Reset the adapter."""
if adapter and adapter.startswith("hci"):
adapter_id = int(adapter[3:])
return await recover_adapter(adapter_id)
return False

View File

@ -12,6 +12,7 @@ awesomeversion==22.6.0
bcrypt==3.1.7
bleak==0.15.1
bluetooth-adapters==0.2.0
bluetooth-auto-recovery==0.2.1
certifi>=2021.5.30
ciso8601==2.2.0
cryptography==37.0.4

View File

@ -426,6 +426,9 @@ blockchain==1.4.4
# homeassistant.components.bluetooth
bluetooth-adapters==0.2.0
# homeassistant.components.bluetooth
bluetooth-auto-recovery==0.2.1
# homeassistant.components.bond
bond-async==0.1.22

View File

@ -337,6 +337,9 @@ blinkpy==0.19.0
# homeassistant.components.bluetooth
bluetooth-adapters==0.2.0
# homeassistant.components.bluetooth
bluetooth-auto-recovery==0.2.1
# homeassistant.components.bond
bond-async==0.1.22

View File

@ -8,12 +8,14 @@ from bleak.backends.scanner import (
BLEDevice,
)
from dbus_next import InvalidMessageError
import pytest
from homeassistant.components import bluetooth
from homeassistant.components.bluetooth.const import (
SCANNER_WATCHDOG_INTERVAL,
SCANNER_WATCHDOG_TIMEOUT,
)
from homeassistant.components.bluetooth.scanner import NEED_RESET_ERRORS
from homeassistant.config_entries import ConfigEntryState
from homeassistant.const import EVENT_HOMEASSISTANT_STARTED, EVENT_HOMEASSISTANT_STOP
from homeassistant.util import dt as dt_util
@ -140,6 +142,27 @@ async def test_invalid_dbus_message(hass, caplog, one_adapter):
assert "dbus" in caplog.text
@pytest.mark.parametrize("error", NEED_RESET_ERRORS)
async def test_adapter_needs_reset_at_start(hass, caplog, one_adapter, error):
"""Test we cycle the adapter when it needs a restart."""
with patch(
"homeassistant.components.bluetooth.scanner.OriginalBleakScanner.start",
side_effect=[BleakError(error), None],
), patch(
"homeassistant.components.bluetooth.util.recover_adapter", return_value=True
) as mock_recover_adapter:
await async_setup_with_one_adapter(hass)
hass.bus.async_fire(EVENT_HOMEASSISTANT_STARTED)
await hass.async_block_till_done()
assert len(mock_recover_adapter.mock_calls) == 1
hass.bus.async_fire(EVENT_HOMEASSISTANT_STOP)
await hass.async_block_till_done()
async def test_recovery_from_dbus_restart(hass, one_adapter):
"""Test we can recover when DBus gets restarted out from under us."""
@ -223,3 +246,189 @@ async def test_recovery_from_dbus_restart(hass, one_adapter):
await hass.async_block_till_done()
assert called_start == 2
async def test_adapter_recovery(hass, one_adapter):
"""Test we can recover when the adapter stops responding."""
called_start = 0
called_stop = 0
_callback = None
mock_discovered = []
class MockBleakScanner:
async def start(self, *args, **kwargs):
"""Mock Start."""
nonlocal called_start
called_start += 1
async def stop(self, *args, **kwargs):
"""Mock Start."""
nonlocal called_stop
called_stop += 1
@property
def discovered_devices(self):
"""Mock discovered_devices."""
nonlocal mock_discovered
return mock_discovered
def register_detection_callback(self, callback: AdvertisementDataCallback):
"""Mock Register Detection Callback."""
nonlocal _callback
_callback = callback
scanner = MockBleakScanner()
start_time_monotonic = 1000
with patch(
"homeassistant.components.bluetooth.scanner.MONOTONIC_TIME",
return_value=start_time_monotonic,
), patch(
"homeassistant.components.bluetooth.scanner.OriginalBleakScanner",
return_value=scanner,
):
await async_setup_with_one_adapter(hass)
assert called_start == 1
scanner = _get_manager()
mock_discovered = [MagicMock()]
# Ensure we don't restart the scanner if we don't need to
with patch(
"homeassistant.components.bluetooth.scanner.MONOTONIC_TIME",
return_value=start_time_monotonic + 10,
):
async_fire_time_changed(hass, dt_util.utcnow() + SCANNER_WATCHDOG_INTERVAL)
await hass.async_block_till_done()
assert called_start == 1
# Ensure we don't restart the scanner if we don't need to
with patch(
"homeassistant.components.bluetooth.scanner.MONOTONIC_TIME",
return_value=start_time_monotonic + 20,
):
async_fire_time_changed(hass, dt_util.utcnow() + SCANNER_WATCHDOG_INTERVAL)
await hass.async_block_till_done()
assert called_start == 1
# We hit the timer with no detections, so we reset the adapter and restart the scanner
with patch(
"homeassistant.components.bluetooth.scanner.MONOTONIC_TIME",
return_value=start_time_monotonic
+ SCANNER_WATCHDOG_TIMEOUT
+ SCANNER_WATCHDOG_INTERVAL.total_seconds(),
), patch(
"homeassistant.components.bluetooth.util.recover_adapter", return_value=True
) as mock_recover_adapter:
async_fire_time_changed(hass, dt_util.utcnow() + SCANNER_WATCHDOG_INTERVAL)
await hass.async_block_till_done()
assert len(mock_recover_adapter.mock_calls) == 1
assert called_start == 2
async def test_adapter_scanner_fails_to_start_first_time(hass, one_adapter):
"""Test we can recover when the adapter stops responding and the first recovery fails."""
called_start = 0
called_stop = 0
_callback = None
mock_discovered = []
class MockBleakScanner:
async def start(self, *args, **kwargs):
"""Mock Start."""
nonlocal called_start
called_start += 1
if called_start == 1:
return # Start ok the first time
if called_start < 4:
raise BleakError("Failed to start")
async def stop(self, *args, **kwargs):
"""Mock Start."""
nonlocal called_stop
called_stop += 1
@property
def discovered_devices(self):
"""Mock discovered_devices."""
nonlocal mock_discovered
return mock_discovered
def register_detection_callback(self, callback: AdvertisementDataCallback):
"""Mock Register Detection Callback."""
nonlocal _callback
_callback = callback
scanner = MockBleakScanner()
start_time_monotonic = 1000
with patch(
"homeassistant.components.bluetooth.scanner.MONOTONIC_TIME",
return_value=start_time_monotonic,
), patch(
"homeassistant.components.bluetooth.scanner.OriginalBleakScanner",
return_value=scanner,
):
await async_setup_with_one_adapter(hass)
assert called_start == 1
scanner = _get_manager()
mock_discovered = [MagicMock()]
# Ensure we don't restart the scanner if we don't need to
with patch(
"homeassistant.components.bluetooth.scanner.MONOTONIC_TIME",
return_value=start_time_monotonic + 10,
):
async_fire_time_changed(hass, dt_util.utcnow() + SCANNER_WATCHDOG_INTERVAL)
await hass.async_block_till_done()
assert called_start == 1
# Ensure we don't restart the scanner if we don't need to
with patch(
"homeassistant.components.bluetooth.scanner.MONOTONIC_TIME",
return_value=start_time_monotonic + 20,
):
async_fire_time_changed(hass, dt_util.utcnow() + SCANNER_WATCHDOG_INTERVAL)
await hass.async_block_till_done()
assert called_start == 1
# We hit the timer with no detections, so we reset the adapter and restart the scanner
with patch(
"homeassistant.components.bluetooth.scanner.MONOTONIC_TIME",
return_value=start_time_monotonic
+ SCANNER_WATCHDOG_TIMEOUT
+ SCANNER_WATCHDOG_INTERVAL.total_seconds(),
), patch(
"homeassistant.components.bluetooth.util.recover_adapter", return_value=True
) as mock_recover_adapter:
async_fire_time_changed(hass, dt_util.utcnow() + SCANNER_WATCHDOG_INTERVAL)
await hass.async_block_till_done()
assert len(mock_recover_adapter.mock_calls) == 1
assert called_start == 3
# We hit the timer again the previous start call failed, make sure
# we try again
with patch(
"homeassistant.components.bluetooth.scanner.MONOTONIC_TIME",
return_value=start_time_monotonic
+ SCANNER_WATCHDOG_TIMEOUT
+ SCANNER_WATCHDOG_INTERVAL.total_seconds(),
), patch(
"homeassistant.components.bluetooth.util.recover_adapter", return_value=True
) as mock_recover_adapter:
async_fire_time_changed(hass, dt_util.utcnow() + SCANNER_WATCHDOG_INTERVAL)
await hass.async_block_till_done()
assert len(mock_recover_adapter.mock_calls) == 1
assert called_start == 4