Diagnostics report for Thread networks (#88541)

* Initial diagnostics

* Include MLP of local OTBR

* Add dep on pyroute2

* Move pyroute2 onto executor

* More comments

* Read thread data direct from zeroconf cache

* Get neighbour cache for known BR's

* isort

* mypy

* Add diagnostic test

* rel import

* Fix pylint

* Restore coverage in discovery.py
This commit is contained in:
Jc2k
2023-02-22 16:24:21 +00:00
committed by GitHub
parent 3afc39dbbf
commit f7bfdfefde
10 changed files with 593 additions and 26 deletions

View File

@ -0,0 +1,208 @@
"""Diagnostics support for Thread networks.
When triaging Matter and HomeKit issues you often need to check for problems with the Thread network.
This report helps spot and rule out:
* Is the users border router visible at all?
* Is the border router actually announcing any routes? The user could have a network boundary like
VLANs or WiFi isolation that is blocking the RA packets.
* Alternatively, if user isn't on HAOS they could have accept_ra_rt_info_max_plen set incorrectly.
* Are there any bogus routes that could be interfering. If routes don't expire they can build up.
When you have 10 routes and only 2 border routers something has gone wrong.
This does not do any connectivity checks. So user could have all their border routers visible, but
some of their thread accessories can't be pinged, but it's still a thread problem.
"""
from __future__ import annotations
from typing import Any, TypedDict
from pyroute2 import NDB # pylint: disable=no-name-in-module
from python_otbr_api.tlv_parser import MeshcopTLVType
from homeassistant.components import zeroconf
from homeassistant.config_entries import ConfigEntry
from homeassistant.core import HomeAssistant
from .dataset_store import async_get_store
from .discovery import async_read_zeroconf_cache
class Neighbour(TypedDict):
"""A neighbour cache entry (ip neigh)."""
lladdr: str
state: int
probes: int
class Route(TypedDict):
"""A route table entry (ip -6 route)."""
metrics: int
priority: int
is_nexthop: bool
class Router(TypedDict):
"""A border router."""
server: str | None
addresses: list[str]
neighbours: dict[str, Neighbour]
thread_version: str | None
model: str | None
vendor: str | None
routes: dict[str, Route]
class Network(TypedDict):
"""A thread network."""
name: str | None
routers: dict[str, Router]
prefixes: set[str]
unexpected_routers: set[str]
def _get_possible_thread_routes() -> (
tuple[dict[str, dict[str, Route]], dict[str, set[str]]]
):
# Build a list of possible thread routes
# Right now, this is ipv6 /64's that have a gateway
# We cross reference with zerconf data to confirm which via's are known border routers
routes: dict[str, dict[str, Route]] = {}
reverse_routes: dict[str, set[str]] = {}
with NDB() as ndb:
for record in ndb.routes:
# Limit to IPV6 routes
if record.family != 10:
continue
# Limit to /64 prefixes
if record.dst_len != 64:
continue
# Limit to routes with a via
if not record.gateway and not record.nh_gateway:
continue
gateway = record.gateway or record.nh_gateway
route = routes.setdefault(gateway, {})
route[record.dst] = {
"metrics": record.metrics,
"priority": record.priority,
# NM creates "nexthop" routes - a single route with many via's
# Kernel creates many routes with a single via
"is_nexthop": record.nh_gateway is not None,
}
reverse_routes.setdefault(record.dst, set()).add(gateway)
return routes, reverse_routes
def _get_neighbours() -> dict[str, Neighbour]:
neighbours: dict[str, Neighbour] = {}
with NDB() as ndb:
for record in ndb.neighbours:
neighbours[record.dst] = {
"lladdr": record.lladdr,
"state": record.state,
"probes": record.probes,
}
return neighbours
async def async_get_config_entry_diagnostics(
hass: HomeAssistant, entry: ConfigEntry
) -> dict[str, Any]:
"""Return diagnostics for all known thread networks."""
networks: dict[str, Network] = {}
# Start with all networks that HA knows about
store = await async_get_store(hass)
for record in store.datasets.values():
if not record.extended_pan_id:
continue
network = networks.setdefault(
record.extended_pan_id,
{
"name": record.network_name,
"routers": {},
"prefixes": set(),
"unexpected_routers": set(),
},
)
if mlp := record.dataset.get(MeshcopTLVType.MESHLOCALPREFIX):
network["prefixes"].add(f"{mlp[0:4]}:{mlp[4:8]}:{mlp[8:12]}:{mlp[12:16]}")
# Find all routes currently act that might be thread related, so we can match them to
# border routers as we process the zeroconf data.
routes, reverse_routes = await hass.async_add_executor_job(
_get_possible_thread_routes
)
# Find all neighbours
neighbours = await hass.async_add_executor_job(_get_neighbours)
aiozc = await zeroconf.async_get_async_instance(hass)
for data in async_read_zeroconf_cache(aiozc):
if not data.extended_pan_id:
continue
network = networks.setdefault(
data.extended_pan_id,
{
"name": data.network_name,
"routers": {},
"prefixes": set(),
"unexpected_routers": set(),
},
)
if not data.server:
continue
router = network["routers"][data.server] = {
"server": data.server,
"addresses": data.addresses or [],
"neighbours": {},
"thread_version": data.thread_version,
"model": data.model_name,
"vendor": data.vendor_name,
"routes": {},
}
# For every address this border router hass, see if we have seen
# it in the route table as a via - these are the routes its
# announcing via RA
if data.addresses:
for address in data.addresses:
if address in routes:
router["routes"].update(routes[address])
if address in neighbours:
router["neighbours"][address] = neighbours[address]
network["prefixes"].update(router["routes"].keys())
# Find unexpected via's.
# Collect all router addresses and then for each prefix, find via's that aren't
# a known router for that prefix.
for network in networks.values():
routers = set()
for router in network["routers"].values():
routers.update(router["addresses"])
for prefix in network["prefixes"]:
if prefix not in reverse_routes:
continue
if ghosts := reverse_routes[prefix] - routers:
network["unexpected_routers"] = ghosts
return {
"networks": networks,
}

View File

@ -4,9 +4,10 @@ from __future__ import annotations
from collections.abc import Callable
import dataclasses
import logging
from typing import cast
from zeroconf import ServiceListener, Zeroconf
from zeroconf.asyncio import AsyncZeroconf
from zeroconf import BadTypeInNameException, DNSPointer, ServiceListener, Zeroconf
from zeroconf.asyncio import AsyncServiceInfo, AsyncZeroconf
from homeassistant.components import zeroconf
from homeassistant.core import HomeAssistant
@ -19,6 +20,8 @@ KNOWN_BRANDS: dict[str | None, str] = {
"HomeAssistant": "homeassistant",
}
THREAD_TYPE = "_meshcop._udp.local."
CLASS_IN = 1
TYPE_PTR = 12
@dataclasses.dataclass
@ -31,6 +34,65 @@ class ThreadRouterDiscoveryData:
network_name: str | None
server: str | None
vendor_name: str | None
addresses: list[str] | None
thread_version: str | None
def async_discovery_data_from_service(
service: AsyncServiceInfo,
) -> ThreadRouterDiscoveryData:
"""Get a ThreadRouterDiscoveryData from an AsyncServiceInfo."""
def try_decode(value: bytes | None) -> str | None:
"""Try decoding UTF-8."""
if value is None:
return None
try:
return value.decode()
except UnicodeDecodeError:
return None
ext_pan_id = service.properties.get(b"xp")
network_name = try_decode(service.properties.get(b"nn"))
model_name = try_decode(service.properties.get(b"mn"))
server = service.server
vendor_name = try_decode(service.properties.get(b"vn"))
thread_version = try_decode(service.properties.get(b"tv"))
return ThreadRouterDiscoveryData(
brand=KNOWN_BRANDS.get(vendor_name),
extended_pan_id=ext_pan_id.hex() if ext_pan_id is not None else None,
model_name=model_name,
network_name=network_name,
server=server,
vendor_name=vendor_name,
addresses=service.parsed_addresses(),
thread_version=thread_version,
)
def async_read_zeroconf_cache(aiozc: AsyncZeroconf) -> list[ThreadRouterDiscoveryData]:
"""Return all meshcop records already in the zeroconf cache."""
results = []
records = aiozc.zeroconf.cache.async_all_by_details(THREAD_TYPE, TYPE_PTR, CLASS_IN)
for record in records:
record = cast(DNSPointer, record)
try:
info = AsyncServiceInfo(THREAD_TYPE, record.alias)
except BadTypeInNameException as ex:
_LOGGER.debug(
"Ignoring record with bad type in name: %s: %s", record.alias, ex
)
continue
if not info.load_from_cache(aiozc.zeroconf):
# data is not fully in the cache, so ignore for now
continue
results.append(async_discovery_data_from_service(info))
return results
class ThreadRouterDiscovery:
@ -83,15 +145,6 @@ class ThreadRouterDiscovery:
_LOGGER.debug("_add_update_service failed to add %s, %s", type_, name)
return
def try_decode(value: bytes | None) -> str | None:
"""Try decoding UTF-8."""
if value is None:
return None
try:
return value.decode()
except UnicodeDecodeError:
return None
_LOGGER.debug("_add_update_service %s %s", name, service)
# We use the extended mac address as key, bail out if it's missing
try:
@ -99,19 +152,8 @@ class ThreadRouterDiscovery:
except (KeyError, UnicodeDecodeError) as err:
_LOGGER.debug("_add_update_service failed to parse service %s", err)
return
ext_pan_id = service.properties.get(b"xp")
network_name = try_decode(service.properties.get(b"nn"))
model_name = try_decode(service.properties.get(b"mn"))
server = service.server
vendor_name = try_decode(service.properties.get(b"vn"))
data = ThreadRouterDiscoveryData(
brand=KNOWN_BRANDS.get(vendor_name),
extended_pan_id=ext_pan_id.hex() if ext_pan_id is not None else None,
model_name=model_name,
network_name=network_name,
server=server,
vendor_name=vendor_name,
)
data = async_discovery_data_from_service(service)
if name in self._known_routers and self._known_routers[name] == (
extended_mac_address,
data,

View File

@ -7,6 +7,6 @@
"documentation": "https://www.home-assistant.io/integrations/thread",
"integration_type": "service",
"iot_class": "local_polling",
"requirements": ["python-otbr-api==1.0.3"],
"requirements": ["python-otbr-api==1.0.3", "pyroute2==0.7.5"],
"zeroconf": ["_meshcop._udp.local."]
}

View File

@ -1916,6 +1916,9 @@ pyrisco==0.5.7
# homeassistant.components.rituals_perfume_genie
pyrituals==0.0.6
# homeassistant.components.thread
pyroute2==0.7.5
# homeassistant.components.ruckus_unleashed
pyruckus==0.16

View File

@ -1384,6 +1384,9 @@ pyrisco==0.5.7
# homeassistant.components.rituals_perfume_genie
pyrituals==0.0.6
# homeassistant.components.thread
pyroute2==0.7.5
# homeassistant.components.ruckus_unleashed
pyruckus==0.16

View File

@ -3,6 +3,7 @@
import pytest
from homeassistant.components import thread
from homeassistant.core import HomeAssistant
from tests.common import MockConfigEntry
@ -10,7 +11,7 @@ CONFIG_ENTRY_DATA = {}
@pytest.fixture(name="thread_config_entry")
async def thread_config_entry_fixture(hass):
async def thread_config_entry_fixture(hass: HomeAssistant):
"""Mock Thread config entry."""
config_entry = MockConfigEntry(
data=CONFIG_ENTRY_DATA,

View File

@ -0,0 +1,76 @@
# serializer version: 1
# name: test_diagnostics
dict({
'networks': dict({
'1111111122222222': dict({
'name': 'OpenThreadDemo',
'prefixes': list([
'fdad:70bf:e5aa:15dd',
]),
'routers': dict({
}),
'unexpected_routers': list([
]),
}),
'c3a60fc387c381c286212cc3a5': dict({
'name': 'OpenThread HC',
'prefixes': list([
'fd59:86c6:e5a5::',
]),
'routers': dict({
'HomeAssistant OpenThreadBorderRouter #0BBF._meshcop._udp.local.': dict({
'addresses': list([
'127.0.0.1',
'fe80::10ed:6406:4ee9:85e5',
]),
'model': 'OpenThreadBorderRouter',
'neighbours': dict({
'fe80::10ed:6406:4ee9:85e5': dict({
'lladdr': '00:00:00:00:00:00',
'probes': 64,
'state': 64,
}),
}),
'routes': dict({
'fd59:86c6:e5a5::': dict({
'is_nexthop': False,
'metrics': 100,
'priority': 100,
}),
}),
'server': 'HomeAssistant OpenThreadBorderRouter #0BBF._meshcop._udp.local.',
'thread_version': '1.3.0',
'vendor': 'HomeAssistant',
}),
'HomePod._meshcop._udp.local.': dict({
'addresses': list([
'127.0.0.1',
'fe80::10ed:6406:4ee9:85e4',
]),
'model': None,
'neighbours': dict({
'fe80::10ed:6406:4ee9:85e4': dict({
'lladdr': '00:00:00:00:00:00',
'probes': 64,
'state': 64,
}),
}),
'routes': dict({
'fd59:86c6:e5a5::': dict({
'is_nexthop': True,
'metrics': 100,
'priority': 100,
}),
}),
'server': 'HomePod._meshcop._udp.local.',
'thread_version': '1.2.0',
'vendor': 'Apple',
}),
}),
'unexpected_routers': list([
'fe80::10ed:6406:4ee9:85e3',
]),
}),
}),
})
# ---

View File

@ -0,0 +1,224 @@
"""Test the thread websocket API."""
import dataclasses
import time
from unittest.mock import Mock, patch
import pytest
from syrupy.assertion import SnapshotAssertion
from zeroconf import DNSCache, ServiceInfo
from homeassistant.components.thread import dataset_store
from homeassistant.components.thread.const import DOMAIN
from homeassistant.core import HomeAssistant
from homeassistant.setup import async_setup_component
from . import DATASET_1
from tests.components.diagnostics import get_diagnostics_for_config_entry
from tests.typing import ClientSessionGenerator
TEST_ZEROCONF_RECORD_1 = ServiceInfo(
type_="_meshcop._udp.local.",
name="HomeAssistant OpenThreadBorderRouter #0BBF._meshcop._udp.local.",
addresses=["127.0.0.1", "fe80::10ed:6406:4ee9:85e5"],
port=8080,
properties={
"rv": "1",
"vn": "HomeAssistant",
"mn": "OpenThreadBorderRouter",
"nn": "OpenThread HC",
"xp": "\xe6\x0f\xc7\xc1\x86!,\xe5",
"tv": "1.3.0",
"xa": "\xae\xeb/YKW\x0b\xbf",
"sb": "\x00\x00\x01\xb1",
"at": "\x00\x00\x00\x00\x00\x01\x00\x00",
"pt": "\x8f\x06Q~",
"sq": "3",
"bb": "\xf0\xbf",
"dn": "DefaultDomain",
},
)
TEST_ZEROCONF_RECORD_2 = ServiceInfo(
type_="_meshcop._udp.local.",
name="HomePod._meshcop._udp.local.",
addresses=["127.0.0.1", "fe80::10ed:6406:4ee9:85e4"],
port=8080,
properties={
"rv": "1",
"vn": "Apple",
"nn": "OpenThread HC",
"xp": "\xe6\x0f\xc7\xc1\x86!,\xe5",
"tv": "1.2.0",
"xa": "\xae\xeb/YKW\x0b\xbf",
"sb": "\x00\x00\x01\xb1",
"at": "\x00\x00\x00\x00\x00\x01\x00\x00",
"pt": "\x8f\x06Q~",
"sq": "3",
"bb": "\xf0\xbf",
"dn": "DefaultDomain",
},
)
TEST_ZEROCONF_RECORD_3 = ServiceInfo(
type_="_meshcop._udp.local.",
name="office._meshcop._udp.local.",
addresses=["127.0.0.1", "fe80::10ed:6406:4ee9:85e0"],
port=8080,
properties={
"rv": "1",
"vn": "Apple",
"nn": "OpenThread HC",
"xp": "\xe6\x0f\xc7\xc1\x86!,\xe5",
"tv": "1.2.0",
"xa": "\xae\xeb/YKW\x0b\xbf",
"sb": "\x00\x00\x01\xb1",
"at": "\x00\x00\x00\x00\x00\x01\x00\x00",
"pt": "\x8f\x06Q~",
"sq": "3",
"bb": "\xf0\xbf",
"dn": "DefaultDomain",
},
)
TEST_ZEROCONF_RECORD_4 = ServiceInfo(
type_="_meshcop._udp.local.",
name="office._meshcop._udp.local.",
addresses=["127.0.0.1", "fe80::10ed:6406:4ee9:85e0"],
port=8080,
properties={
"rv": "1",
"vn": "Apple",
"nn": "OpenThread HC",
"xp": "\xe6\x0f\xc7\xc1\x86!,\xe5",
"tv": "1.2.0",
"xa": "\xae\xeb/YKW\x0b\xbf",
"sb": "\x00\x00\x01\xb1",
"at": "\x00\x00\x00\x00\x00\x01\x00\x00",
"pt": "\x8f\x06Q~",
"sq": "3",
"bb": "\xf0\xbf",
"dn": "DefaultDomain",
},
)
# Make sure this generates an invalid DNSPointer
TEST_ZEROCONF_RECORD_4.name = "office._meshcop._udp.lo\x00cal."
@dataclasses.dataclass
class MockRoute:
"""A mock iproute2 route table entry."""
dst: str
gateway: str | None = None
nh_gateway: str | None = None
metrics: int = 100
priority: int = 100
family: int = 10
dst_len: int = 64
@dataclasses.dataclass
class MockNeighbour:
"""A mock iproute2 neighbour cache entry."""
dst: str
lladdr: str = "00:00:00:00:00:00"
state: int = 64
probes: int = 64
@pytest.fixture
def ndb() -> Mock:
"""Prevent NDB poking the OS route tables."""
with patch(
"homeassistant.components.thread.diagnostics.NDB"
) as ndb, ndb() as instance:
instance.neighbours = []
instance.routes = []
yield instance
async def test_diagnostics(
hass: HomeAssistant,
mock_async_zeroconf: None,
ndb: Mock,
hass_client: ClientSessionGenerator,
snapshot: SnapshotAssertion,
) -> None:
"""Test diagnostics for thread routers."""
cache = mock_async_zeroconf.zeroconf.cache = DNSCache()
now = time.monotonic() * 1000
cache.async_add_records(
[
*TEST_ZEROCONF_RECORD_1.dns_addresses(created=now),
TEST_ZEROCONF_RECORD_1.dns_service(created=now),
TEST_ZEROCONF_RECORD_1.dns_text(created=now),
TEST_ZEROCONF_RECORD_1.dns_pointer(created=now),
]
)
cache.async_add_records(
[
*TEST_ZEROCONF_RECORD_2.dns_addresses(created=now),
TEST_ZEROCONF_RECORD_2.dns_service(created=now),
TEST_ZEROCONF_RECORD_2.dns_text(created=now),
TEST_ZEROCONF_RECORD_2.dns_pointer(created=now),
]
)
# Test for invalid cache
cache.async_add_records([TEST_ZEROCONF_RECORD_3.dns_pointer(created=now)])
# Test for invalid record
cache.async_add_records(
[
*TEST_ZEROCONF_RECORD_4.dns_addresses(created=now),
TEST_ZEROCONF_RECORD_4.dns_service(created=now),
TEST_ZEROCONF_RECORD_4.dns_text(created=now),
TEST_ZEROCONF_RECORD_4.dns_pointer(created=now),
]
)
assert await async_setup_component(hass, DOMAIN, {})
await hass.async_block_till_done()
config_entry = hass.config_entries.async_entries(DOMAIN)[0]
await dataset_store.async_add_dataset(hass, "source", DATASET_1)
ndb.neighbours.append(
MockNeighbour(
dst="fe80::10ed:6406:4ee9:85e5",
)
)
ndb.neighbours.append(
MockNeighbour(
dst="fe80::10ed:6406:4ee9:85e4",
)
)
ndb.routes.append(
MockRoute(
dst="fd59:86c6:e5a5::",
gateway="fe80::10ed:6406:4ee9:85e5",
)
)
ndb.routes.append(
MockRoute(
dst="fd59:86c6:e5a5::",
nh_gateway="fe80::10ed:6406:4ee9:85e4",
)
)
# Add a "ghost" route - we don't know a border router on 85e3
ndb.routes.append(
MockRoute(
dst="fd59:86c6:e5a5::",
nh_gateway="fe80::10ed:6406:4ee9:85e3",
)
)
diag = await get_diagnostics_for_config_entry(hass, hass_client, config_entry)
assert diag == snapshot

View File

@ -73,6 +73,8 @@ async def test_discover_routers(hass: HomeAssistant, mock_async_zeroconf: None)
network_name="OpenThread HC",
server="core-silabs-multiprotocol.local.",
vendor_name="HomeAssistant",
thread_version="1.3.0",
addresses=["192.168.0.115"],
),
)
@ -95,6 +97,8 @@ async def test_discover_routers(hass: HomeAssistant, mock_async_zeroconf: None)
network_name="NEST-PAN-E1AF",
server="2d99f293-cd8e-2770-8dd2-6675de9fa000.local.",
vendor_name="Google Inc.",
thread_version="1.3.0",
addresses=["192.168.0.124"],
),
)
@ -163,6 +167,8 @@ async def test_discover_routers_bad_data(
network_name="OpenThread HC",
server="core-silabs-multiprotocol.local.",
vendor_name=None,
thread_version="1.3.0",
addresses=["192.168.0.115"],
),
)

View File

@ -240,6 +240,8 @@ async def test_discover_routers(
"network_name": "OpenThread HC",
"server": "core-silabs-multiprotocol.local.",
"vendor_name": "HomeAssistant",
"addresses": ["192.168.0.115"],
"thread_version": "1.3.0",
},
"key": "aeeb2f594b570bbf",
"type": "router_discovered",
@ -265,6 +267,8 @@ async def test_discover_routers(
"network_name": "NEST-PAN-E1AF",
"server": "2d99f293-cd8e-2770-8dd2-6675de9fa000.local.",
"vendor_name": "Google Inc.",
"thread_version": "1.3.0",
"addresses": ["192.168.0.124"],
},
"key": "f6a99b425a67abed",
"type": "router_discovered",