From 9f5de63c6191d06a9bbb42e4e90efb90c65b8b7e Mon Sep 17 00:00:00 2001 From: Greg Laabs Date: Sat, 29 Jul 2017 18:39:03 -0700 Subject: [PATCH] Rewrite of the "first synthetic datapoint" query for multiple entities The old method was written in a manner that prevented an index from being used in the inner-most GROUP BY statement, causing massive performance issues especially when querying for a large time period. The new query does have one material change that will cause it to return different results than before: instead of using max(state_id) to get the latest entry, we now get the max(last_updated). This is more appropriate (primary key should not be assumed to be in order of event firing) and allows an index to be used on the inner-most query. I added another JOIN layer to account for cases where there are two entries on the exact same `last_created` for a given entity. In this case we do use `state_id` as a tiebreaker. For performance reasons the domain filters were also moved to the outermost query, as it's way more efficient to do it there than on the innermost query as before (due to indexing with GROUP BY problems) The result is a query that only needs to do a filesort on the final result set, which will only be as many rows as there are entities. --- homeassistant/components/history.py | 41 ++++++++++++++++++++--------- 1 file changed, 28 insertions(+), 13 deletions(-) diff --git a/homeassistant/components/history.py b/homeassistant/components/history.py index 893ff23df35..0eb177fb5c7 100644 --- a/homeassistant/components/history.py +++ b/homeassistant/components/history.py @@ -130,34 +130,49 @@ def get_states(hass, utc_point_in_time, entity_ids=None, run=None, ).order_by( States.created.desc()) - if filters: - most_recent_state_ids = filters.apply(most_recent_state_ids, - entity_ids) - most_recent_state_ids = most_recent_state_ids.limit(1) else: # We have more than one entity to look at (most commonly we want # all entities,) so we need to do a search on all states since the # last recorder run started. - most_recent_state_ids = session.query( - func.max(States.state_id).label('max_state_id') + + most_recent_states_by_date = session.query( + States.entity_id.label('max_entity_id'), + func.max(States.last_updated).label('max_last_updated') ).filter( (States.created >= run.start) & - (States.created < utc_point_in_time) & - (~States.domain.in_(IGNORE_DOMAINS))) + (States.created < utc_point_in_time) + ) - if filters: - most_recent_state_ids = filters.apply(most_recent_state_ids, - entity_ids) + if entity_ids: + most_recent_states_by_date.filter( + States.entity_id.in_(entity_ids)) + + most_recent_states_by_date = most_recent_states_by_date.group_by( + States.entity_id) + + most_recent_states_by_date = most_recent_states_by_date.subquery() + + most_recent_state_ids = session.query( + func.max(States.state_id).label('max_state_id') + ).join(most_recent_states_by_date, and_( + States.entity_id == most_recent_states_by_date.c.max_entity_id, + States.last_updated == most_recent_states_by_date.c. + max_last_updated)) most_recent_state_ids = most_recent_state_ids.group_by( States.entity_id) most_recent_state_ids = most_recent_state_ids.subquery() - query = session.query(States).join(most_recent_state_ids, and_( - States.state_id == most_recent_state_ids.c.max_state_id)) + query = session.query(States).join( + most_recent_state_ids, + States.state_id == most_recent_state_ids.c.max_state_id + ).filter((~States.domain.in_(IGNORE_DOMAINS))) + + if filters: + query = filters.apply(query, entity_ids) return [state for state in execute(query) if not state.attributes.get(ATTR_HIDDEN, False)]