Coverage for python/lsst/daf/butler/registry/queries/_results.py: 33%
281 statements
« prev ^ index » next coverage.py v6.4.4, created at 2022-09-27 08:58 +0000
« prev ^ index » next coverage.py v6.4.4, created at 2022-09-27 08:58 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23__all__ = (
24 "ChainedDatasetQueryResults",
25 "DatabaseDimensionRecordQueryResults",
26 "DataCoordinateQueryResults",
27 "DatasetQueryResults",
28 "DimensionRecordQueryResults",
29 "ParentDatasetQueryResults",
30)
32import itertools
33import operator
34from abc import abstractmethod
35from collections.abc import Callable, Iterable, Iterator, Mapping, Sequence
36from contextlib import AbstractContextManager, ExitStack, contextmanager
37from typing import Any, Optional
39import sqlalchemy
41from ...core import (
42 DataCoordinate,
43 DataCoordinateIterable,
44 DatasetRef,
45 DatasetType,
46 Dimension,
47 DimensionGraph,
48 DimensionRecord,
49 SimpleQuery,
50)
51from ..interfaces import Database, DimensionRecordStorage
52from ._query import Query
53from ._structs import ElementOrderByClause, QuerySummary
55QueryFactoryMethod = Callable[[Optional[Iterable[str]], Optional[tuple[int, Optional[int]]]], Query]
56"""Type of a query factory method type used by DataCoordinateQueryResults.
57"""
60class DataCoordinateQueryResults(DataCoordinateIterable):
61 """An enhanced implementation of `DataCoordinateIterable` that represents
62 data IDs retrieved from a database query.
64 Parameters
65 ----------
66 db : `Database`
67 Database engine used to execute queries.
68 query_factory : `QueryFactoryMethod`
69 Method which creates an instance of `Query` class.
70 graph : `DimensionGraph`
71 Dimensions used by query.
72 order_by : `Iterable` [ `str` ], optional
73 Optional sequence of column names used for result ordering.
74 limit : `Tuple` [ `int`, `int` ], optional
75 Limit for the number of returned records and optional offset.
76 records : `Mapping`, optional
77 A nested mapping containing `DimensionRecord` objects for all
78 dimensions and all data IDs this query will yield. If `None`
79 (default), `DataCoordinateIterable.hasRecords` will return `False`.
80 The outer mapping has `str` keys (the names of dimension elements).
81 The inner mapping has `tuple` keys representing data IDs (tuple
82 conversions of `DataCoordinate.values()`) and `DimensionRecord` values.
84 Notes
85 -----
86 Constructing an instance of this does nothing; the query is not executed
87 until it is iterated over (or some other operation is performed that
88 involves iteration).
90 Instances should generally only be constructed by `Registry` methods or the
91 methods of other query result objects.
92 """
94 def __init__(
95 self,
96 db: Database,
97 query_factory: QueryFactoryMethod,
98 graph: DimensionGraph,
99 *,
100 order_by: Iterable[str] | None = None,
101 limit: tuple[int, int | None] | None = None,
102 records: Mapping[str, Mapping[tuple, DimensionRecord]] | None = None,
103 ):
104 self._db = db
105 self._query_factory = query_factory
106 self._graph = graph
107 self._order_by = order_by
108 self._limit = limit
109 self._records = records
110 self._cached_query: Query | None = None
112 __slots__ = ("_db", "_query_factory", "_graph", "_order_by", "_limit", "_records", "_cached_query")
114 @classmethod
115 def from_query(
116 cls,
117 db: Database,
118 query: Query,
119 graph: DimensionGraph,
120 *,
121 order_by: Iterable[str] | None = None,
122 limit: tuple[int, int | None] | None = None,
123 records: Mapping[str, Mapping[tuple, DimensionRecord]] | None = None,
124 ) -> DataCoordinateQueryResults:
125 """Make an instance from a pre-existing query instead of a factory.
127 Parameters
128 ----------
129 db : `Database`
130 Database engine used to execute queries.
131 query : `Query`
132 Low-level representation of the query that backs this result
133 object.
134 graph : `DimensionGraph`
135 Dimensions used by query.
136 order_by : `Iterable` [ `str` ], optional
137 Optional sequence of column names used for result ordering.
138 limit : `Tuple` [ `int`, `int` ], optional
139 Limit for the number of returned records and optional offset.
140 records : `Mapping`, optional
141 A nested mapping containing `DimensionRecord` objects for all
142 dimensions and all data IDs this query will yield. If `None`
143 (default), `DataCoordinateIterable.hasRecords` will return `False`.
144 The outer mapping has `str` keys (the names of dimension elements).
145 The inner mapping has `tuple` keys representing data IDs (tuple
146 conversions of `DataCoordinate.values()`) and `DimensionRecord`
147 values.
148 """
150 def factory(order_by: Iterable[str] | None, limit: tuple[int, int | None] | None) -> Query:
151 return query
153 return DataCoordinateQueryResults(db, factory, graph, order_by=order_by, limit=limit, records=records)
155 def __iter__(self) -> Iterator[DataCoordinate]:
156 return (self._query.extractDataId(row, records=self._records) for row in self._query.rows(self._db))
158 def __repr__(self) -> str:
159 return f"<DataCoordinate iterator with dimensions={self._graph}>"
161 def _clone(
162 self,
163 *,
164 query_factory: QueryFactoryMethod | None = None,
165 query: Query | None = None,
166 graph: DimensionGraph | None = None,
167 order_by: Iterable[str] | None = None,
168 limit: tuple[int, int | None] | None = None,
169 records: Mapping[str, Mapping[tuple, DimensionRecord]] | None = None,
170 ) -> DataCoordinateQueryResults:
171 """Clone this instance potentially updating some attributes."""
172 graph = graph if graph is not None else self._graph
173 order_by = order_by if order_by is not None else self._order_by
174 limit = limit if limit is not None else self._limit
175 records = records if records is not None else self._records
176 if query is None:
177 query_factory = query_factory or self._query_factory
178 return DataCoordinateQueryResults(
179 self._db, query_factory, graph, order_by=order_by, limit=limit, records=records
180 )
181 else:
182 return DataCoordinateQueryResults.from_query(
183 self._db, query, graph, order_by=order_by, limit=limit, records=records
184 )
186 @property
187 def _query(self) -> Query:
188 """Query representation instance (`Query`)"""
189 if self._cached_query is None:
190 self._cached_query = self._query_factory(self._order_by, self._limit)
191 assert (
192 self._cached_query.datasetType is None
193 ), "Query used to initialize data coordinate results should not have any datasets."
194 return self._cached_query
196 @property
197 def graph(self) -> DimensionGraph:
198 # Docstring inherited from DataCoordinateIterable.
199 return self._graph
201 def hasFull(self) -> bool:
202 # Docstring inherited from DataCoordinateIterable.
203 return True
205 def hasRecords(self) -> bool:
206 # Docstring inherited from DataCoordinateIterable.
207 return self._records is not None or not self._graph
209 @contextmanager
210 def materialize(self) -> Iterator[DataCoordinateQueryResults]:
211 """Insert this query's results into a temporary table.
213 Returns
214 -------
215 context : `typing.ContextManager` [ `DataCoordinateQueryResults` ]
216 A context manager that ensures the temporary table is created and
217 populated in ``__enter__`` (returning a results object backed by
218 that table), and dropped in ``__exit__``. If ``self`` is already
219 materialized, the context manager may do nothing (reflecting the
220 fact that an outer context manager should already take care of
221 everything else).
223 Notes
224 -----
225 When using a very large result set to perform multiple queries (e.g.
226 multiple calls to `subset` with different arguments, or even a single
227 call to `expanded`), it may be much more efficient to start by
228 materializing the query and only then performing the follow up queries.
229 It may also be less efficient, depending on how well database engine's
230 query optimizer can simplify those particular follow-up queries and
231 how efficiently it caches query results even when the are not
232 explicitly inserted into a temporary table. See `expanded` and
233 `subset` for examples.
234 """
235 with self._query.materialize(self._db) as materialized:
236 # Note that we depend on order_by columns to be passes from Query
237 # to MaterializedQuery, so order_by and limit are not used.
238 yield self._clone(query=materialized)
240 def expanded(self) -> DataCoordinateQueryResults:
241 """Return a results object for which `hasRecords` returns `True`.
243 This method may involve actually executing database queries to fetch
244 `DimensionRecord` objects.
246 Returns
247 -------
248 results : `DataCoordinateQueryResults`
249 A results object for which `hasRecords` returns `True`. May be
250 ``self`` if that is already the case.
252 Notes
253 -----
254 For very result sets, it may be much more efficient to call
255 `materialize` before calling `expanded`, to avoid performing the
256 original query multiple times (as a subquery) in the follow-up queries
257 that fetch dimension records. For example::
259 with registry.queryDataIds(...).materialize() as tempDataIds:
260 dataIdsWithRecords = tempDataIds.expanded()
261 for dataId in dataIdsWithRecords:
262 ...
263 """
264 if self._records is None:
265 records = {}
266 for element in self.graph.elements:
267 subset = self.subset(graph=element.graph, unique=True)
268 records[element.name] = {
269 tuple(record.dataId.values()): record
270 for record in self._query.backend.managers.dimensions[element].fetch(subset)
271 }
273 return self._clone(query=self._query, records=records)
274 else:
275 return self
277 def subset(
278 self, graph: DimensionGraph | None = None, *, unique: bool = False
279 ) -> DataCoordinateQueryResults:
280 """Return a results object containing a subset of the dimensions of
281 this one, and/or a unique near-subset of its rows.
283 This method may involve actually executing database queries to fetch
284 `DimensionRecord` objects.
286 Parameters
287 ----------
288 graph : `DimensionGraph`, optional
289 Dimensions to include in the new results object. If `None`,
290 ``self.graph`` is used.
291 unique : `bool`, optional
292 If `True` (`False` is default), the query should only return unique
293 data IDs. This is implemented in the database; to obtain unique
294 results via Python-side processing (which may be more efficient in
295 some cases), use `toSet` to construct a `DataCoordinateSet` from
296 this results object instead.
298 Returns
299 -------
300 results : `DataCoordinateQueryResults`
301 A results object corresponding to the given criteria. May be
302 ``self`` if it already qualifies.
304 Raises
305 ------
306 ValueError
307 Raised when ``graph`` is not a subset of the dimension graph in
308 this result.
310 Notes
311 -----
312 This method can only return a "near-subset" of the original result rows
313 in general because of subtleties in how spatial overlaps are
314 implemented; see `Query.subset` for more information.
316 When calling `subset` multiple times on the same very large result set,
317 it may be much more efficient to call `materialize` first. For
318 example::
320 dimensions1 = DimensionGraph(...)
321 dimensions2 = DimensionGraph(...)
322 with registry.queryDataIds(...).materialize() as tempDataIds:
323 for dataId1 in tempDataIds.subset(
324 graph=dimensions1,
325 unique=True):
326 ...
327 for dataId2 in tempDataIds.subset(
328 graph=dimensions2,
329 unique=True):
330 ...
331 """
332 if graph is None:
333 graph = self.graph
334 if not graph.issubset(self.graph):
335 raise ValueError(f"{graph} is not a subset of {self.graph}")
336 if graph == self.graph and (not unique or self._query.isUnique()):
337 return self
338 records: Mapping[str, Mapping[tuple, DimensionRecord]] | None
339 if self._records is not None:
340 records = {element.name: self._records[element.name] for element in graph.elements}
341 else:
342 records = None
343 query = self._query.subset(graph=graph, datasets=False, unique=unique)
345 return self._clone(graph=graph, query=query, records=records)
347 def constrain(self, query: SimpleQuery, columns: Callable[[str], sqlalchemy.sql.ColumnElement]) -> None:
348 # Docstring inherited from DataCoordinateIterable.
349 sql = self._query.sql
350 if sql is not None:
351 fromClause = sql.alias("c")
352 query.join(
353 fromClause,
354 onclause=sqlalchemy.sql.and_(
355 *[
356 columns(dimension.name) == fromClause.columns[dimension.name]
357 for dimension in self.graph.required
358 ]
359 ),
360 )
362 def findDatasets(
363 self, datasetType: DatasetType | str, collections: Any, *, findFirst: bool = True
364 ) -> DatasetQueryResults:
365 """Find datasets using the data IDs identified by this query.
367 Parameters
368 ----------
369 datasetType : `DatasetType` or `str`
370 Dataset type or the name of one to search for. Must have
371 dimensions that are a subset of ``self.graph``.
372 collections : `Any`
373 An expression that fully or partially identifies the collections
374 to search for the dataset, such as a `str`, `re.Pattern`, or
375 iterable thereof. ``...`` can be used to return all collections.
376 See :ref:`daf_butler_collection_expressions` for more information.
377 findFirst : `bool`, optional
378 If `True` (default), for each result data ID, only yield one
379 `DatasetRef`, from the first collection in which a dataset of that
380 dataset type appears (according to the order of ``collections``
381 passed in). If `True`, ``collections`` must not contain regular
382 expressions and may not be ``...``.
384 Returns
385 -------
386 datasets : `DatasetQueryResults`
387 A lazy-evaluation object representing dataset query results,
388 iterable over `DatasetRef` objects. If ``self.hasRecords()``, all
389 nested data IDs in those dataset references will have records as
390 well.
392 Raises
393 ------
394 ValueError
395 Raised if ``datasetType.dimensions.issubset(self.graph) is False``.
396 """
397 if not isinstance(datasetType, DatasetType):
398 storage = self._query.backend.managers.datasets.find(datasetType)
399 if storage is None:
400 return ChainedDatasetQueryResults(
401 [],
402 doomed_by=[
403 f"Dataset type {datasetType!r} is not registered, so no instances of it can exist in "
404 "any collection."
405 ],
406 )
407 else:
408 datasetType = storage.datasetType
409 if not datasetType.dimensions.issubset(self.graph):
410 raise ValueError(
411 f"findDatasets requires that the dataset type have only dimensions in "
412 f"the DataCoordinateQueryResult used as input to the search, but "
413 f"{datasetType.name} has dimensions {datasetType.dimensions}, while the input "
414 f"dimensions are {self.graph}."
415 )
416 if datasetType.isComponent():
417 # We were given a true DatasetType instance, but it's a component.
418 components = [datasetType.component()]
419 datasetType = datasetType.makeCompositeDatasetType()
420 else:
421 components = [None]
422 summary = QuerySummary(self.graph, whereRegion=self._query.whereRegion, datasets=[datasetType])
423 builder = self._query.makeBuilder(summary)
424 builder.joinDataset(datasetType, collections=collections, findFirst=findFirst)
425 query = builder.finish(joinMissing=False)
426 return ParentDatasetQueryResults(
427 db=self._db, query=query, components=components, records=self._records, datasetType=datasetType
428 )
430 def count(self, *, exact: bool = True) -> int:
431 """Count the number of rows this query would return.
433 Parameters
434 ----------
435 exact : `bool`, optional
436 If `True`, run the full query and perform post-query filtering if
437 needed to account for that filtering in the count. If `False`, the
438 result may be an upper bound.
440 Returns
441 -------
442 count : `int`
443 The number of rows the query would return, or an upper bound if
444 ``exact=False``.
446 Notes
447 -----
448 This counts the number of rows returned, not the number of unique rows
449 returned, so even with ``exact=True`` it may provide only an upper
450 bound on the number of *deduplicated* result rows.
451 """
452 return self._query.count(self._db, exact=exact)
454 def any(
455 self,
456 *,
457 execute: bool = True,
458 exact: bool = True,
459 ) -> bool:
460 """Test whether this query returns any results.
462 Parameters
463 ----------
464 execute : `bool`, optional
465 If `True`, execute at least a ``LIMIT 1`` query if it cannot be
466 determined prior to execution that the query would return no rows.
467 exact : `bool`, optional
468 If `True`, run the full query and perform post-query filtering if
469 needed, until at least one result row is found. If `False`, the
470 returned result does not account for post-query filtering, and
471 hence may be `True` even when all result rows would be filtered
472 out.
474 Returns
475 -------
476 any : `bool`
477 `True` if the query would (or might, depending on arguments) yield
478 result rows. `False` if it definitely would not.
479 """
480 return self._query.any(self._db, execute=execute, exact=exact)
482 def explain_no_results(self) -> Iterator[str]:
483 """Return human-readable messages that may help explain why the query
484 yields no results.
486 Returns
487 -------
488 messages : `Iterator` [ `str` ]
489 String messages that describe reasons the query might not yield any
490 results.
492 Notes
493 -----
494 Messages related to post-query filtering are only available if the
495 iterator has been exhausted, or if `any` or `count` was already called
496 (with ``exact=True`` for the latter two).
498 This method first yields messages that are generated while the query is
499 being built or filtered, but may then proceed to diagnostics generated
500 by performing what should be inexpensive follow-up queries. Callers
501 can short-circuit this at any time by simplying not iterating further.
502 """
503 return self._query.explain_no_results(self._db)
505 def order_by(self, *args: str) -> DataCoordinateQueryResults:
506 """Make the iterator return ordered result.
508 Parameters
509 ----------
510 *args : `str`
511 Names of the columns/dimensions to use for ordering. Column name
512 can be prefixed with minus (``-``) to use descending ordering.
514 Returns
515 -------
516 result : `DataCoordinateQueryResults`
517 Returns ``self`` instance which is updated to return ordered
518 result.
520 Notes
521 -----
522 This method modifies the iterator in place and returns the same
523 instance to support method chaining.
524 """
525 return self._clone(order_by=args)
527 def limit(self, limit: int, offset: int | None = None) -> DataCoordinateQueryResults:
528 """Make the iterator return limited number of records.
530 Parameters
531 ----------
532 limit : `int`
533 Upper limit on the number of returned records.
534 offset : `int` or `None`
535 If not `None` then the number of records to skip before returning
536 ``limit`` records.
538 Returns
539 -------
540 result : `DataCoordinateQueryResults`
541 Returns ``self`` instance which is updated to return limited set
542 of records.
544 Notes
545 -----
546 This method modifies the iterator in place and returns the same
547 instance to support method chaining. Normally this method is used
548 together with `order_by` method.
549 """
550 return self._clone(limit=(limit, offset))
553class DatasetQueryResults(Iterable[DatasetRef]):
554 """An interface for objects that represent the results of queries for
555 datasets.
556 """
558 @abstractmethod
559 def byParentDatasetType(self) -> Iterator[ParentDatasetQueryResults]:
560 """Group results by parent dataset type.
562 Returns
563 -------
564 iter : `Iterator` [ `ParentDatasetQueryResults` ]
565 An iterator over `DatasetQueryResults` instances that are each
566 responsible for a single parent dataset type (either just that
567 dataset type, one or more of its component dataset types, or both).
568 """
569 raise NotImplementedError()
571 @abstractmethod
572 def materialize(self) -> AbstractContextManager[DatasetQueryResults]:
573 """Insert this query's results into a temporary table.
575 Returns
576 -------
577 context : `typing.ContextManager` [ `DatasetQueryResults` ]
578 A context manager that ensures the temporary table is created and
579 populated in ``__enter__`` (returning a results object backed by
580 that table), and dropped in ``__exit__``. If ``self`` is already
581 materialized, the context manager may do nothing (reflecting the
582 fact that an outer context manager should already take care of
583 everything else).
584 """
585 raise NotImplementedError()
587 @abstractmethod
588 def expanded(self) -> DatasetQueryResults:
589 """Return a `DatasetQueryResults` for which `DataCoordinate.hasRecords`
590 returns `True` for all data IDs in returned `DatasetRef` objects.
592 Returns
593 -------
594 expanded : `DatasetQueryResults`
595 Either a new `DatasetQueryResults` instance or ``self``, if it is
596 already expanded.
598 Notes
599 -----
600 As with `DataCoordinateQueryResults.expanded`, it may be more efficient
601 to call `materialize` before expanding data IDs for very large result
602 sets.
603 """
604 raise NotImplementedError()
606 @abstractmethod
607 def count(self, *, exact: bool = True) -> int:
608 """Count the number of rows this query would return.
610 Parameters
611 ----------
612 exact : `bool`, optional
613 If `True`, run the full query and perform post-query filtering if
614 needed to account for that filtering in the count. If `False`, the
615 result may be an upper bound.
617 Returns
618 -------
619 count : `int`
620 The number of rows the query would return, or an upper bound if
621 ``exact=False``.
623 Notes
624 -----
625 This counts the number of rows returned, not the number of unique rows
626 returned, so even with ``exact=True`` it may provide only an upper
627 bound on the number of *deduplicated* result rows.
628 """
629 raise NotImplementedError()
631 @abstractmethod
632 def any(
633 self,
634 *,
635 execute: bool = True,
636 exact: bool = True,
637 ) -> bool:
638 """Test whether this query returns any results.
640 Parameters
641 ----------
642 execute : `bool`, optional
643 If `True`, execute at least a ``LIMIT 1`` query if it cannot be
644 determined prior to execution that the query would return no rows.
645 exact : `bool`, optional
646 If `True`, run the full query and perform post-query filtering if
647 needed, until at least one result row is found. If `False`, the
648 returned result does not account for post-query filtering, and
649 hence may be `True` even when all result rows would be filtered
650 out.
652 Returns
653 -------
654 any : `bool`
655 `True` if the query would (or might, depending on arguments) yield
656 result rows. `False` if it definitely would not.
657 """
658 raise NotImplementedError()
660 @abstractmethod
661 def explain_no_results(self) -> Iterator[str]:
662 """Return human-readable messages that may help explain why the query
663 yields no results.
665 Returns
666 -------
667 messages : `Iterator` [ `str` ]
668 String messages that describe reasons the query might not yield any
669 results.
671 Notes
672 -----
673 Messages related to post-query filtering are only available if the
674 iterator has been exhausted, or if `any` or `count` was already called
675 (with ``exact=True`` for the latter two).
677 This method first yields messages that are generated while the query is
678 being built or filtered, but may then proceed to diagnostics generated
679 by performing what should be inexpensive follow-up queries. Callers
680 can short-circuit this at any time by simplying not iterating further.
681 """
682 raise NotImplementedError()
685class ParentDatasetQueryResults(DatasetQueryResults):
686 """An object that represents results from a query for datasets with a
687 single parent `DatasetType`.
689 Parameters
690 ----------
691 db : `Database`
692 Database engine to execute queries against.
693 query : `Query`
694 Low-level query object that backs these results. ``query.datasetType``
695 will be the parent dataset type for this object, and may not be `None`.
696 components : `Sequence` [ `str` or `None` ]
697 Names of components to include in iteration. `None` may be included
698 (at most once) to include the parent dataset type.
699 records : `Mapping`, optional
700 Mapping containing `DimensionRecord` objects for all dimensions and
701 all data IDs this query will yield. If `None` (default),
702 `DataCoordinate.hasRecords` will return `False` for all nested data
703 IDs. This is a nested mapping with `str` names of dimension elements
704 as outer keys, `DimensionRecord` instances as inner values, and
705 ``tuple(record.dataId.values())`` for the inner keys / outer values
706 (where ``record`` is the innermost `DimensionRecord` instance).
707 datasetType : `DatasetType`, optional
708 Parent dataset type for all datasets returned by this query. If not
709 provided, ``query.datasetType`` be used, and must not be `None` (as it
710 is in the case where the query is known to yield no results prior to
711 execution).
712 """
714 def __init__(
715 self,
716 db: Database,
717 query: Query,
718 *,
719 components: Sequence[str | None],
720 records: Mapping[str, Mapping[tuple, DimensionRecord]] | None = None,
721 datasetType: DatasetType | None = None,
722 ):
723 self._db = db
724 self._query = query
725 self._components = components
726 self._records = records
727 if datasetType is None:
728 datasetType = query.datasetType
729 assert datasetType is not None, "Query used to initialize dataset results must have a dataset."
730 assert datasetType.dimensions.issubset(
731 query.graph
732 ), f"Query dimensions {query.graph} do not match dataset type dimensions {datasetType.dimensions}."
733 self._datasetType = datasetType
735 __slots__ = ("_db", "_query", "_dimensions", "_components", "_records")
737 def __iter__(self) -> Iterator[DatasetRef]:
738 for row in self._query.rows(self._db):
739 parentRef = self._query.extractDatasetRef(row, records=self._records)
740 for component in self._components:
741 if component is None:
742 yield parentRef
743 else:
744 yield parentRef.makeComponentRef(component)
746 def __repr__(self) -> str:
747 return f"<DatasetRef iterator for [components of] {self._datasetType.name}>"
749 def byParentDatasetType(self) -> Iterator[ParentDatasetQueryResults]:
750 # Docstring inherited from DatasetQueryResults.
751 yield self
753 @contextmanager
754 def materialize(self) -> Iterator[ParentDatasetQueryResults]:
755 # Docstring inherited from DatasetQueryResults.
756 with self._query.materialize(self._db) as materialized:
757 yield ParentDatasetQueryResults(
758 self._db, materialized, components=self._components, records=self._records
759 )
761 @property
762 def parentDatasetType(self) -> DatasetType:
763 """The parent dataset type for all datasets in this iterable
764 (`DatasetType`).
765 """
766 return self._datasetType
768 @property
769 def dataIds(self) -> DataCoordinateQueryResults:
770 """A lazy-evaluation object representing a query for just the data
771 IDs of the datasets that would be returned by this query
772 (`DataCoordinateQueryResults`).
774 The returned object is not in general `zip`-iterable with ``self``;
775 it may be in a different order or have (or not have) duplicates.
776 """
777 query = self._query.subset(graph=self.parentDatasetType.dimensions, datasets=False, unique=False)
778 return DataCoordinateQueryResults.from_query(
779 self._db,
780 query,
781 self.parentDatasetType.dimensions,
782 records=self._records,
783 )
785 def withComponents(self, components: Sequence[str | None]) -> ParentDatasetQueryResults:
786 """Return a new query results object for the same parent datasets but
787 different components.
789 components : `Sequence` [ `str` or `None` ]
790 Names of components to include in iteration. `None` may be
791 included (at most once) to include the parent dataset type.
792 """
793 return ParentDatasetQueryResults(
794 self._db, self._query, records=self._records, components=components, datasetType=self._datasetType
795 )
797 def expanded(self) -> ParentDatasetQueryResults:
798 # Docstring inherited from DatasetQueryResults.
799 if self._records is None:
800 records = self.dataIds.expanded()._records
801 return ParentDatasetQueryResults(
802 self._db,
803 self._query,
804 records=records,
805 components=self._components,
806 datasetType=self._datasetType,
807 )
808 else:
809 return self
811 def count(self, *, exact: bool = True) -> int:
812 # Docstring inherited.
813 return len(self._components) * self._query.count(self._db, exact=exact)
815 def any(
816 self,
817 *,
818 execute: bool = True,
819 exact: bool = True,
820 ) -> bool:
821 # Docstring inherited.
822 return self._query.any(self._db, execute=execute, exact=exact)
824 def explain_no_results(self) -> Iterator[str]:
825 # Docstring inherited.
826 return self._query.explain_no_results(self._db)
829class ChainedDatasetQueryResults(DatasetQueryResults):
830 """A `DatasetQueryResults` implementation that simply chains together
831 other results objects, each for a different parent dataset type.
833 Parameters
834 ----------
835 chain : `Sequence` [ `ParentDatasetQueryResults` ]
836 The underlying results objects this object will chain together.
837 doomed_by : `Iterable` [ `str` ], optional
838 A list of messages (appropriate for e.g. logging or exceptions) that
839 explain why the query is known to return no results even before it is
840 executed. Queries with a non-empty list will never be executed.
841 Child results objects may also have their own list.
842 """
844 def __init__(self, chain: Sequence[ParentDatasetQueryResults], doomed_by: Iterable[str] = ()):
845 self._chain = chain
846 self._doomed_by = tuple(doomed_by)
848 __slots__ = ("_chain",)
850 def __iter__(self) -> Iterator[DatasetRef]:
851 return itertools.chain.from_iterable(self._chain)
853 def __repr__(self) -> str:
854 return "<DatasetRef iterator for multiple dataset types>"
856 def byParentDatasetType(self) -> Iterator[ParentDatasetQueryResults]:
857 # Docstring inherited from DatasetQueryResults.
858 return iter(self._chain)
860 @contextmanager
861 def materialize(self) -> Iterator[ChainedDatasetQueryResults]:
862 # Docstring inherited from DatasetQueryResults.
863 with ExitStack() as stack:
864 yield ChainedDatasetQueryResults([stack.enter_context(r.materialize()) for r in self._chain])
866 def expanded(self) -> ChainedDatasetQueryResults:
867 # Docstring inherited from DatasetQueryResults.
868 return ChainedDatasetQueryResults([r.expanded() for r in self._chain])
870 def count(self, *, exact: bool = True) -> int:
871 # Docstring inherited.
872 return sum(r.count(exact=exact) for r in self._chain)
874 def any(
875 self,
876 *,
877 execute: bool = True,
878 exact: bool = True,
879 ) -> bool:
880 # Docstring inherited.
881 return any(r.any(execute=execute, exact=exact) for r in self._chain)
883 def explain_no_results(self) -> Iterator[str]:
884 # Docstring inherited.
885 for r in self._chain:
886 yield from r.explain_no_results()
887 yield from self._doomed_by
890class DimensionRecordQueryResults(Iterable[DimensionRecord]):
891 """An interface for objects that represent the results of queries for
892 dimension records.
893 """
895 @abstractmethod
896 def count(self, *, exact: bool = True) -> int:
897 """Count the number of rows this query would return.
899 Parameters
900 ----------
901 exact : `bool`, optional
902 If `True`, run the full query and perform post-query filtering if
903 needed to account for that filtering in the count. If `False`, the
904 result may be an upper bound.
906 Returns
907 -------
908 count : `int`
909 The number of rows the query would return, or an upper bound if
910 ``exact=False``.
912 Notes
913 -----
914 This counts the number of rows returned, not the number of unique rows
915 returned, so even with ``exact=True`` it may provide only an upper
916 bound on the number of *deduplicated* result rows.
917 """
918 raise NotImplementedError()
920 @abstractmethod
921 def any(self, *, execute: bool = True, exact: bool = True) -> bool:
922 """Test whether this query returns any results.
924 Parameters
925 ----------
926 execute : `bool`, optional
927 If `True`, execute at least a ``LIMIT 1`` query if it cannot be
928 determined prior to execution that the query would return no rows.
929 exact : `bool`, optional
930 If `True`, run the full query and perform post-query filtering if
931 needed, until at least one result row is found. If `False`, the
932 returned result does not account for post-query filtering, and
933 hence may be `True` even when all result rows would be filtered
934 out.
936 Returns
937 -------
938 any : `bool`
939 `True` if the query would (or might, depending on arguments) yield
940 result rows. `False` if it definitely would not.
941 """
942 raise NotImplementedError()
944 @abstractmethod
945 def order_by(self, *args: str) -> DimensionRecordQueryResults:
946 """Make the iterator return ordered result.
948 Parameters
949 ----------
950 *args : `str`
951 Names of the columns/dimensions to use for ordering. Column name
952 can be prefixed with minus (``-``) to use descending ordering.
954 Returns
955 -------
956 result : `DimensionRecordQueryResults`
957 Returns ``self`` instance which is updated to return ordered
958 result.
960 Notes
961 -----
962 This method can modify the iterator in place and return the same
963 instance.
964 """
965 raise NotImplementedError()
967 @abstractmethod
968 def limit(self, limit: int, offset: int | None = None) -> DimensionRecordQueryResults:
969 """Make the iterator return limited number of records.
971 Parameters
972 ----------
973 limit : `int`
974 Upper limit on the number of returned records.
975 offset : `int` or `None`
976 If not `None` then the number of records to skip before returning
977 ``limit`` records.
979 Returns
980 -------
981 result : `DimensionRecordQueryResults`
982 Returns ``self`` instance which is updated to return limited set
983 of records.
985 Notes
986 -----
987 This method can modify the iterator in place and return the same
988 instance. Normally this method is used together with `order_by`
989 method.
990 """
991 raise NotImplementedError()
993 @abstractmethod
994 def explain_no_results(self) -> Iterator[str]:
995 """Return human-readable messages that may help explain why the query
996 yields no results.
998 Returns
999 -------
1000 messages : `Iterator` [ `str` ]
1001 String messages that describe reasons the query might not yield any
1002 results.
1004 Notes
1005 -----
1006 Messages related to post-query filtering are only available if the
1007 iterator has been exhausted, or if `any` or `count` was already called
1008 (with ``exact=True`` for the latter two).
1010 This method first yields messages that are generated while the query is
1011 being built or filtered, but may then proceed to diagnostics generated
1012 by performing what should be inexpensive follow-up queries. Callers
1013 can short-circuit this at any time by simply not iterating further.
1014 """
1015 raise NotImplementedError()
1018class _DimensionRecordKey:
1019 """Class for objects used as keys in ordering `DimensionRecord` instances.
1021 Parameters
1022 ----------
1023 attributes : `Sequence` [ `str` ]
1024 Sequence of attribute names to use for comparison.
1025 ordering : `Sequence` [ `bool` ]
1026 Matching sequence of ordering flags, `False` for descending ordering,
1027 `True` for ascending ordering.
1028 record : `DimensionRecord`
1029 `DimensionRecord` to compare to other records.
1030 """
1032 def __init__(self, attributes: Sequence[str], ordering: Sequence[bool], record: DimensionRecord):
1033 self.attributes = attributes
1034 self.ordering = ordering
1035 self.rec = record
1037 def _cmp(self, other: _DimensionRecordKey) -> int:
1038 """Compare two records using provided comparison operator.
1040 Parameters
1041 ----------
1042 other : `_DimensionRecordKey`
1043 Key for other record.
1045 Returns
1046 -------
1047 result : `int`
1048 0 if keys are identical, negative if ``self`` is ordered before
1049 ``other``, positive otherwise.
1050 """
1051 for attribute, ordering in zip(self.attributes, self.ordering):
1052 # timespan.begin/end cannot use getattr
1053 attrgetter = operator.attrgetter(attribute)
1054 lhs = attrgetter(self.rec)
1055 rhs = attrgetter(other.rec)
1056 if not ordering:
1057 lhs, rhs = rhs, lhs
1058 if lhs != rhs:
1059 return 1 if lhs > rhs else -1
1060 return 0
1062 def __lt__(self, other: _DimensionRecordKey) -> bool:
1063 return self._cmp(other) < 0
1065 def __gt__(self, other: _DimensionRecordKey) -> bool:
1066 return self._cmp(other) > 0
1068 def __eq__(self, other: Any) -> bool:
1069 if not isinstance(other, _DimensionRecordKey):
1070 return NotImplemented
1071 return self._cmp(other) == 0
1073 def __le__(self, other: _DimensionRecordKey) -> bool:
1074 return self._cmp(other) <= 0
1076 def __ge__(self, other: _DimensionRecordKey) -> bool:
1077 return self._cmp(other) >= 0
1080class DatabaseDimensionRecordQueryResults(DimensionRecordQueryResults):
1081 """Implementation of DimensionRecordQueryResults using database query.
1083 Parameters
1084 ----------
1085 dataIds : `DataCoordinateQueryResults`
1086 Iterator for DataIds.
1087 recordStorage : `DimensionRecordStorage`
1088 Instance of storage class for dimension records.
1089 """
1091 def __init__(self, dataIds: DataCoordinateQueryResults, recordStorage: DimensionRecordStorage):
1092 self._dataIds = dataIds
1093 self._recordStorage = recordStorage
1094 self._order_by: Iterable[str] = ()
1096 def __iter__(self) -> Iterator[DimensionRecord]:
1097 # LIMIT is already applied at DataCoordinateQueryResults level
1098 # (assumption here is that if DataId exists then dimension record
1099 # exists too and their counts must be equal). fetch() does not
1100 # guarantee ordering, so we need to sort records in memory below.
1101 recordIter = self._recordStorage.fetch(self._dataIds)
1102 if not self._order_by:
1103 return iter(recordIter)
1105 # Parse list of column names and build a list of attribute name for
1106 # ordering. Note that here we only support ordering by direct
1107 # attributes of the element, and not other elements from the dimension
1108 # graph.
1109 orderBy = ElementOrderByClause(self._order_by, self._recordStorage.element)
1110 attributes: list[str] = []
1111 ordering: list[bool] = []
1112 for column in orderBy.order_by_columns:
1113 if column.column is None:
1114 assert isinstance(column.element, Dimension), "Element must be a Dimension"
1115 attributes.append(column.element.primaryKey.name)
1116 else:
1117 attributes.append(column.column)
1118 ordering.append(column.ordering)
1120 def _key(record: DimensionRecord) -> _DimensionRecordKey:
1121 return _DimensionRecordKey(attributes, ordering, record)
1123 records = sorted(recordIter, key=_key)
1124 return iter(records)
1126 def count(self, *, exact: bool = True) -> int:
1127 # Docstring inherited from base class.
1128 return self._dataIds.count(exact=exact)
1130 def any(self, *, execute: bool = True, exact: bool = True) -> bool:
1131 # Docstring inherited from base class.
1132 return self._dataIds.any(execute=execute, exact=exact)
1134 def order_by(self, *args: str) -> DimensionRecordQueryResults:
1135 # Docstring inherited from base class.
1136 self._dataIds = self._dataIds.order_by(*args)
1137 self._order_by = args
1138 return self
1140 def limit(self, limit: int, offset: Optional[int] = None) -> DimensionRecordQueryResults:
1141 # Docstring inherited from base class.
1142 self._dataIds = self._dataIds.limit(limit, offset)
1143 return self
1145 def explain_no_results(self) -> Iterator[str]:
1146 # Docstring inherited.
1147 return self._dataIds.explain_no_results()