Coverage for python/lsst/daf/butler/registry/queries/_results.py: 33%
282 statements
« prev ^ index » next coverage.py v6.4.2, created at 2022-07-27 01:57 -0700
« prev ^ index » next coverage.py v6.4.2, created at 2022-07-27 01:57 -0700
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23__all__ = (
24 "ChainedDatasetQueryResults",
25 "DatabaseDimensionRecordQueryResults",
26 "DataCoordinateQueryResults",
27 "DatasetQueryResults",
28 "DimensionRecordQueryResults",
29 "ParentDatasetQueryResults",
30)
32import itertools
33import operator
34from abc import abstractmethod
35from contextlib import ExitStack, contextmanager
36from typing import (
37 Any,
38 Callable,
39 ContextManager,
40 Iterable,
41 Iterator,
42 List,
43 Mapping,
44 Optional,
45 Sequence,
46 Tuple,
47 Union,
48)
50import sqlalchemy
52from ...core import (
53 DataCoordinate,
54 DataCoordinateIterable,
55 DatasetRef,
56 DatasetType,
57 Dimension,
58 DimensionGraph,
59 DimensionRecord,
60 SimpleQuery,
61)
62from ..interfaces import Database, DimensionRecordStorage
63from ._query import Query
64from ._structs import ElementOrderByClause, QuerySummary
66QueryFactoryMethod = Callable[[Optional[Iterable[str]], Optional[Tuple[int, Optional[int]]]], Query]
67"""Type of a query factory method type used by DataCoordinateQueryResults.
68"""
71class DataCoordinateQueryResults(DataCoordinateIterable):
72 """An enhanced implementation of `DataCoordinateIterable` that represents
73 data IDs retrieved from a database query.
75 Parameters
76 ----------
77 db : `Database`
78 Database engine used to execute queries.
79 query_factory : `QueryFactoryMethod`
80 Method which creates an instance of `Query` class.
81 graph : `DimensionGraph`
82 Dimensions used by query.
83 order_by : `Iterable` [ `str` ], optional
84 Optional sequence of column names used for result ordering.
85 limit : `Tuple` [ `int`, `int` ], optional
86 Limit for the number of returned records and optional offset.
87 records : `Mapping`, optional
88 A nested mapping containing `DimensionRecord` objects for all
89 dimensions and all data IDs this query will yield. If `None`
90 (default), `DataCoordinateIterable.hasRecords` will return `False`.
91 The outer mapping has `str` keys (the names of dimension elements).
92 The inner mapping has `tuple` keys representing data IDs (tuple
93 conversions of `DataCoordinate.values()`) and `DimensionRecord` values.
95 Notes
96 -----
97 Constructing an instance of this does nothing; the query is not executed
98 until it is iterated over (or some other operation is performed that
99 involves iteration).
101 Instances should generally only be constructed by `Registry` methods or the
102 methods of other query result objects.
103 """
105 def __init__(
106 self,
107 db: Database,
108 query_factory: QueryFactoryMethod,
109 graph: DimensionGraph,
110 *,
111 order_by: Optional[Iterable[str]] = None,
112 limit: Optional[Tuple[int, Optional[int]]] = None,
113 records: Optional[Mapping[str, Mapping[tuple, DimensionRecord]]] = None,
114 ):
115 self._db = db
116 self._query_factory = query_factory
117 self._graph = graph
118 self._order_by = order_by
119 self._limit = limit
120 self._records = records
121 self._cached_query: Optional[Query] = None
123 __slots__ = ("_db", "_query_factory", "_graph", "_order_by", "_limit", "_records", "_cached_query")
125 @classmethod
126 def from_query(
127 cls,
128 db: Database,
129 query: Query,
130 graph: DimensionGraph,
131 *,
132 order_by: Optional[Iterable[str]] = None,
133 limit: Optional[Tuple[int, Optional[int]]] = None,
134 records: Optional[Mapping[str, Mapping[tuple, DimensionRecord]]] = None,
135 ) -> DataCoordinateQueryResults:
136 """Make an instance from a pre-existing query instead of a factory.
138 Parameters
139 ----------
140 db : `Database`
141 Database engine used to execute queries.
142 query : `Query`
143 Low-level representation of the query that backs this result
144 object.
145 graph : `DimensionGraph`
146 Dimensions used by query.
147 order_by : `Iterable` [ `str` ], optional
148 Optional sequence of column names used for result ordering.
149 limit : `Tuple` [ `int`, `int` ], optional
150 Limit for the number of returned records and optional offset.
151 records : `Mapping`, optional
152 A nested mapping containing `DimensionRecord` objects for all
153 dimensions and all data IDs this query will yield. If `None`
154 (default), `DataCoordinateIterable.hasRecords` will return `False`.
155 The outer mapping has `str` keys (the names of dimension elements).
156 The inner mapping has `tuple` keys representing data IDs (tuple
157 conversions of `DataCoordinate.values()`) and `DimensionRecord`
158 values.
159 """
161 def factory(order_by: Optional[Iterable[str]], limit: Optional[Tuple[int, Optional[int]]]) -> Query:
162 return query
164 return DataCoordinateQueryResults(db, factory, graph, order_by=order_by, limit=limit, records=records)
166 def __iter__(self) -> Iterator[DataCoordinate]:
167 return (self._query.extractDataId(row, records=self._records) for row in self._query.rows(self._db))
169 def __repr__(self) -> str:
170 return f"<DataCoordinate iterator with dimensions={self._graph}>"
172 def _clone(
173 self,
174 *,
175 query_factory: Optional[QueryFactoryMethod] = None,
176 query: Optional[Query] = None,
177 graph: Optional[DimensionGraph] = None,
178 order_by: Optional[Iterable[str]] = None,
179 limit: Optional[Tuple[int, Optional[int]]] = None,
180 records: Optional[Mapping[str, Mapping[tuple, DimensionRecord]]] = None,
181 ) -> DataCoordinateQueryResults:
182 """Clone this instance potentially updating some attributes."""
183 graph = graph if graph is not None else self._graph
184 order_by = order_by if order_by is not None else self._order_by
185 limit = limit if limit is not None else self._limit
186 records = records if records is not None else self._records
187 if query is None:
188 query_factory = query_factory or self._query_factory
189 return DataCoordinateQueryResults(
190 self._db, query_factory, graph, order_by=order_by, limit=limit, records=records
191 )
192 else:
193 return DataCoordinateQueryResults.from_query(
194 self._db, query, graph, order_by=order_by, limit=limit, records=records
195 )
197 @property
198 def _query(self) -> Query:
199 """Query representation instance (`Query`)"""
200 if self._cached_query is None:
201 self._cached_query = self._query_factory(self._order_by, self._limit)
202 assert (
203 self._cached_query.datasetType is None
204 ), "Query used to initialize data coordinate results should not have any datasets."
205 return self._cached_query
207 @property
208 def graph(self) -> DimensionGraph:
209 # Docstring inherited from DataCoordinateIterable.
210 return self._graph
212 def hasFull(self) -> bool:
213 # Docstring inherited from DataCoordinateIterable.
214 return True
216 def hasRecords(self) -> bool:
217 # Docstring inherited from DataCoordinateIterable.
218 return self._records is not None or not self._graph
220 @contextmanager
221 def materialize(self) -> Iterator[DataCoordinateQueryResults]:
222 """Insert this query's results into a temporary table.
224 Returns
225 -------
226 context : `typing.ContextManager` [ `DataCoordinateQueryResults` ]
227 A context manager that ensures the temporary table is created and
228 populated in ``__enter__`` (returning a results object backed by
229 that table), and dropped in ``__exit__``. If ``self`` is already
230 materialized, the context manager may do nothing (reflecting the
231 fact that an outer context manager should already take care of
232 everything else).
234 Notes
235 -----
236 When using a very large result set to perform multiple queries (e.g.
237 multiple calls to `subset` with different arguments, or even a single
238 call to `expanded`), it may be much more efficient to start by
239 materializing the query and only then performing the follow up queries.
240 It may also be less efficient, depending on how well database engine's
241 query optimizer can simplify those particular follow-up queries and
242 how efficiently it caches query results even when the are not
243 explicitly inserted into a temporary table. See `expanded` and
244 `subset` for examples.
245 """
246 with self._query.materialize(self._db) as materialized:
247 # Note that we depend on order_by columns to be passes from Query
248 # to MaterializedQuery, so order_by and limit are not used.
249 yield self._clone(query=materialized)
251 def expanded(self) -> DataCoordinateQueryResults:
252 """Return a results object for which `hasRecords` returns `True`.
254 This method may involve actually executing database queries to fetch
255 `DimensionRecord` objects.
257 Returns
258 -------
259 results : `DataCoordinateQueryResults`
260 A results object for which `hasRecords` returns `True`. May be
261 ``self`` if that is already the case.
263 Notes
264 -----
265 For very result sets, it may be much more efficient to call
266 `materialize` before calling `expanded`, to avoid performing the
267 original query multiple times (as a subquery) in the follow-up queries
268 that fetch dimension records. For example::
270 with registry.queryDataIds(...).materialize() as tempDataIds:
271 dataIdsWithRecords = tempDataIds.expanded()
272 for dataId in dataIdsWithRecords:
273 ...
274 """
275 if self._records is None:
276 records = {}
277 for element in self.graph.elements:
278 subset = self.subset(graph=element.graph, unique=True)
279 records[element.name] = {
280 tuple(record.dataId.values()): record
281 for record in self._query.managers.dimensions[element].fetch(subset)
282 }
284 return self._clone(query=self._query, records=records)
285 else:
286 return self
288 def subset(
289 self, graph: Optional[DimensionGraph] = None, *, unique: bool = False
290 ) -> DataCoordinateQueryResults:
291 """Return a results object containing a subset of the dimensions of
292 this one, and/or a unique near-subset of its rows.
294 This method may involve actually executing database queries to fetch
295 `DimensionRecord` objects.
297 Parameters
298 ----------
299 graph : `DimensionGraph`, optional
300 Dimensions to include in the new results object. If `None`,
301 ``self.graph`` is used.
302 unique : `bool`, optional
303 If `True` (`False` is default), the query should only return unique
304 data IDs. This is implemented in the database; to obtain unique
305 results via Python-side processing (which may be more efficient in
306 some cases), use `toSet` to construct a `DataCoordinateSet` from
307 this results object instead.
309 Returns
310 -------
311 results : `DataCoordinateQueryResults`
312 A results object corresponding to the given criteria. May be
313 ``self`` if it already qualifies.
315 Raises
316 ------
317 ValueError
318 Raised when ``graph`` is not a subset of the dimension graph in
319 this result.
321 Notes
322 -----
323 This method can only return a "near-subset" of the original result rows
324 in general because of subtleties in how spatial overlaps are
325 implemented; see `Query.subset` for more information.
327 When calling `subset` multiple times on the same very large result set,
328 it may be much more efficient to call `materialize` first. For
329 example::
331 dimensions1 = DimensionGraph(...)
332 dimensions2 = DimensionGraph(...)
333 with registry.queryDataIds(...).materialize() as tempDataIds:
334 for dataId1 in tempDataIds.subset(
335 graph=dimensions1,
336 unique=True):
337 ...
338 for dataId2 in tempDataIds.subset(
339 graph=dimensions2,
340 unique=True):
341 ...
342 """
343 if graph is None:
344 graph = self.graph
345 if not graph.issubset(self.graph):
346 raise ValueError(f"{graph} is not a subset of {self.graph}")
347 if graph == self.graph and (not unique or self._query.isUnique()):
348 return self
349 records: Optional[Mapping[str, Mapping[tuple, DimensionRecord]]]
350 if self._records is not None:
351 records = {element.name: self._records[element.name] for element in graph.elements}
352 else:
353 records = None
354 query = self._query.subset(graph=graph, datasets=False, unique=unique)
356 return self._clone(graph=graph, query=query, records=records)
358 def constrain(self, query: SimpleQuery, columns: Callable[[str], sqlalchemy.sql.ColumnElement]) -> None:
359 # Docstring inherited from DataCoordinateIterable.
360 sql = self._query.sql
361 if sql is not None:
362 fromClause = sql.alias("c")
363 query.join(
364 fromClause,
365 onclause=sqlalchemy.sql.and_(
366 *[
367 columns(dimension.name) == fromClause.columns[dimension.name]
368 for dimension in self.graph.required
369 ]
370 ),
371 )
373 def findDatasets(
374 self, datasetType: Union[DatasetType, str], collections: Any, *, findFirst: bool = True
375 ) -> DatasetQueryResults:
376 """Find datasets using the data IDs identified by this query.
378 Parameters
379 ----------
380 datasetType : `DatasetType` or `str`
381 Dataset type or the name of one to search for. Must have
382 dimensions that are a subset of ``self.graph``.
383 collections : `Any`
384 An expression that fully or partially identifies the collections
385 to search for the dataset, such as a `str`, `re.Pattern`, or
386 iterable thereof. ``...`` can be used to return all collections.
387 See :ref:`daf_butler_collection_expressions` for more information.
388 findFirst : `bool`, optional
389 If `True` (default), for each result data ID, only yield one
390 `DatasetRef`, from the first collection in which a dataset of that
391 dataset type appears (according to the order of ``collections``
392 passed in). If `True`, ``collections`` must not contain regular
393 expressions and may not be ``...``.
395 Returns
396 -------
397 datasets : `DatasetQueryResults`
398 A lazy-evaluation object representing dataset query results,
399 iterable over `DatasetRef` objects. If ``self.hasRecords()``, all
400 nested data IDs in those dataset references will have records as
401 well.
403 Raises
404 ------
405 ValueError
406 Raised if ``datasetType.dimensions.issubset(self.graph) is False``.
407 """
408 if not isinstance(datasetType, DatasetType):
409 storage = self._query.managers.datasets.find(datasetType)
410 if storage is None:
411 return ChainedDatasetQueryResults(
412 [],
413 doomed_by=[
414 f"No registered dataset type {datasetType!r} found, so no instances can "
415 "exist in any collection."
416 ],
417 )
418 else:
419 datasetType = storage.datasetType
420 if not datasetType.dimensions.issubset(self.graph):
421 raise ValueError(
422 f"findDatasets requires that the dataset type have only dimensions in "
423 f"the DataCoordinateQueryResult used as input to the search, but "
424 f"{datasetType.name} has dimensions {datasetType.dimensions}, while the input "
425 f"dimensions are {self.graph}."
426 )
427 if datasetType.isComponent():
428 # We were given a true DatasetType instance, but it's a component.
429 parentName, componentName = datasetType.nameAndComponent()
430 storage = self._query.managers.datasets[parentName]
431 datasetType = storage.datasetType
432 components = [componentName]
433 else:
434 components = [None]
435 summary = QuerySummary(self.graph, whereRegion=self._query.whereRegion, datasets=[datasetType])
436 builder = self._query.makeBuilder(summary)
437 builder.joinDataset(datasetType, collections=collections, findFirst=findFirst)
438 query = builder.finish(joinMissing=False)
439 return ParentDatasetQueryResults(
440 db=self._db, query=query, components=components, records=self._records, datasetType=datasetType
441 )
443 def count(self, *, exact: bool = True) -> int:
444 """Count the number of rows this query would return.
446 Parameters
447 ----------
448 exact : `bool`, optional
449 If `True`, run the full query and perform post-query filtering if
450 needed to account for that filtering in the count. If `False`, the
451 result may be an upper bound.
453 Returns
454 -------
455 count : `int`
456 The number of rows the query would return, or an upper bound if
457 ``exact=False``.
459 Notes
460 -----
461 This counts the number of rows returned, not the number of unique rows
462 returned, so even with ``exact=True`` it may provide only an upper
463 bound on the number of *deduplicated* result rows.
464 """
465 return self._query.count(self._db, exact=exact)
467 def any(
468 self,
469 *,
470 execute: bool = True,
471 exact: bool = True,
472 ) -> bool:
473 """Test whether this query returns any results.
475 Parameters
476 ----------
477 execute : `bool`, optional
478 If `True`, execute at least a ``LIMIT 1`` query if it cannot be
479 determined prior to execution that the query would return no rows.
480 exact : `bool`, optional
481 If `True`, run the full query and perform post-query filtering if
482 needed, until at least one result row is found. If `False`, the
483 returned result does not account for post-query filtering, and
484 hence may be `True` even when all result rows would be filtered
485 out.
487 Returns
488 -------
489 any : `bool`
490 `True` if the query would (or might, depending on arguments) yield
491 result rows. `False` if it definitely would not.
492 """
493 return self._query.any(self._db, execute=execute, exact=exact)
495 def explain_no_results(self) -> Iterator[str]:
496 """Return human-readable messages that may help explain why the query
497 yields no results.
499 Returns
500 -------
501 messages : `Iterator` [ `str` ]
502 String messages that describe reasons the query might not yield any
503 results.
505 Notes
506 -----
507 Messages related to post-query filtering are only available if the
508 iterator has been exhausted, or if `any` or `count` was already called
509 (with ``exact=True`` for the latter two).
511 This method first yields messages that are generated while the query is
512 being built or filtered, but may then proceed to diagnostics generated
513 by performing what should be inexpensive follow-up queries. Callers
514 can short-circuit this at any time by simplying not iterating further.
515 """
516 return self._query.explain_no_results(self._db)
518 def order_by(self, *args: str) -> DataCoordinateQueryResults:
519 """Make the iterator return ordered result.
521 Parameters
522 ----------
523 *args : `str`
524 Names of the columns/dimensions to use for ordering. Column name
525 can be prefixed with minus (``-``) to use descending ordering.
527 Returns
528 -------
529 result : `DataCoordinateQueryResults`
530 Returns ``self`` instance which is updated to return ordered
531 result.
533 Notes
534 -----
535 This method modifies the iterator in place and returns the same
536 instance to support method chaining.
537 """
538 return self._clone(order_by=args)
540 def limit(self, limit: int, offset: Optional[int] = None) -> DataCoordinateQueryResults:
541 """Make the iterator return limited number of records.
543 Parameters
544 ----------
545 limit : `int`
546 Upper limit on the number of returned records.
547 offset : `int` or `None`
548 If not `None` then the number of records to skip before returning
549 ``limit`` records.
551 Returns
552 -------
553 result : `DataCoordinateQueryResults`
554 Returns ``self`` instance which is updated to return limited set
555 of records.
557 Notes
558 -----
559 This method modifies the iterator in place and returns the same
560 instance to support method chaining. Normally this method is used
561 together with `order_by` method.
562 """
563 return self._clone(limit=(limit, offset))
566class DatasetQueryResults(Iterable[DatasetRef]):
567 """An interface for objects that represent the results of queries for
568 datasets.
569 """
571 @abstractmethod
572 def byParentDatasetType(self) -> Iterator[ParentDatasetQueryResults]:
573 """Group results by parent dataset type.
575 Returns
576 -------
577 iter : `Iterator` [ `ParentDatasetQueryResults` ]
578 An iterator over `DatasetQueryResults` instances that are each
579 responsible for a single parent dataset type (either just that
580 dataset type, one or more of its component dataset types, or both).
581 """
582 raise NotImplementedError()
584 @abstractmethod
585 def materialize(self) -> ContextManager[DatasetQueryResults]:
586 """Insert this query's results into a temporary table.
588 Returns
589 -------
590 context : `typing.ContextManager` [ `DatasetQueryResults` ]
591 A context manager that ensures the temporary table is created and
592 populated in ``__enter__`` (returning a results object backed by
593 that table), and dropped in ``__exit__``. If ``self`` is already
594 materialized, the context manager may do nothing (reflecting the
595 fact that an outer context manager should already take care of
596 everything else).
597 """
598 raise NotImplementedError()
600 @abstractmethod
601 def expanded(self) -> DatasetQueryResults:
602 """Return a `DatasetQueryResults` for which `DataCoordinate.hasRecords`
603 returns `True` for all data IDs in returned `DatasetRef` objects.
605 Returns
606 -------
607 expanded : `DatasetQueryResults`
608 Either a new `DatasetQueryResults` instance or ``self``, if it is
609 already expanded.
611 Notes
612 -----
613 As with `DataCoordinateQueryResults.expanded`, it may be more efficient
614 to call `materialize` before expanding data IDs for very large result
615 sets.
616 """
617 raise NotImplementedError()
619 @abstractmethod
620 def count(self, *, exact: bool = True) -> int:
621 """Count the number of rows this query would return.
623 Parameters
624 ----------
625 exact : `bool`, optional
626 If `True`, run the full query and perform post-query filtering if
627 needed to account for that filtering in the count. If `False`, the
628 result may be an upper bound.
630 Returns
631 -------
632 count : `int`
633 The number of rows the query would return, or an upper bound if
634 ``exact=False``.
636 Notes
637 -----
638 This counts the number of rows returned, not the number of unique rows
639 returned, so even with ``exact=True`` it may provide only an upper
640 bound on the number of *deduplicated* result rows.
641 """
642 raise NotImplementedError()
644 @abstractmethod
645 def any(
646 self,
647 *,
648 execute: bool = True,
649 exact: bool = True,
650 ) -> bool:
651 """Test whether this query returns any results.
653 Parameters
654 ----------
655 execute : `bool`, optional
656 If `True`, execute at least a ``LIMIT 1`` query if it cannot be
657 determined prior to execution that the query would return no rows.
658 exact : `bool`, optional
659 If `True`, run the full query and perform post-query filtering if
660 needed, until at least one result row is found. If `False`, the
661 returned result does not account for post-query filtering, and
662 hence may be `True` even when all result rows would be filtered
663 out.
665 Returns
666 -------
667 any : `bool`
668 `True` if the query would (or might, depending on arguments) yield
669 result rows. `False` if it definitely would not.
670 """
671 raise NotImplementedError()
673 @abstractmethod
674 def explain_no_results(self) -> Iterator[str]:
675 """Return human-readable messages that may help explain why the query
676 yields no results.
678 Returns
679 -------
680 messages : `Iterator` [ `str` ]
681 String messages that describe reasons the query might not yield any
682 results.
684 Notes
685 -----
686 Messages related to post-query filtering are only available if the
687 iterator has been exhausted, or if `any` or `count` was already called
688 (with ``exact=True`` for the latter two).
690 This method first yields messages that are generated while the query is
691 being built or filtered, but may then proceed to diagnostics generated
692 by performing what should be inexpensive follow-up queries. Callers
693 can short-circuit this at any time by simplying not iterating further.
694 """
695 raise NotImplementedError()
698class ParentDatasetQueryResults(DatasetQueryResults):
699 """An object that represents results from a query for datasets with a
700 single parent `DatasetType`.
702 Parameters
703 ----------
704 db : `Database`
705 Database engine to execute queries against.
706 query : `Query`
707 Low-level query object that backs these results. ``query.datasetType``
708 will be the parent dataset type for this object, and may not be `None`.
709 components : `Sequence` [ `str` or `None` ]
710 Names of components to include in iteration. `None` may be included
711 (at most once) to include the parent dataset type.
712 records : `Mapping`, optional
713 Mapping containing `DimensionRecord` objects for all dimensions and
714 all data IDs this query will yield. If `None` (default),
715 `DataCoordinate.hasRecords` will return `False` for all nested data
716 IDs. This is a nested mapping with `str` names of dimension elements
717 as outer keys, `DimensionRecord` instances as inner values, and
718 ``tuple(record.dataId.values())`` for the inner keys / outer values
719 (where ``record`` is the innermost `DimensionRecord` instance).
720 datasetType : `DatasetType`, optional
721 Parent dataset type for all datasets returned by this query. If not
722 provided, ``query.datasetType`` be used, and must not be `None` (as it
723 is in the case where the query is known to yield no results prior to
724 execution).
725 """
727 def __init__(
728 self,
729 db: Database,
730 query: Query,
731 *,
732 components: Sequence[Optional[str]],
733 records: Optional[Mapping[str, Mapping[tuple, DimensionRecord]]] = None,
734 datasetType: Optional[DatasetType] = None,
735 ):
736 self._db = db
737 self._query = query
738 self._components = components
739 self._records = records
740 if datasetType is None:
741 datasetType = query.datasetType
742 assert datasetType is not None, "Query used to initialize dataset results must have a dataset."
743 assert datasetType.dimensions.issubset(
744 query.graph
745 ), f"Query dimensions {query.graph} do not match dataset type dimensions {datasetType.dimensions}."
746 self._datasetType = datasetType
748 __slots__ = ("_db", "_query", "_dimensions", "_components", "_records")
750 def __iter__(self) -> Iterator[DatasetRef]:
751 for row in self._query.rows(self._db):
752 parentRef = self._query.extractDatasetRef(row, records=self._records)
753 for component in self._components:
754 if component is None:
755 yield parentRef
756 else:
757 yield parentRef.makeComponentRef(component)
759 def __repr__(self) -> str:
760 return f"<DatasetRef iterator for [components of] {self._datasetType.name}>"
762 def byParentDatasetType(self) -> Iterator[ParentDatasetQueryResults]:
763 # Docstring inherited from DatasetQueryResults.
764 yield self
766 @contextmanager
767 def materialize(self) -> Iterator[ParentDatasetQueryResults]:
768 # Docstring inherited from DatasetQueryResults.
769 with self._query.materialize(self._db) as materialized:
770 yield ParentDatasetQueryResults(
771 self._db, materialized, components=self._components, records=self._records
772 )
774 @property
775 def parentDatasetType(self) -> DatasetType:
776 """The parent dataset type for all datasets in this iterable
777 (`DatasetType`).
778 """
779 return self._datasetType
781 @property
782 def dataIds(self) -> DataCoordinateQueryResults:
783 """A lazy-evaluation object representing a query for just the data
784 IDs of the datasets that would be returned by this query
785 (`DataCoordinateQueryResults`).
787 The returned object is not in general `zip`-iterable with ``self``;
788 it may be in a different order or have (or not have) duplicates.
789 """
790 query = self._query.subset(graph=self.parentDatasetType.dimensions, datasets=False, unique=False)
791 return DataCoordinateQueryResults.from_query(
792 self._db,
793 query,
794 self.parentDatasetType.dimensions,
795 records=self._records,
796 )
798 def withComponents(self, components: Sequence[Optional[str]]) -> ParentDatasetQueryResults:
799 """Return a new query results object for the same parent datasets but
800 different components.
802 components : `Sequence` [ `str` or `None` ]
803 Names of components to include in iteration. `None` may be
804 included (at most once) to include the parent dataset type.
805 """
806 return ParentDatasetQueryResults(
807 self._db, self._query, records=self._records, components=components, datasetType=self._datasetType
808 )
810 def expanded(self) -> ParentDatasetQueryResults:
811 # Docstring inherited from DatasetQueryResults.
812 if self._records is None:
813 records = self.dataIds.expanded()._records
814 return ParentDatasetQueryResults(
815 self._db,
816 self._query,
817 records=records,
818 components=self._components,
819 datasetType=self._datasetType,
820 )
821 else:
822 return self
824 def count(self, *, exact: bool = True) -> int:
825 # Docstring inherited.
826 return len(self._components) * self._query.count(self._db, exact=exact)
828 def any(
829 self,
830 *,
831 execute: bool = True,
832 exact: bool = True,
833 ) -> bool:
834 # Docstring inherited.
835 return self._query.any(self._db, execute=execute, exact=exact)
837 def explain_no_results(self) -> Iterator[str]:
838 # Docstring inherited.
839 return self._query.explain_no_results(self._db)
842class ChainedDatasetQueryResults(DatasetQueryResults):
843 """A `DatasetQueryResults` implementation that simply chains together
844 other results objects, each for a different parent dataset type.
846 Parameters
847 ----------
848 chain : `Sequence` [ `ParentDatasetQueryResults` ]
849 The underlying results objects this object will chain together.
850 doomed_by : `Iterable` [ `str` ], optional
851 A list of messages (appropriate for e.g. logging or exceptions) that
852 explain why the query is known to return no results even before it is
853 executed. Queries with a non-empty list will never be executed.
854 Child results objects may also have their own list.
855 """
857 def __init__(self, chain: Sequence[ParentDatasetQueryResults], doomed_by: Iterable[str] = ()):
858 self._chain = chain
859 self._doomed_by = tuple(doomed_by)
861 __slots__ = ("_chain",)
863 def __iter__(self) -> Iterator[DatasetRef]:
864 return itertools.chain.from_iterable(self._chain)
866 def __repr__(self) -> str:
867 return "<DatasetRef iterator for multiple dataset types>"
869 def byParentDatasetType(self) -> Iterator[ParentDatasetQueryResults]:
870 # Docstring inherited from DatasetQueryResults.
871 return iter(self._chain)
873 @contextmanager
874 def materialize(self) -> Iterator[ChainedDatasetQueryResults]:
875 # Docstring inherited from DatasetQueryResults.
876 with ExitStack() as stack:
877 yield ChainedDatasetQueryResults([stack.enter_context(r.materialize()) for r in self._chain])
879 def expanded(self) -> ChainedDatasetQueryResults:
880 # Docstring inherited from DatasetQueryResults.
881 return ChainedDatasetQueryResults([r.expanded() for r in self._chain])
883 def count(self, *, exact: bool = True) -> int:
884 # Docstring inherited.
885 return sum(r.count(exact=exact) for r in self._chain)
887 def any(
888 self,
889 *,
890 execute: bool = True,
891 exact: bool = True,
892 ) -> bool:
893 # Docstring inherited.
894 return any(r.any(execute=execute, exact=exact) for r in self._chain)
896 def explain_no_results(self) -> Iterator[str]:
897 # Docstring inherited.
898 for r in self._chain:
899 yield from r.explain_no_results()
900 yield from self._doomed_by
903class DimensionRecordQueryResults(Iterable[DimensionRecord]):
904 """An interface for objects that represent the results of queries for
905 dimension records.
906 """
908 @abstractmethod
909 def count(self, *, exact: bool = True) -> int:
910 """Count the number of rows this query would return.
912 Parameters
913 ----------
914 exact : `bool`, optional
915 If `True`, run the full query and perform post-query filtering if
916 needed to account for that filtering in the count. If `False`, the
917 result may be an upper bound.
919 Returns
920 -------
921 count : `int`
922 The number of rows the query would return, or an upper bound if
923 ``exact=False``.
925 Notes
926 -----
927 This counts the number of rows returned, not the number of unique rows
928 returned, so even with ``exact=True`` it may provide only an upper
929 bound on the number of *deduplicated* result rows.
930 """
931 raise NotImplementedError()
933 @abstractmethod
934 def any(self, *, execute: bool = True, exact: bool = True) -> bool:
935 """Test whether this query returns any results.
937 Parameters
938 ----------
939 execute : `bool`, optional
940 If `True`, execute at least a ``LIMIT 1`` query if it cannot be
941 determined prior to execution that the query would return no rows.
942 exact : `bool`, optional
943 If `True`, run the full query and perform post-query filtering if
944 needed, until at least one result row is found. If `False`, the
945 returned result does not account for post-query filtering, and
946 hence may be `True` even when all result rows would be filtered
947 out.
949 Returns
950 -------
951 any : `bool`
952 `True` if the query would (or might, depending on arguments) yield
953 result rows. `False` if it definitely would not.
954 """
955 raise NotImplementedError()
957 @abstractmethod
958 def order_by(self, *args: str) -> DimensionRecordQueryResults:
959 """Make the iterator return ordered result.
961 Parameters
962 ----------
963 *args : `str`
964 Names of the columns/dimensions to use for ordering. Column name
965 can be prefixed with minus (``-``) to use descending ordering.
967 Returns
968 -------
969 result : `DimensionRecordQueryResults`
970 Returns ``self`` instance which is updated to return ordered
971 result.
973 Notes
974 -----
975 This method can modify the iterator in place and return the same
976 instance.
977 """
978 raise NotImplementedError()
980 @abstractmethod
981 def limit(self, limit: int, offset: Optional[int] = None) -> DimensionRecordQueryResults:
982 """Make the iterator return limited number of records.
984 Parameters
985 ----------
986 limit : `int`
987 Upper limit on the number of returned records.
988 offset : `int` or `None`
989 If not `None` then the number of records to skip before returning
990 ``limit`` records.
992 Returns
993 -------
994 result : `DimensionRecordQueryResults`
995 Returns ``self`` instance which is updated to return limited set
996 of records.
998 Notes
999 -----
1000 This method can modify the iterator in place and return the same
1001 instance. Normally this method is used together with `order_by`
1002 method.
1003 """
1004 raise NotImplementedError()
1006 @abstractmethod
1007 def explain_no_results(self) -> Iterator[str]:
1008 """Return human-readable messages that may help explain why the query
1009 yields no results.
1011 Returns
1012 -------
1013 messages : `Iterator` [ `str` ]
1014 String messages that describe reasons the query might not yield any
1015 results.
1017 Notes
1018 -----
1019 Messages related to post-query filtering are only available if the
1020 iterator has been exhausted, or if `any` or `count` was already called
1021 (with ``exact=True`` for the latter two).
1023 This method first yields messages that are generated while the query is
1024 being built or filtered, but may then proceed to diagnostics generated
1025 by performing what should be inexpensive follow-up queries. Callers
1026 can short-circuit this at any time by simply not iterating further.
1027 """
1028 raise NotImplementedError()
1031class _DimensionRecordKey:
1032 """Class for objects used as keys in ordering `DimensionRecord` instances.
1034 Parameters
1035 ----------
1036 attributes : `Sequence` [ `str` ]
1037 Sequence of attribute names to use for comparison.
1038 ordering : `Sequence` [ `bool` ]
1039 Matching sequence of ordering flags, `False` for descending ordering,
1040 `True` for ascending ordering.
1041 record : `DimensionRecord`
1042 `DimensionRecord` to compare to other records.
1043 """
1045 def __init__(self, attributes: Sequence[str], ordering: Sequence[bool], record: DimensionRecord):
1046 self.attributes = attributes
1047 self.ordering = ordering
1048 self.rec = record
1050 def _cmp(self, other: _DimensionRecordKey) -> int:
1051 """Compare two records using provided comparison operator.
1053 Parameters
1054 ----------
1055 other : `_DimensionRecordKey`
1056 Key for other record.
1058 Returns
1059 -------
1060 result : `int`
1061 0 if keys are identical, negative if ``self`` is ordered before
1062 ``other``, positive otherwise.
1063 """
1064 for attribute, ordering in zip(self.attributes, self.ordering):
1065 # timespan.begin/end cannot use getattr
1066 attrgetter = operator.attrgetter(attribute)
1067 lhs = attrgetter(self.rec)
1068 rhs = attrgetter(other.rec)
1069 if not ordering:
1070 lhs, rhs = rhs, lhs
1071 if lhs != rhs:
1072 return 1 if lhs > rhs else -1
1073 return 0
1075 def __lt__(self, other: _DimensionRecordKey) -> bool:
1076 return self._cmp(other) < 0
1078 def __gt__(self, other: _DimensionRecordKey) -> bool:
1079 return self._cmp(other) > 0
1081 def __eq__(self, other: Any) -> bool:
1082 if not isinstance(other, _DimensionRecordKey):
1083 return NotImplemented
1084 return self._cmp(other) == 0
1086 def __le__(self, other: _DimensionRecordKey) -> bool:
1087 return self._cmp(other) <= 0
1089 def __ge__(self, other: _DimensionRecordKey) -> bool:
1090 return self._cmp(other) >= 0
1093class DatabaseDimensionRecordQueryResults(DimensionRecordQueryResults):
1094 """Implementation of DimensionRecordQueryResults using database query.
1096 Parameters
1097 ----------
1098 dataIds : `DataCoordinateQueryResults`
1099 Iterator for DataIds.
1100 recordStorage : `DimensionRecordStorage`
1101 Instance of storage class for dimension records.
1102 """
1104 def __init__(self, dataIds: DataCoordinateQueryResults, recordStorage: DimensionRecordStorage):
1105 self._dataIds = dataIds
1106 self._recordStorage = recordStorage
1107 self._order_by: Iterable[str] = ()
1109 def __iter__(self) -> Iterator[DimensionRecord]:
1110 # LIMIT is already applied at DataCoordinateQueryResults level
1111 # (assumption here is that if DataId exists then dimension record
1112 # exists too and their counts must be equal). fetch() does not
1113 # guarantee ordering, so we need to sort records in memory below.
1114 recordIter = self._recordStorage.fetch(self._dataIds)
1115 if not self._order_by:
1116 return iter(recordIter)
1118 # Parse list of column names and build a list of attribute name for
1119 # ordering. Note that here we only support ordering by direct
1120 # attributes of the element, and not other elements from the dimension
1121 # graph.
1122 orderBy = ElementOrderByClause(self._order_by, self._recordStorage.element)
1123 attributes: List[str] = []
1124 ordering: List[bool] = []
1125 for column in orderBy.order_by_columns:
1126 if column.column is None:
1127 assert isinstance(column.element, Dimension), "Element must be a Dimension"
1128 attributes.append(column.element.primaryKey.name)
1129 else:
1130 attributes.append(column.column)
1131 ordering.append(column.ordering)
1133 def _key(record: DimensionRecord) -> _DimensionRecordKey:
1134 return _DimensionRecordKey(attributes, ordering, record)
1136 records = sorted(recordIter, key=_key)
1137 return iter(records)
1139 def count(self, *, exact: bool = True) -> int:
1140 # Docstring inherited from base class.
1141 return self._dataIds.count(exact=exact)
1143 def any(self, *, execute: bool = True, exact: bool = True) -> bool:
1144 # Docstring inherited from base class.
1145 return self._dataIds.any(execute=execute, exact=exact)
1147 def order_by(self, *args: str) -> DimensionRecordQueryResults:
1148 # Docstring inherited from base class.
1149 self._dataIds = self._dataIds.order_by(*args)
1150 self._order_by = args
1151 return self
1153 def limit(self, limit: int, offset: Optional[int] = None) -> DimensionRecordQueryResults:
1154 # Docstring inherited from base class.
1155 self._dataIds = self._dataIds.limit(limit, offset)
1156 return self
1158 def explain_no_results(self) -> Iterator[str]:
1159 # Docstring inherited.
1160 return self._dataIds.explain_no_results()