Coverage for python/lsst/daf/butler/registry/queries/_results.py: 33%
280 statements
« prev ^ index » next coverage.py v6.4.4, created at 2022-08-31 10:07 +0000
« prev ^ index » next coverage.py v6.4.4, created at 2022-08-31 10:07 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23__all__ = (
24 "ChainedDatasetQueryResults",
25 "DatabaseDimensionRecordQueryResults",
26 "DataCoordinateQueryResults",
27 "DatasetQueryResults",
28 "DimensionRecordQueryResults",
29 "ParentDatasetQueryResults",
30)
32import itertools
33import operator
34from abc import abstractmethod
35from contextlib import ExitStack, contextmanager
36from typing import (
37 Any,
38 Callable,
39 ContextManager,
40 Iterable,
41 Iterator,
42 List,
43 Mapping,
44 Optional,
45 Sequence,
46 Tuple,
47 Union,
48)
50import sqlalchemy
52from ...core import (
53 DataCoordinate,
54 DataCoordinateIterable,
55 DatasetRef,
56 DatasetType,
57 Dimension,
58 DimensionGraph,
59 DimensionRecord,
60 SimpleQuery,
61)
62from ..interfaces import Database, DimensionRecordStorage
63from ._query import Query
64from ._structs import ElementOrderByClause, QuerySummary
66QueryFactoryMethod = Callable[[Optional[Iterable[str]], Optional[Tuple[int, Optional[int]]]], Query]
67"""Type of a query factory method type used by DataCoordinateQueryResults.
68"""
71class DataCoordinateQueryResults(DataCoordinateIterable):
72 """An enhanced implementation of `DataCoordinateIterable` that represents
73 data IDs retrieved from a database query.
75 Parameters
76 ----------
77 db : `Database`
78 Database engine used to execute queries.
79 query_factory : `QueryFactoryMethod`
80 Method which creates an instance of `Query` class.
81 graph : `DimensionGraph`
82 Dimensions used by query.
83 order_by : `Iterable` [ `str` ], optional
84 Optional sequence of column names used for result ordering.
85 limit : `Tuple` [ `int`, `int` ], optional
86 Limit for the number of returned records and optional offset.
87 records : `Mapping`, optional
88 A nested mapping containing `DimensionRecord` objects for all
89 dimensions and all data IDs this query will yield. If `None`
90 (default), `DataCoordinateIterable.hasRecords` will return `False`.
91 The outer mapping has `str` keys (the names of dimension elements).
92 The inner mapping has `tuple` keys representing data IDs (tuple
93 conversions of `DataCoordinate.values()`) and `DimensionRecord` values.
95 Notes
96 -----
97 Constructing an instance of this does nothing; the query is not executed
98 until it is iterated over (or some other operation is performed that
99 involves iteration).
101 Instances should generally only be constructed by `Registry` methods or the
102 methods of other query result objects.
103 """
105 def __init__(
106 self,
107 db: Database,
108 query_factory: QueryFactoryMethod,
109 graph: DimensionGraph,
110 *,
111 order_by: Optional[Iterable[str]] = None,
112 limit: Optional[Tuple[int, Optional[int]]] = None,
113 records: Optional[Mapping[str, Mapping[tuple, DimensionRecord]]] = None,
114 ):
115 self._db = db
116 self._query_factory = query_factory
117 self._graph = graph
118 self._order_by = order_by
119 self._limit = limit
120 self._records = records
121 self._cached_query: Optional[Query] = None
123 __slots__ = ("_db", "_query_factory", "_graph", "_order_by", "_limit", "_records", "_cached_query")
125 @classmethod
126 def from_query(
127 cls,
128 db: Database,
129 query: Query,
130 graph: DimensionGraph,
131 *,
132 order_by: Optional[Iterable[str]] = None,
133 limit: Optional[Tuple[int, Optional[int]]] = None,
134 records: Optional[Mapping[str, Mapping[tuple, DimensionRecord]]] = None,
135 ) -> DataCoordinateQueryResults:
136 """Make an instance from a pre-existing query instead of a factory.
138 Parameters
139 ----------
140 db : `Database`
141 Database engine used to execute queries.
142 query : `Query`
143 Low-level representation of the query that backs this result
144 object.
145 graph : `DimensionGraph`
146 Dimensions used by query.
147 order_by : `Iterable` [ `str` ], optional
148 Optional sequence of column names used for result ordering.
149 limit : `Tuple` [ `int`, `int` ], optional
150 Limit for the number of returned records and optional offset.
151 records : `Mapping`, optional
152 A nested mapping containing `DimensionRecord` objects for all
153 dimensions and all data IDs this query will yield. If `None`
154 (default), `DataCoordinateIterable.hasRecords` will return `False`.
155 The outer mapping has `str` keys (the names of dimension elements).
156 The inner mapping has `tuple` keys representing data IDs (tuple
157 conversions of `DataCoordinate.values()`) and `DimensionRecord`
158 values.
159 """
161 def factory(order_by: Optional[Iterable[str]], limit: Optional[Tuple[int, Optional[int]]]) -> Query:
162 return query
164 return DataCoordinateQueryResults(db, factory, graph, order_by=order_by, limit=limit, records=records)
166 def __iter__(self) -> Iterator[DataCoordinate]:
167 return (self._query.extractDataId(row, records=self._records) for row in self._query.rows(self._db))
169 def __repr__(self) -> str:
170 return f"<DataCoordinate iterator with dimensions={self._graph}>"
172 def _clone(
173 self,
174 *,
175 query_factory: Optional[QueryFactoryMethod] = None,
176 query: Optional[Query] = None,
177 graph: Optional[DimensionGraph] = None,
178 order_by: Optional[Iterable[str]] = None,
179 limit: Optional[Tuple[int, Optional[int]]] = None,
180 records: Optional[Mapping[str, Mapping[tuple, DimensionRecord]]] = None,
181 ) -> DataCoordinateQueryResults:
182 """Clone this instance potentially updating some attributes."""
183 graph = graph if graph is not None else self._graph
184 order_by = order_by if order_by is not None else self._order_by
185 limit = limit if limit is not None else self._limit
186 records = records if records is not None else self._records
187 if query is None:
188 query_factory = query_factory or self._query_factory
189 return DataCoordinateQueryResults(
190 self._db, query_factory, graph, order_by=order_by, limit=limit, records=records
191 )
192 else:
193 return DataCoordinateQueryResults.from_query(
194 self._db, query, graph, order_by=order_by, limit=limit, records=records
195 )
197 @property
198 def _query(self) -> Query:
199 """Query representation instance (`Query`)"""
200 if self._cached_query is None:
201 self._cached_query = self._query_factory(self._order_by, self._limit)
202 assert (
203 self._cached_query.datasetType is None
204 ), "Query used to initialize data coordinate results should not have any datasets."
205 return self._cached_query
207 @property
208 def graph(self) -> DimensionGraph:
209 # Docstring inherited from DataCoordinateIterable.
210 return self._graph
212 def hasFull(self) -> bool:
213 # Docstring inherited from DataCoordinateIterable.
214 return True
216 def hasRecords(self) -> bool:
217 # Docstring inherited from DataCoordinateIterable.
218 return self._records is not None or not self._graph
220 @contextmanager
221 def materialize(self) -> Iterator[DataCoordinateQueryResults]:
222 """Insert this query's results into a temporary table.
224 Returns
225 -------
226 context : `typing.ContextManager` [ `DataCoordinateQueryResults` ]
227 A context manager that ensures the temporary table is created and
228 populated in ``__enter__`` (returning a results object backed by
229 that table), and dropped in ``__exit__``. If ``self`` is already
230 materialized, the context manager may do nothing (reflecting the
231 fact that an outer context manager should already take care of
232 everything else).
234 Notes
235 -----
236 When using a very large result set to perform multiple queries (e.g.
237 multiple calls to `subset` with different arguments, or even a single
238 call to `expanded`), it may be much more efficient to start by
239 materializing the query and only then performing the follow up queries.
240 It may also be less efficient, depending on how well database engine's
241 query optimizer can simplify those particular follow-up queries and
242 how efficiently it caches query results even when the are not
243 explicitly inserted into a temporary table. See `expanded` and
244 `subset` for examples.
245 """
246 with self._query.materialize(self._db) as materialized:
247 # Note that we depend on order_by columns to be passes from Query
248 # to MaterializedQuery, so order_by and limit are not used.
249 yield self._clone(query=materialized)
251 def expanded(self) -> DataCoordinateQueryResults:
252 """Return a results object for which `hasRecords` returns `True`.
254 This method may involve actually executing database queries to fetch
255 `DimensionRecord` objects.
257 Returns
258 -------
259 results : `DataCoordinateQueryResults`
260 A results object for which `hasRecords` returns `True`. May be
261 ``self`` if that is already the case.
263 Notes
264 -----
265 For very result sets, it may be much more efficient to call
266 `materialize` before calling `expanded`, to avoid performing the
267 original query multiple times (as a subquery) in the follow-up queries
268 that fetch dimension records. For example::
270 with registry.queryDataIds(...).materialize() as tempDataIds:
271 dataIdsWithRecords = tempDataIds.expanded()
272 for dataId in dataIdsWithRecords:
273 ...
274 """
275 if self._records is None:
276 records = {}
277 for element in self.graph.elements:
278 subset = self.subset(graph=element.graph, unique=True)
279 records[element.name] = {
280 tuple(record.dataId.values()): record
281 for record in self._query.managers.dimensions[element].fetch(subset)
282 }
284 return self._clone(query=self._query, records=records)
285 else:
286 return self
288 def subset(
289 self, graph: Optional[DimensionGraph] = None, *, unique: bool = False
290 ) -> DataCoordinateQueryResults:
291 """Return a results object containing a subset of the dimensions of
292 this one, and/or a unique near-subset of its rows.
294 This method may involve actually executing database queries to fetch
295 `DimensionRecord` objects.
297 Parameters
298 ----------
299 graph : `DimensionGraph`, optional
300 Dimensions to include in the new results object. If `None`,
301 ``self.graph`` is used.
302 unique : `bool`, optional
303 If `True` (`False` is default), the query should only return unique
304 data IDs. This is implemented in the database; to obtain unique
305 results via Python-side processing (which may be more efficient in
306 some cases), use `toSet` to construct a `DataCoordinateSet` from
307 this results object instead.
309 Returns
310 -------
311 results : `DataCoordinateQueryResults`
312 A results object corresponding to the given criteria. May be
313 ``self`` if it already qualifies.
315 Raises
316 ------
317 ValueError
318 Raised when ``graph`` is not a subset of the dimension graph in
319 this result.
321 Notes
322 -----
323 This method can only return a "near-subset" of the original result rows
324 in general because of subtleties in how spatial overlaps are
325 implemented; see `Query.subset` for more information.
327 When calling `subset` multiple times on the same very large result set,
328 it may be much more efficient to call `materialize` first. For
329 example::
331 dimensions1 = DimensionGraph(...)
332 dimensions2 = DimensionGraph(...)
333 with registry.queryDataIds(...).materialize() as tempDataIds:
334 for dataId1 in tempDataIds.subset(
335 graph=dimensions1,
336 unique=True):
337 ...
338 for dataId2 in tempDataIds.subset(
339 graph=dimensions2,
340 unique=True):
341 ...
342 """
343 if graph is None:
344 graph = self.graph
345 if not graph.issubset(self.graph):
346 raise ValueError(f"{graph} is not a subset of {self.graph}")
347 if graph == self.graph and (not unique or self._query.isUnique()):
348 return self
349 records: Optional[Mapping[str, Mapping[tuple, DimensionRecord]]]
350 if self._records is not None:
351 records = {element.name: self._records[element.name] for element in graph.elements}
352 else:
353 records = None
354 query = self._query.subset(graph=graph, datasets=False, unique=unique)
356 return self._clone(graph=graph, query=query, records=records)
358 def constrain(self, query: SimpleQuery, columns: Callable[[str], sqlalchemy.sql.ColumnElement]) -> None:
359 # Docstring inherited from DataCoordinateIterable.
360 sql = self._query.sql
361 if sql is not None:
362 fromClause = sql.alias("c")
363 query.join(
364 fromClause,
365 onclause=sqlalchemy.sql.and_(
366 *[
367 columns(dimension.name) == fromClause.columns[dimension.name]
368 for dimension in self.graph.required
369 ]
370 ),
371 )
373 def findDatasets(
374 self, datasetType: Union[DatasetType, str], collections: Any, *, findFirst: bool = True
375 ) -> DatasetQueryResults:
376 """Find datasets using the data IDs identified by this query.
378 Parameters
379 ----------
380 datasetType : `DatasetType` or `str`
381 Dataset type or the name of one to search for. Must have
382 dimensions that are a subset of ``self.graph``.
383 collections : `Any`
384 An expression that fully or partially identifies the collections
385 to search for the dataset, such as a `str`, `re.Pattern`, or
386 iterable thereof. ``...`` can be used to return all collections.
387 See :ref:`daf_butler_collection_expressions` for more information.
388 findFirst : `bool`, optional
389 If `True` (default), for each result data ID, only yield one
390 `DatasetRef`, from the first collection in which a dataset of that
391 dataset type appears (according to the order of ``collections``
392 passed in). If `True`, ``collections`` must not contain regular
393 expressions and may not be ``...``.
395 Returns
396 -------
397 datasets : `DatasetQueryResults`
398 A lazy-evaluation object representing dataset query results,
399 iterable over `DatasetRef` objects. If ``self.hasRecords()``, all
400 nested data IDs in those dataset references will have records as
401 well.
403 Raises
404 ------
405 ValueError
406 Raised if ``datasetType.dimensions.issubset(self.graph) is False``.
407 """
408 if not isinstance(datasetType, DatasetType):
409 storage = self._query.managers.datasets.find(datasetType)
410 if storage is None:
411 return ChainedDatasetQueryResults(
412 [],
413 doomed_by=[
414 f"Dataset type {datasetType!r} is not registered, so no instances of it can exist in "
415 "any collection."
416 ],
417 )
418 else:
419 datasetType = storage.datasetType
420 if not datasetType.dimensions.issubset(self.graph):
421 raise ValueError(
422 f"findDatasets requires that the dataset type have only dimensions in "
423 f"the DataCoordinateQueryResult used as input to the search, but "
424 f"{datasetType.name} has dimensions {datasetType.dimensions}, while the input "
425 f"dimensions are {self.graph}."
426 )
427 if datasetType.isComponent():
428 # We were given a true DatasetType instance, but it's a component.
429 components = [datasetType.component()]
430 datasetType = datasetType.makeCompositeDatasetType()
431 else:
432 components = [None]
433 summary = QuerySummary(self.graph, whereRegion=self._query.whereRegion, datasets=[datasetType])
434 builder = self._query.makeBuilder(summary)
435 builder.joinDataset(datasetType, collections=collections, findFirst=findFirst)
436 query = builder.finish(joinMissing=False)
437 return ParentDatasetQueryResults(
438 db=self._db, query=query, components=components, records=self._records, datasetType=datasetType
439 )
441 def count(self, *, exact: bool = True) -> int:
442 """Count the number of rows this query would return.
444 Parameters
445 ----------
446 exact : `bool`, optional
447 If `True`, run the full query and perform post-query filtering if
448 needed to account for that filtering in the count. If `False`, the
449 result may be an upper bound.
451 Returns
452 -------
453 count : `int`
454 The number of rows the query would return, or an upper bound if
455 ``exact=False``.
457 Notes
458 -----
459 This counts the number of rows returned, not the number of unique rows
460 returned, so even with ``exact=True`` it may provide only an upper
461 bound on the number of *deduplicated* result rows.
462 """
463 return self._query.count(self._db, exact=exact)
465 def any(
466 self,
467 *,
468 execute: bool = True,
469 exact: bool = True,
470 ) -> bool:
471 """Test whether this query returns any results.
473 Parameters
474 ----------
475 execute : `bool`, optional
476 If `True`, execute at least a ``LIMIT 1`` query if it cannot be
477 determined prior to execution that the query would return no rows.
478 exact : `bool`, optional
479 If `True`, run the full query and perform post-query filtering if
480 needed, until at least one result row is found. If `False`, the
481 returned result does not account for post-query filtering, and
482 hence may be `True` even when all result rows would be filtered
483 out.
485 Returns
486 -------
487 any : `bool`
488 `True` if the query would (or might, depending on arguments) yield
489 result rows. `False` if it definitely would not.
490 """
491 return self._query.any(self._db, execute=execute, exact=exact)
493 def explain_no_results(self) -> Iterator[str]:
494 """Return human-readable messages that may help explain why the query
495 yields no results.
497 Returns
498 -------
499 messages : `Iterator` [ `str` ]
500 String messages that describe reasons the query might not yield any
501 results.
503 Notes
504 -----
505 Messages related to post-query filtering are only available if the
506 iterator has been exhausted, or if `any` or `count` was already called
507 (with ``exact=True`` for the latter two).
509 This method first yields messages that are generated while the query is
510 being built or filtered, but may then proceed to diagnostics generated
511 by performing what should be inexpensive follow-up queries. Callers
512 can short-circuit this at any time by simplying not iterating further.
513 """
514 return self._query.explain_no_results(self._db)
516 def order_by(self, *args: str) -> DataCoordinateQueryResults:
517 """Make the iterator return ordered result.
519 Parameters
520 ----------
521 *args : `str`
522 Names of the columns/dimensions to use for ordering. Column name
523 can be prefixed with minus (``-``) to use descending ordering.
525 Returns
526 -------
527 result : `DataCoordinateQueryResults`
528 Returns ``self`` instance which is updated to return ordered
529 result.
531 Notes
532 -----
533 This method modifies the iterator in place and returns the same
534 instance to support method chaining.
535 """
536 return self._clone(order_by=args)
538 def limit(self, limit: int, offset: Optional[int] = None) -> DataCoordinateQueryResults:
539 """Make the iterator return limited number of records.
541 Parameters
542 ----------
543 limit : `int`
544 Upper limit on the number of returned records.
545 offset : `int` or `None`
546 If not `None` then the number of records to skip before returning
547 ``limit`` records.
549 Returns
550 -------
551 result : `DataCoordinateQueryResults`
552 Returns ``self`` instance which is updated to return limited set
553 of records.
555 Notes
556 -----
557 This method modifies the iterator in place and returns the same
558 instance to support method chaining. Normally this method is used
559 together with `order_by` method.
560 """
561 return self._clone(limit=(limit, offset))
564class DatasetQueryResults(Iterable[DatasetRef]):
565 """An interface for objects that represent the results of queries for
566 datasets.
567 """
569 @abstractmethod
570 def byParentDatasetType(self) -> Iterator[ParentDatasetQueryResults]:
571 """Group results by parent dataset type.
573 Returns
574 -------
575 iter : `Iterator` [ `ParentDatasetQueryResults` ]
576 An iterator over `DatasetQueryResults` instances that are each
577 responsible for a single parent dataset type (either just that
578 dataset type, one or more of its component dataset types, or both).
579 """
580 raise NotImplementedError()
582 @abstractmethod
583 def materialize(self) -> ContextManager[DatasetQueryResults]:
584 """Insert this query's results into a temporary table.
586 Returns
587 -------
588 context : `typing.ContextManager` [ `DatasetQueryResults` ]
589 A context manager that ensures the temporary table is created and
590 populated in ``__enter__`` (returning a results object backed by
591 that table), and dropped in ``__exit__``. If ``self`` is already
592 materialized, the context manager may do nothing (reflecting the
593 fact that an outer context manager should already take care of
594 everything else).
595 """
596 raise NotImplementedError()
598 @abstractmethod
599 def expanded(self) -> DatasetQueryResults:
600 """Return a `DatasetQueryResults` for which `DataCoordinate.hasRecords`
601 returns `True` for all data IDs in returned `DatasetRef` objects.
603 Returns
604 -------
605 expanded : `DatasetQueryResults`
606 Either a new `DatasetQueryResults` instance or ``self``, if it is
607 already expanded.
609 Notes
610 -----
611 As with `DataCoordinateQueryResults.expanded`, it may be more efficient
612 to call `materialize` before expanding data IDs for very large result
613 sets.
614 """
615 raise NotImplementedError()
617 @abstractmethod
618 def count(self, *, exact: bool = True) -> int:
619 """Count the number of rows this query would return.
621 Parameters
622 ----------
623 exact : `bool`, optional
624 If `True`, run the full query and perform post-query filtering if
625 needed to account for that filtering in the count. If `False`, the
626 result may be an upper bound.
628 Returns
629 -------
630 count : `int`
631 The number of rows the query would return, or an upper bound if
632 ``exact=False``.
634 Notes
635 -----
636 This counts the number of rows returned, not the number of unique rows
637 returned, so even with ``exact=True`` it may provide only an upper
638 bound on the number of *deduplicated* result rows.
639 """
640 raise NotImplementedError()
642 @abstractmethod
643 def any(
644 self,
645 *,
646 execute: bool = True,
647 exact: bool = True,
648 ) -> bool:
649 """Test whether this query returns any results.
651 Parameters
652 ----------
653 execute : `bool`, optional
654 If `True`, execute at least a ``LIMIT 1`` query if it cannot be
655 determined prior to execution that the query would return no rows.
656 exact : `bool`, optional
657 If `True`, run the full query and perform post-query filtering if
658 needed, until at least one result row is found. If `False`, the
659 returned result does not account for post-query filtering, and
660 hence may be `True` even when all result rows would be filtered
661 out.
663 Returns
664 -------
665 any : `bool`
666 `True` if the query would (or might, depending on arguments) yield
667 result rows. `False` if it definitely would not.
668 """
669 raise NotImplementedError()
671 @abstractmethod
672 def explain_no_results(self) -> Iterator[str]:
673 """Return human-readable messages that may help explain why the query
674 yields no results.
676 Returns
677 -------
678 messages : `Iterator` [ `str` ]
679 String messages that describe reasons the query might not yield any
680 results.
682 Notes
683 -----
684 Messages related to post-query filtering are only available if the
685 iterator has been exhausted, or if `any` or `count` was already called
686 (with ``exact=True`` for the latter two).
688 This method first yields messages that are generated while the query is
689 being built or filtered, but may then proceed to diagnostics generated
690 by performing what should be inexpensive follow-up queries. Callers
691 can short-circuit this at any time by simplying not iterating further.
692 """
693 raise NotImplementedError()
696class ParentDatasetQueryResults(DatasetQueryResults):
697 """An object that represents results from a query for datasets with a
698 single parent `DatasetType`.
700 Parameters
701 ----------
702 db : `Database`
703 Database engine to execute queries against.
704 query : `Query`
705 Low-level query object that backs these results. ``query.datasetType``
706 will be the parent dataset type for this object, and may not be `None`.
707 components : `Sequence` [ `str` or `None` ]
708 Names of components to include in iteration. `None` may be included
709 (at most once) to include the parent dataset type.
710 records : `Mapping`, optional
711 Mapping containing `DimensionRecord` objects for all dimensions and
712 all data IDs this query will yield. If `None` (default),
713 `DataCoordinate.hasRecords` will return `False` for all nested data
714 IDs. This is a nested mapping with `str` names of dimension elements
715 as outer keys, `DimensionRecord` instances as inner values, and
716 ``tuple(record.dataId.values())`` for the inner keys / outer values
717 (where ``record`` is the innermost `DimensionRecord` instance).
718 datasetType : `DatasetType`, optional
719 Parent dataset type for all datasets returned by this query. If not
720 provided, ``query.datasetType`` be used, and must not be `None` (as it
721 is in the case where the query is known to yield no results prior to
722 execution).
723 """
725 def __init__(
726 self,
727 db: Database,
728 query: Query,
729 *,
730 components: Sequence[Optional[str]],
731 records: Optional[Mapping[str, Mapping[tuple, DimensionRecord]]] = None,
732 datasetType: Optional[DatasetType] = None,
733 ):
734 self._db = db
735 self._query = query
736 self._components = components
737 self._records = records
738 if datasetType is None:
739 datasetType = query.datasetType
740 assert datasetType is not None, "Query used to initialize dataset results must have a dataset."
741 assert datasetType.dimensions.issubset(
742 query.graph
743 ), f"Query dimensions {query.graph} do not match dataset type dimensions {datasetType.dimensions}."
744 self._datasetType = datasetType
746 __slots__ = ("_db", "_query", "_dimensions", "_components", "_records")
748 def __iter__(self) -> Iterator[DatasetRef]:
749 for row in self._query.rows(self._db):
750 parentRef = self._query.extractDatasetRef(row, records=self._records)
751 for component in self._components:
752 if component is None:
753 yield parentRef
754 else:
755 yield parentRef.makeComponentRef(component)
757 def __repr__(self) -> str:
758 return f"<DatasetRef iterator for [components of] {self._datasetType.name}>"
760 def byParentDatasetType(self) -> Iterator[ParentDatasetQueryResults]:
761 # Docstring inherited from DatasetQueryResults.
762 yield self
764 @contextmanager
765 def materialize(self) -> Iterator[ParentDatasetQueryResults]:
766 # Docstring inherited from DatasetQueryResults.
767 with self._query.materialize(self._db) as materialized:
768 yield ParentDatasetQueryResults(
769 self._db, materialized, components=self._components, records=self._records
770 )
772 @property
773 def parentDatasetType(self) -> DatasetType:
774 """The parent dataset type for all datasets in this iterable
775 (`DatasetType`).
776 """
777 return self._datasetType
779 @property
780 def dataIds(self) -> DataCoordinateQueryResults:
781 """A lazy-evaluation object representing a query for just the data
782 IDs of the datasets that would be returned by this query
783 (`DataCoordinateQueryResults`).
785 The returned object is not in general `zip`-iterable with ``self``;
786 it may be in a different order or have (or not have) duplicates.
787 """
788 query = self._query.subset(graph=self.parentDatasetType.dimensions, datasets=False, unique=False)
789 return DataCoordinateQueryResults.from_query(
790 self._db,
791 query,
792 self.parentDatasetType.dimensions,
793 records=self._records,
794 )
796 def withComponents(self, components: Sequence[Optional[str]]) -> ParentDatasetQueryResults:
797 """Return a new query results object for the same parent datasets but
798 different components.
800 components : `Sequence` [ `str` or `None` ]
801 Names of components to include in iteration. `None` may be
802 included (at most once) to include the parent dataset type.
803 """
804 return ParentDatasetQueryResults(
805 self._db, self._query, records=self._records, components=components, datasetType=self._datasetType
806 )
808 def expanded(self) -> ParentDatasetQueryResults:
809 # Docstring inherited from DatasetQueryResults.
810 if self._records is None:
811 records = self.dataIds.expanded()._records
812 return ParentDatasetQueryResults(
813 self._db,
814 self._query,
815 records=records,
816 components=self._components,
817 datasetType=self._datasetType,
818 )
819 else:
820 return self
822 def count(self, *, exact: bool = True) -> int:
823 # Docstring inherited.
824 return len(self._components) * self._query.count(self._db, exact=exact)
826 def any(
827 self,
828 *,
829 execute: bool = True,
830 exact: bool = True,
831 ) -> bool:
832 # Docstring inherited.
833 return self._query.any(self._db, execute=execute, exact=exact)
835 def explain_no_results(self) -> Iterator[str]:
836 # Docstring inherited.
837 return self._query.explain_no_results(self._db)
840class ChainedDatasetQueryResults(DatasetQueryResults):
841 """A `DatasetQueryResults` implementation that simply chains together
842 other results objects, each for a different parent dataset type.
844 Parameters
845 ----------
846 chain : `Sequence` [ `ParentDatasetQueryResults` ]
847 The underlying results objects this object will chain together.
848 doomed_by : `Iterable` [ `str` ], optional
849 A list of messages (appropriate for e.g. logging or exceptions) that
850 explain why the query is known to return no results even before it is
851 executed. Queries with a non-empty list will never be executed.
852 Child results objects may also have their own list.
853 """
855 def __init__(self, chain: Sequence[ParentDatasetQueryResults], doomed_by: Iterable[str] = ()):
856 self._chain = chain
857 self._doomed_by = tuple(doomed_by)
859 __slots__ = ("_chain",)
861 def __iter__(self) -> Iterator[DatasetRef]:
862 return itertools.chain.from_iterable(self._chain)
864 def __repr__(self) -> str:
865 return "<DatasetRef iterator for multiple dataset types>"
867 def byParentDatasetType(self) -> Iterator[ParentDatasetQueryResults]:
868 # Docstring inherited from DatasetQueryResults.
869 return iter(self._chain)
871 @contextmanager
872 def materialize(self) -> Iterator[ChainedDatasetQueryResults]:
873 # Docstring inherited from DatasetQueryResults.
874 with ExitStack() as stack:
875 yield ChainedDatasetQueryResults([stack.enter_context(r.materialize()) for r in self._chain])
877 def expanded(self) -> ChainedDatasetQueryResults:
878 # Docstring inherited from DatasetQueryResults.
879 return ChainedDatasetQueryResults([r.expanded() for r in self._chain])
881 def count(self, *, exact: bool = True) -> int:
882 # Docstring inherited.
883 return sum(r.count(exact=exact) for r in self._chain)
885 def any(
886 self,
887 *,
888 execute: bool = True,
889 exact: bool = True,
890 ) -> bool:
891 # Docstring inherited.
892 return any(r.any(execute=execute, exact=exact) for r in self._chain)
894 def explain_no_results(self) -> Iterator[str]:
895 # Docstring inherited.
896 for r in self._chain:
897 yield from r.explain_no_results()
898 yield from self._doomed_by
901class DimensionRecordQueryResults(Iterable[DimensionRecord]):
902 """An interface for objects that represent the results of queries for
903 dimension records.
904 """
906 @abstractmethod
907 def count(self, *, exact: bool = True) -> int:
908 """Count the number of rows this query would return.
910 Parameters
911 ----------
912 exact : `bool`, optional
913 If `True`, run the full query and perform post-query filtering if
914 needed to account for that filtering in the count. If `False`, the
915 result may be an upper bound.
917 Returns
918 -------
919 count : `int`
920 The number of rows the query would return, or an upper bound if
921 ``exact=False``.
923 Notes
924 -----
925 This counts the number of rows returned, not the number of unique rows
926 returned, so even with ``exact=True`` it may provide only an upper
927 bound on the number of *deduplicated* result rows.
928 """
929 raise NotImplementedError()
931 @abstractmethod
932 def any(self, *, execute: bool = True, exact: bool = True) -> bool:
933 """Test whether this query returns any results.
935 Parameters
936 ----------
937 execute : `bool`, optional
938 If `True`, execute at least a ``LIMIT 1`` query if it cannot be
939 determined prior to execution that the query would return no rows.
940 exact : `bool`, optional
941 If `True`, run the full query and perform post-query filtering if
942 needed, until at least one result row is found. If `False`, the
943 returned result does not account for post-query filtering, and
944 hence may be `True` even when all result rows would be filtered
945 out.
947 Returns
948 -------
949 any : `bool`
950 `True` if the query would (or might, depending on arguments) yield
951 result rows. `False` if it definitely would not.
952 """
953 raise NotImplementedError()
955 @abstractmethod
956 def order_by(self, *args: str) -> DimensionRecordQueryResults:
957 """Make the iterator return ordered result.
959 Parameters
960 ----------
961 *args : `str`
962 Names of the columns/dimensions to use for ordering. Column name
963 can be prefixed with minus (``-``) to use descending ordering.
965 Returns
966 -------
967 result : `DimensionRecordQueryResults`
968 Returns ``self`` instance which is updated to return ordered
969 result.
971 Notes
972 -----
973 This method can modify the iterator in place and return the same
974 instance.
975 """
976 raise NotImplementedError()
978 @abstractmethod
979 def limit(self, limit: int, offset: Optional[int] = None) -> DimensionRecordQueryResults:
980 """Make the iterator return limited number of records.
982 Parameters
983 ----------
984 limit : `int`
985 Upper limit on the number of returned records.
986 offset : `int` or `None`
987 If not `None` then the number of records to skip before returning
988 ``limit`` records.
990 Returns
991 -------
992 result : `DimensionRecordQueryResults`
993 Returns ``self`` instance which is updated to return limited set
994 of records.
996 Notes
997 -----
998 This method can modify the iterator in place and return the same
999 instance. Normally this method is used together with `order_by`
1000 method.
1001 """
1002 raise NotImplementedError()
1004 @abstractmethod
1005 def explain_no_results(self) -> Iterator[str]:
1006 """Return human-readable messages that may help explain why the query
1007 yields no results.
1009 Returns
1010 -------
1011 messages : `Iterator` [ `str` ]
1012 String messages that describe reasons the query might not yield any
1013 results.
1015 Notes
1016 -----
1017 Messages related to post-query filtering are only available if the
1018 iterator has been exhausted, or if `any` or `count` was already called
1019 (with ``exact=True`` for the latter two).
1021 This method first yields messages that are generated while the query is
1022 being built or filtered, but may then proceed to diagnostics generated
1023 by performing what should be inexpensive follow-up queries. Callers
1024 can short-circuit this at any time by simply not iterating further.
1025 """
1026 raise NotImplementedError()
1029class _DimensionRecordKey:
1030 """Class for objects used as keys in ordering `DimensionRecord` instances.
1032 Parameters
1033 ----------
1034 attributes : `Sequence` [ `str` ]
1035 Sequence of attribute names to use for comparison.
1036 ordering : `Sequence` [ `bool` ]
1037 Matching sequence of ordering flags, `False` for descending ordering,
1038 `True` for ascending ordering.
1039 record : `DimensionRecord`
1040 `DimensionRecord` to compare to other records.
1041 """
1043 def __init__(self, attributes: Sequence[str], ordering: Sequence[bool], record: DimensionRecord):
1044 self.attributes = attributes
1045 self.ordering = ordering
1046 self.rec = record
1048 def _cmp(self, other: _DimensionRecordKey) -> int:
1049 """Compare two records using provided comparison operator.
1051 Parameters
1052 ----------
1053 other : `_DimensionRecordKey`
1054 Key for other record.
1056 Returns
1057 -------
1058 result : `int`
1059 0 if keys are identical, negative if ``self`` is ordered before
1060 ``other``, positive otherwise.
1061 """
1062 for attribute, ordering in zip(self.attributes, self.ordering):
1063 # timespan.begin/end cannot use getattr
1064 attrgetter = operator.attrgetter(attribute)
1065 lhs = attrgetter(self.rec)
1066 rhs = attrgetter(other.rec)
1067 if not ordering:
1068 lhs, rhs = rhs, lhs
1069 if lhs != rhs:
1070 return 1 if lhs > rhs else -1
1071 return 0
1073 def __lt__(self, other: _DimensionRecordKey) -> bool:
1074 return self._cmp(other) < 0
1076 def __gt__(self, other: _DimensionRecordKey) -> bool:
1077 return self._cmp(other) > 0
1079 def __eq__(self, other: Any) -> bool:
1080 if not isinstance(other, _DimensionRecordKey):
1081 return NotImplemented
1082 return self._cmp(other) == 0
1084 def __le__(self, other: _DimensionRecordKey) -> bool:
1085 return self._cmp(other) <= 0
1087 def __ge__(self, other: _DimensionRecordKey) -> bool:
1088 return self._cmp(other) >= 0
1091class DatabaseDimensionRecordQueryResults(DimensionRecordQueryResults):
1092 """Implementation of DimensionRecordQueryResults using database query.
1094 Parameters
1095 ----------
1096 dataIds : `DataCoordinateQueryResults`
1097 Iterator for DataIds.
1098 recordStorage : `DimensionRecordStorage`
1099 Instance of storage class for dimension records.
1100 """
1102 def __init__(self, dataIds: DataCoordinateQueryResults, recordStorage: DimensionRecordStorage):
1103 self._dataIds = dataIds
1104 self._recordStorage = recordStorage
1105 self._order_by: Iterable[str] = ()
1107 def __iter__(self) -> Iterator[DimensionRecord]:
1108 # LIMIT is already applied at DataCoordinateQueryResults level
1109 # (assumption here is that if DataId exists then dimension record
1110 # exists too and their counts must be equal). fetch() does not
1111 # guarantee ordering, so we need to sort records in memory below.
1112 recordIter = self._recordStorage.fetch(self._dataIds)
1113 if not self._order_by:
1114 return iter(recordIter)
1116 # Parse list of column names and build a list of attribute name for
1117 # ordering. Note that here we only support ordering by direct
1118 # attributes of the element, and not other elements from the dimension
1119 # graph.
1120 orderBy = ElementOrderByClause(self._order_by, self._recordStorage.element)
1121 attributes: List[str] = []
1122 ordering: List[bool] = []
1123 for column in orderBy.order_by_columns:
1124 if column.column is None:
1125 assert isinstance(column.element, Dimension), "Element must be a Dimension"
1126 attributes.append(column.element.primaryKey.name)
1127 else:
1128 attributes.append(column.column)
1129 ordering.append(column.ordering)
1131 def _key(record: DimensionRecord) -> _DimensionRecordKey:
1132 return _DimensionRecordKey(attributes, ordering, record)
1134 records = sorted(recordIter, key=_key)
1135 return iter(records)
1137 def count(self, *, exact: bool = True) -> int:
1138 # Docstring inherited from base class.
1139 return self._dataIds.count(exact=exact)
1141 def any(self, *, execute: bool = True, exact: bool = True) -> bool:
1142 # Docstring inherited from base class.
1143 return self._dataIds.any(execute=execute, exact=exact)
1145 def order_by(self, *args: str) -> DimensionRecordQueryResults:
1146 # Docstring inherited from base class.
1147 self._dataIds = self._dataIds.order_by(*args)
1148 self._order_by = args
1149 return self
1151 def limit(self, limit: int, offset: Optional[int] = None) -> DimensionRecordQueryResults:
1152 # Docstring inherited from base class.
1153 self._dataIds = self._dataIds.limit(limit, offset)
1154 return self
1156 def explain_no_results(self) -> Iterator[str]:
1157 # Docstring inherited.
1158 return self._dataIds.explain_no_results()