Coverage for python/lsst/daf/butler/registry/queries/_results.py: 32%
273 statements
« prev ^ index » next coverage.py v6.5.0, created at 2022-10-04 02:19 -0700
« prev ^ index » next coverage.py v6.5.0, created at 2022-10-04 02:19 -0700
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23__all__ = (
24 "ChainedDatasetQueryResults",
25 "DatabaseDimensionRecordQueryResults",
26 "DataCoordinateQueryResults",
27 "DatasetQueryResults",
28 "DimensionRecordQueryResults",
29 "ParentDatasetQueryResults",
30)
32import itertools
33import operator
34from abc import abstractmethod
35from collections.abc import Callable, Iterable, Iterator, Mapping, Sequence
36from contextlib import AbstractContextManager, ExitStack, contextmanager
37from typing import Any, Optional
39import sqlalchemy
41from ...core import (
42 DataCoordinate,
43 DataCoordinateIterable,
44 DatasetRef,
45 DatasetType,
46 Dimension,
47 DimensionGraph,
48 DimensionRecord,
49 SimpleQuery,
50)
51from ..interfaces import Database, DimensionRecordStorage
52from ._query import Query
53from ._structs import ElementOrderByClause, QuerySummary
55QueryFactoryMethod = Callable[[Optional[Iterable[str]], Optional[tuple[int, Optional[int]]]], Query]
56"""Type of a query factory method type used by DataCoordinateQueryResults.
57"""
60class DataCoordinateQueryResults(DataCoordinateIterable):
61 """An enhanced implementation of `DataCoordinateIterable` that represents
62 data IDs retrieved from a database query.
64 Parameters
65 ----------
66 db : `Database`
67 Database engine used to execute queries.
68 query_factory : `QueryFactoryMethod`
69 Method which creates an instance of `Query` class.
70 graph : `DimensionGraph`
71 Dimensions used by query.
72 order_by : `Iterable` [ `str` ], optional
73 Optional sequence of column names used for result ordering.
74 limit : `Tuple` [ `int`, `int` ], optional
75 Limit for the number of returned records and optional offset.
76 records : `Mapping`, optional
77 A nested mapping containing `DimensionRecord` objects for all
78 dimensions and all data IDs this query will yield. If `None`
79 (default), `DataCoordinateIterable.hasRecords` will return `False`.
80 The outer mapping has `str` keys (the names of dimension elements).
81 The inner mapping has `tuple` keys representing data IDs (tuple
82 conversions of `DataCoordinate.values()`) and `DimensionRecord` values.
84 Notes
85 -----
86 Constructing an instance of this does nothing; the query is not executed
87 until it is iterated over (or some other operation is performed that
88 involves iteration).
90 Instances should generally only be constructed by `Registry` methods or the
91 methods of other query result objects.
92 """
94 def __init__(
95 self,
96 db: Database,
97 query_factory: QueryFactoryMethod,
98 graph: DimensionGraph,
99 *,
100 order_by: Iterable[str] | None = None,
101 limit: tuple[int, int | None] | None = None,
102 records: Mapping[str, Mapping[tuple, DimensionRecord]] | None = None,
103 ):
104 self._db = db
105 self._query_factory = query_factory
106 self._graph = graph
107 self._order_by = order_by
108 self._limit = limit
109 self._records = records
110 self._cached_query: Query | None = None
112 __slots__ = ("_db", "_query_factory", "_graph", "_order_by", "_limit", "_records", "_cached_query")
114 @classmethod
115 def from_query(
116 cls,
117 db: Database,
118 query: Query,
119 graph: DimensionGraph,
120 *,
121 order_by: Iterable[str] | None = None,
122 limit: tuple[int, int | None] | None = None,
123 records: Mapping[str, Mapping[tuple, DimensionRecord]] | None = None,
124 ) -> DataCoordinateQueryResults:
125 """Make an instance from a pre-existing query instead of a factory.
127 Parameters
128 ----------
129 db : `Database`
130 Database engine used to execute queries.
131 query : `Query`
132 Low-level representation of the query that backs this result
133 object.
134 graph : `DimensionGraph`
135 Dimensions used by query.
136 order_by : `Iterable` [ `str` ], optional
137 Optional sequence of column names used for result ordering.
138 limit : `Tuple` [ `int`, `int` ], optional
139 Limit for the number of returned records and optional offset.
140 records : `Mapping`, optional
141 A nested mapping containing `DimensionRecord` objects for all
142 dimensions and all data IDs this query will yield. If `None`
143 (default), `DataCoordinateIterable.hasRecords` will return `False`.
144 The outer mapping has `str` keys (the names of dimension elements).
145 The inner mapping has `tuple` keys representing data IDs (tuple
146 conversions of `DataCoordinate.values()`) and `DimensionRecord`
147 values.
148 """
150 def factory(order_by: Iterable[str] | None, limit: tuple[int, int | None] | None) -> Query:
151 return query
153 return DataCoordinateQueryResults(db, factory, graph, order_by=order_by, limit=limit, records=records)
155 def __iter__(self) -> Iterator[DataCoordinate]:
156 return (self._query.extractDataId(row, records=self._records) for row in self._query.rows(self._db))
158 def __repr__(self) -> str:
159 return f"<DataCoordinate iterator with dimensions={self._graph}>"
161 def _clone(
162 self,
163 *,
164 query_factory: QueryFactoryMethod | None = None,
165 query: Query | None = None,
166 graph: DimensionGraph | None = None,
167 order_by: Iterable[str] | None = None,
168 limit: tuple[int, int | None] | None = None,
169 records: Mapping[str, Mapping[tuple, DimensionRecord]] | None = None,
170 ) -> DataCoordinateQueryResults:
171 """Clone this instance potentially updating some attributes."""
172 graph = graph if graph is not None else self._graph
173 order_by = order_by if order_by is not None else self._order_by
174 limit = limit if limit is not None else self._limit
175 records = records if records is not None else self._records
176 if query is None:
177 query_factory = query_factory or self._query_factory
178 return DataCoordinateQueryResults(
179 self._db, query_factory, graph, order_by=order_by, limit=limit, records=records
180 )
181 else:
182 return DataCoordinateQueryResults.from_query(
183 self._db, query, graph, order_by=order_by, limit=limit, records=records
184 )
186 @property
187 def _query(self) -> Query:
188 """Query representation instance (`Query`)"""
189 if self._cached_query is None:
190 self._cached_query = self._query_factory(self._order_by, self._limit)
191 assert (
192 self._cached_query.datasetType is None
193 ), "Query used to initialize data coordinate results should not have any datasets."
194 return self._cached_query
196 @property
197 def graph(self) -> DimensionGraph:
198 # Docstring inherited from DataCoordinateIterable.
199 return self._graph
201 def hasFull(self) -> bool:
202 # Docstring inherited from DataCoordinateIterable.
203 return True
205 def hasRecords(self) -> bool:
206 # Docstring inherited from DataCoordinateIterable.
207 return self._records is not None or not self._graph
209 @contextmanager
210 def materialize(self) -> Iterator[DataCoordinateQueryResults]:
211 """Insert this query's results into a temporary table.
213 Returns
214 -------
215 context : `typing.ContextManager` [ `DataCoordinateQueryResults` ]
216 A context manager that ensures the temporary table is created and
217 populated in ``__enter__`` (returning a results object backed by
218 that table), and dropped in ``__exit__``. If ``self`` is already
219 materialized, the context manager may do nothing (reflecting the
220 fact that an outer context manager should already take care of
221 everything else).
223 Notes
224 -----
225 When using a very large result set to perform multiple queries (e.g.
226 multiple calls to `subset` with different arguments, or even a single
227 call to `expanded`), it may be much more efficient to start by
228 materializing the query and only then performing the follow up queries.
229 It may also be less efficient, depending on how well database engine's
230 query optimizer can simplify those particular follow-up queries and
231 how efficiently it caches query results even when the are not
232 explicitly inserted into a temporary table. See `expanded` and
233 `subset` for examples.
234 """
235 with self._query.materialize(self._db) as materialized:
236 # Note that we depend on order_by columns to be passes from Query
237 # to MaterializedQuery, so order_by and limit are not used.
238 yield self._clone(query=materialized)
240 def expanded(self) -> DataCoordinateQueryResults:
241 """Return a results object for which `hasRecords` returns `True`.
243 This method may involve actually executing database queries to fetch
244 `DimensionRecord` objects.
246 Returns
247 -------
248 results : `DataCoordinateQueryResults`
249 A results object for which `hasRecords` returns `True`. May be
250 ``self`` if that is already the case.
252 Notes
253 -----
254 For very result sets, it may be much more efficient to call
255 `materialize` before calling `expanded`, to avoid performing the
256 original query multiple times (as a subquery) in the follow-up queries
257 that fetch dimension records. For example::
259 with registry.queryDataIds(...).materialize() as tempDataIds:
260 dataIdsWithRecords = tempDataIds.expanded()
261 for dataId in dataIdsWithRecords:
262 ...
263 """
264 if self._records is None:
265 records = {}
266 for element in self.graph.elements:
267 subset = self.subset(graph=element.graph, unique=True)
268 records[element.name] = {
269 tuple(record.dataId.values()): record
270 for record in self._query.backend.managers.dimensions[element].fetch(subset)
271 }
273 return self._clone(query=self._query, records=records)
274 else:
275 return self
277 def subset(
278 self, graph: DimensionGraph | None = None, *, unique: bool = False
279 ) -> DataCoordinateQueryResults:
280 """Return a results object containing a subset of the dimensions of
281 this one, and/or a unique near-subset of its rows.
283 This method may involve actually executing database queries to fetch
284 `DimensionRecord` objects.
286 Parameters
287 ----------
288 graph : `DimensionGraph`, optional
289 Dimensions to include in the new results object. If `None`,
290 ``self.graph`` is used.
291 unique : `bool`, optional
292 If `True` (`False` is default), the query should only return unique
293 data IDs. This is implemented in the database; to obtain unique
294 results via Python-side processing (which may be more efficient in
295 some cases), use `toSet` to construct a `DataCoordinateSet` from
296 this results object instead.
298 Returns
299 -------
300 results : `DataCoordinateQueryResults`
301 A results object corresponding to the given criteria. May be
302 ``self`` if it already qualifies.
304 Raises
305 ------
306 ValueError
307 Raised when ``graph`` is not a subset of the dimension graph in
308 this result.
310 Notes
311 -----
312 This method can only return a "near-subset" of the original result rows
313 in general because of subtleties in how spatial overlaps are
314 implemented; see `Query.subset` for more information.
316 When calling `subset` multiple times on the same very large result set,
317 it may be much more efficient to call `materialize` first. For
318 example::
320 dimensions1 = DimensionGraph(...)
321 dimensions2 = DimensionGraph(...)
322 with registry.queryDataIds(...).materialize() as tempDataIds:
323 for dataId1 in tempDataIds.subset(
324 graph=dimensions1,
325 unique=True):
326 ...
327 for dataId2 in tempDataIds.subset(
328 graph=dimensions2,
329 unique=True):
330 ...
331 """
332 if graph is None:
333 graph = self.graph
334 if not graph.issubset(self.graph):
335 raise ValueError(f"{graph} is not a subset of {self.graph}")
336 if graph == self.graph and (not unique or self._query.isUnique()):
337 return self
338 records: Mapping[str, Mapping[tuple, DimensionRecord]] | None
339 if self._records is not None:
340 records = {element.name: self._records[element.name] for element in graph.elements}
341 else:
342 records = None
343 query = self._query.subset(graph=graph, datasets=False, unique=unique)
345 return self._clone(graph=graph, query=query, records=records)
347 def constrain(self, query: SimpleQuery, columns: Callable[[str], sqlalchemy.sql.ColumnElement]) -> None:
348 # Docstring inherited from DataCoordinateIterable.
349 sql = self._query.sql
350 if sql is not None:
351 fromClause = sql.alias("c")
352 query.join(
353 fromClause,
354 onclause=sqlalchemy.sql.and_(
355 *[
356 columns(dimension.name) == fromClause.columns[dimension.name]
357 for dimension in self.graph.required
358 ]
359 ),
360 )
362 def findDatasets(
363 self, datasetType: DatasetType | str, collections: Any, *, findFirst: bool = True
364 ) -> ParentDatasetQueryResults:
365 """Find datasets using the data IDs identified by this query.
367 Parameters
368 ----------
369 datasetType : `DatasetType` or `str`
370 Dataset type or the name of one to search for. Must have
371 dimensions that are a subset of ``self.graph``.
372 collections : `Any`
373 An expression that fully or partially identifies the collections
374 to search for the dataset, such as a `str`, `re.Pattern`, or
375 iterable thereof. ``...`` can be used to return all collections.
376 See :ref:`daf_butler_collection_expressions` for more information.
377 findFirst : `bool`, optional
378 If `True` (default), for each result data ID, only yield one
379 `DatasetRef`, from the first collection in which a dataset of that
380 dataset type appears (according to the order of ``collections``
381 passed in). If `True`, ``collections`` must not contain regular
382 expressions and may not be ``...``.
384 Returns
385 -------
386 datasets : `ParentDatasetQueryResults`
387 A lazy-evaluation object representing dataset query results,
388 iterable over `DatasetRef` objects. If ``self.hasRecords()``, all
389 nested data IDs in those dataset references will have records as
390 well.
392 Raises
393 ------
394 ValueError
395 Raised if ``datasetType.dimensions.issubset(self.graph) is False``.
396 MissingDatasetTypeError
397 Raised if the given dataset type is not registered.
398 """
399 parent_dataset_type, components = self._query.backend.resolve_single_dataset_type_wildcard(
400 datasetType, explicit_only=True
401 )
402 if not parent_dataset_type.dimensions.issubset(self.graph):
403 raise ValueError(
404 f"findDatasets requires that the dataset type have only dimensions in "
405 f"the DataCoordinateQueryResult used as input to the search, but "
406 f"{parent_dataset_type.name} has dimensions {parent_dataset_type.dimensions}, "
407 f"while the input dimensions are {self.graph}."
408 )
409 summary = QuerySummary(
410 self.graph, whereRegion=self._query.whereRegion, datasets=[parent_dataset_type]
411 )
412 builder = self._query.makeBuilder(summary)
413 builder.joinDataset(parent_dataset_type, collections=collections, findFirst=findFirst)
414 query = builder.finish(joinMissing=False)
415 return ParentDatasetQueryResults(
416 db=self._db,
417 query=query,
418 components=components,
419 records=self._records,
420 datasetType=parent_dataset_type,
421 )
423 def count(self, *, exact: bool = True) -> int:
424 """Count the number of rows this query would return.
426 Parameters
427 ----------
428 exact : `bool`, optional
429 If `True`, run the full query and perform post-query filtering if
430 needed to account for that filtering in the count. If `False`, the
431 result may be an upper bound.
433 Returns
434 -------
435 count : `int`
436 The number of rows the query would return, or an upper bound if
437 ``exact=False``.
439 Notes
440 -----
441 This counts the number of rows returned, not the number of unique rows
442 returned, so even with ``exact=True`` it may provide only an upper
443 bound on the number of *deduplicated* result rows.
444 """
445 return self._query.count(self._db, exact=exact)
447 def any(
448 self,
449 *,
450 execute: bool = True,
451 exact: bool = True,
452 ) -> bool:
453 """Test whether this query returns any results.
455 Parameters
456 ----------
457 execute : `bool`, optional
458 If `True`, execute at least a ``LIMIT 1`` query if it cannot be
459 determined prior to execution that the query would return no rows.
460 exact : `bool`, optional
461 If `True`, run the full query and perform post-query filtering if
462 needed, until at least one result row is found. If `False`, the
463 returned result does not account for post-query filtering, and
464 hence may be `True` even when all result rows would be filtered
465 out.
467 Returns
468 -------
469 any : `bool`
470 `True` if the query would (or might, depending on arguments) yield
471 result rows. `False` if it definitely would not.
472 """
473 return self._query.any(self._db, execute=execute, exact=exact)
475 def explain_no_results(self) -> Iterable[str]:
476 """Return human-readable messages that may help explain why the query
477 yields no results.
479 Returns
480 -------
481 messages : `Iterable` [ `str` ]
482 String messages that describe reasons the query might not yield any
483 results.
485 Notes
486 -----
487 Messages related to post-query filtering are only available if the
488 iterator has been exhausted, or if `any` or `count` was already called
489 (with ``exact=True`` for the latter two).
491 This method first yields messages that are generated while the query is
492 being built or filtered, but may then proceed to diagnostics generated
493 by performing what should be inexpensive follow-up queries. Callers
494 can short-circuit this at any time by simplying not iterating further.
495 """
496 return self._query.explain_no_results(self._db)
498 def order_by(self, *args: str) -> DataCoordinateQueryResults:
499 """Make the iterator return ordered result.
501 Parameters
502 ----------
503 *args : `str`
504 Names of the columns/dimensions to use for ordering. Column name
505 can be prefixed with minus (``-``) to use descending ordering.
507 Returns
508 -------
509 result : `DataCoordinateQueryResults`
510 Returns ``self`` instance which is updated to return ordered
511 result.
513 Notes
514 -----
515 This method modifies the iterator in place and returns the same
516 instance to support method chaining.
517 """
518 return self._clone(order_by=args)
520 def limit(self, limit: int, offset: int | None = None) -> DataCoordinateQueryResults:
521 """Make the iterator return limited number of records.
523 Parameters
524 ----------
525 limit : `int`
526 Upper limit on the number of returned records.
527 offset : `int` or `None`
528 If not `None` then the number of records to skip before returning
529 ``limit`` records.
531 Returns
532 -------
533 result : `DataCoordinateQueryResults`
534 Returns ``self`` instance which is updated to return limited set
535 of records.
537 Notes
538 -----
539 This method modifies the iterator in place and returns the same
540 instance to support method chaining. Normally this method is used
541 together with `order_by` method.
542 """
543 return self._clone(limit=(limit, offset))
546class DatasetQueryResults(Iterable[DatasetRef]):
547 """An interface for objects that represent the results of queries for
548 datasets.
549 """
551 @abstractmethod
552 def byParentDatasetType(self) -> Iterator[ParentDatasetQueryResults]:
553 """Group results by parent dataset type.
555 Returns
556 -------
557 iter : `Iterator` [ `ParentDatasetQueryResults` ]
558 An iterator over `DatasetQueryResults` instances that are each
559 responsible for a single parent dataset type (either just that
560 dataset type, one or more of its component dataset types, or both).
561 """
562 raise NotImplementedError()
564 @abstractmethod
565 def materialize(self) -> AbstractContextManager[DatasetQueryResults]:
566 """Insert this query's results into a temporary table.
568 Returns
569 -------
570 context : `typing.ContextManager` [ `DatasetQueryResults` ]
571 A context manager that ensures the temporary table is created and
572 populated in ``__enter__`` (returning a results object backed by
573 that table), and dropped in ``__exit__``. If ``self`` is already
574 materialized, the context manager may do nothing (reflecting the
575 fact that an outer context manager should already take care of
576 everything else).
577 """
578 raise NotImplementedError()
580 @abstractmethod
581 def expanded(self) -> DatasetQueryResults:
582 """Return a `DatasetQueryResults` for which `DataCoordinate.hasRecords`
583 returns `True` for all data IDs in returned `DatasetRef` objects.
585 Returns
586 -------
587 expanded : `DatasetQueryResults`
588 Either a new `DatasetQueryResults` instance or ``self``, if it is
589 already expanded.
591 Notes
592 -----
593 As with `DataCoordinateQueryResults.expanded`, it may be more efficient
594 to call `materialize` before expanding data IDs for very large result
595 sets.
596 """
597 raise NotImplementedError()
599 @abstractmethod
600 def count(self, *, exact: bool = True) -> int:
601 """Count the number of rows this query would return.
603 Parameters
604 ----------
605 exact : `bool`, optional
606 If `True`, run the full query and perform post-query filtering if
607 needed to account for that filtering in the count. If `False`, the
608 result may be an upper bound.
610 Returns
611 -------
612 count : `int`
613 The number of rows the query would return, or an upper bound if
614 ``exact=False``.
616 Notes
617 -----
618 This counts the number of rows returned, not the number of unique rows
619 returned, so even with ``exact=True`` it may provide only an upper
620 bound on the number of *deduplicated* result rows.
621 """
622 raise NotImplementedError()
624 @abstractmethod
625 def any(
626 self,
627 *,
628 execute: bool = True,
629 exact: bool = True,
630 ) -> bool:
631 """Test whether this query returns any results.
633 Parameters
634 ----------
635 execute : `bool`, optional
636 If `True`, execute at least a ``LIMIT 1`` query if it cannot be
637 determined prior to execution that the query would return no rows.
638 exact : `bool`, optional
639 If `True`, run the full query and perform post-query filtering if
640 needed, until at least one result row is found. If `False`, the
641 returned result does not account for post-query filtering, and
642 hence may be `True` even when all result rows would be filtered
643 out.
645 Returns
646 -------
647 any : `bool`
648 `True` if the query would (or might, depending on arguments) yield
649 result rows. `False` if it definitely would not.
650 """
651 raise NotImplementedError()
653 @abstractmethod
654 def explain_no_results(self) -> Iterable[str]:
655 """Return human-readable messages that may help explain why the query
656 yields no results.
658 Returns
659 -------
660 messages : `Iterable` [ `str` ]
661 String messages that describe reasons the query might not yield any
662 results.
664 Notes
665 -----
666 Messages related to post-query filtering are only available if the
667 iterator has been exhausted, or if `any` or `count` was already called
668 (with ``exact=True`` for the latter two).
670 This method first yields messages that are generated while the query is
671 being built or filtered, but may then proceed to diagnostics generated
672 by performing what should be inexpensive follow-up queries. Callers
673 can short-circuit this at any time by simplying not iterating further.
674 """
675 raise NotImplementedError()
678class ParentDatasetQueryResults(DatasetQueryResults):
679 """An object that represents results from a query for datasets with a
680 single parent `DatasetType`.
682 Parameters
683 ----------
684 db : `Database`
685 Database engine to execute queries against.
686 query : `Query`
687 Low-level query object that backs these results. ``query.datasetType``
688 will be the parent dataset type for this object, and may not be `None`.
689 components : `Sequence` [ `str` or `None` ]
690 Names of components to include in iteration. `None` may be included
691 (at most once) to include the parent dataset type.
692 records : `Mapping`, optional
693 Mapping containing `DimensionRecord` objects for all dimensions and
694 all data IDs this query will yield. If `None` (default),
695 `DataCoordinate.hasRecords` will return `False` for all nested data
696 IDs. This is a nested mapping with `str` names of dimension elements
697 as outer keys, `DimensionRecord` instances as inner values, and
698 ``tuple(record.dataId.values())`` for the inner keys / outer values
699 (where ``record`` is the innermost `DimensionRecord` instance).
700 datasetType : `DatasetType`, optional
701 Parent dataset type for all datasets returned by this query. If not
702 provided, ``query.datasetType`` be used, and must not be `None` (as it
703 is in the case where the query is known to yield no results prior to
704 execution).
705 """
707 def __init__(
708 self,
709 db: Database,
710 query: Query,
711 *,
712 components: Sequence[str | None],
713 records: Mapping[str, Mapping[tuple, DimensionRecord]] | None = None,
714 datasetType: DatasetType | None = None,
715 ):
716 self._db = db
717 self._query = query
718 self._components = components
719 self._records = records
720 if datasetType is None:
721 datasetType = query.datasetType
722 assert datasetType is not None, "Query used to initialize dataset results must have a dataset."
723 assert datasetType.dimensions.issubset(
724 query.graph
725 ), f"Query dimensions {query.graph} do not match dataset type dimensions {datasetType.dimensions}."
726 self._datasetType = datasetType
728 __slots__ = ("_db", "_query", "_dimensions", "_components", "_records")
730 def __iter__(self) -> Iterator[DatasetRef]:
731 for row in self._query.rows(self._db):
732 parentRef = self._query.extractDatasetRef(row, records=self._records)
733 for component in self._components:
734 if component is None:
735 yield parentRef
736 else:
737 yield parentRef.makeComponentRef(component)
739 def __repr__(self) -> str:
740 return f"<DatasetRef iterator for [components of] {self._datasetType.name}>"
742 def byParentDatasetType(self) -> Iterator[ParentDatasetQueryResults]:
743 # Docstring inherited from DatasetQueryResults.
744 yield self
746 @contextmanager
747 def materialize(self) -> Iterator[ParentDatasetQueryResults]:
748 # Docstring inherited from DatasetQueryResults.
749 with self._query.materialize(self._db) as materialized:
750 yield ParentDatasetQueryResults(
751 self._db, materialized, components=self._components, records=self._records
752 )
754 @property
755 def parentDatasetType(self) -> DatasetType:
756 """The parent dataset type for all datasets in this iterable
757 (`DatasetType`).
758 """
759 return self._datasetType
761 @property
762 def dataIds(self) -> DataCoordinateQueryResults:
763 """A lazy-evaluation object representing a query for just the data
764 IDs of the datasets that would be returned by this query
765 (`DataCoordinateQueryResults`).
767 The returned object is not in general `zip`-iterable with ``self``;
768 it may be in a different order or have (or not have) duplicates.
769 """
770 query = self._query.subset(graph=self.parentDatasetType.dimensions, datasets=False, unique=False)
771 return DataCoordinateQueryResults.from_query(
772 self._db,
773 query,
774 self.parentDatasetType.dimensions,
775 records=self._records,
776 )
778 def withComponents(self, components: Sequence[str | None]) -> ParentDatasetQueryResults:
779 """Return a new query results object for the same parent datasets but
780 different components.
782 components : `Sequence` [ `str` or `None` ]
783 Names of components to include in iteration. `None` may be
784 included (at most once) to include the parent dataset type.
785 """
786 return ParentDatasetQueryResults(
787 self._db, self._query, records=self._records, components=components, datasetType=self._datasetType
788 )
790 def expanded(self) -> ParentDatasetQueryResults:
791 # Docstring inherited from DatasetQueryResults.
792 if self._records is None:
793 records = self.dataIds.expanded()._records
794 return ParentDatasetQueryResults(
795 self._db,
796 self._query,
797 records=records,
798 components=self._components,
799 datasetType=self._datasetType,
800 )
801 else:
802 return self
804 def count(self, *, exact: bool = True) -> int:
805 # Docstring inherited.
806 return len(self._components) * self._query.count(self._db, exact=exact)
808 def any(
809 self,
810 *,
811 execute: bool = True,
812 exact: bool = True,
813 ) -> bool:
814 # Docstring inherited.
815 return self._query.any(self._db, execute=execute, exact=exact)
817 def explain_no_results(self) -> Iterable[str]:
818 # Docstring inherited.
819 return self._query.explain_no_results(self._db)
822class ChainedDatasetQueryResults(DatasetQueryResults):
823 """A `DatasetQueryResults` implementation that simply chains together
824 other results objects, each for a different parent dataset type.
826 Parameters
827 ----------
828 chain : `Sequence` [ `ParentDatasetQueryResults` ]
829 The underlying results objects this object will chain together.
830 doomed_by : `Iterable` [ `str` ], optional
831 A list of messages (appropriate for e.g. logging or exceptions) that
832 explain why the query is known to return no results even before it is
833 executed. Queries with a non-empty list will never be executed.
834 Child results objects may also have their own list.
835 """
837 def __init__(self, chain: Sequence[ParentDatasetQueryResults], doomed_by: Iterable[str] = ()):
838 self._chain = chain
839 self._doomed_by = tuple(doomed_by)
841 __slots__ = ("_chain",)
843 def __iter__(self) -> Iterator[DatasetRef]:
844 return itertools.chain.from_iterable(self._chain)
846 def __repr__(self) -> str:
847 return "<DatasetRef iterator for multiple dataset types>"
849 def byParentDatasetType(self) -> Iterator[ParentDatasetQueryResults]:
850 # Docstring inherited from DatasetQueryResults.
851 return iter(self._chain)
853 @contextmanager
854 def materialize(self) -> Iterator[ChainedDatasetQueryResults]:
855 # Docstring inherited from DatasetQueryResults.
856 with ExitStack() as stack:
857 yield ChainedDatasetQueryResults([stack.enter_context(r.materialize()) for r in self._chain])
859 def expanded(self) -> ChainedDatasetQueryResults:
860 # Docstring inherited from DatasetQueryResults.
861 return ChainedDatasetQueryResults([r.expanded() for r in self._chain])
863 def count(self, *, exact: bool = True) -> int:
864 # Docstring inherited.
865 return sum(r.count(exact=exact) for r in self._chain)
867 def any(
868 self,
869 *,
870 execute: bool = True,
871 exact: bool = True,
872 ) -> bool:
873 # Docstring inherited.
874 return any(r.any(execute=execute, exact=exact) for r in self._chain)
876 def explain_no_results(self) -> Iterable[str]:
877 # Docstring inherited.
878 for r in self._chain:
879 yield from r.explain_no_results()
880 yield from self._doomed_by
883class DimensionRecordQueryResults(Iterable[DimensionRecord]):
884 """An interface for objects that represent the results of queries for
885 dimension records.
886 """
888 @abstractmethod
889 def count(self, *, exact: bool = True) -> int:
890 """Count the number of rows this query would return.
892 Parameters
893 ----------
894 exact : `bool`, optional
895 If `True`, run the full query and perform post-query filtering if
896 needed to account for that filtering in the count. If `False`, the
897 result may be an upper bound.
899 Returns
900 -------
901 count : `int`
902 The number of rows the query would return, or an upper bound if
903 ``exact=False``.
905 Notes
906 -----
907 This counts the number of rows returned, not the number of unique rows
908 returned, so even with ``exact=True`` it may provide only an upper
909 bound on the number of *deduplicated* result rows.
910 """
911 raise NotImplementedError()
913 @abstractmethod
914 def any(self, *, execute: bool = True, exact: bool = True) -> bool:
915 """Test whether this query returns any results.
917 Parameters
918 ----------
919 execute : `bool`, optional
920 If `True`, execute at least a ``LIMIT 1`` query if it cannot be
921 determined prior to execution that the query would return no rows.
922 exact : `bool`, optional
923 If `True`, run the full query and perform post-query filtering if
924 needed, until at least one result row is found. If `False`, the
925 returned result does not account for post-query filtering, and
926 hence may be `True` even when all result rows would be filtered
927 out.
929 Returns
930 -------
931 any : `bool`
932 `True` if the query would (or might, depending on arguments) yield
933 result rows. `False` if it definitely would not.
934 """
935 raise NotImplementedError()
937 @abstractmethod
938 def order_by(self, *args: str) -> DimensionRecordQueryResults:
939 """Make the iterator return ordered result.
941 Parameters
942 ----------
943 *args : `str`
944 Names of the columns/dimensions to use for ordering. Column name
945 can be prefixed with minus (``-``) to use descending ordering.
947 Returns
948 -------
949 result : `DimensionRecordQueryResults`
950 Returns ``self`` instance which is updated to return ordered
951 result.
953 Notes
954 -----
955 This method can modify the iterator in place and return the same
956 instance.
957 """
958 raise NotImplementedError()
960 @abstractmethod
961 def limit(self, limit: int, offset: int | None = None) -> DimensionRecordQueryResults:
962 """Make the iterator return limited number of records.
964 Parameters
965 ----------
966 limit : `int`
967 Upper limit on the number of returned records.
968 offset : `int` or `None`
969 If not `None` then the number of records to skip before returning
970 ``limit`` records.
972 Returns
973 -------
974 result : `DimensionRecordQueryResults`
975 Returns ``self`` instance which is updated to return limited set
976 of records.
978 Notes
979 -----
980 This method can modify the iterator in place and return the same
981 instance. Normally this method is used together with `order_by`
982 method.
983 """
984 raise NotImplementedError()
986 @abstractmethod
987 def explain_no_results(self) -> Iterable[str]:
988 """Return human-readable messages that may help explain why the query
989 yields no results.
991 Returns
992 -------
993 messages : `Iterable` [ `str` ]
994 String messages that describe reasons the query might not yield any
995 results.
997 Notes
998 -----
999 Messages related to post-query filtering are only available if the
1000 iterator has been exhausted, or if `any` or `count` was already called
1001 (with ``exact=True`` for the latter two).
1003 This method first yields messages that are generated while the query is
1004 being built or filtered, but may then proceed to diagnostics generated
1005 by performing what should be inexpensive follow-up queries. Callers
1006 can short-circuit this at any time by simply not iterating further.
1007 """
1008 raise NotImplementedError()
1011class _DimensionRecordKey:
1012 """Class for objects used as keys in ordering `DimensionRecord` instances.
1014 Parameters
1015 ----------
1016 attributes : `Sequence` [ `str` ]
1017 Sequence of attribute names to use for comparison.
1018 ordering : `Sequence` [ `bool` ]
1019 Matching sequence of ordering flags, `False` for descending ordering,
1020 `True` for ascending ordering.
1021 record : `DimensionRecord`
1022 `DimensionRecord` to compare to other records.
1023 """
1025 def __init__(self, attributes: Sequence[str], ordering: Sequence[bool], record: DimensionRecord):
1026 self.attributes = attributes
1027 self.ordering = ordering
1028 self.rec = record
1030 def _cmp(self, other: _DimensionRecordKey) -> int:
1031 """Compare two records using provided comparison operator.
1033 Parameters
1034 ----------
1035 other : `_DimensionRecordKey`
1036 Key for other record.
1038 Returns
1039 -------
1040 result : `int`
1041 0 if keys are identical, negative if ``self`` is ordered before
1042 ``other``, positive otherwise.
1043 """
1044 for attribute, ordering in zip(self.attributes, self.ordering):
1045 # timespan.begin/end cannot use getattr
1046 attrgetter = operator.attrgetter(attribute)
1047 lhs = attrgetter(self.rec)
1048 rhs = attrgetter(other.rec)
1049 if not ordering:
1050 lhs, rhs = rhs, lhs
1051 if lhs != rhs:
1052 return 1 if lhs > rhs else -1
1053 return 0
1055 def __lt__(self, other: _DimensionRecordKey) -> bool:
1056 return self._cmp(other) < 0
1058 def __gt__(self, other: _DimensionRecordKey) -> bool:
1059 return self._cmp(other) > 0
1061 def __eq__(self, other: Any) -> bool:
1062 if not isinstance(other, _DimensionRecordKey):
1063 return NotImplemented
1064 return self._cmp(other) == 0
1066 def __le__(self, other: _DimensionRecordKey) -> bool:
1067 return self._cmp(other) <= 0
1069 def __ge__(self, other: _DimensionRecordKey) -> bool:
1070 return self._cmp(other) >= 0
1073class DatabaseDimensionRecordQueryResults(DimensionRecordQueryResults):
1074 """Implementation of DimensionRecordQueryResults using database query.
1076 Parameters
1077 ----------
1078 dataIds : `DataCoordinateQueryResults`
1079 Iterator for DataIds.
1080 recordStorage : `DimensionRecordStorage`
1081 Instance of storage class for dimension records.
1082 """
1084 def __init__(self, dataIds: DataCoordinateQueryResults, recordStorage: DimensionRecordStorage):
1085 self._dataIds = dataIds
1086 self._recordStorage = recordStorage
1087 self._order_by: Iterable[str] = ()
1089 def __iter__(self) -> Iterator[DimensionRecord]:
1090 # LIMIT is already applied at DataCoordinateQueryResults level
1091 # (assumption here is that if DataId exists then dimension record
1092 # exists too and their counts must be equal). fetch() does not
1093 # guarantee ordering, so we need to sort records in memory below.
1094 recordIter = self._recordStorage.fetch(self._dataIds)
1095 if not self._order_by:
1096 return iter(recordIter)
1098 # Parse list of column names and build a list of attribute name for
1099 # ordering. Note that here we only support ordering by direct
1100 # attributes of the element, and not other elements from the dimension
1101 # graph.
1102 orderBy = ElementOrderByClause(self._order_by, self._recordStorage.element)
1103 attributes: list[str] = []
1104 ordering: list[bool] = []
1105 for column in orderBy.order_by_columns:
1106 if column.column is None:
1107 assert isinstance(column.element, Dimension), "Element must be a Dimension"
1108 attributes.append(column.element.primaryKey.name)
1109 else:
1110 attributes.append(column.column)
1111 ordering.append(column.ordering)
1113 def _key(record: DimensionRecord) -> _DimensionRecordKey:
1114 return _DimensionRecordKey(attributes, ordering, record)
1116 records = sorted(recordIter, key=_key)
1117 return iter(records)
1119 def count(self, *, exact: bool = True) -> int:
1120 # Docstring inherited from base class.
1121 return self._dataIds.count(exact=exact)
1123 def any(self, *, execute: bool = True, exact: bool = True) -> bool:
1124 # Docstring inherited from base class.
1125 return self._dataIds.any(execute=execute, exact=exact)
1127 def order_by(self, *args: str) -> DimensionRecordQueryResults:
1128 # Docstring inherited from base class.
1129 self._dataIds = self._dataIds.order_by(*args)
1130 self._order_by = args
1131 return self
1133 def limit(self, limit: int, offset: Optional[int] = None) -> DimensionRecordQueryResults:
1134 # Docstring inherited from base class.
1135 self._dataIds = self._dataIds.limit(limit, offset)
1136 return self
1138 def explain_no_results(self) -> Iterable[str]:
1139 # Docstring inherited.
1140 return self._dataIds.explain_no_results()