Coverage for python/lsst/daf/butler/registry/queries/_results.py: 32%
273 statements
« prev ^ index » next coverage.py v6.5.0, created at 2022-11-15 01:59 -0800
« prev ^ index » next coverage.py v6.5.0, created at 2022-11-15 01:59 -0800
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23__all__ = (
24 "ChainedDatasetQueryResults",
25 "DatabaseDimensionRecordQueryResults",
26 "DataCoordinateQueryResults",
27 "DatasetQueryResults",
28 "DimensionRecordQueryResults",
29 "ParentDatasetQueryResults",
30)
32import itertools
33import operator
34from abc import abstractmethod
35from collections.abc import Callable, Iterable, Iterator, Mapping, Sequence
36from contextlib import AbstractContextManager, ExitStack, contextmanager
37from typing import Any, Optional
39import sqlalchemy
41from ...core import (
42 DataCoordinate,
43 DataCoordinateIterable,
44 DatasetRef,
45 DatasetType,
46 Dimension,
47 DimensionGraph,
48 DimensionRecord,
49 SimpleQuery,
50)
51from ..interfaces import Database, DimensionRecordStorage
52from ._query import Query
53from ._structs import ElementOrderByClause, QuerySummary
55QueryFactoryMethod = Callable[[Optional[Iterable[str]], Optional[tuple[int, Optional[int]]]], Query]
56"""Type of a query factory method type used by DataCoordinateQueryResults.
57"""
60class DataCoordinateQueryResults(DataCoordinateIterable):
61 """An enhanced implementation of `DataCoordinateIterable` that represents
62 data IDs retrieved from a database query.
64 Parameters
65 ----------
66 db : `Database`
67 Database engine used to execute queries.
68 query_factory : `QueryFactoryMethod`
69 Method which creates an instance of `Query` class.
70 graph : `DimensionGraph`
71 Dimensions used by query.
72 order_by : `Iterable` [ `str` ], optional
73 Optional sequence of column names used for result ordering.
74 limit : `Tuple` [ `int`, `int` ], optional
75 Limit for the number of returned records and optional offset.
76 records : `Mapping`, optional
77 A nested mapping containing `DimensionRecord` objects for all
78 dimensions and all data IDs this query will yield. If `None`
79 (default), `DataCoordinateIterable.hasRecords` will return `False`.
80 The outer mapping has `str` keys (the names of dimension elements).
81 The inner mapping has `tuple` keys representing data IDs (tuple
82 conversions of `DataCoordinate.values()`) and `DimensionRecord` values.
84 Notes
85 -----
86 Constructing an instance of this does nothing; the query is not executed
87 until it is iterated over (or some other operation is performed that
88 involves iteration).
90 Instances should generally only be constructed by `Registry` methods or the
91 methods of other query result objects.
92 """
94 def __init__(
95 self,
96 db: Database,
97 query_factory: QueryFactoryMethod,
98 graph: DimensionGraph,
99 *,
100 order_by: Iterable[str] | None = None,
101 limit: tuple[int, int | None] | None = None,
102 records: Mapping[str, Mapping[tuple, DimensionRecord]] | None = None,
103 ):
104 self._db = db
105 self._query_factory = query_factory
106 self._graph = graph
107 self._order_by = order_by
108 self._limit = limit
109 self._records = records
110 self._cached_query: Query | None = None
112 __slots__ = ("_db", "_query_factory", "_graph", "_order_by", "_limit", "_records", "_cached_query")
114 @classmethod
115 def from_query(
116 cls,
117 db: Database,
118 query: Query,
119 graph: DimensionGraph,
120 *,
121 order_by: Iterable[str] | None = None,
122 limit: tuple[int, int | None] | None = None,
123 records: Mapping[str, Mapping[tuple, DimensionRecord]] | None = None,
124 ) -> DataCoordinateQueryResults:
125 """Make an instance from a pre-existing query instead of a factory.
127 Parameters
128 ----------
129 db : `Database`
130 Database engine used to execute queries.
131 query : `Query`
132 Low-level representation of the query that backs this result
133 object.
134 graph : `DimensionGraph`
135 Dimensions used by query.
136 order_by : `Iterable` [ `str` ], optional
137 Optional sequence of column names used for result ordering.
138 limit : `Tuple` [ `int`, `int` ], optional
139 Limit for the number of returned records and optional offset.
140 records : `Mapping`, optional
141 A nested mapping containing `DimensionRecord` objects for all
142 dimensions and all data IDs this query will yield. If `None`
143 (default), `DataCoordinateIterable.hasRecords` will return `False`.
144 The outer mapping has `str` keys (the names of dimension elements).
145 The inner mapping has `tuple` keys representing data IDs (tuple
146 conversions of `DataCoordinate.values()`) and `DimensionRecord`
147 values.
148 """
150 def factory(order_by: Iterable[str] | None, limit: tuple[int, int | None] | None) -> Query:
151 return query
153 return DataCoordinateQueryResults(db, factory, graph, order_by=order_by, limit=limit, records=records)
155 def __iter__(self) -> Iterator[DataCoordinate]:
156 return (self._query.extractDataId(row, records=self._records) for row in self._query.rows(self._db))
158 def __repr__(self) -> str:
159 return f"<DataCoordinate iterator with dimensions={self._graph}>"
161 def _clone(
162 self,
163 *,
164 query_factory: QueryFactoryMethod | None = None,
165 query: Query | None = None,
166 graph: DimensionGraph | None = None,
167 order_by: Iterable[str] | None = None,
168 limit: tuple[int, int | None] | None = None,
169 records: Mapping[str, Mapping[tuple, DimensionRecord]] | None = None,
170 ) -> DataCoordinateQueryResults:
171 """Clone this instance potentially updating some attributes."""
172 graph = graph if graph is not None else self._graph
173 order_by = order_by if order_by is not None else self._order_by
174 limit = limit if limit is not None else self._limit
175 records = records if records is not None else self._records
176 if query is None:
177 query_factory = query_factory or self._query_factory
178 return DataCoordinateQueryResults(
179 self._db, query_factory, graph, order_by=order_by, limit=limit, records=records
180 )
181 else:
182 return DataCoordinateQueryResults.from_query(
183 self._db, query, graph, order_by=order_by, limit=limit, records=records
184 )
186 @property
187 def _query(self) -> Query:
188 """Query representation instance (`Query`)"""
189 if self._cached_query is None:
190 self._cached_query = self._query_factory(self._order_by, self._limit)
191 assert (
192 self._cached_query.datasetType is None
193 ), "Query used to initialize data coordinate results should not have any datasets."
194 return self._cached_query
196 @property
197 def graph(self) -> DimensionGraph:
198 # Docstring inherited from DataCoordinateIterable.
199 return self._graph
201 def hasFull(self) -> bool:
202 # Docstring inherited from DataCoordinateIterable.
203 return True
205 def hasRecords(self) -> bool:
206 # Docstring inherited from DataCoordinateIterable.
207 return self._records is not None or not self._graph
209 @contextmanager
210 def materialize(self) -> Iterator[DataCoordinateQueryResults]:
211 """Insert this query's results into a temporary table.
213 Returns
214 -------
215 context : `typing.ContextManager` [ `DataCoordinateQueryResults` ]
216 A context manager that ensures the temporary table is created and
217 populated in ``__enter__`` (returning a results object backed by
218 that table), and dropped in ``__exit__``. If ``self`` is already
219 materialized, the context manager may do nothing (reflecting the
220 fact that an outer context manager should already take care of
221 everything else).
223 Notes
224 -----
225 When using a very large result set to perform multiple queries (e.g.
226 multiple calls to `subset` with different arguments, or even a single
227 call to `expanded`), it may be much more efficient to start by
228 materializing the query and only then performing the follow up queries.
229 It may also be less efficient, depending on how well database engine's
230 query optimizer can simplify those particular follow-up queries and
231 how efficiently it caches query results even when the are not
232 explicitly inserted into a temporary table. See `expanded` and
233 `subset` for examples.
234 """
235 with self._query.materialize(self._db) as materialized:
236 # Note that we depend on order_by columns to be passes from Query
237 # to MaterializedQuery, so order_by and limit are not used.
238 yield self._clone(query=materialized)
240 def expanded(self) -> DataCoordinateQueryResults:
241 """Return a results object for which `hasRecords` returns `True`.
243 This method may involve actually executing database queries to fetch
244 `DimensionRecord` objects.
246 Returns
247 -------
248 results : `DataCoordinateQueryResults`
249 A results object for which `hasRecords` returns `True`. May be
250 ``self`` if that is already the case.
252 Notes
253 -----
254 For very result sets, it may be much more efficient to call
255 `materialize` before calling `expanded`, to avoid performing the
256 original query multiple times (as a subquery) in the follow-up queries
257 that fetch dimension records. For example::
259 with registry.queryDataIds(...).materialize() as tempDataIds:
260 dataIdsWithRecords = tempDataIds.expanded()
261 for dataId in dataIdsWithRecords:
262 ...
263 """
264 if self._records is None:
265 records = {}
266 for element in self.graph.elements:
267 subset = self.subset(graph=element.graph, unique=True)
268 records[element.name] = {
269 tuple(record.dataId.values()): record
270 for record in self._query.backend.managers.dimensions[element].fetch(subset)
271 }
273 return self._clone(query=self._query, records=records)
274 else:
275 return self
277 def subset(
278 self, graph: DimensionGraph | None = None, *, unique: bool = False
279 ) -> DataCoordinateQueryResults:
280 """Return a results object containing a subset of the dimensions of
281 this one, and/or a unique near-subset of its rows.
283 This method may involve actually executing database queries to fetch
284 `DimensionRecord` objects.
286 Parameters
287 ----------
288 graph : `DimensionGraph`, optional
289 Dimensions to include in the new results object. If `None`,
290 ``self.graph`` is used.
291 unique : `bool`, optional
292 If `True` (`False` is default), the query should only return unique
293 data IDs. This is implemented in the database; to obtain unique
294 results via Python-side processing (which may be more efficient in
295 some cases), use `toSet` to construct a `DataCoordinateSet` from
296 this results object instead.
298 Returns
299 -------
300 results : `DataCoordinateQueryResults`
301 A results object corresponding to the given criteria. May be
302 ``self`` if it already qualifies.
304 Raises
305 ------
306 ValueError
307 Raised when ``graph`` is not a subset of the dimension graph in
308 this result.
310 Notes
311 -----
312 This method can only return a "near-subset" of the original result rows
313 in general because of subtleties in how spatial overlaps are
314 implemented; see `Query.subset` for more information.
316 When calling `subset` multiple times on the same very large result set,
317 it may be much more efficient to call `materialize` first. For
318 example::
320 dimensions1 = DimensionGraph(...)
321 dimensions2 = DimensionGraph(...)
322 with registry.queryDataIds(...).materialize() as tempDataIds:
323 for dataId1 in tempDataIds.subset(
324 graph=dimensions1,
325 unique=True):
326 ...
327 for dataId2 in tempDataIds.subset(
328 graph=dimensions2,
329 unique=True):
330 ...
331 """
332 if graph is None:
333 graph = self.graph
334 if not graph.issubset(self.graph):
335 raise ValueError(f"{graph} is not a subset of {self.graph}")
336 if graph == self.graph and (not unique or self._query.isUnique()):
337 return self
338 records: Mapping[str, Mapping[tuple, DimensionRecord]] | None
339 if self._records is not None:
340 records = {element.name: self._records[element.name] for element in graph.elements}
341 else:
342 records = None
343 query = self._query.subset(graph=graph, datasets=False, unique=unique)
345 return self._clone(graph=graph, query=query, records=records)
347 def constrain(self, query: SimpleQuery, columns: Callable[[str], sqlalchemy.sql.ColumnElement]) -> None:
348 # Docstring inherited from DataCoordinateIterable.
349 sql = self._query.sql
350 if sql is not None:
351 fromClause = sql.alias("c")
352 query.join(
353 fromClause,
354 onclause=sqlalchemy.sql.and_(
355 *[
356 columns(dimension.name) == fromClause.columns[dimension.name]
357 for dimension in self.graph.required
358 ]
359 ),
360 )
362 def findDatasets(
363 self,
364 datasetType: DatasetType | str,
365 collections: Any,
366 *,
367 findFirst: bool = True,
368 components: bool | None = None,
369 ) -> ParentDatasetQueryResults:
370 """Find datasets using the data IDs identified by this query.
372 Parameters
373 ----------
374 datasetType : `DatasetType` or `str`
375 Dataset type or the name of one to search for. Must have
376 dimensions that are a subset of ``self.graph``.
377 collections : `Any`
378 An expression that fully or partially identifies the collections
379 to search for the dataset, such as a `str`, `re.Pattern`, or
380 iterable thereof. ``...`` can be used to return all collections.
381 See :ref:`daf_butler_collection_expressions` for more information.
382 findFirst : `bool`, optional
383 If `True` (default), for each result data ID, only yield one
384 `DatasetRef`, from the first collection in which a dataset of that
385 dataset type appears (according to the order of ``collections``
386 passed in). If `True`, ``collections`` must not contain regular
387 expressions and may not be ``...``.
388 components : `bool`, optional
389 If `True`, apply all expression patterns to component dataset type
390 names as well. If `False`, never apply patterns to components. If
391 `None` (default), apply patterns to components only if their parent
392 datasets were not matched by the expression. Fully-specified
393 component datasets (`str` or `DatasetType` instances) are always
394 included.
396 Values other than `False` are deprecated, and only `False` will be
397 supported after v26. After v27 this argument will be removed
398 entirely.
400 Returns
401 -------
402 datasets : `ParentDatasetQueryResults`
403 A lazy-evaluation object representing dataset query results,
404 iterable over `DatasetRef` objects. If ``self.hasRecords()``, all
405 nested data IDs in those dataset references will have records as
406 well.
408 Raises
409 ------
410 ValueError
411 Raised if ``datasetType.dimensions.issubset(self.graph) is False``.
412 MissingDatasetTypeError
413 Raised if the given dataset type is not registered.
414 """
415 parent_dataset_type, components_found = self._query.backend.resolve_single_dataset_type_wildcard(
416 datasetType, components=components, explicit_only=True
417 )
418 if not parent_dataset_type.dimensions.issubset(self.graph):
419 raise ValueError(
420 f"findDatasets requires that the dataset type have only dimensions in "
421 f"the DataCoordinateQueryResult used as input to the search, but "
422 f"{parent_dataset_type.name} has dimensions {parent_dataset_type.dimensions}, "
423 f"while the input dimensions are {self.graph}."
424 )
425 summary = QuerySummary(
426 self.graph, whereRegion=self._query.whereRegion, datasets=[parent_dataset_type]
427 )
428 builder = self._query.makeBuilder(summary)
429 builder.joinDataset(parent_dataset_type, collections=collections, findFirst=findFirst)
430 query = builder.finish(joinMissing=False)
431 return ParentDatasetQueryResults(
432 db=self._db,
433 query=query,
434 components=components_found,
435 records=self._records,
436 datasetType=parent_dataset_type,
437 )
439 def count(self, *, exact: bool = True) -> int:
440 """Count the number of rows this query would return.
442 Parameters
443 ----------
444 exact : `bool`, optional
445 If `True`, run the full query and perform post-query filtering if
446 needed to account for that filtering in the count. If `False`, the
447 result may be an upper bound.
449 Returns
450 -------
451 count : `int`
452 The number of rows the query would return, or an upper bound if
453 ``exact=False``.
455 Notes
456 -----
457 This counts the number of rows returned, not the number of unique rows
458 returned, so even with ``exact=True`` it may provide only an upper
459 bound on the number of *deduplicated* result rows.
460 """
461 return self._query.count(self._db, exact=exact)
463 def any(
464 self,
465 *,
466 execute: bool = True,
467 exact: bool = True,
468 ) -> bool:
469 """Test whether this query returns any results.
471 Parameters
472 ----------
473 execute : `bool`, optional
474 If `True`, execute at least a ``LIMIT 1`` query if it cannot be
475 determined prior to execution that the query would return no rows.
476 exact : `bool`, optional
477 If `True`, run the full query and perform post-query filtering if
478 needed, until at least one result row is found. If `False`, the
479 returned result does not account for post-query filtering, and
480 hence may be `True` even when all result rows would be filtered
481 out.
483 Returns
484 -------
485 any : `bool`
486 `True` if the query would (or might, depending on arguments) yield
487 result rows. `False` if it definitely would not.
488 """
489 return self._query.any(self._db, execute=execute, exact=exact)
491 def explain_no_results(self) -> Iterable[str]:
492 """Return human-readable messages that may help explain why the query
493 yields no results.
495 Returns
496 -------
497 messages : `Iterable` [ `str` ]
498 String messages that describe reasons the query might not yield any
499 results.
501 Notes
502 -----
503 Messages related to post-query filtering are only available if the
504 iterator has been exhausted, or if `any` or `count` was already called
505 (with ``exact=True`` for the latter two).
507 This method first yields messages that are generated while the query is
508 being built or filtered, but may then proceed to diagnostics generated
509 by performing what should be inexpensive follow-up queries. Callers
510 can short-circuit this at any time by simplying not iterating further.
511 """
512 return self._query.explain_no_results(self._db)
514 def order_by(self, *args: str) -> DataCoordinateQueryResults:
515 """Make the iterator return ordered result.
517 Parameters
518 ----------
519 *args : `str`
520 Names of the columns/dimensions to use for ordering. Column name
521 can be prefixed with minus (``-``) to use descending ordering.
523 Returns
524 -------
525 result : `DataCoordinateQueryResults`
526 Returns ``self`` instance which is updated to return ordered
527 result.
529 Notes
530 -----
531 This method modifies the iterator in place and returns the same
532 instance to support method chaining.
533 """
534 return self._clone(order_by=args)
536 def limit(self, limit: int, offset: int | None = None) -> DataCoordinateQueryResults:
537 """Make the iterator return limited number of records.
539 Parameters
540 ----------
541 limit : `int`
542 Upper limit on the number of returned records.
543 offset : `int` or `None`
544 If not `None` then the number of records to skip before returning
545 ``limit`` records.
547 Returns
548 -------
549 result : `DataCoordinateQueryResults`
550 Returns ``self`` instance which is updated to return limited set
551 of records.
553 Notes
554 -----
555 This method modifies the iterator in place and returns the same
556 instance to support method chaining. Normally this method is used
557 together with `order_by` method.
558 """
559 return self._clone(limit=(limit, offset))
562class DatasetQueryResults(Iterable[DatasetRef]):
563 """An interface for objects that represent the results of queries for
564 datasets.
565 """
567 @abstractmethod
568 def byParentDatasetType(self) -> Iterator[ParentDatasetQueryResults]:
569 """Group results by parent dataset type.
571 Returns
572 -------
573 iter : `Iterator` [ `ParentDatasetQueryResults` ]
574 An iterator over `DatasetQueryResults` instances that are each
575 responsible for a single parent dataset type (either just that
576 dataset type, one or more of its component dataset types, or both).
577 """
578 raise NotImplementedError()
580 @abstractmethod
581 def materialize(self) -> AbstractContextManager[DatasetQueryResults]:
582 """Insert this query's results into a temporary table.
584 Returns
585 -------
586 context : `typing.ContextManager` [ `DatasetQueryResults` ]
587 A context manager that ensures the temporary table is created and
588 populated in ``__enter__`` (returning a results object backed by
589 that table), and dropped in ``__exit__``. If ``self`` is already
590 materialized, the context manager may do nothing (reflecting the
591 fact that an outer context manager should already take care of
592 everything else).
593 """
594 raise NotImplementedError()
596 @abstractmethod
597 def expanded(self) -> DatasetQueryResults:
598 """Return a `DatasetQueryResults` for which `DataCoordinate.hasRecords`
599 returns `True` for all data IDs in returned `DatasetRef` objects.
601 Returns
602 -------
603 expanded : `DatasetQueryResults`
604 Either a new `DatasetQueryResults` instance or ``self``, if it is
605 already expanded.
607 Notes
608 -----
609 As with `DataCoordinateQueryResults.expanded`, it may be more efficient
610 to call `materialize` before expanding data IDs for very large result
611 sets.
612 """
613 raise NotImplementedError()
615 @abstractmethod
616 def count(self, *, exact: bool = True) -> int:
617 """Count the number of rows this query would return.
619 Parameters
620 ----------
621 exact : `bool`, optional
622 If `True`, run the full query and perform post-query filtering if
623 needed to account for that filtering in the count. If `False`, the
624 result may be an upper bound.
626 Returns
627 -------
628 count : `int`
629 The number of rows the query would return, or an upper bound if
630 ``exact=False``.
632 Notes
633 -----
634 This counts the number of rows returned, not the number of unique rows
635 returned, so even with ``exact=True`` it may provide only an upper
636 bound on the number of *deduplicated* result rows.
637 """
638 raise NotImplementedError()
640 @abstractmethod
641 def any(
642 self,
643 *,
644 execute: bool = True,
645 exact: bool = True,
646 ) -> bool:
647 """Test whether this query returns any results.
649 Parameters
650 ----------
651 execute : `bool`, optional
652 If `True`, execute at least a ``LIMIT 1`` query if it cannot be
653 determined prior to execution that the query would return no rows.
654 exact : `bool`, optional
655 If `True`, run the full query and perform post-query filtering if
656 needed, until at least one result row is found. If `False`, the
657 returned result does not account for post-query filtering, and
658 hence may be `True` even when all result rows would be filtered
659 out.
661 Returns
662 -------
663 any : `bool`
664 `True` if the query would (or might, depending on arguments) yield
665 result rows. `False` if it definitely would not.
666 """
667 raise NotImplementedError()
669 @abstractmethod
670 def explain_no_results(self) -> Iterable[str]:
671 """Return human-readable messages that may help explain why the query
672 yields no results.
674 Returns
675 -------
676 messages : `Iterable` [ `str` ]
677 String messages that describe reasons the query might not yield any
678 results.
680 Notes
681 -----
682 Messages related to post-query filtering are only available if the
683 iterator has been exhausted, or if `any` or `count` was already called
684 (with ``exact=True`` for the latter two).
686 This method first yields messages that are generated while the query is
687 being built or filtered, but may then proceed to diagnostics generated
688 by performing what should be inexpensive follow-up queries. Callers
689 can short-circuit this at any time by simplying not iterating further.
690 """
691 raise NotImplementedError()
694class ParentDatasetQueryResults(DatasetQueryResults):
695 """An object that represents results from a query for datasets with a
696 single parent `DatasetType`.
698 Parameters
699 ----------
700 db : `Database`
701 Database engine to execute queries against.
702 query : `Query`
703 Low-level query object that backs these results. ``query.datasetType``
704 will be the parent dataset type for this object, and may not be `None`.
705 components : `Sequence` [ `str` or `None` ]
706 Names of components to include in iteration. `None` may be included
707 (at most once) to include the parent dataset type.
708 records : `Mapping`, optional
709 Mapping containing `DimensionRecord` objects for all dimensions and
710 all data IDs this query will yield. If `None` (default),
711 `DataCoordinate.hasRecords` will return `False` for all nested data
712 IDs. This is a nested mapping with `str` names of dimension elements
713 as outer keys, `DimensionRecord` instances as inner values, and
714 ``tuple(record.dataId.values())`` for the inner keys / outer values
715 (where ``record`` is the innermost `DimensionRecord` instance).
716 datasetType : `DatasetType`, optional
717 Parent dataset type for all datasets returned by this query. If not
718 provided, ``query.datasetType`` be used, and must not be `None` (as it
719 is in the case where the query is known to yield no results prior to
720 execution).
721 """
723 def __init__(
724 self,
725 db: Database,
726 query: Query,
727 *,
728 components: Sequence[str | None],
729 records: Mapping[str, Mapping[tuple, DimensionRecord]] | None = None,
730 datasetType: DatasetType | None = None,
731 ):
732 self._db = db
733 self._query = query
734 self._components = components
735 self._records = records
736 if datasetType is None:
737 datasetType = query.datasetType
738 assert datasetType is not None, "Query used to initialize dataset results must have a dataset."
739 assert datasetType.dimensions.issubset(
740 query.graph
741 ), f"Query dimensions {query.graph} do not match dataset type dimensions {datasetType.dimensions}."
742 self._datasetType = datasetType
744 __slots__ = ("_db", "_query", "_dimensions", "_components", "_records")
746 def __iter__(self) -> Iterator[DatasetRef]:
747 for row in self._query.rows(self._db):
748 parentRef = self._query.extractDatasetRef(row, records=self._records)
749 for component in self._components:
750 if component is None:
751 yield parentRef
752 else:
753 yield parentRef.makeComponentRef(component)
755 def __repr__(self) -> str:
756 return f"<DatasetRef iterator for [components of] {self._datasetType.name}>"
758 def byParentDatasetType(self) -> Iterator[ParentDatasetQueryResults]:
759 # Docstring inherited from DatasetQueryResults.
760 yield self
762 @contextmanager
763 def materialize(self) -> Iterator[ParentDatasetQueryResults]:
764 # Docstring inherited from DatasetQueryResults.
765 with self._query.materialize(self._db) as materialized:
766 yield ParentDatasetQueryResults(
767 self._db, materialized, components=self._components, records=self._records
768 )
770 @property
771 def parentDatasetType(self) -> DatasetType:
772 """The parent dataset type for all datasets in this iterable
773 (`DatasetType`).
774 """
775 return self._datasetType
777 @property
778 def dataIds(self) -> DataCoordinateQueryResults:
779 """A lazy-evaluation object representing a query for just the data
780 IDs of the datasets that would be returned by this query
781 (`DataCoordinateQueryResults`).
783 The returned object is not in general `zip`-iterable with ``self``;
784 it may be in a different order or have (or not have) duplicates.
785 """
786 query = self._query.subset(graph=self.parentDatasetType.dimensions, datasets=False, unique=False)
787 return DataCoordinateQueryResults.from_query(
788 self._db,
789 query,
790 self.parentDatasetType.dimensions,
791 records=self._records,
792 )
794 def withComponents(self, components: Sequence[str | None]) -> ParentDatasetQueryResults:
795 """Return a new query results object for the same parent datasets but
796 different components.
798 components : `Sequence` [ `str` or `None` ]
799 Names of components to include in iteration. `None` may be
800 included (at most once) to include the parent dataset type.
801 """
802 return ParentDatasetQueryResults(
803 self._db, self._query, records=self._records, components=components, datasetType=self._datasetType
804 )
806 def expanded(self) -> ParentDatasetQueryResults:
807 # Docstring inherited from DatasetQueryResults.
808 if self._records is None:
809 records = self.dataIds.expanded()._records
810 return ParentDatasetQueryResults(
811 self._db,
812 self._query,
813 records=records,
814 components=self._components,
815 datasetType=self._datasetType,
816 )
817 else:
818 return self
820 def count(self, *, exact: bool = True) -> int:
821 # Docstring inherited.
822 return len(self._components) * self._query.count(self._db, exact=exact)
824 def any(
825 self,
826 *,
827 execute: bool = True,
828 exact: bool = True,
829 ) -> bool:
830 # Docstring inherited.
831 return self._query.any(self._db, execute=execute, exact=exact)
833 def explain_no_results(self) -> Iterable[str]:
834 # Docstring inherited.
835 return self._query.explain_no_results(self._db)
838class ChainedDatasetQueryResults(DatasetQueryResults):
839 """A `DatasetQueryResults` implementation that simply chains together
840 other results objects, each for a different parent dataset type.
842 Parameters
843 ----------
844 chain : `Sequence` [ `ParentDatasetQueryResults` ]
845 The underlying results objects this object will chain together.
846 doomed_by : `Iterable` [ `str` ], optional
847 A list of messages (appropriate for e.g. logging or exceptions) that
848 explain why the query is known to return no results even before it is
849 executed. Queries with a non-empty list will never be executed.
850 Child results objects may also have their own list.
851 """
853 def __init__(self, chain: Sequence[ParentDatasetQueryResults], doomed_by: Iterable[str] = ()):
854 self._chain = chain
855 self._doomed_by = tuple(doomed_by)
857 __slots__ = ("_chain",)
859 def __iter__(self) -> Iterator[DatasetRef]:
860 return itertools.chain.from_iterable(self._chain)
862 def __repr__(self) -> str:
863 return "<DatasetRef iterator for multiple dataset types>"
865 def byParentDatasetType(self) -> Iterator[ParentDatasetQueryResults]:
866 # Docstring inherited from DatasetQueryResults.
867 return iter(self._chain)
869 @contextmanager
870 def materialize(self) -> Iterator[ChainedDatasetQueryResults]:
871 # Docstring inherited from DatasetQueryResults.
872 with ExitStack() as stack:
873 yield ChainedDatasetQueryResults([stack.enter_context(r.materialize()) for r in self._chain])
875 def expanded(self) -> ChainedDatasetQueryResults:
876 # Docstring inherited from DatasetQueryResults.
877 return ChainedDatasetQueryResults([r.expanded() for r in self._chain])
879 def count(self, *, exact: bool = True) -> int:
880 # Docstring inherited.
881 return sum(r.count(exact=exact) for r in self._chain)
883 def any(
884 self,
885 *,
886 execute: bool = True,
887 exact: bool = True,
888 ) -> bool:
889 # Docstring inherited.
890 return any(r.any(execute=execute, exact=exact) for r in self._chain)
892 def explain_no_results(self) -> Iterable[str]:
893 # Docstring inherited.
894 for r in self._chain:
895 yield from r.explain_no_results()
896 yield from self._doomed_by
899class DimensionRecordQueryResults(Iterable[DimensionRecord]):
900 """An interface for objects that represent the results of queries for
901 dimension records.
902 """
904 @abstractmethod
905 def count(self, *, exact: bool = True) -> int:
906 """Count the number of rows this query would return.
908 Parameters
909 ----------
910 exact : `bool`, optional
911 If `True`, run the full query and perform post-query filtering if
912 needed to account for that filtering in the count. If `False`, the
913 result may be an upper bound.
915 Returns
916 -------
917 count : `int`
918 The number of rows the query would return, or an upper bound if
919 ``exact=False``.
921 Notes
922 -----
923 This counts the number of rows returned, not the number of unique rows
924 returned, so even with ``exact=True`` it may provide only an upper
925 bound on the number of *deduplicated* result rows.
926 """
927 raise NotImplementedError()
929 @abstractmethod
930 def any(self, *, execute: bool = True, exact: bool = True) -> bool:
931 """Test whether this query returns any results.
933 Parameters
934 ----------
935 execute : `bool`, optional
936 If `True`, execute at least a ``LIMIT 1`` query if it cannot be
937 determined prior to execution that the query would return no rows.
938 exact : `bool`, optional
939 If `True`, run the full query and perform post-query filtering if
940 needed, until at least one result row is found. If `False`, the
941 returned result does not account for post-query filtering, and
942 hence may be `True` even when all result rows would be filtered
943 out.
945 Returns
946 -------
947 any : `bool`
948 `True` if the query would (or might, depending on arguments) yield
949 result rows. `False` if it definitely would not.
950 """
951 raise NotImplementedError()
953 @abstractmethod
954 def order_by(self, *args: str) -> DimensionRecordQueryResults:
955 """Make the iterator return ordered result.
957 Parameters
958 ----------
959 *args : `str`
960 Names of the columns/dimensions to use for ordering. Column name
961 can be prefixed with minus (``-``) to use descending ordering.
963 Returns
964 -------
965 result : `DimensionRecordQueryResults`
966 Returns ``self`` instance which is updated to return ordered
967 result.
969 Notes
970 -----
971 This method can modify the iterator in place and return the same
972 instance.
973 """
974 raise NotImplementedError()
976 @abstractmethod
977 def limit(self, limit: int, offset: int | None = None) -> DimensionRecordQueryResults:
978 """Make the iterator return limited number of records.
980 Parameters
981 ----------
982 limit : `int`
983 Upper limit on the number of returned records.
984 offset : `int` or `None`
985 If not `None` then the number of records to skip before returning
986 ``limit`` records.
988 Returns
989 -------
990 result : `DimensionRecordQueryResults`
991 Returns ``self`` instance which is updated to return limited set
992 of records.
994 Notes
995 -----
996 This method can modify the iterator in place and return the same
997 instance. Normally this method is used together with `order_by`
998 method.
999 """
1000 raise NotImplementedError()
1002 @abstractmethod
1003 def explain_no_results(self) -> Iterable[str]:
1004 """Return human-readable messages that may help explain why the query
1005 yields no results.
1007 Returns
1008 -------
1009 messages : `Iterable` [ `str` ]
1010 String messages that describe reasons the query might not yield any
1011 results.
1013 Notes
1014 -----
1015 Messages related to post-query filtering are only available if the
1016 iterator has been exhausted, or if `any` or `count` was already called
1017 (with ``exact=True`` for the latter two).
1019 This method first yields messages that are generated while the query is
1020 being built or filtered, but may then proceed to diagnostics generated
1021 by performing what should be inexpensive follow-up queries. Callers
1022 can short-circuit this at any time by simply not iterating further.
1023 """
1024 raise NotImplementedError()
1027class _DimensionRecordKey:
1028 """Class for objects used as keys in ordering `DimensionRecord` instances.
1030 Parameters
1031 ----------
1032 attributes : `Sequence` [ `str` ]
1033 Sequence of attribute names to use for comparison.
1034 ordering : `Sequence` [ `bool` ]
1035 Matching sequence of ordering flags, `False` for descending ordering,
1036 `True` for ascending ordering.
1037 record : `DimensionRecord`
1038 `DimensionRecord` to compare to other records.
1039 """
1041 def __init__(self, attributes: Sequence[str], ordering: Sequence[bool], record: DimensionRecord):
1042 self.attributes = attributes
1043 self.ordering = ordering
1044 self.rec = record
1046 def _cmp(self, other: _DimensionRecordKey) -> int:
1047 """Compare two records using provided comparison operator.
1049 Parameters
1050 ----------
1051 other : `_DimensionRecordKey`
1052 Key for other record.
1054 Returns
1055 -------
1056 result : `int`
1057 0 if keys are identical, negative if ``self`` is ordered before
1058 ``other``, positive otherwise.
1059 """
1060 for attribute, ordering in zip(self.attributes, self.ordering):
1061 # timespan.begin/end cannot use getattr
1062 attrgetter = operator.attrgetter(attribute)
1063 lhs = attrgetter(self.rec)
1064 rhs = attrgetter(other.rec)
1065 if not ordering:
1066 lhs, rhs = rhs, lhs
1067 if lhs != rhs:
1068 return 1 if lhs > rhs else -1
1069 return 0
1071 def __lt__(self, other: _DimensionRecordKey) -> bool:
1072 return self._cmp(other) < 0
1074 def __gt__(self, other: _DimensionRecordKey) -> bool:
1075 return self._cmp(other) > 0
1077 def __eq__(self, other: Any) -> bool:
1078 if not isinstance(other, _DimensionRecordKey):
1079 return NotImplemented
1080 return self._cmp(other) == 0
1082 def __le__(self, other: _DimensionRecordKey) -> bool:
1083 return self._cmp(other) <= 0
1085 def __ge__(self, other: _DimensionRecordKey) -> bool:
1086 return self._cmp(other) >= 0
1089class DatabaseDimensionRecordQueryResults(DimensionRecordQueryResults):
1090 """Implementation of DimensionRecordQueryResults using database query.
1092 Parameters
1093 ----------
1094 dataIds : `DataCoordinateQueryResults`
1095 Iterator for DataIds.
1096 recordStorage : `DimensionRecordStorage`
1097 Instance of storage class for dimension records.
1098 """
1100 def __init__(self, dataIds: DataCoordinateQueryResults, recordStorage: DimensionRecordStorage):
1101 self._dataIds = dataIds
1102 self._recordStorage = recordStorage
1103 self._order_by: Iterable[str] = ()
1105 def __iter__(self) -> Iterator[DimensionRecord]:
1106 # LIMIT is already applied at DataCoordinateQueryResults level
1107 # (assumption here is that if DataId exists then dimension record
1108 # exists too and their counts must be equal). fetch() does not
1109 # guarantee ordering, so we need to sort records in memory below.
1110 recordIter = self._recordStorage.fetch(self._dataIds)
1111 if not self._order_by:
1112 return iter(recordIter)
1114 # Parse list of column names and build a list of attribute name for
1115 # ordering. Note that here we only support ordering by direct
1116 # attributes of the element, and not other elements from the dimension
1117 # graph.
1118 orderBy = ElementOrderByClause(self._order_by, self._recordStorage.element)
1119 attributes: list[str] = []
1120 ordering: list[bool] = []
1121 for column in orderBy.order_by_columns:
1122 if column.column is None:
1123 assert isinstance(column.element, Dimension), "Element must be a Dimension"
1124 attributes.append(column.element.primaryKey.name)
1125 else:
1126 attributes.append(column.column)
1127 ordering.append(column.ordering)
1129 def _key(record: DimensionRecord) -> _DimensionRecordKey:
1130 return _DimensionRecordKey(attributes, ordering, record)
1132 records = sorted(recordIter, key=_key)
1133 return iter(records)
1135 def count(self, *, exact: bool = True) -> int:
1136 # Docstring inherited from base class.
1137 return self._dataIds.count(exact=exact)
1139 def any(self, *, execute: bool = True, exact: bool = True) -> bool:
1140 # Docstring inherited from base class.
1141 return self._dataIds.any(execute=execute, exact=exact)
1143 def order_by(self, *args: str) -> DimensionRecordQueryResults:
1144 # Docstring inherited from base class.
1145 self._dataIds = self._dataIds.order_by(*args)
1146 self._order_by = args
1147 return self
1149 def limit(self, limit: int, offset: Optional[int] = None) -> DimensionRecordQueryResults:
1150 # Docstring inherited from base class.
1151 self._dataIds = self._dataIds.limit(limit, offset)
1152 return self
1154 def explain_no_results(self) -> Iterable[str]:
1155 # Docstring inherited.
1156 return self._dataIds.explain_no_results()