Coverage for python/lsst/daf/butler/registry/queries/_results.py: 34%
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23__all__ = (
24 "ChainedDatasetQueryResults",
25 "DatabaseDimensionRecordQueryResults",
26 "DataCoordinateQueryResults",
27 "DatasetQueryResults",
28 "DimensionRecordQueryResults",
29 "ParentDatasetQueryResults",
30)
32import itertools
33import operator
34from abc import abstractmethod
35from contextlib import ExitStack, contextmanager
36from typing import (
37 Any,
38 Callable,
39 ContextManager,
40 Iterable,
41 Iterator,
42 List,
43 Mapping,
44 Optional,
45 Sequence,
46 Tuple,
47 Union,
48)
50import sqlalchemy
52from ...core import (
53 DataCoordinate,
54 DataCoordinateIterable,
55 DatasetRef,
56 DatasetType,
57 Dimension,
58 DimensionGraph,
59 DimensionRecord,
60 SimpleQuery,
61)
62from ..interfaces import Database, DimensionRecordStorage
63from ._query import Query
64from ._structs import ElementOrderByClause, QuerySummary
66QueryFactoryMethod = Callable[[Optional[Iterable[str]], Optional[Tuple[int, Optional[int]]]], Query]
67"""Type of a query factory method type used by DataCoordinateQueryResults.
68"""
71class DataCoordinateQueryResults(DataCoordinateIterable):
72 """An enhanced implementation of `DataCoordinateIterable` that represents
73 data IDs retrieved from a database query.
75 Parameters
76 ----------
77 db : `Database`
78 Database engine used to execute queries.
79 query_factory : `QueryFactoryMethod`
80 Method which creates an instance of `Query` class.
81 graph : `DimensionGraph`
82 Dimensions used by query.
83 order_by : `Iterable` [ `str` ], optional
84 Optional sequence of column names used for result ordering.
85 limit : `Tuple` [ `int`, `int` ], optional
86 Limit for the number of returned records and optional offset.
87 records : `Mapping`, optional
88 A nested mapping containing `DimensionRecord` objects for all
89 dimensions and all data IDs this query will yield. If `None`
90 (default), `DataCoordinateIterable.hasRecords` will return `False`.
91 The outer mapping has `str` keys (the names of dimension elements).
92 The inner mapping has `tuple` keys representing data IDs (tuple
93 conversions of `DataCoordinate.values()`) and `DimensionRecord` values.
95 Notes
96 -----
97 Constructing an instance of this does nothing; the query is not executed
98 until it is iterated over (or some other operation is performed that
99 involves iteration).
101 Instances should generally only be constructed by `Registry` methods or the
102 methods of other query result objects.
103 """
105 def __init__(
106 self,
107 db: Database,
108 query_factory: QueryFactoryMethod,
109 graph: DimensionGraph,
110 *,
111 order_by: Optional[Iterable[str]] = None,
112 limit: Optional[Tuple[int, Optional[int]]] = None,
113 records: Optional[Mapping[str, Mapping[tuple, DimensionRecord]]] = None,
114 ):
115 self._db = db
116 self._query_factory = query_factory
117 self._graph = graph
118 self._order_by = order_by
119 self._limit = limit
120 self._records = records
121 self._cached_query: Optional[Query] = None
123 __slots__ = ("_db", "_query_factory", "_graph", "_order_by", "_limit", "_records", "_cached_query")
125 @classmethod
126 def from_query(
127 cls,
128 db: Database,
129 query: Query,
130 graph: DimensionGraph,
131 *,
132 order_by: Optional[Iterable[str]] = None,
133 limit: Optional[Tuple[int, Optional[int]]] = None,
134 records: Optional[Mapping[str, Mapping[tuple, DimensionRecord]]] = None,
135 ) -> DataCoordinateQueryResults:
136 """Make an instance from a pre-existing query instead of a factory.
138 Parameters
139 ----------
140 db : `Database`
141 Database engine used to execute queries.
142 query : `Query`
143 Low-level representation of the query that backs this result
144 object.
145 graph : `DimensionGraph`
146 Dimensions used by query.
147 order_by : `Iterable` [ `str` ], optional
148 Optional sequence of column names used for result ordering.
149 limit : `Tuple` [ `int`, `int` ], optional
150 Limit for the number of returned records and optional offset.
151 records : `Mapping`, optional
152 A nested mapping containing `DimensionRecord` objects for all
153 dimensions and all data IDs this query will yield. If `None`
154 (default), `DataCoordinateIterable.hasRecords` will return `False`.
155 The outer mapping has `str` keys (the names of dimension elements).
156 The inner mapping has `tuple` keys representing data IDs (tuple
157 conversions of `DataCoordinate.values()`) and `DimensionRecord`
158 values.
159 """
161 def factory(order_by: Optional[Iterable[str]], limit: Optional[Tuple[int, Optional[int]]]) -> Query:
162 return query
164 return DataCoordinateQueryResults(db, factory, graph, order_by=order_by, limit=limit, records=records)
166 def __iter__(self) -> Iterator[DataCoordinate]:
167 return (self._query.extractDataId(row, records=self._records) for row in self._query.rows(self._db))
169 def __repr__(self) -> str:
170 return f"<DataCoordinate iterator with dimensions={self._graph}>"
172 def _clone(
173 self,
174 *,
175 query_factory: Optional[QueryFactoryMethod] = None,
176 query: Optional[Query] = None,
177 graph: Optional[DimensionGraph] = None,
178 order_by: Optional[Iterable[str]] = None,
179 limit: Optional[Tuple[int, Optional[int]]] = None,
180 records: Optional[Mapping[str, Mapping[tuple, DimensionRecord]]] = None,
181 ) -> DataCoordinateQueryResults:
182 """Clone this instance potentially updating some attributes."""
183 graph = graph if graph is not None else self._graph
184 order_by = order_by if order_by is not None else self._order_by
185 limit = limit if limit is not None else self._limit
186 records = records if records is not None else self._records
187 if query is None:
188 query_factory = query_factory or self._query_factory
189 return DataCoordinateQueryResults(
190 self._db, query_factory, graph, order_by=order_by, limit=limit, records=records
191 )
192 else:
193 return DataCoordinateQueryResults.from_query(
194 self._db, query, graph, order_by=order_by, limit=limit, records=records
195 )
197 @property
198 def _query(self) -> Query:
199 """Query representation instance (`Query`)"""
200 if self._cached_query is None:
201 self._cached_query = self._query_factory(self._order_by, self._limit)
202 assert (
203 self._cached_query.datasetType is None
204 ), "Query used to initialize data coordinate results should not have any datasets."
205 return self._cached_query
207 @property
208 def graph(self) -> DimensionGraph:
209 # Docstring inherited from DataCoordinateIterable.
210 return self._graph
212 def hasFull(self) -> bool:
213 # Docstring inherited from DataCoordinateIterable.
214 return True
216 def hasRecords(self) -> bool:
217 # Docstring inherited from DataCoordinateIterable.
218 return self._records is not None or not self._graph
220 @contextmanager
221 def materialize(self) -> Iterator[DataCoordinateQueryResults]:
222 """Insert this query's results into a temporary table.
224 Returns
225 -------
226 context : `typing.ContextManager` [ `DataCoordinateQueryResults` ]
227 A context manager that ensures the temporary table is created and
228 populated in ``__enter__`` (returning a results object backed by
229 that table), and dropped in ``__exit__``. If ``self`` is already
230 materialized, the context manager may do nothing (reflecting the
231 fact that an outer context manager should already take care of
232 everything else).
234 Notes
235 -----
236 When using a very large result set to perform multiple queries (e.g.
237 multiple calls to `subset` with different arguments, or even a single
238 call to `expanded`), it may be much more efficient to start by
239 materializing the query and only then performing the follow up queries.
240 It may also be less efficient, depending on how well database engine's
241 query optimizer can simplify those particular follow-up queries and
242 how efficiently it caches query results even when the are not
243 explicitly inserted into a temporary table. See `expanded` and
244 `subset` for examples.
245 """
246 with self._query.materialize(self._db) as materialized:
247 # Note that we depend on order_by columns to be passes from Query
248 # to MaterializedQuery, so order_by and limit are not used.
249 yield self._clone(query=materialized)
251 def expanded(self) -> DataCoordinateQueryResults:
252 """Return a results object for which `hasRecords` returns `True`.
254 This method may involve actually executing database queries to fetch
255 `DimensionRecord` objects.
257 Returns
258 -------
259 results : `DataCoordinateQueryResults`
260 A results object for which `hasRecords` returns `True`. May be
261 ``self`` if that is already the case.
263 Notes
264 -----
265 For very result sets, it may be much more efficient to call
266 `materialize` before calling `expanded`, to avoid performing the
267 original query multiple times (as a subquery) in the follow-up queries
268 that fetch dimension records. For example::
270 with registry.queryDataIds(...).materialize() as tempDataIds:
271 dataIdsWithRecords = tempDataIds.expanded()
272 for dataId in dataIdsWithRecords:
273 ...
274 """
275 if self._records is None:
276 records = {}
277 for element in self.graph.elements:
278 subset = self.subset(graph=element.graph, unique=True)
279 records[element.name] = {
280 tuple(record.dataId.values()): record
281 for record in self._query.managers.dimensions[element].fetch(subset)
282 }
284 return self._clone(query=self._query, records=records)
285 else:
286 return self
288 def subset(
289 self, graph: Optional[DimensionGraph] = None, *, unique: bool = False
290 ) -> DataCoordinateQueryResults:
291 """Return a results object containing a subset of the dimensions of
292 this one, and/or a unique near-subset of its rows.
294 This method may involve actually executing database queries to fetch
295 `DimensionRecord` objects.
297 Parameters
298 ----------
299 graph : `DimensionGraph`, optional
300 Dimensions to include in the new results object. If `None`,
301 ``self.graph`` is used.
302 unique : `bool`, optional
303 If `True` (`False` is default), the query should only return unique
304 data IDs. This is implemented in the database; to obtain unique
305 results via Python-side processing (which may be more efficient in
306 some cases), use `toSet` to construct a `DataCoordinateSet` from
307 this results object instead.
309 Returns
310 -------
311 results : `DataCoordinateQueryResults`
312 A results object corresponding to the given criteria. May be
313 ``self`` if it already qualifies.
315 Raises
316 ------
317 ValueError
318 Raised when ``graph`` is not a subset of the dimension graph in
319 this result.
321 Notes
322 -----
323 This method can only return a "near-subset" of the original result rows
324 in general because of subtleties in how spatial overlaps are
325 implemented; see `Query.subset` for more information.
327 When calling `subset` multiple times on the same very large result set,
328 it may be much more efficient to call `materialize` first. For
329 example::
331 dimensions1 = DimensionGraph(...)
332 dimensions2 = DimensionGraph(...)
333 with registry.queryDataIds(...).materialize() as tempDataIds:
334 for dataId1 in tempDataIds.subset(
335 graph=dimensions1,
336 unique=True):
337 ...
338 for dataId2 in tempDataIds.subset(
339 graph=dimensions2,
340 unique=True):
341 ...
342 """
343 if graph is None:
344 graph = self.graph
345 if not graph.issubset(self.graph):
346 raise ValueError(f"{graph} is not a subset of {self.graph}")
347 if graph == self.graph and (not unique or self._query.isUnique()):
348 return self
349 records: Optional[Mapping[str, Mapping[tuple, DimensionRecord]]]
350 if self._records is not None:
351 records = {element.name: self._records[element.name] for element in graph.elements}
352 else:
353 records = None
354 query = self._query.subset(graph=graph, datasets=False, unique=unique)
356 return self._clone(graph=graph, query=query, records=records)
358 def constrain(self, query: SimpleQuery, columns: Callable[[str], sqlalchemy.sql.ColumnElement]) -> None:
359 # Docstring inherited from DataCoordinateIterable.
360 sql = self._query.sql
361 if sql is not None:
362 fromClause = sql.alias("c")
363 query.join(
364 fromClause,
365 onclause=sqlalchemy.sql.and_(
366 *[
367 columns(dimension.name) == fromClause.columns[dimension.name]
368 for dimension in self.graph.required
369 ]
370 ),
371 )
373 def findDatasets(
374 self, datasetType: Union[DatasetType, str], collections: Any, *, findFirst: bool = True
375 ) -> ParentDatasetQueryResults:
376 """Find datasets using the data IDs identified by this query.
378 Parameters
379 ----------
380 datasetType : `DatasetType` or `str`
381 Dataset type or the name of one to search for. Must have
382 dimensions that are a subset of ``self.graph``.
383 collections : `Any`
384 An expression that fully or partially identifies the collections
385 to search for the dataset, such as a `str`, `re.Pattern`, or
386 iterable thereof. ``...`` can be used to return all collections.
387 See :ref:`daf_butler_collection_expressions` for more information.
388 findFirst : `bool`, optional
389 If `True` (default), for each result data ID, only yield one
390 `DatasetRef`, from the first collection in which a dataset of that
391 dataset type appears (according to the order of ``collections``
392 passed in). If `True`, ``collections`` must not contain regular
393 expressions and may not be ``...``.
395 Returns
396 -------
397 datasets : `ParentDatasetQueryResults`
398 A lazy-evaluation object representing dataset query results,
399 iterable over `DatasetRef` objects. If ``self.hasRecords()``, all
400 nested data IDs in those dataset references will have records as
401 well.
403 Raises
404 ------
405 ValueError
406 Raised if ``datasetType.dimensions.issubset(self.graph) is False``.
407 """
408 if not isinstance(datasetType, DatasetType):
409 datasetType = self._query.managers.datasets[datasetType].datasetType
410 # moving component handling down into managers.
411 if not datasetType.dimensions.issubset(self.graph):
412 raise ValueError(
413 f"findDatasets requires that the dataset type have the same dimensions as "
414 f"the DataCoordinateQueryResult used as input to the search, but "
415 f"{datasetType.name} has dimensions {datasetType.dimensions}, while the input "
416 f"dimensions are {self.graph}."
417 )
418 if datasetType.isComponent():
419 # We were given a true DatasetType instance, but it's a component.
420 parentName, componentName = datasetType.nameAndComponent()
421 storage = self._query.managers.datasets[parentName]
422 datasetType = storage.datasetType
423 components = [componentName]
424 else:
425 components = [None]
426 summary = QuerySummary(self.graph, whereRegion=self._query.whereRegion, datasets=[datasetType])
427 builder = self._query.makeBuilder(summary)
428 builder.joinDataset(datasetType, collections=collections, findFirst=findFirst)
429 query = builder.finish(joinMissing=False)
430 return ParentDatasetQueryResults(
431 db=self._db, query=query, components=components, records=self._records, datasetType=datasetType
432 )
434 def count(self, *, exact: bool = True) -> int:
435 """Count the number of rows this query would return.
437 Parameters
438 ----------
439 exact : `bool`, optional
440 If `True`, run the full query and perform post-query filtering if
441 needed to account for that filtering in the count. If `False`, the
442 result may be an upper bound.
444 Returns
445 -------
446 count : `int`
447 The number of rows the query would return, or an upper bound if
448 ``exact=False``.
450 Notes
451 -----
452 This counts the number of rows returned, not the number of unique rows
453 returned, so even with ``exact=True`` it may provide only an upper
454 bound on the number of *deduplicated* result rows.
455 """
456 return self._query.count(self._db, exact=exact)
458 def any(
459 self,
460 *,
461 execute: bool = True,
462 exact: bool = True,
463 ) -> bool:
464 """Test whether this query returns any results.
466 Parameters
467 ----------
468 execute : `bool`, optional
469 If `True`, execute at least a ``LIMIT 1`` query if it cannot be
470 determined prior to execution that the query would return no rows.
471 exact : `bool`, optional
472 If `True`, run the full query and perform post-query filtering if
473 needed, until at least one result row is found. If `False`, the
474 returned result does not account for post-query filtering, and
475 hence may be `True` even when all result rows would be filtered
476 out.
478 Returns
479 -------
480 any : `bool`
481 `True` if the query would (or might, depending on arguments) yield
482 result rows. `False` if it definitely would not.
483 """
484 return self._query.any(self._db, execute=execute, exact=exact)
486 def explain_no_results(self) -> Iterator[str]:
487 """Return human-readable messages that may help explain why the query
488 yields no results.
490 Returns
491 -------
492 messages : `Iterator` [ `str` ]
493 String messages that describe reasons the query might not yield any
494 results.
496 Notes
497 -----
498 Messages related to post-query filtering are only available if the
499 iterator has been exhausted, or if `any` or `count` was already called
500 (with ``exact=True`` for the latter two).
502 This method first yields messages that are generated while the query is
503 being built or filtered, but may then proceed to diagnostics generated
504 by performing what should be inexpensive follow-up queries. Callers
505 can short-circuit this at any time by simplying not iterating further.
506 """
507 return self._query.explain_no_results(self._db)
509 def order_by(self, *args: str) -> DataCoordinateQueryResults:
510 """Make the iterator return ordered result.
512 Parameters
513 ----------
514 *args : `str`
515 Names of the columns/dimensions to use for ordering. Column name
516 can be prefixed with minus (``-``) to use descending ordering.
518 Returns
519 -------
520 result : `DataCoordinateQueryResults`
521 Returns ``self`` instance which is updated to return ordered
522 result.
524 Notes
525 -----
526 This method modifies the iterator in place and returns the same
527 instance to support method chaining.
528 """
529 return self._clone(order_by=args)
531 def limit(self, limit: int, offset: Optional[int] = None) -> DataCoordinateQueryResults:
532 """Make the iterator return limited number of records.
534 Parameters
535 ----------
536 limit : `int`
537 Upper limit on the number of returned records.
538 offset : `int` or `None`
539 If not `None` then the number of records to skip before returning
540 ``limit`` records.
542 Returns
543 -------
544 result : `DataCoordinateQueryResults`
545 Returns ``self`` instance which is updated to return limited set
546 of records.
548 Notes
549 -----
550 This method modifies the iterator in place and returns the same
551 instance to support method chaining. Normally this method is used
552 together with `order_by` method.
553 """
554 return self._clone(limit=(limit, offset))
557class DatasetQueryResults(Iterable[DatasetRef]):
558 """An interface for objects that represent the results of queries for
559 datasets.
560 """
562 @abstractmethod
563 def byParentDatasetType(self) -> Iterator[ParentDatasetQueryResults]:
564 """Group results by parent dataset type.
566 Returns
567 -------
568 iter : `Iterator` [ `ParentDatasetQueryResults` ]
569 An iterator over `DatasetQueryResults` instances that are each
570 responsible for a single parent dataset type (either just that
571 dataset type, one or more of its component dataset types, or both).
572 """
573 raise NotImplementedError()
575 @abstractmethod
576 def materialize(self) -> ContextManager[DatasetQueryResults]:
577 """Insert this query's results into a temporary table.
579 Returns
580 -------
581 context : `typing.ContextManager` [ `DatasetQueryResults` ]
582 A context manager that ensures the temporary table is created and
583 populated in ``__enter__`` (returning a results object backed by
584 that table), and dropped in ``__exit__``. If ``self`` is already
585 materialized, the context manager may do nothing (reflecting the
586 fact that an outer context manager should already take care of
587 everything else).
588 """
589 raise NotImplementedError()
591 @abstractmethod
592 def expanded(self) -> DatasetQueryResults:
593 """Return a `DatasetQueryResults` for which `DataCoordinate.hasRecords`
594 returns `True` for all data IDs in returned `DatasetRef` objects.
596 Returns
597 -------
598 expanded : `DatasetQueryResults`
599 Either a new `DatasetQueryResults` instance or ``self``, if it is
600 already expanded.
602 Notes
603 -----
604 As with `DataCoordinateQueryResults.expanded`, it may be more efficient
605 to call `materialize` before expanding data IDs for very large result
606 sets.
607 """
608 raise NotImplementedError()
610 @abstractmethod
611 def count(self, *, exact: bool = True) -> int:
612 """Count the number of rows this query would return.
614 Parameters
615 ----------
616 exact : `bool`, optional
617 If `True`, run the full query and perform post-query filtering if
618 needed to account for that filtering in the count. If `False`, the
619 result may be an upper bound.
621 Returns
622 -------
623 count : `int`
624 The number of rows the query would return, or an upper bound if
625 ``exact=False``.
627 Notes
628 -----
629 This counts the number of rows returned, not the number of unique rows
630 returned, so even with ``exact=True`` it may provide only an upper
631 bound on the number of *deduplicated* result rows.
632 """
633 raise NotImplementedError()
635 @abstractmethod
636 def any(
637 self,
638 *,
639 execute: bool = True,
640 exact: bool = True,
641 ) -> bool:
642 """Test whether this query returns any results.
644 Parameters
645 ----------
646 execute : `bool`, optional
647 If `True`, execute at least a ``LIMIT 1`` query if it cannot be
648 determined prior to execution that the query would return no rows.
649 exact : `bool`, optional
650 If `True`, run the full query and perform post-query filtering if
651 needed, until at least one result row is found. If `False`, the
652 returned result does not account for post-query filtering, and
653 hence may be `True` even when all result rows would be filtered
654 out.
656 Returns
657 -------
658 any : `bool`
659 `True` if the query would (or might, depending on arguments) yield
660 result rows. `False` if it definitely would not.
661 """
662 raise NotImplementedError()
664 @abstractmethod
665 def explain_no_results(self) -> Iterator[str]:
666 """Return human-readable messages that may help explain why the query
667 yields no results.
669 Returns
670 -------
671 messages : `Iterator` [ `str` ]
672 String messages that describe reasons the query might not yield any
673 results.
675 Notes
676 -----
677 Messages related to post-query filtering are only available if the
678 iterator has been exhausted, or if `any` or `count` was already called
679 (with ``exact=True`` for the latter two).
681 This method first yields messages that are generated while the query is
682 being built or filtered, but may then proceed to diagnostics generated
683 by performing what should be inexpensive follow-up queries. Callers
684 can short-circuit this at any time by simplying not iterating further.
685 """
686 raise NotImplementedError()
689class ParentDatasetQueryResults(DatasetQueryResults):
690 """An object that represents results from a query for datasets with a
691 single parent `DatasetType`.
693 Parameters
694 ----------
695 db : `Database`
696 Database engine to execute queries against.
697 query : `Query`
698 Low-level query object that backs these results. ``query.datasetType``
699 will be the parent dataset type for this object, and may not be `None`.
700 components : `Sequence` [ `str` or `None` ]
701 Names of components to include in iteration. `None` may be included
702 (at most once) to include the parent dataset type.
703 records : `Mapping`, optional
704 Mapping containing `DimensionRecord` objects for all dimensions and
705 all data IDs this query will yield. If `None` (default),
706 `DataCoordinate.hasRecords` will return `False` for all nested data
707 IDs. This is a nested mapping with `str` names of dimension elements
708 as outer keys, `DimensionRecord` instances as inner values, and
709 ``tuple(record.dataId.values())`` for the inner keys / outer values
710 (where ``record`` is the innermost `DimensionRecord` instance).
711 datasetType : `DatasetType`, optional
712 Parent dataset type for all datasets returned by this query. If not
713 provided, ``query.datasetType`` be used, and must not be `None` (as it
714 is in the case where the query is known to yield no results prior to
715 execution).
716 """
718 def __init__(
719 self,
720 db: Database,
721 query: Query,
722 *,
723 components: Sequence[Optional[str]],
724 records: Optional[Mapping[str, Mapping[tuple, DimensionRecord]]] = None,
725 datasetType: Optional[DatasetType] = None,
726 ):
727 self._db = db
728 self._query = query
729 self._components = components
730 self._records = records
731 if datasetType is None:
732 datasetType = query.datasetType
733 assert datasetType is not None, "Query used to initialize dataset results must have a dataset."
734 assert (
735 datasetType.dimensions == query.graph
736 ), f"Query dimensions {query.graph} do not match dataset type dimesions {datasetType.dimensions}."
737 self._datasetType = datasetType
739 __slots__ = ("_db", "_query", "_dimensions", "_components", "_records")
741 def __iter__(self) -> Iterator[DatasetRef]:
742 for row in self._query.rows(self._db):
743 parentRef = self._query.extractDatasetRef(row, records=self._records)
744 for component in self._components:
745 if component is None:
746 yield parentRef
747 else:
748 yield parentRef.makeComponentRef(component)
750 def __repr__(self) -> str:
751 return f"<DatasetRef iterator for [components of] {self._datasetType.name}>"
753 def byParentDatasetType(self) -> Iterator[ParentDatasetQueryResults]:
754 # Docstring inherited from DatasetQueryResults.
755 yield self
757 @contextmanager
758 def materialize(self) -> Iterator[ParentDatasetQueryResults]:
759 # Docstring inherited from DatasetQueryResults.
760 with self._query.materialize(self._db) as materialized:
761 yield ParentDatasetQueryResults(
762 self._db, materialized, components=self._components, records=self._records
763 )
765 @property
766 def parentDatasetType(self) -> DatasetType:
767 """The parent dataset type for all datasets in this iterable
768 (`DatasetType`).
769 """
770 return self._datasetType
772 @property
773 def dataIds(self) -> DataCoordinateQueryResults:
774 """A lazy-evaluation object representing a query for just the data
775 IDs of the datasets that would be returned by this query
776 (`DataCoordinateQueryResults`).
778 The returned object is not in general `zip`-iterable with ``self``;
779 it may be in a different order or have (or not have) duplicates.
780 """
781 query = self._query.subset(graph=self.parentDatasetType.dimensions, datasets=False, unique=False)
782 return DataCoordinateQueryResults.from_query(
783 self._db,
784 query,
785 self.parentDatasetType.dimensions,
786 records=self._records,
787 )
789 def withComponents(self, components: Sequence[Optional[str]]) -> ParentDatasetQueryResults:
790 """Return a new query results object for the same parent datasets but
791 different components.
793 components : `Sequence` [ `str` or `None` ]
794 Names of components to include in iteration. `None` may be
795 included (at most once) to include the parent dataset type.
796 """
797 return ParentDatasetQueryResults(
798 self._db, self._query, records=self._records, components=components, datasetType=self._datasetType
799 )
801 def expanded(self) -> ParentDatasetQueryResults:
802 # Docstring inherited from DatasetQueryResults.
803 if self._records is None:
804 records = self.dataIds.expanded()._records
805 return ParentDatasetQueryResults(
806 self._db,
807 self._query,
808 records=records,
809 components=self._components,
810 datasetType=self._datasetType,
811 )
812 else:
813 return self
815 def count(self, *, exact: bool = True) -> int:
816 # Docstring inherited.
817 return len(self._components) * self._query.count(self._db, exact=exact)
819 def any(
820 self,
821 *,
822 execute: bool = True,
823 exact: bool = True,
824 ) -> bool:
825 # Docstring inherited.
826 return self._query.any(self._db, execute=execute, exact=exact)
828 def explain_no_results(self) -> Iterator[str]:
829 # Docstring inherited.
830 return self._query.explain_no_results(self._db)
833class ChainedDatasetQueryResults(DatasetQueryResults):
834 """A `DatasetQueryResults` implementation that simply chains together
835 other results objects, each for a different parent dataset type.
837 Parameters
838 ----------
839 chain : `Sequence` [ `ParentDatasetQueryResults` ]
840 The underlying results objects this object will chain together.
841 doomed_by : `Iterable` [ `str` ], optional
842 A list of messages (appropriate for e.g. logging or exceptions) that
843 explain why the query is known to return no results even before it is
844 executed. Queries with a non-empty list will never be executed.
845 Child results objects may also have their own list.
846 """
848 def __init__(self, chain: Sequence[ParentDatasetQueryResults], doomed_by: Iterable[str] = ()):
849 self._chain = chain
850 self._doomed_by = tuple(doomed_by)
852 __slots__ = ("_chain",)
854 def __iter__(self) -> Iterator[DatasetRef]:
855 return itertools.chain.from_iterable(self._chain)
857 def __repr__(self) -> str:
858 return "<DatasetRef iterator for multiple dataset types>"
860 def byParentDatasetType(self) -> Iterator[ParentDatasetQueryResults]:
861 # Docstring inherited from DatasetQueryResults.
862 return iter(self._chain)
864 @contextmanager
865 def materialize(self) -> Iterator[ChainedDatasetQueryResults]:
866 # Docstring inherited from DatasetQueryResults.
867 with ExitStack() as stack:
868 yield ChainedDatasetQueryResults([stack.enter_context(r.materialize()) for r in self._chain])
870 def expanded(self) -> ChainedDatasetQueryResults:
871 # Docstring inherited from DatasetQueryResults.
872 return ChainedDatasetQueryResults([r.expanded() for r in self._chain])
874 def count(self, *, exact: bool = True) -> int:
875 # Docstring inherited.
876 return sum(r.count(exact=exact) for r in self._chain)
878 def any(
879 self,
880 *,
881 execute: bool = True,
882 exact: bool = True,
883 ) -> bool:
884 # Docstring inherited.
885 return any(r.any(execute=execute, exact=exact) for r in self._chain)
887 def explain_no_results(self) -> Iterator[str]:
888 # Docstring inherited.
889 for r in self._chain:
890 yield from r.explain_no_results()
891 yield from self._doomed_by
894class DimensionRecordQueryResults(Iterable[DimensionRecord]):
895 """An interface for objects that represent the results of queries for
896 dimension records.
897 """
899 @abstractmethod
900 def count(self, *, exact: bool = True) -> int:
901 """Count the number of rows this query would return.
903 Parameters
904 ----------
905 exact : `bool`, optional
906 If `True`, run the full query and perform post-query filtering if
907 needed to account for that filtering in the count. If `False`, the
908 result may be an upper bound.
910 Returns
911 -------
912 count : `int`
913 The number of rows the query would return, or an upper bound if
914 ``exact=False``.
916 Notes
917 -----
918 This counts the number of rows returned, not the number of unique rows
919 returned, so even with ``exact=True`` it may provide only an upper
920 bound on the number of *deduplicated* result rows.
921 """
922 raise NotImplementedError()
924 @abstractmethod
925 def any(self, *, execute: bool = True, exact: bool = True) -> bool:
926 """Test whether this query returns any results.
928 Parameters
929 ----------
930 execute : `bool`, optional
931 If `True`, execute at least a ``LIMIT 1`` query if it cannot be
932 determined prior to execution that the query would return no rows.
933 exact : `bool`, optional
934 If `True`, run the full query and perform post-query filtering if
935 needed, until at least one result row is found. If `False`, the
936 returned result does not account for post-query filtering, and
937 hence may be `True` even when all result rows would be filtered
938 out.
940 Returns
941 -------
942 any : `bool`
943 `True` if the query would (or might, depending on arguments) yield
944 result rows. `False` if it definitely would not.
945 """
946 raise NotImplementedError()
948 @abstractmethod
949 def order_by(self, *args: str) -> DimensionRecordQueryResults:
950 """Make the iterator return ordered result.
952 Parameters
953 ----------
954 *args : `str`
955 Names of the columns/dimensions to use for ordering. Column name
956 can be prefixed with minus (``-``) to use descending ordering.
958 Returns
959 -------
960 result : `DimensionRecordQueryResults`
961 Returns ``self`` instance which is updated to return ordered
962 result.
964 Notes
965 -----
966 This method can modify the iterator in place and return the same
967 instance.
968 """
969 raise NotImplementedError()
971 @abstractmethod
972 def limit(self, limit: int, offset: Optional[int] = None) -> DimensionRecordQueryResults:
973 """Make the iterator return limited number of records.
975 Parameters
976 ----------
977 limit : `int`
978 Upper limit on the number of returned records.
979 offset : `int` or `None`
980 If not `None` then the number of records to skip before returning
981 ``limit`` records.
983 Returns
984 -------
985 result : `DimensionRecordQueryResults`
986 Returns ``self`` instance which is updated to return limited set
987 of records.
989 Notes
990 -----
991 This method can modify the iterator in place and return the same
992 instance. Normally this method is used together with `order_by`
993 method.
994 """
995 raise NotImplementedError()
997 @abstractmethod
998 def explain_no_results(self) -> Iterator[str]:
999 """Return human-readable messages that may help explain why the query
1000 yields no results.
1002 Returns
1003 -------
1004 messages : `Iterator` [ `str` ]
1005 String messages that describe reasons the query might not yield any
1006 results.
1008 Notes
1009 -----
1010 Messages related to post-query filtering are only available if the
1011 iterator has been exhausted, or if `any` or `count` was already called
1012 (with ``exact=True`` for the latter two).
1014 This method first yields messages that are generated while the query is
1015 being built or filtered, but may then proceed to diagnostics generated
1016 by performing what should be inexpensive follow-up queries. Callers
1017 can short-circuit this at any time by simply not iterating further.
1018 """
1019 raise NotImplementedError()
1022class _DimensionRecordKey:
1023 """Class for objects used as keys in ordering `DimensionRecord` instances.
1025 Parameters
1026 ----------
1027 attributes : `Sequence` [ `str` ]
1028 Sequence of attribute names to use for comparison.
1029 ordering : `Sequence` [ `bool` ]
1030 Matching sequence of ordering flags, `False` for descending ordering,
1031 `True` for ascending ordering.
1032 record : `DimensionRecord`
1033 `DimensionRecord` to compare to other records.
1034 """
1036 def __init__(self, attributes: Sequence[str], ordering: Sequence[bool], record: DimensionRecord):
1037 self.attributes = attributes
1038 self.ordering = ordering
1039 self.rec = record
1041 def _cmp(self, other: _DimensionRecordKey) -> int:
1042 """Compare two records using provided comparison operator.
1044 Parameters
1045 ----------
1046 other : `_DimensionRecordKey`
1047 Key for other record.
1049 Returns
1050 -------
1051 result : `int`
1052 0 if keys are identical, negative if ``self`` is ordered before
1053 ``other``, positive otherwise.
1054 """
1055 for attribute, ordering in zip(self.attributes, self.ordering):
1056 # timespan.begin/end cannot use getattr
1057 attrgetter = operator.attrgetter(attribute)
1058 lhs = attrgetter(self.rec)
1059 rhs = attrgetter(other.rec)
1060 if not ordering:
1061 lhs, rhs = rhs, lhs
1062 if lhs != rhs:
1063 return 1 if lhs > rhs else -1
1064 return 0
1066 def __lt__(self, other: _DimensionRecordKey) -> bool:
1067 return self._cmp(other) < 0
1069 def __gt__(self, other: _DimensionRecordKey) -> bool:
1070 return self._cmp(other) > 0
1072 def __eq__(self, other: Any) -> bool:
1073 if not isinstance(other, _DimensionRecordKey):
1074 return NotImplemented
1075 return self._cmp(other) == 0
1077 def __le__(self, other: _DimensionRecordKey) -> bool:
1078 return self._cmp(other) <= 0
1080 def __ge__(self, other: _DimensionRecordKey) -> bool:
1081 return self._cmp(other) >= 0
1084class DatabaseDimensionRecordQueryResults(DimensionRecordQueryResults):
1085 """Implementation of DimensionRecordQueryResults using database query.
1087 Parameters
1088 ----------
1089 dataIds : `DataCoordinateQueryResults`
1090 Iterator for DataIds.
1091 recordStorage : `DimensionRecordStorage`
1092 Instance of storage class for dimension records.
1093 """
1095 def __init__(self, dataIds: DataCoordinateQueryResults, recordStorage: DimensionRecordStorage):
1096 self._dataIds = dataIds
1097 self._recordStorage = recordStorage
1098 self._order_by: Iterable[str] = ()
1100 def __iter__(self) -> Iterator[DimensionRecord]:
1101 # LIMIT is already applied at DataCoordinateQueryResults level
1102 # (assumption here is that if DataId exists then dimension record
1103 # exists too and their counts must be equal). fetch() does not
1104 # guarantee ordering, so we need to sort records in memory below.
1105 recordIter = self._recordStorage.fetch(self._dataIds)
1106 if not self._order_by:
1107 return iter(recordIter)
1109 # Parse list of column names and build a list of attribute name for
1110 # ordering. Note that here we only support ordering by direct
1111 # attributes of the element, and not other elements from the dimension
1112 # graph.
1113 orderBy = ElementOrderByClause(self._order_by, self._recordStorage.element)
1114 attributes: List[str] = []
1115 ordering: List[bool] = []
1116 for column in orderBy.order_by_columns:
1117 if column.column is None:
1118 assert isinstance(column.element, Dimension), "Element must be a Dimension"
1119 attributes.append(column.element.primaryKey.name)
1120 else:
1121 attributes.append(column.column)
1122 ordering.append(column.ordering)
1124 def _key(record: DimensionRecord) -> _DimensionRecordKey:
1125 return _DimensionRecordKey(attributes, ordering, record)
1127 records = sorted(recordIter, key=_key)
1128 return iter(records)
1130 def count(self, *, exact: bool = True) -> int:
1131 # Docstring inherited from base class.
1132 return self._dataIds.count(exact=exact)
1134 def any(self, *, execute: bool = True, exact: bool = True) -> bool:
1135 # Docstring inherited from base class.
1136 return self._dataIds.any(execute=execute, exact=exact)
1138 def order_by(self, *args: str) -> DimensionRecordQueryResults:
1139 # Docstring inherited from base class.
1140 self._dataIds = self._dataIds.order_by(*args)
1141 self._order_by = args
1142 return self
1144 def limit(self, limit: int, offset: Optional[int] = None) -> DimensionRecordQueryResults:
1145 # Docstring inherited from base class.
1146 self._dataIds = self._dataIds.limit(limit, offset)
1147 return self
1149 def explain_no_results(self) -> Iterator[str]:
1150 # Docstring inherited.
1151 return self._dataIds.explain_no_results()