Coverage for python/lsst/daf/butler/registry/queries/_results.py: 35%
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23__all__ = (
24 "ChainedDatasetQueryResults",
25 "DatabaseDimensionRecordQueryResults",
26 "DataCoordinateQueryResults",
27 "DatasetQueryResults",
28 "DimensionRecordQueryResults",
29 "ParentDatasetQueryResults",
30)
32import itertools
33from abc import abstractmethod
34from contextlib import ExitStack, contextmanager
35from typing import (
36 Any,
37 Callable,
38 ContextManager,
39 Iterable,
40 Iterator,
41 Mapping,
42 Optional,
43 Sequence,
44 Tuple,
45 Union,
46)
48import sqlalchemy
50from ...core import (
51 DataCoordinate,
52 DataCoordinateIterable,
53 DatasetRef,
54 DatasetType,
55 DimensionGraph,
56 DimensionRecord,
57 SimpleQuery,
58)
59from ..interfaces import Database, DimensionRecordStorage
60from ._query import Query
61from ._structs import QuerySummary
63QueryFactoryMethod = Callable[[Optional[Iterable[str]], Optional[Tuple[int, Optional[int]]]], Query]
64"""Type of a query factory method type used by DataCoordinateQueryResults.
65"""
68class DataCoordinateQueryResults(DataCoordinateIterable):
69 """An enhanced implementation of `DataCoordinateIterable` that represents
70 data IDs retrieved from a database query.
72 Parameters
73 ----------
74 db : `Database`
75 Database engine used to execute queries.
76 query_factory : `QueryFactoryMethod`
77 Method which creates an instance of `Query` class.
78 graph : `DimensionGraph`
79 Dimensions used by query.
80 order_by : `Iterable` [ `str` ], optional
81 Optional sequence of column names used for result ordering.
82 limit : `Tuple` [ `int`, `int` ], optional
83 Limit for the number of returned records and optional offset.
84 records : `Mapping`, optional
85 A nested mapping containing `DimensionRecord` objects for all
86 dimensions and all data IDs this query will yield. If `None`
87 (default), `DataCoordinateIterable.hasRecords` will return `False`.
88 The outer mapping has `str` keys (the names of dimension elements).
89 The inner mapping has `tuple` keys representing data IDs (tuple
90 conversions of `DataCoordinate.values()`) and `DimensionRecord` values.
92 Notes
93 -----
94 Constructing an instance of this does nothing; the query is not executed
95 until it is iterated over (or some other operation is performed that
96 involves iteration).
98 Instances should generally only be constructed by `Registry` methods or the
99 methods of other query result objects.
100 """
102 def __init__(
103 self,
104 db: Database,
105 query_factory: QueryFactoryMethod,
106 graph: DimensionGraph,
107 *,
108 order_by: Optional[Iterable[str]] = None,
109 limit: Optional[Tuple[int, Optional[int]]] = None,
110 records: Optional[Mapping[str, Mapping[tuple, DimensionRecord]]] = None,
111 ):
112 self._db = db
113 self._query_factory = query_factory
114 self._graph = graph
115 self._order_by = order_by
116 self._limit = limit
117 self._records = records
118 self._cached_query: Optional[Query] = None
120 __slots__ = ("_db", "_query_factory", "_graph", "_order_by", "_limit", "_records", "_cached_query")
122 @classmethod
123 def from_query(
124 cls,
125 db: Database,
126 query: Query,
127 graph: DimensionGraph,
128 *,
129 order_by: Optional[Iterable[str]] = None,
130 limit: Optional[Tuple[int, Optional[int]]] = None,
131 records: Optional[Mapping[str, Mapping[tuple, DimensionRecord]]] = None,
132 ) -> DataCoordinateQueryResults:
133 """Make an instance from a pre-existing query instead of a factory.
135 Parameters
136 ----------
137 db : `Database`
138 Database engine used to execute queries.
139 query : `Query`
140 Low-level representation of the query that backs this result
141 object.
142 graph : `DimensionGraph`
143 Dimensions used by query.
144 order_by : `Iterable` [ `str` ], optional
145 Optional sequence of column names used for result ordering.
146 limit : `Tuple` [ `int`, `int` ], optional
147 Limit for the number of returned records and optional offset.
148 records : `Mapping`, optional
149 A nested mapping containing `DimensionRecord` objects for all
150 dimensions and all data IDs this query will yield. If `None`
151 (default), `DataCoordinateIterable.hasRecords` will return `False`.
152 The outer mapping has `str` keys (the names of dimension elements).
153 The inner mapping has `tuple` keys representing data IDs (tuple
154 conversions of `DataCoordinate.values()`) and `DimensionRecord`
155 values.
156 """
158 def factory(order_by: Optional[Iterable[str]], limit: Optional[Tuple[int, Optional[int]]]) -> Query:
159 return query
161 return DataCoordinateQueryResults(db, factory, graph, order_by=order_by, limit=limit, records=records)
163 def __iter__(self) -> Iterator[DataCoordinate]:
164 return (self._query.extractDataId(row, records=self._records) for row in self._query.rows(self._db))
166 def __repr__(self) -> str:
167 return f"<DataCoordinate iterator with dimensions={self._graph}>"
169 def _clone(
170 self,
171 *,
172 query_factory: Optional[QueryFactoryMethod] = None,
173 query: Optional[Query] = None,
174 graph: Optional[DimensionGraph] = None,
175 order_by: Optional[Iterable[str]] = None,
176 limit: Optional[Tuple[int, Optional[int]]] = None,
177 records: Optional[Mapping[str, Mapping[tuple, DimensionRecord]]] = None,
178 ) -> DataCoordinateQueryResults:
179 """Clone this instance potentially updating some attributes."""
180 graph = graph if graph is not None else self._graph
181 order_by = order_by if order_by is not None else self._order_by
182 limit = limit if limit is not None else self._limit
183 records = records if records is not None else self._records
184 if query is None:
185 query_factory = query_factory or self._query_factory
186 return DataCoordinateQueryResults(
187 self._db, query_factory, graph, order_by=order_by, limit=limit, records=records
188 )
189 else:
190 return DataCoordinateQueryResults.from_query(
191 self._db, query, graph, order_by=order_by, limit=limit, records=records
192 )
194 @property
195 def _query(self) -> Query:
196 """Query representation instance (`Query`)"""
197 if self._cached_query is None:
198 self._cached_query = self._query_factory(self._order_by, self._limit)
199 assert (
200 self._cached_query.datasetType is None
201 ), "Query used to initialize data coordinate results should not have any datasets."
202 return self._cached_query
204 @property
205 def graph(self) -> DimensionGraph:
206 # Docstring inherited from DataCoordinateIterable.
207 return self._graph
209 def hasFull(self) -> bool:
210 # Docstring inherited from DataCoordinateIterable.
211 return True
213 def hasRecords(self) -> bool:
214 # Docstring inherited from DataCoordinateIterable.
215 return self._records is not None or not self._graph
217 @contextmanager
218 def materialize(self) -> Iterator[DataCoordinateQueryResults]:
219 """Insert this query's results into a temporary table.
221 Returns
222 -------
223 context : `typing.ContextManager` [ `DataCoordinateQueryResults` ]
224 A context manager that ensures the temporary table is created and
225 populated in ``__enter__`` (returning a results object backed by
226 that table), and dropped in ``__exit__``. If ``self`` is already
227 materialized, the context manager may do nothing (reflecting the
228 fact that an outer context manager should already take care of
229 everything else).
231 Notes
232 -----
233 When using a very large result set to perform multiple queries (e.g.
234 multiple calls to `subset` with different arguments, or even a single
235 call to `expanded`), it may be much more efficient to start by
236 materializing the query and only then performing the follow up queries.
237 It may also be less efficient, depending on how well database engine's
238 query optimizer can simplify those particular follow-up queries and
239 how efficiently it caches query results even when the are not
240 explicitly inserted into a temporary table. See `expanded` and
241 `subset` for examples.
242 """
243 with self._query.materialize(self._db) as materialized:
244 # Note that we depend on order_by columns to be passes from Query
245 # to MaterializedQuery, so order_by and limit are not used.
246 yield self._clone(query=materialized)
248 def expanded(self) -> DataCoordinateQueryResults:
249 """Return a results object for which `hasRecords` returns `True`.
251 This method may involve actually executing database queries to fetch
252 `DimensionRecord` objects.
254 Returns
255 -------
256 results : `DataCoordinateQueryResults`
257 A results object for which `hasRecords` returns `True`. May be
258 ``self`` if that is already the case.
260 Notes
261 -----
262 For very result sets, it may be much more efficient to call
263 `materialize` before calling `expanded`, to avoid performing the
264 original query multiple times (as a subquery) in the follow-up queries
265 that fetch dimension records. For example::
267 with registry.queryDataIds(...).materialize() as tempDataIds:
268 dataIdsWithRecords = tempDataIds.expanded()
269 for dataId in dataIdsWithRecords:
270 ...
271 """
272 if self._records is None:
273 records = {}
274 for element in self.graph.elements:
275 subset = self.subset(graph=element.graph, unique=True)
276 records[element.name] = {
277 tuple(record.dataId.values()): record
278 for record in self._query.managers.dimensions[element].fetch(subset)
279 }
281 return self._clone(query=self._query, records=records)
282 else:
283 return self
285 def subset(
286 self, graph: Optional[DimensionGraph] = None, *, unique: bool = False
287 ) -> DataCoordinateQueryResults:
288 """Return a results object containing a subset of the dimensions of
289 this one, and/or a unique near-subset of its rows.
291 This method may involve actually executing database queries to fetch
292 `DimensionRecord` objects.
294 Parameters
295 ----------
296 graph : `DimensionGraph`, optional
297 Dimensions to include in the new results object. If `None`,
298 ``self.graph`` is used.
299 unique : `bool`, optional
300 If `True` (`False` is default), the query should only return unique
301 data IDs. This is implemented in the database; to obtain unique
302 results via Python-side processing (which may be more efficient in
303 some cases), use `toSet` to construct a `DataCoordinateSet` from
304 this results object instead.
306 Returns
307 -------
308 results : `DataCoordinateQueryResults`
309 A results object corresponding to the given criteria. May be
310 ``self`` if it already qualifies.
312 Notes
313 -----
314 This method can only return a "near-subset" of the original result rows
315 in general because of subtleties in how spatial overlaps are
316 implemented; see `Query.subset` for more information.
318 When calling `subset` multiple times on the same very large result set,
319 it may be much more efficient to call `materialize` first. For
320 example::
322 dimensions1 = DimensionGraph(...)
323 dimensions2 = DimensionGraph(...)
324 with registry.queryDataIds(...).materialize() as tempDataIds:
325 for dataId1 in tempDataIds.subset(
326 graph=dimensions1,
327 unique=True):
328 ...
329 for dataId2 in tempDataIds.subset(
330 graph=dimensions2,
331 unique=True):
332 ...
333 """
334 if graph is None:
335 graph = self.graph
336 if not graph.issubset(self.graph):
337 raise ValueError(f"{graph} is not a subset of {self.graph}")
338 if graph == self.graph and (not unique or self._query.isUnique()):
339 return self
340 records: Optional[Mapping[str, Mapping[tuple, DimensionRecord]]]
341 if self._records is not None:
342 records = {element.name: self._records[element.name] for element in graph.elements}
343 else:
344 records = None
345 query = self._query.subset(graph=graph, datasets=False, unique=unique)
347 return self._clone(graph=graph, query=query, records=records)
349 def constrain(self, query: SimpleQuery, columns: Callable[[str], sqlalchemy.sql.ColumnElement]) -> None:
350 # Docstring inherited from DataCoordinateIterable.
351 sql = self._query.sql
352 if sql is not None:
353 fromClause = sql.alias("c")
354 query.join(
355 fromClause,
356 onclause=sqlalchemy.sql.and_(
357 *[
358 columns(dimension.name) == fromClause.columns[dimension.name]
359 for dimension in self.graph.required
360 ]
361 ),
362 )
364 def findDatasets(
365 self, datasetType: Union[DatasetType, str], collections: Any, *, findFirst: bool = True
366 ) -> ParentDatasetQueryResults:
367 """Find datasets using the data IDs identified by this query.
369 Parameters
370 ----------
371 datasetType : `DatasetType` or `str`
372 Dataset type or the name of one to search for. Must have
373 dimensions that are a subset of ``self.graph``.
374 collections : `Any`
375 An expression that fully or partially identifies the collections
376 to search for the dataset, such as a `str`, `re.Pattern`, or
377 iterable thereof. ``...`` can be used to return all collections.
378 See :ref:`daf_butler_collection_expressions` for more information.
379 findFirst : `bool`, optional
380 If `True` (default), for each result data ID, only yield one
381 `DatasetRef`, from the first collection in which a dataset of that
382 dataset type appears (according to the order of ``collections``
383 passed in). If `True`, ``collections`` must not contain regular
384 expressions and may not be ``...``.
386 Returns
387 -------
388 datasets : `ParentDatasetQueryResults`
389 A lazy-evaluation object representing dataset query results,
390 iterable over `DatasetRef` objects. If ``self.hasRecords()``, all
391 nested data IDs in those dataset references will have records as
392 well.
394 Raises
395 ------
396 ValueError
397 Raised if ``datasetType.dimensions.issubset(self.graph) is False``.
398 """
399 if not isinstance(datasetType, DatasetType):
400 datasetType = self._query.managers.datasets[datasetType].datasetType
401 # moving component handling down into managers.
402 if not datasetType.dimensions.issubset(self.graph):
403 raise ValueError(
404 f"findDatasets requires that the dataset type have the same dimensions as "
405 f"the DataCoordinateQueryResult used as input to the search, but "
406 f"{datasetType.name} has dimensions {datasetType.dimensions}, while the input "
407 f"dimensions are {self.graph}."
408 )
409 if datasetType.isComponent():
410 # We were given a true DatasetType instance, but it's a component.
411 parentName, componentName = datasetType.nameAndComponent()
412 storage = self._query.managers.datasets[parentName]
413 datasetType = storage.datasetType
414 components = [componentName]
415 else:
416 components = [None]
417 summary = QuerySummary(self.graph, whereRegion=self._query.whereRegion, datasets=[datasetType])
418 builder = self._query.makeBuilder(summary)
419 builder.joinDataset(datasetType, collections=collections, findFirst=findFirst)
420 query = builder.finish(joinMissing=False)
421 return ParentDatasetQueryResults(
422 db=self._db, query=query, components=components, records=self._records, datasetType=datasetType
423 )
425 def count(self, *, exact: bool = True) -> int:
426 """Count the number of rows this query would return.
428 Parameters
429 ----------
430 exact : `bool`, optional
431 If `True`, run the full query and perform post-query filtering if
432 needed to account for that filtering in the count. If `False`, the
433 result may be an upper bound.
435 Returns
436 -------
437 count : `int`
438 The number of rows the query would return, or an upper bound if
439 ``exact=False``.
441 Notes
442 -----
443 This counts the number of rows returned, not the number of unique rows
444 returned, so even with ``exact=True`` it may provide only an upper
445 bound on the number of *deduplicated* result rows.
446 """
447 return self._query.count(self._db, exact=exact)
449 def any(
450 self,
451 *,
452 execute: bool = True,
453 exact: bool = True,
454 ) -> bool:
455 """Test whether this query returns any results.
457 Parameters
458 ----------
459 execute : `bool`, optional
460 If `True`, execute at least a ``LIMIT 1`` query if it cannot be
461 determined prior to execution that the query would return no rows.
462 exact : `bool`, optional
463 If `True`, run the full query and perform post-query filtering if
464 needed, until at least one result row is found. If `False`, the
465 returned result does not account for post-query filtering, and
466 hence may be `True` even when all result rows would be filtered
467 out.
469 Returns
470 -------
471 any : `bool`
472 `True` if the query would (or might, depending on arguments) yield
473 result rows. `False` if it definitely would not.
474 """
475 return self._query.any(self._db, execute=execute, exact=exact)
477 def explain_no_results(self) -> Iterator[str]:
478 """Return human-readable messages that may help explain why the query
479 yields no results.
481 Returns
482 -------
483 messages : `Iterator` [ `str` ]
484 String messages that describe reasons the query might not yield any
485 results.
487 Notes
488 -----
489 Messages related to post-query filtering are only available if the
490 iterator has been exhausted, or if `any` or `count` was already called
491 (with ``exact=True`` for the latter two).
493 This method first yields messages that are generated while the query is
494 being built or filtered, but may then proceed to diagnostics generated
495 by performing what should be inexpensive follow-up queries. Callers
496 can short-circuit this at any time by simplying not iterating further.
497 """
498 return self._query.explain_no_results(self._db)
500 def order_by(self, *args: str) -> DataCoordinateQueryResults:
501 """Make the iterator return ordered result.
503 Parameters
504 ----------
505 *args : `str`
506 Names of the columns/dimensions to use for ordering. Column name
507 can be prefixed with minus (``-``) to use descending ordering.
509 Returns
510 -------
511 result : `DataCoordinateQueryResults`
512 Returns ``self`` instance which is updated to return ordered
513 result.
515 Notes
516 -----
517 This method modifies the iterator in place and returns the same
518 instance to support method chaining.
519 """
520 return self._clone(order_by=args)
522 def limit(self, limit: int, offset: Optional[int] = None) -> DataCoordinateQueryResults:
523 """Make the iterator return limited number of records.
525 Parameters
526 ----------
527 limit : `int`
528 Upper limit on the number of returned records.
529 offset : `int` or `None`
530 If not `None` then the number of records to skip before returning
531 ``limit`` records.
533 Returns
534 -------
535 result : `DataCoordinateQueryResults`
536 Returns ``self`` instance which is updated to return limited set
537 of records.
539 Notes
540 -----
541 This method modifies the iterator in place and returns the same
542 instance to support method chaining. Normally this method is used
543 together with `order_by` method.
544 """
545 return self._clone(limit=(limit, offset))
548class DatasetQueryResults(Iterable[DatasetRef]):
549 """An interface for objects that represent the results of queries for
550 datasets.
551 """
553 @abstractmethod
554 def byParentDatasetType(self) -> Iterator[ParentDatasetQueryResults]:
555 """Group results by parent dataset type.
557 Returns
558 -------
559 iter : `Iterator` [ `ParentDatasetQueryResults` ]
560 An iterator over `DatasetQueryResults` instances that are each
561 responsible for a single parent dataset type (either just that
562 dataset type, one or more of its component dataset types, or both).
563 """
564 raise NotImplementedError()
566 @abstractmethod
567 def materialize(self) -> ContextManager[DatasetQueryResults]:
568 """Insert this query's results into a temporary table.
570 Returns
571 -------
572 context : `typing.ContextManager` [ `DatasetQueryResults` ]
573 A context manager that ensures the temporary table is created and
574 populated in ``__enter__`` (returning a results object backed by
575 that table), and dropped in ``__exit__``. If ``self`` is already
576 materialized, the context manager may do nothing (reflecting the
577 fact that an outer context manager should already take care of
578 everything else).
579 """
580 raise NotImplementedError()
582 @abstractmethod
583 def expanded(self) -> DatasetQueryResults:
584 """Return a `DatasetQueryResults` for which `DataCoordinate.hasRecords`
585 returns `True` for all data IDs in returned `DatasetRef` objects.
587 Returns
588 -------
589 expanded : `DatasetQueryResults`
590 Either a new `DatasetQueryResults` instance or ``self``, if it is
591 already expanded.
593 Notes
594 -----
595 As with `DataCoordinateQueryResults.expanded`, it may be more efficient
596 to call `materialize` before expanding data IDs for very large result
597 sets.
598 """
599 raise NotImplementedError()
601 @abstractmethod
602 def count(self, *, exact: bool = True) -> int:
603 """Count the number of rows this query would return.
605 Parameters
606 ----------
607 exact : `bool`, optional
608 If `True`, run the full query and perform post-query filtering if
609 needed to account for that filtering in the count. If `False`, the
610 result may be an upper bound.
612 Returns
613 -------
614 count : `int`
615 The number of rows the query would return, or an upper bound if
616 ``exact=False``.
618 Notes
619 -----
620 This counts the number of rows returned, not the number of unique rows
621 returned, so even with ``exact=True`` it may provide only an upper
622 bound on the number of *deduplicated* result rows.
623 """
624 raise NotImplementedError()
626 @abstractmethod
627 def any(
628 self,
629 *,
630 execute: bool = True,
631 exact: bool = True,
632 ) -> bool:
633 """Test whether this query returns any results.
635 Parameters
636 ----------
637 execute : `bool`, optional
638 If `True`, execute at least a ``LIMIT 1`` query if it cannot be
639 determined prior to execution that the query would return no rows.
640 exact : `bool`, optional
641 If `True`, run the full query and perform post-query filtering if
642 needed, until at least one result row is found. If `False`, the
643 returned result does not account for post-query filtering, and
644 hence may be `True` even when all result rows would be filtered
645 out.
647 Returns
648 -------
649 any : `bool`
650 `True` if the query would (or might, depending on arguments) yield
651 result rows. `False` if it definitely would not.
652 """
653 raise NotImplementedError()
655 @abstractmethod
656 def explain_no_results(self) -> Iterator[str]:
657 """Return human-readable messages that may help explain why the query
658 yields no results.
660 Returns
661 -------
662 messages : `Iterator` [ `str` ]
663 String messages that describe reasons the query might not yield any
664 results.
666 Notes
667 -----
668 Messages related to post-query filtering are only available if the
669 iterator has been exhausted, or if `any` or `count` was already called
670 (with ``exact=True`` for the latter two).
672 This method first yields messages that are generated while the query is
673 being built or filtered, but may then proceed to diagnostics generated
674 by performing what should be inexpensive follow-up queries. Callers
675 can short-circuit this at any time by simplying not iterating further.
676 """
677 raise NotImplementedError()
680class ParentDatasetQueryResults(DatasetQueryResults):
681 """An object that represents results from a query for datasets with a
682 single parent `DatasetType`.
684 Parameters
685 ----------
686 db : `Database`
687 Database engine to execute queries against.
688 query : `Query`
689 Low-level query object that backs these results. ``query.datasetType``
690 will be the parent dataset type for this object, and may not be `None`.
691 components : `Sequence` [ `str` or `None` ]
692 Names of components to include in iteration. `None` may be included
693 (at most once) to include the parent dataset type.
694 records : `Mapping`, optional
695 Mapping containing `DimensionRecord` objects for all dimensions and
696 all data IDs this query will yield. If `None` (default),
697 `DataCoordinate.hasRecords` will return `False` for all nested data
698 IDs. This is a nested mapping with `str` names of dimension elements
699 as outer keys, `DimensionRecord` instances as inner values, and
700 ``tuple(record.dataId.values())`` for the inner keys / outer values
701 (where ``record`` is the innermost `DimensionRecord` instance).
702 datasetType : `DatasetType`, optional
703 Parent dataset type for all datasets returned by this query. If not
704 provided, ``query.datasetType`` be used, and must not be `None` (as it
705 is in the case where the query is known to yield no results prior to
706 execution).
707 """
709 def __init__(
710 self,
711 db: Database,
712 query: Query,
713 *,
714 components: Sequence[Optional[str]],
715 records: Optional[Mapping[str, Mapping[tuple, DimensionRecord]]] = None,
716 datasetType: Optional[DatasetType] = None,
717 ):
718 self._db = db
719 self._query = query
720 self._components = components
721 self._records = records
722 if datasetType is None:
723 datasetType = query.datasetType
724 assert datasetType is not None, "Query used to initialize dataset results must have a dataset."
725 assert (
726 datasetType.dimensions == query.graph
727 ), f"Query dimensions {query.graph} do not match dataset type dimesions {datasetType.dimensions}."
728 self._datasetType = datasetType
730 __slots__ = ("_db", "_query", "_dimensions", "_components", "_records")
732 def __iter__(self) -> Iterator[DatasetRef]:
733 for row in self._query.rows(self._db):
734 parentRef = self._query.extractDatasetRef(row, records=self._records)
735 for component in self._components:
736 if component is None:
737 yield parentRef
738 else:
739 yield parentRef.makeComponentRef(component)
741 def __repr__(self) -> str:
742 return f"<DatasetRef iterator for [components of] {self._datasetType.name}>"
744 def byParentDatasetType(self) -> Iterator[ParentDatasetQueryResults]:
745 # Docstring inherited from DatasetQueryResults.
746 yield self
748 @contextmanager
749 def materialize(self) -> Iterator[ParentDatasetQueryResults]:
750 # Docstring inherited from DatasetQueryResults.
751 with self._query.materialize(self._db) as materialized:
752 yield ParentDatasetQueryResults(
753 self._db, materialized, components=self._components, records=self._records
754 )
756 @property
757 def parentDatasetType(self) -> DatasetType:
758 """The parent dataset type for all datasets in this iterable
759 (`DatasetType`).
760 """
761 return self._datasetType
763 @property
764 def dataIds(self) -> DataCoordinateQueryResults:
765 """A lazy-evaluation object representing a query for just the data
766 IDs of the datasets that would be returned by this query
767 (`DataCoordinateQueryResults`).
769 The returned object is not in general `zip`-iterable with ``self``;
770 it may be in a different order or have (or not have) duplicates.
771 """
772 query = self._query.subset(graph=self.parentDatasetType.dimensions, datasets=False, unique=False)
773 return DataCoordinateQueryResults.from_query(
774 self._db,
775 query,
776 self.parentDatasetType.dimensions,
777 records=self._records,
778 )
780 def withComponents(self, components: Sequence[Optional[str]]) -> ParentDatasetQueryResults:
781 """Return a new query results object for the same parent datasets but
782 different components.
784 components : `Sequence` [ `str` or `None` ]
785 Names of components to include in iteration. `None` may be
786 included (at most once) to include the parent dataset type.
787 """
788 return ParentDatasetQueryResults(
789 self._db, self._query, records=self._records, components=components, datasetType=self._datasetType
790 )
792 def expanded(self) -> ParentDatasetQueryResults:
793 # Docstring inherited from DatasetQueryResults.
794 if self._records is None:
795 records = self.dataIds.expanded()._records
796 return ParentDatasetQueryResults(
797 self._db,
798 self._query,
799 records=records,
800 components=self._components,
801 datasetType=self._datasetType,
802 )
803 else:
804 return self
806 def count(self, *, exact: bool = True) -> int:
807 # Docstring inherited.
808 return len(self._components) * self._query.count(self._db, exact=exact)
810 def any(
811 self,
812 *,
813 execute: bool = True,
814 exact: bool = True,
815 ) -> bool:
816 # Docstring inherited.
817 return self._query.any(self._db, execute=execute, exact=exact)
819 def explain_no_results(self) -> Iterator[str]:
820 # Docstring inherited.
821 return self._query.explain_no_results(self._db)
824class ChainedDatasetQueryResults(DatasetQueryResults):
825 """A `DatasetQueryResults` implementation that simply chains together
826 other results objects, each for a different parent dataset type.
828 Parameters
829 ----------
830 chain : `Sequence` [ `ParentDatasetQueryResults` ]
831 The underlying results objects this object will chain together.
832 doomed_by : `Iterable` [ `str` ], optional
833 A list of messages (appropriate for e.g. logging or exceptions) that
834 explain why the query is known to return no results even before it is
835 executed. Queries with a non-empty list will never be executed.
836 Child results objects may also have their own list.
837 """
839 def __init__(self, chain: Sequence[ParentDatasetQueryResults], doomed_by: Iterable[str] = ()):
840 self._chain = chain
841 self._doomed_by = tuple(doomed_by)
843 __slots__ = ("_chain",)
845 def __iter__(self) -> Iterator[DatasetRef]:
846 return itertools.chain.from_iterable(self._chain)
848 def __repr__(self) -> str:
849 return "<DatasetRef iterator for multiple dataset types>"
851 def byParentDatasetType(self) -> Iterator[ParentDatasetQueryResults]:
852 # Docstring inherited from DatasetQueryResults.
853 return iter(self._chain)
855 @contextmanager
856 def materialize(self) -> Iterator[ChainedDatasetQueryResults]:
857 # Docstring inherited from DatasetQueryResults.
858 with ExitStack() as stack:
859 yield ChainedDatasetQueryResults([stack.enter_context(r.materialize()) for r in self._chain])
861 def expanded(self) -> ChainedDatasetQueryResults:
862 # Docstring inherited from DatasetQueryResults.
863 return ChainedDatasetQueryResults([r.expanded() for r in self._chain])
865 def count(self, *, exact: bool = True) -> int:
866 # Docstring inherited.
867 return sum(r.count(exact=exact) for r in self._chain)
869 def any(
870 self,
871 *,
872 execute: bool = True,
873 exact: bool = True,
874 ) -> bool:
875 # Docstring inherited.
876 return any(r.any(execute=execute, exact=exact) for r in self._chain)
878 def explain_no_results(self) -> Iterator[str]:
879 # Docstring inherited.
880 for r in self._chain:
881 yield from r.explain_no_results()
882 yield from self._doomed_by
885class DimensionRecordQueryResults(Iterable[DimensionRecord]):
886 """An interface for objects that represent the results of queries for
887 dimension records.
888 """
890 @abstractmethod
891 def count(self, *, exact: bool = True) -> int:
892 """Count the number of rows this query would return.
894 Parameters
895 ----------
896 exact : `bool`, optional
897 If `True`, run the full query and perform post-query filtering if
898 needed to account for that filtering in the count. If `False`, the
899 result may be an upper bound.
901 Returns
902 -------
903 count : `int`
904 The number of rows the query would return, or an upper bound if
905 ``exact=False``.
907 Notes
908 -----
909 This counts the number of rows returned, not the number of unique rows
910 returned, so even with ``exact=True`` it may provide only an upper
911 bound on the number of *deduplicated* result rows.
912 """
913 raise NotImplementedError()
915 @abstractmethod
916 def any(self, *, execute: bool = True, exact: bool = True) -> bool:
917 """Test whether this query returns any results.
919 Parameters
920 ----------
921 execute : `bool`, optional
922 If `True`, execute at least a ``LIMIT 1`` query if it cannot be
923 determined prior to execution that the query would return no rows.
924 exact : `bool`, optional
925 If `True`, run the full query and perform post-query filtering if
926 needed, until at least one result row is found. If `False`, the
927 returned result does not account for post-query filtering, and
928 hence may be `True` even when all result rows would be filtered
929 out.
931 Returns
932 -------
933 any : `bool`
934 `True` if the query would (or might, depending on arguments) yield
935 result rows. `False` if it definitely would not.
936 """
937 raise NotImplementedError()
939 @abstractmethod
940 def order_by(self, *args: str) -> DimensionRecordQueryResults:
941 """Make the iterator return ordered result.
943 Parameters
944 ----------
945 *args : `str`
946 Names of the columns/dimensions to use for ordering. Column name
947 can be prefixed with minus (``-``) to use descending ordering.
949 Returns
950 -------
951 result : `DimensionRecordQueryResults`
952 Returns ``self`` instance which is updated to return ordered
953 result.
955 Notes
956 -----
957 This method can modify the iterator in place and return the same
958 instance.
959 """
960 raise NotImplementedError()
962 @abstractmethod
963 def limit(self, limit: int, offset: Optional[int] = None) -> DimensionRecordQueryResults:
964 """Make the iterator return limited number of records.
966 Parameters
967 ----------
968 limit : `int`
969 Upper limit on the number of returned records.
970 offset : `int` or `None`
971 If not `None` then the number of records to skip before returning
972 ``limit`` records.
974 Returns
975 -------
976 result : `DimensionRecordQueryResults`
977 Returns ``self`` instance which is updated to return limited set
978 of records.
980 Notes
981 -----
982 This method can modify the iterator in place and return the same
983 instance. Normally this method is used together with `order_by`
984 method.
985 """
986 raise NotImplementedError()
989class DatabaseDimensionRecordQueryResults(DimensionRecordQueryResults):
990 """Implementation of DimensionRecordQueryResults using database query.
992 Parameters
993 ----------
994 dataIds : `DataCoordinateQueryResults`
995 Iterator for DataIds.
996 recordStorage : `DimensionRecordStorage`
997 Instance of storage class for dimension records.
998 """
1000 def __init__(self, dataIds: DataCoordinateQueryResults, recordStorage: DimensionRecordStorage):
1001 self._dataIds = dataIds
1002 self._recordStorage = recordStorage
1003 self._order_by: Iterable[str] = ()
1005 def __iter__(self) -> Iterator[DimensionRecord]:
1006 # LIMIT is already applied at DataCoordinateQueryResults level
1007 # (assumption here is that if DataId exists then dimension record
1008 # exists too and their counts must be equal). We still need to make
1009 # sure that ordering is applied to dimension records as well.
1010 if not self._order_by:
1011 return iter(self._recordStorage.fetch(self._dataIds))
1012 else:
1013 # fetch() method does not support ordering, for now do it hard way
1014 # by fetching everything into memory and ordering by DataId
1015 dataIds = self._dataIds.toSequence()
1016 rec_map = {}
1017 for rec in self._recordStorage.fetch(dataIds):
1018 rec_map[rec.dataId] = rec
1019 # TODO: Do we want to clean up dataIds that may be missing
1020 return iter(rec_map[dataId] for dataId in dataIds)
1022 def count(self, *, exact: bool = True) -> int:
1023 # Docstring inherited from base class.
1024 return self._dataIds.count(exact=exact)
1026 def any(self, *, execute: bool = True, exact: bool = True) -> bool:
1027 # Docstring inherited from base class.
1028 return self._dataIds.any(execute=execute, exact=exact)
1030 def order_by(self, *args: str) -> DimensionRecordQueryResults:
1031 # Docstring inherited from base class.
1032 self._dataIds = self._dataIds.order_by(*args)
1033 self._order_by = args
1034 return self
1036 def limit(self, limit: int, offset: Optional[int] = None) -> DimensionRecordQueryResults:
1037 # Docstring inherited from base class.
1038 self._dataIds = self._dataIds.limit(limit, offset)
1039 return self