Coverage for python/lsst/daf/butler/registry/queries/_results.py: 32%
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23__all__ = (
24 "ChainedDatasetQueryResults",
25 "DatabaseDimensionRecordQueryResults",
26 "DataCoordinateQueryResults",
27 "DatasetQueryResults",
28 "DimensionRecordQueryResults",
29 "ParentDatasetQueryResults",
30)
32from abc import abstractmethod
33from contextlib import contextmanager, ExitStack
34import itertools
35from typing import (
36 Any,
37 Callable,
38 ContextManager,
39 Iterable,
40 Iterator,
41 Mapping,
42 Optional,
43 Sequence,
44 Tuple,
45 Union,
46)
48import sqlalchemy
50from ...core import (
51 DataCoordinate,
52 DataCoordinateIterable,
53 DatasetRef,
54 DatasetType,
55 DimensionGraph,
56 DimensionRecord,
57 SimpleQuery,
58)
59from ..interfaces import Database, DimensionRecordStorage
60from ._query import Query
61from ._structs import QuerySummary
64QueryFactoryMethod = Callable[[Optional[Iterable[str]], Optional[Tuple[int, Optional[int]]]], Query]
65"""Type of a query factory method type used by DataCoordinateQueryResults.
66"""
69class DataCoordinateQueryResults(DataCoordinateIterable):
70 """An enhanced implementation of `DataCoordinateIterable` that represents
71 data IDs retrieved from a database query.
73 Parameters
74 ----------
75 db : `Database`
76 Database engine used to execute queries.
77 query_factory : `QueryFactoryMethod`
78 Method which creates an instance of `Query` class.
79 graph : `DimensionGraph`
80 Dimensions used by query.
81 order_by : `Iterable` [ `str` ], optional
82 Optional sequence of column names used for result ordering.
83 limit : `Tuple` [ `int`, `int` ], optional
84 Limit for the number of returned records and optional offset.
85 records : `Mapping`, optional
86 A nested mapping containing `DimensionRecord` objects for all
87 dimensions and all data IDs this query will yield. If `None`
88 (default), `DataCoordinateIterable.hasRecords` will return `False`.
89 The outer mapping has `str` keys (the names of dimension elements).
90 The inner mapping has `tuple` keys representing data IDs (tuple
91 conversions of `DataCoordinate.values()`) and `DimensionRecord` values.
93 Notes
94 -----
95 Constructing an instance of this does nothing; the query is not executed
96 until it is iterated over (or some other operation is performed that
97 involves iteration).
99 Instances should generally only be constructed by `Registry` methods or the
100 methods of other query result objects.
101 """
102 def __init__(self,
103 db: Database,
104 query_factory: QueryFactoryMethod,
105 graph: DimensionGraph,
106 *,
107 order_by: Optional[Iterable[str]] = None,
108 limit: Optional[Tuple[int, Optional[int]]] = None,
109 records: Optional[Mapping[str, Mapping[tuple, DimensionRecord]]] = None):
110 self._db = db
111 self._query_factory = query_factory
112 self._graph = graph
113 self._order_by = order_by
114 self._limit = limit
115 self._records = records
116 self._cached_query: Optional[Query] = None
118 __slots__ = ("_db", "_query_factory", "_graph", "_order_by", "_limit", "_records", "_cached_query")
120 @classmethod
121 def from_query(cls,
122 db: Database,
123 query: Query,
124 graph: DimensionGraph,
125 *,
126 order_by: Optional[Iterable[str]] = None,
127 limit: Optional[Tuple[int, Optional[int]]] = None,
128 records: Optional[Mapping[str, Mapping[tuple, DimensionRecord]]] = None
129 ) -> DataCoordinateQueryResults:
130 """Make an instance from a pre-existing query instead of a factory.
132 Parameters
133 ----------
134 db : `Database`
135 Database engine used to execute queries.
136 query : `Query`
137 Low-level representation of the query that backs this result
138 object.
139 graph : `DimensionGraph`
140 Dimensions used by query.
141 order_by : `Iterable` [ `str` ], optional
142 Optional sequence of column names used for result ordering.
143 limit : `Tuple` [ `int`, `int` ], optional
144 Limit for the number of returned records and optional offset.
145 records : `Mapping`, optional
146 A nested mapping containing `DimensionRecord` objects for all
147 dimensions and all data IDs this query will yield. If `None`
148 (default), `DataCoordinateIterable.hasRecords` will return `False`.
149 The outer mapping has `str` keys (the names of dimension elements).
150 The inner mapping has `tuple` keys representing data IDs (tuple
151 conversions of `DataCoordinate.values()`) and `DimensionRecord`
152 values.
153 """
154 def factory(order_by: Optional[Iterable[str]], limit: Optional[Tuple[int, Optional[int]]]) -> Query:
155 return query
157 return DataCoordinateQueryResults(db, factory, graph, order_by=order_by,
158 limit=limit, records=records)
160 def __iter__(self) -> Iterator[DataCoordinate]:
161 return (self._query.extractDataId(row, records=self._records) for row in self._query.rows(self._db))
163 def __repr__(self) -> str:
164 return f"<DataCoordinate iterator with dimensions={self._graph}>"
166 def _clone(self, *,
167 query_factory: Optional[QueryFactoryMethod] = None,
168 query: Optional[Query] = None,
169 graph: Optional[DimensionGraph] = None,
170 order_by: Optional[Iterable[str]] = None,
171 limit: Optional[Tuple[int, Optional[int]]] = None,
172 records: Optional[Mapping[str, Mapping[tuple, DimensionRecord]]] = None,
173 ) -> DataCoordinateQueryResults:
174 """Clone this instance potentially updating some attributes.
175 """
176 graph = graph if graph is not None else self._graph
177 order_by = order_by if order_by is not None else self._order_by
178 limit = limit if limit is not None else self._limit
179 records = records if records is not None else self._records
180 if query is None:
181 query_factory = query_factory or self._query_factory
182 return DataCoordinateQueryResults(self._db, query_factory, graph,
183 order_by=order_by, limit=limit, records=records)
184 else:
185 return DataCoordinateQueryResults.from_query(self._db, query, graph,
186 order_by=order_by, limit=limit, records=records)
188 @property
189 def _query(self) -> Query:
190 """Query representation instance (`Query`)"""
191 if self._cached_query is None:
192 self._cached_query = self._query_factory(self._order_by, self._limit)
193 assert self._cached_query.datasetType is None, \
194 "Query used to initialize data coordinate results should not have any datasets."
195 return self._cached_query
197 @property
198 def graph(self) -> DimensionGraph:
199 # Docstring inherited from DataCoordinateIterable.
200 return self._graph
202 def hasFull(self) -> bool:
203 # Docstring inherited from DataCoordinateIterable.
204 return True
206 def hasRecords(self) -> bool:
207 # Docstring inherited from DataCoordinateIterable.
208 return self._records is not None or not self._graph
210 @contextmanager
211 def materialize(self) -> Iterator[DataCoordinateQueryResults]:
212 """Insert this query's results into a temporary table.
214 Returns
215 -------
216 context : `typing.ContextManager` [ `DataCoordinateQueryResults` ]
217 A context manager that ensures the temporary table is created and
218 populated in ``__enter__`` (returning a results object backed by
219 that table), and dropped in ``__exit__``. If ``self`` is already
220 materialized, the context manager may do nothing (reflecting the
221 fact that an outer context manager should already take care of
222 everything else).
224 Notes
225 -----
226 When using a very large result set to perform multiple queries (e.g.
227 multiple calls to `subset` with different arguments, or even a single
228 call to `expanded`), it may be much more efficient to start by
229 materializing the query and only then performing the follow up queries.
230 It may also be less efficient, depending on how well database engine's
231 query optimizer can simplify those particular follow-up queries and
232 how efficiently it caches query results even when the are not
233 explicitly inserted into a temporary table. See `expanded` and
234 `subset` for examples.
235 """
236 with self._query.materialize(self._db) as materialized:
237 # Note that we depend on order_by columns to be passes from Query
238 # to MaterializedQuery, so order_by and limit are not used.
239 yield self._clone(query=materialized)
241 def expanded(self) -> DataCoordinateQueryResults:
242 """Return a results object for which `hasRecords` returns `True`.
244 This method may involve actually executing database queries to fetch
245 `DimensionRecord` objects.
247 Returns
248 -------
249 results : `DataCoordinateQueryResults`
250 A results object for which `hasRecords` returns `True`. May be
251 ``self`` if that is already the case.
253 Notes
254 -----
255 For very result sets, it may be much more efficient to call
256 `materialize` before calling `expanded`, to avoid performing the
257 original query multiple times (as a subquery) in the follow-up queries
258 that fetch dimension records. For example::
260 with registry.queryDataIds(...).materialize() as tempDataIds:
261 dataIdsWithRecords = tempDataIds.expanded()
262 for dataId in dataIdsWithRecords:
263 ...
264 """
265 if self._records is None:
266 records = {}
267 for element in self.graph.elements:
268 subset = self.subset(graph=element.graph, unique=True)
269 records[element.name] = {
270 tuple(record.dataId.values()): record
271 for record in self._query.managers.dimensions[element].fetch(subset)
272 }
274 return self._clone(query=self._query, records=records)
275 else:
276 return self
278 def subset(self, graph: Optional[DimensionGraph] = None, *,
279 unique: bool = False) -> DataCoordinateQueryResults:
280 """Return a results object containing a subset of the dimensions of
281 this one, and/or a unique near-subset of its rows.
283 This method may involve actually executing database queries to fetch
284 `DimensionRecord` objects.
286 Parameters
287 ----------
288 graph : `DimensionGraph`, optional
289 Dimensions to include in the new results object. If `None`,
290 ``self.graph`` is used.
291 unique : `bool`, optional
292 If `True` (`False` is default), the query should only return unique
293 data IDs. This is implemented in the database; to obtain unique
294 results via Python-side processing (which may be more efficient in
295 some cases), use `toSet` to construct a `DataCoordinateSet` from
296 this results object instead.
298 Returns
299 -------
300 results : `DataCoordinateQueryResults`
301 A results object corresponding to the given criteria. May be
302 ``self`` if it already qualifies.
304 Notes
305 -----
306 This method can only return a "near-subset" of the original result rows
307 in general because of subtleties in how spatial overlaps are
308 implemented; see `Query.subset` for more information.
310 When calling `subset` multiple times on the same very large result set,
311 it may be much more efficient to call `materialize` first. For
312 example::
314 dimensions1 = DimensionGraph(...)
315 dimensions2 = DimensionGraph(...)
316 with registry.queryDataIds(...).materialize() as tempDataIds:
317 for dataId1 in tempDataIds.subset(
318 graph=dimensions1,
319 unique=True):
320 ...
321 for dataId2 in tempDataIds.subset(
322 graph=dimensions2,
323 unique=True):
324 ...
325 """
326 if graph is None:
327 graph = self.graph
328 if not graph.issubset(self.graph):
329 raise ValueError(f"{graph} is not a subset of {self.graph}")
330 if graph == self.graph and (not unique or self._query.isUnique()):
331 return self
332 records: Optional[Mapping[str, Mapping[tuple, DimensionRecord]]]
333 if self._records is not None:
334 records = {element.name: self._records[element.name] for element in graph.elements}
335 else:
336 records = None
337 query = self._query.subset(graph=graph, datasets=False, unique=unique)
339 return self._clone(graph=graph, query=query, records=records)
341 def constrain(self, query: SimpleQuery, columns: Callable[[str], sqlalchemy.sql.ColumnElement]) -> None:
342 # Docstring inherited from DataCoordinateIterable.
343 sql = self._query.sql
344 if sql is not None:
345 fromClause = sql.alias("c")
346 query.join(
347 fromClause,
348 onclause=sqlalchemy.sql.and_(*[
349 columns(dimension.name) == fromClause.columns[dimension.name]
350 for dimension in self.graph.required
351 ])
352 )
354 def findDatasets(self, datasetType: Union[DatasetType, str], collections: Any, *,
355 findFirst: bool = True) -> ParentDatasetQueryResults:
356 """Find datasets using the data IDs identified by this query.
358 Parameters
359 ----------
360 datasetType : `DatasetType` or `str`
361 Dataset type or the name of one to search for. Must have
362 dimensions that are a subset of ``self.graph``.
363 collections : `Any`
364 An expression that fully or partially identifies the collections
365 to search for the dataset, such as a `str`, `re.Pattern`, or
366 iterable thereof. ``...`` can be used to return all collections.
367 See :ref:`daf_butler_collection_expressions` for more information.
368 findFirst : `bool`, optional
369 If `True` (default), for each result data ID, only yield one
370 `DatasetRef`, from the first collection in which a dataset of that
371 dataset type appears (according to the order of ``collections``
372 passed in). If `True`, ``collections`` must not contain regular
373 expressions and may not be ``...``.
375 Returns
376 -------
377 datasets : `ParentDatasetQueryResults`
378 A lazy-evaluation object representing dataset query results,
379 iterable over `DatasetRef` objects. If ``self.hasRecords()``, all
380 nested data IDs in those dataset references will have records as
381 well.
383 Raises
384 ------
385 ValueError
386 Raised if ``datasetType.dimensions.issubset(self.graph) is False``.
387 """
388 if not isinstance(datasetType, DatasetType):
389 datasetType = self._query.managers.datasets[datasetType].datasetType
390 # moving component handling down into managers.
391 if not datasetType.dimensions.issubset(self.graph):
392 raise ValueError(f"findDatasets requires that the dataset type have the same dimensions as "
393 f"the DataCoordinateQueryResult used as input to the search, but "
394 f"{datasetType.name} has dimensions {datasetType.dimensions}, while the input "
395 f"dimensions are {self.graph}.")
396 if datasetType.isComponent():
397 # We were given a true DatasetType instance, but it's a component.
398 parentName, componentName = datasetType.nameAndComponent()
399 storage = self._query.managers.datasets[parentName]
400 datasetType = storage.datasetType
401 components = [componentName]
402 else:
403 components = [None]
404 summary = QuerySummary(self.graph, whereRegion=self._query.whereRegion, datasets=[datasetType])
405 builder = self._query.makeBuilder(summary)
406 builder.joinDataset(datasetType, collections=collections, findFirst=findFirst)
407 query = builder.finish(joinMissing=False)
408 return ParentDatasetQueryResults(db=self._db, query=query, components=components,
409 records=self._records, datasetType=datasetType)
411 def count(self, *, exact: bool = True) -> int:
412 """Count the number of rows this query would return.
414 Parameters
415 ----------
416 exact : `bool`, optional
417 If `True`, run the full query and perform post-query filtering if
418 needed to account for that filtering in the count. If `False`, the
419 result may be an upper bound.
421 Returns
422 -------
423 count : `int`
424 The number of rows the query would return, or an upper bound if
425 ``exact=False``.
427 Notes
428 -----
429 This counts the number of rows returned, not the number of unique rows
430 returned, so even with ``exact=True`` it may provide only an upper
431 bound on the number of *deduplicated* result rows.
432 """
433 return self._query.count(self._db, exact=exact)
435 def any(
436 self, *,
437 execute: bool = True,
438 exact: bool = True,
439 ) -> bool:
440 """Test whether this query returns any results.
442 Parameters
443 ----------
444 execute : `bool`, optional
445 If `True`, execute at least a ``LIMIT 1`` query if it cannot be
446 determined prior to execution that the query would return no rows.
447 exact : `bool`, optional
448 If `True`, run the full query and perform post-query filtering if
449 needed, until at least one result row is found. If `False`, the
450 returned result does not account for post-query filtering, and
451 hence may be `True` even when all result rows would be filtered
452 out.
454 Returns
455 -------
456 any : `bool`
457 `True` if the query would (or might, depending on arguments) yield
458 result rows. `False` if it definitely would not.
459 """
460 return self._query.any(self._db, execute=execute, exact=exact)
462 def explain_no_results(self) -> Iterator[str]:
463 """Return human-readable messages that may help explain why the query
464 yields no results.
466 Returns
467 -------
468 messages : `Iterator` [ `str` ]
469 String messages that describe reasons the query might not yield any
470 results.
472 Notes
473 -----
474 Messages related to post-query filtering are only available if the
475 iterator has been exhausted, or if `any` or `count` was already called
476 (with ``exact=True`` for the latter two).
478 This method first yields messages that are generated while the query is
479 being built or filtered, but may then proceed to diagnostics generated
480 by performing what should be inexpensive follow-up queries. Callers
481 can short-circuit this at any time by simplying not iterating further.
482 """
483 return self._query.explain_no_results(self._db)
485 def order_by(self, *args: str) -> DataCoordinateQueryResults:
486 """Make the iterator return ordered result.
488 Parameters
489 ----------
490 *args : `str`
491 Names of the columns/dimensions to use for ordering. Column name
492 can be prefixed with minus (``-``) to use descending ordering.
494 Returns
495 -------
496 result : `DataCoordinateQueryResults`
497 Returns ``self`` instance which is updated to return ordered
498 result.
500 Notes
501 -----
502 This method modifies the iterator in place and returns the same
503 instance to support method chaining.
504 """
505 return self._clone(order_by=args)
507 def limit(self, limit: int, offset: Optional[int] = None) -> DataCoordinateQueryResults:
508 """Make the iterator return limited number of records.
510 Parameters
511 ----------
512 limit : `int`
513 Upper limit on the number of returned records.
514 offset : `int` or `None`
515 If not `None` then the number of records to skip before returning
516 ``limit`` records.
518 Returns
519 -------
520 result : `DataCoordinateQueryResults`
521 Returns ``self`` instance which is updated to return limited set
522 of records.
524 Notes
525 -----
526 This method modifies the iterator in place and returns the same
527 instance to support method chaining. Normally this method is used
528 together with `order_by` method.
529 """
530 return self._clone(limit=(limit, offset))
533class DatasetQueryResults(Iterable[DatasetRef]):
534 """An interface for objects that represent the results of queries for
535 datasets.
536 """
538 @abstractmethod
539 def byParentDatasetType(self) -> Iterator[ParentDatasetQueryResults]:
540 """Group results by parent dataset type.
542 Returns
543 -------
544 iter : `Iterator` [ `ParentDatasetQueryResults` ]
545 An iterator over `DatasetQueryResults` instances that are each
546 responsible for a single parent dataset type (either just that
547 dataset type, one or more of its component dataset types, or both).
548 """
549 raise NotImplementedError()
551 @abstractmethod
552 def materialize(self) -> ContextManager[DatasetQueryResults]:
553 """Insert this query's results into a temporary table.
555 Returns
556 -------
557 context : `typing.ContextManager` [ `DatasetQueryResults` ]
558 A context manager that ensures the temporary table is created and
559 populated in ``__enter__`` (returning a results object backed by
560 that table), and dropped in ``__exit__``. If ``self`` is already
561 materialized, the context manager may do nothing (reflecting the
562 fact that an outer context manager should already take care of
563 everything else).
564 """
565 raise NotImplementedError()
567 @abstractmethod
568 def expanded(self) -> DatasetQueryResults:
569 """Return a `DatasetQueryResults` for which `DataCoordinate.hasRecords`
570 returns `True` for all data IDs in returned `DatasetRef` objects.
572 Returns
573 -------
574 expanded : `DatasetQueryResults`
575 Either a new `DatasetQueryResults` instance or ``self``, if it is
576 already expanded.
578 Notes
579 -----
580 As with `DataCoordinateQueryResults.expanded`, it may be more efficient
581 to call `materialize` before expanding data IDs for very large result
582 sets.
583 """
584 raise NotImplementedError()
586 @abstractmethod
587 def count(self, *, exact: bool = True) -> int:
588 """Count the number of rows this query would return.
590 Parameters
591 ----------
592 exact : `bool`, optional
593 If `True`, run the full query and perform post-query filtering if
594 needed to account for that filtering in the count. If `False`, the
595 result may be an upper bound.
597 Returns
598 -------
599 count : `int`
600 The number of rows the query would return, or an upper bound if
601 ``exact=False``.
603 Notes
604 -----
605 This counts the number of rows returned, not the number of unique rows
606 returned, so even with ``exact=True`` it may provide only an upper
607 bound on the number of *deduplicated* result rows.
608 """
609 raise NotImplementedError()
611 @abstractmethod
612 def any(
613 self, *,
614 execute: bool = True,
615 exact: bool = True,
616 ) -> bool:
617 """Test whether this query returns any results.
619 Parameters
620 ----------
621 execute : `bool`, optional
622 If `True`, execute at least a ``LIMIT 1`` query if it cannot be
623 determined prior to execution that the query would return no rows.
624 exact : `bool`, optional
625 If `True`, run the full query and perform post-query filtering if
626 needed, until at least one result row is found. If `False`, the
627 returned result does not account for post-query filtering, and
628 hence may be `True` even when all result rows would be filtered
629 out.
631 Returns
632 -------
633 any : `bool`
634 `True` if the query would (or might, depending on arguments) yield
635 result rows. `False` if it definitely would not.
636 """
637 raise NotImplementedError()
639 @abstractmethod
640 def explain_no_results(self) -> Iterator[str]:
641 """Return human-readable messages that may help explain why the query
642 yields no results.
644 Returns
645 -------
646 messages : `Iterator` [ `str` ]
647 String messages that describe reasons the query might not yield any
648 results.
650 Notes
651 -----
652 Messages related to post-query filtering are only available if the
653 iterator has been exhausted, or if `any` or `count` was already called
654 (with ``exact=True`` for the latter two).
656 This method first yields messages that are generated while the query is
657 being built or filtered, but may then proceed to diagnostics generated
658 by performing what should be inexpensive follow-up queries. Callers
659 can short-circuit this at any time by simplying not iterating further.
660 """
661 raise NotImplementedError()
664class ParentDatasetQueryResults(DatasetQueryResults):
665 """An object that represents results from a query for datasets with a
666 single parent `DatasetType`.
668 Parameters
669 ----------
670 db : `Database`
671 Database engine to execute queries against.
672 query : `Query`
673 Low-level query object that backs these results. ``query.datasetType``
674 will be the parent dataset type for this object, and may not be `None`.
675 components : `Sequence` [ `str` or `None` ]
676 Names of components to include in iteration. `None` may be included
677 (at most once) to include the parent dataset type.
678 records : `Mapping`, optional
679 Mapping containing `DimensionRecord` objects for all dimensions and
680 all data IDs this query will yield. If `None` (default),
681 `DataCoordinate.hasRecords` will return `False` for all nested data
682 IDs. This is a nested mapping with `str` names of dimension elements
683 as outer keys, `DimensionRecord` instances as inner values, and
684 ``tuple(record.dataId.values())`` for the inner keys / outer values
685 (where ``record`` is the innermost `DimensionRecord` instance).
686 datasetType : `DatasetType`, optional
687 Parent dataset type for all datasets returned by this query. If not
688 provided, ``query.datasetType`` be used, and must not be `None` (as it
689 is in the case where the query is known to yield no results prior to
690 execution).
691 """
692 def __init__(self, db: Database, query: Query, *,
693 components: Sequence[Optional[str]],
694 records: Optional[Mapping[str, Mapping[tuple, DimensionRecord]]] = None,
695 datasetType: Optional[DatasetType] = None):
696 self._db = db
697 self._query = query
698 self._components = components
699 self._records = records
700 if datasetType is None:
701 datasetType = query.datasetType
702 assert datasetType is not None, \
703 "Query used to initialize dataset results must have a dataset."
704 assert datasetType.dimensions == query.graph, \
705 f"Query dimensions {query.graph} do not match dataset type dimesions {datasetType.dimensions}."
706 self._datasetType = datasetType
708 __slots__ = ("_db", "_query", "_dimensions", "_components", "_records")
710 def __iter__(self) -> Iterator[DatasetRef]:
711 for row in self._query.rows(self._db):
712 parentRef = self._query.extractDatasetRef(row, records=self._records)
713 for component in self._components:
714 if component is None:
715 yield parentRef
716 else:
717 yield parentRef.makeComponentRef(component)
719 def __repr__(self) -> str:
720 return f"<DatasetRef iterator for [components of] {self._datasetType.name}>"
722 def byParentDatasetType(self) -> Iterator[ParentDatasetQueryResults]:
723 # Docstring inherited from DatasetQueryResults.
724 yield self
726 @contextmanager
727 def materialize(self) -> Iterator[ParentDatasetQueryResults]:
728 # Docstring inherited from DatasetQueryResults.
729 with self._query.materialize(self._db) as materialized:
730 yield ParentDatasetQueryResults(self._db, materialized,
731 components=self._components,
732 records=self._records)
734 @property
735 def parentDatasetType(self) -> DatasetType:
736 """The parent dataset type for all datasets in this iterable
737 (`DatasetType`).
738 """
739 return self._datasetType
741 @property
742 def dataIds(self) -> DataCoordinateQueryResults:
743 """A lazy-evaluation object representing a query for just the data
744 IDs of the datasets that would be returned by this query
745 (`DataCoordinateQueryResults`).
747 The returned object is not in general `zip`-iterable with ``self``;
748 it may be in a different order or have (or not have) duplicates.
749 """
750 query = self._query.subset(graph=self.parentDatasetType.dimensions, datasets=False, unique=False)
751 return DataCoordinateQueryResults.from_query(
752 self._db,
753 query,
754 self.parentDatasetType.dimensions,
755 records=self._records,
756 )
758 def withComponents(self, components: Sequence[Optional[str]]) -> ParentDatasetQueryResults:
759 """Return a new query results object for the same parent datasets but
760 different components.
762 components : `Sequence` [ `str` or `None` ]
763 Names of components to include in iteration. `None` may be
764 included (at most once) to include the parent dataset type.
765 """
766 return ParentDatasetQueryResults(self._db, self._query, records=self._records,
767 components=components, datasetType=self._datasetType)
769 def expanded(self) -> ParentDatasetQueryResults:
770 # Docstring inherited from DatasetQueryResults.
771 if self._records is None:
772 records = self.dataIds.expanded()._records
773 return ParentDatasetQueryResults(self._db, self._query, records=records,
774 components=self._components, datasetType=self._datasetType)
775 else:
776 return self
778 def count(self, *, exact: bool = True) -> int:
779 # Docstring inherited.
780 return len(self._components) * self._query.count(self._db, exact=exact)
782 def any(
783 self, *,
784 execute: bool = True,
785 exact: bool = True,
786 ) -> bool:
787 # Docstring inherited.
788 return self._query.any(self._db, execute=execute, exact=exact)
790 def explain_no_results(self) -> Iterator[str]:
791 # Docstring inherited.
792 return self._query.explain_no_results(self._db)
795class ChainedDatasetQueryResults(DatasetQueryResults):
796 """A `DatasetQueryResults` implementation that simply chains together
797 other results objects, each for a different parent dataset type.
799 Parameters
800 ----------
801 chain : `Sequence` [ `ParentDatasetQueryResults` ]
802 The underlying results objects this object will chain together.
803 doomed_by : `Iterable` [ `str` ], optional
804 A list of messages (appropriate for e.g. logging or exceptions) that
805 explain why the query is known to return no results even before it is
806 executed. Queries with a non-empty list will never be executed.
807 Child results objects may also have their own list.
808 """
810 def __init__(self, chain: Sequence[ParentDatasetQueryResults], doomed_by: Iterable[str] = ()):
811 self._chain = chain
812 self._doomed_by = tuple(doomed_by)
814 __slots__ = ("_chain",)
816 def __iter__(self) -> Iterator[DatasetRef]:
817 return itertools.chain.from_iterable(self._chain)
819 def __repr__(self) -> str:
820 return "<DatasetRef iterator for multiple dataset types>"
822 def byParentDatasetType(self) -> Iterator[ParentDatasetQueryResults]:
823 # Docstring inherited from DatasetQueryResults.
824 return iter(self._chain)
826 @contextmanager
827 def materialize(self) -> Iterator[ChainedDatasetQueryResults]:
828 # Docstring inherited from DatasetQueryResults.
829 with ExitStack() as stack:
830 yield ChainedDatasetQueryResults(
831 [stack.enter_context(r.materialize()) for r in self._chain]
832 )
834 def expanded(self) -> ChainedDatasetQueryResults:
835 # Docstring inherited from DatasetQueryResults.
836 return ChainedDatasetQueryResults([r.expanded() for r in self._chain])
838 def count(self, *, exact: bool = True) -> int:
839 # Docstring inherited.
840 return sum(r.count(exact=exact) for r in self._chain)
842 def any(
843 self, *,
844 execute: bool = True,
845 exact: bool = True,
846 ) -> bool:
847 # Docstring inherited.
848 return any(r.any(execute=execute, exact=exact) for r in self._chain)
850 def explain_no_results(self) -> Iterator[str]:
851 # Docstring inherited.
852 for r in self._chain:
853 yield from r.explain_no_results()
854 yield from self._doomed_by
857class DimensionRecordQueryResults(Iterable[DimensionRecord]):
858 """An interface for objects that represent the results of queries for
859 dimension records.
860 """
862 @abstractmethod
863 def count(self, *, exact: bool = True) -> int:
864 """Count the number of rows this query would return.
866 Parameters
867 ----------
868 exact : `bool`, optional
869 If `True`, run the full query and perform post-query filtering if
870 needed to account for that filtering in the count. If `False`, the
871 result may be an upper bound.
873 Returns
874 -------
875 count : `int`
876 The number of rows the query would return, or an upper bound if
877 ``exact=False``.
879 Notes
880 -----
881 This counts the number of rows returned, not the number of unique rows
882 returned, so even with ``exact=True`` it may provide only an upper
883 bound on the number of *deduplicated* result rows.
884 """
885 raise NotImplementedError()
887 @abstractmethod
888 def any(self, *, execute: bool = True, exact: bool = True) -> bool:
889 """Test whether this query returns any results.
891 Parameters
892 ----------
893 execute : `bool`, optional
894 If `True`, execute at least a ``LIMIT 1`` query if it cannot be
895 determined prior to execution that the query would return no rows.
896 exact : `bool`, optional
897 If `True`, run the full query and perform post-query filtering if
898 needed, until at least one result row is found. If `False`, the
899 returned result does not account for post-query filtering, and
900 hence may be `True` even when all result rows would be filtered
901 out.
903 Returns
904 -------
905 any : `bool`
906 `True` if the query would (or might, depending on arguments) yield
907 result rows. `False` if it definitely would not.
908 """
909 raise NotImplementedError()
911 @abstractmethod
912 def order_by(self, *args: str) -> DimensionRecordQueryResults:
913 """Make the iterator return ordered result.
915 Parameters
916 ----------
917 *args : `str`
918 Names of the columns/dimensions to use for ordering. Column name
919 can be prefixed with minus (``-``) to use descending ordering.
921 Returns
922 -------
923 result : `DimensionRecordQueryResults`
924 Returns ``self`` instance which is updated to return ordered
925 result.
927 Notes
928 -----
929 This method can modify the iterator in place and return the same
930 instance.
931 """
932 raise NotImplementedError()
934 @abstractmethod
935 def limit(self, limit: int, offset: Optional[int] = None) -> DimensionRecordQueryResults:
936 """Make the iterator return limited number of records.
938 Parameters
939 ----------
940 limit : `int`
941 Upper limit on the number of returned records.
942 offset : `int` or `None`
943 If not `None` then the number of records to skip before returning
944 ``limit`` records.
946 Returns
947 -------
948 result : `DimensionRecordQueryResults`
949 Returns ``self`` instance which is updated to return limited set
950 of records.
952 Notes
953 -----
954 This method can modify the iterator in place and return the same
955 instance. Normally this method is used together with `order_by`
956 method.
957 """
958 raise NotImplementedError()
961class DatabaseDimensionRecordQueryResults(DimensionRecordQueryResults):
962 """Implementation of DimensionRecordQueryResults using database query.
964 Parameters
965 ----------
966 dataIds : `DataCoordinateQueryResults`
967 Iterator for DataIds.
968 recordStorage : `DimensionRecordStorage`
969 Instance of storage class for dimension records.
970 """
971 def __init__(self, dataIds: DataCoordinateQueryResults, recordStorage: DimensionRecordStorage):
972 self._dataIds = dataIds
973 self._recordStorage = recordStorage
974 self._order_by: Iterable[str] = ()
976 def __iter__(self) -> Iterator[DimensionRecord]:
977 # LIMIT is already applied at DataCoordinateQueryResults level
978 # (assumption here is that if DataId exists then dimension record
979 # exists too and their counts must be equal). We still need to make
980 # sure that ordering is applied to dimension records as well.
981 if not self._order_by:
982 return iter(self._recordStorage.fetch(self._dataIds))
983 else:
984 # fetch() method does not support ordering, for now do it hard way
985 # by fetching everything into memory and ordering by DataId
986 dataIds = self._dataIds.toSequence()
987 rec_map = {}
988 for rec in self._recordStorage.fetch(dataIds):
989 rec_map[rec.dataId] = rec
990 # TODO: Do we want to clean up dataIds that may be missing
991 return iter(rec_map[dataId] for dataId in dataIds)
993 def count(self, *, exact: bool = True) -> int:
994 # Docstring inherited from base class.
995 return self._dataIds.count(exact=exact)
997 def any(self, *, execute: bool = True, exact: bool = True) -> bool:
998 # Docstring inherited from base class.
999 return self._dataIds.any(execute=execute, exact=exact)
1001 def order_by(self, *args: str) -> DimensionRecordQueryResults:
1002 # Docstring inherited from base class.
1003 self._dataIds = self._dataIds.order_by(*args)
1004 self._order_by = args
1005 return self
1007 def limit(self, limit: int, offset: Optional[int] = None) -> DimensionRecordQueryResults:
1008 # Docstring inherited from base class.
1009 self._dataIds = self._dataIds.limit(limit, offset)
1010 return self