Coverage for python/lsst/daf/butler/registry/queries/_results.py: 33%
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23__all__ = (
24 "ChainedDatasetQueryResults",
25 "DatabaseDimensionRecordQueryResults",
26 "DataCoordinateQueryResults",
27 "DatasetQueryResults",
28 "DimensionRecordQueryResults",
29 "ParentDatasetQueryResults",
30)
32import itertools
33import operator
34from abc import abstractmethod
35from contextlib import ExitStack, contextmanager
36from typing import (
37 Any,
38 Callable,
39 ContextManager,
40 Iterable,
41 Iterator,
42 List,
43 Mapping,
44 Optional,
45 Sequence,
46 Tuple,
47 Union,
48)
50import sqlalchemy
52from ...core import (
53 DataCoordinate,
54 DataCoordinateIterable,
55 DatasetRef,
56 DatasetType,
57 Dimension,
58 DimensionGraph,
59 DimensionRecord,
60 SimpleQuery,
61)
62from ..interfaces import Database, DimensionRecordStorage
63from ._query import Query
64from ._structs import ElementOrderByClause, QuerySummary
66QueryFactoryMethod = Callable[[Optional[Iterable[str]], Optional[Tuple[int, Optional[int]]]], Query]
67"""Type of a query factory method type used by DataCoordinateQueryResults.
68"""
71class DataCoordinateQueryResults(DataCoordinateIterable):
72 """An enhanced implementation of `DataCoordinateIterable` that represents
73 data IDs retrieved from a database query.
75 Parameters
76 ----------
77 db : `Database`
78 Database engine used to execute queries.
79 query_factory : `QueryFactoryMethod`
80 Method which creates an instance of `Query` class.
81 graph : `DimensionGraph`
82 Dimensions used by query.
83 order_by : `Iterable` [ `str` ], optional
84 Optional sequence of column names used for result ordering.
85 limit : `Tuple` [ `int`, `int` ], optional
86 Limit for the number of returned records and optional offset.
87 records : `Mapping`, optional
88 A nested mapping containing `DimensionRecord` objects for all
89 dimensions and all data IDs this query will yield. If `None`
90 (default), `DataCoordinateIterable.hasRecords` will return `False`.
91 The outer mapping has `str` keys (the names of dimension elements).
92 The inner mapping has `tuple` keys representing data IDs (tuple
93 conversions of `DataCoordinate.values()`) and `DimensionRecord` values.
95 Notes
96 -----
97 Constructing an instance of this does nothing; the query is not executed
98 until it is iterated over (or some other operation is performed that
99 involves iteration).
101 Instances should generally only be constructed by `Registry` methods or the
102 methods of other query result objects.
103 """
105 def __init__(
106 self,
107 db: Database,
108 query_factory: QueryFactoryMethod,
109 graph: DimensionGraph,
110 *,
111 order_by: Optional[Iterable[str]] = None,
112 limit: Optional[Tuple[int, Optional[int]]] = None,
113 records: Optional[Mapping[str, Mapping[tuple, DimensionRecord]]] = None,
114 ):
115 self._db = db
116 self._query_factory = query_factory
117 self._graph = graph
118 self._order_by = order_by
119 self._limit = limit
120 self._records = records
121 self._cached_query: Optional[Query] = None
123 __slots__ = ("_db", "_query_factory", "_graph", "_order_by", "_limit", "_records", "_cached_query")
125 @classmethod
126 def from_query(
127 cls,
128 db: Database,
129 query: Query,
130 graph: DimensionGraph,
131 *,
132 order_by: Optional[Iterable[str]] = None,
133 limit: Optional[Tuple[int, Optional[int]]] = None,
134 records: Optional[Mapping[str, Mapping[tuple, DimensionRecord]]] = None,
135 ) -> DataCoordinateQueryResults:
136 """Make an instance from a pre-existing query instead of a factory.
138 Parameters
139 ----------
140 db : `Database`
141 Database engine used to execute queries.
142 query : `Query`
143 Low-level representation of the query that backs this result
144 object.
145 graph : `DimensionGraph`
146 Dimensions used by query.
147 order_by : `Iterable` [ `str` ], optional
148 Optional sequence of column names used for result ordering.
149 limit : `Tuple` [ `int`, `int` ], optional
150 Limit for the number of returned records and optional offset.
151 records : `Mapping`, optional
152 A nested mapping containing `DimensionRecord` objects for all
153 dimensions and all data IDs this query will yield. If `None`
154 (default), `DataCoordinateIterable.hasRecords` will return `False`.
155 The outer mapping has `str` keys (the names of dimension elements).
156 The inner mapping has `tuple` keys representing data IDs (tuple
157 conversions of `DataCoordinate.values()`) and `DimensionRecord`
158 values.
159 """
161 def factory(order_by: Optional[Iterable[str]], limit: Optional[Tuple[int, Optional[int]]]) -> Query:
162 return query
164 return DataCoordinateQueryResults(db, factory, graph, order_by=order_by, limit=limit, records=records)
166 def __iter__(self) -> Iterator[DataCoordinate]:
167 return (self._query.extractDataId(row, records=self._records) for row in self._query.rows(self._db))
169 def __repr__(self) -> str:
170 return f"<DataCoordinate iterator with dimensions={self._graph}>"
172 def _clone(
173 self,
174 *,
175 query_factory: Optional[QueryFactoryMethod] = None,
176 query: Optional[Query] = None,
177 graph: Optional[DimensionGraph] = None,
178 order_by: Optional[Iterable[str]] = None,
179 limit: Optional[Tuple[int, Optional[int]]] = None,
180 records: Optional[Mapping[str, Mapping[tuple, DimensionRecord]]] = None,
181 ) -> DataCoordinateQueryResults:
182 """Clone this instance potentially updating some attributes."""
183 graph = graph if graph is not None else self._graph
184 order_by = order_by if order_by is not None else self._order_by
185 limit = limit if limit is not None else self._limit
186 records = records if records is not None else self._records
187 if query is None:
188 query_factory = query_factory or self._query_factory
189 return DataCoordinateQueryResults(
190 self._db, query_factory, graph, order_by=order_by, limit=limit, records=records
191 )
192 else:
193 return DataCoordinateQueryResults.from_query(
194 self._db, query, graph, order_by=order_by, limit=limit, records=records
195 )
197 @property
198 def _query(self) -> Query:
199 """Query representation instance (`Query`)"""
200 if self._cached_query is None:
201 self._cached_query = self._query_factory(self._order_by, self._limit)
202 assert (
203 self._cached_query.datasetType is None
204 ), "Query used to initialize data coordinate results should not have any datasets."
205 return self._cached_query
207 @property
208 def graph(self) -> DimensionGraph:
209 # Docstring inherited from DataCoordinateIterable.
210 return self._graph
212 def hasFull(self) -> bool:
213 # Docstring inherited from DataCoordinateIterable.
214 return True
216 def hasRecords(self) -> bool:
217 # Docstring inherited from DataCoordinateIterable.
218 return self._records is not None or not self._graph
220 @contextmanager
221 def materialize(self) -> Iterator[DataCoordinateQueryResults]:
222 """Insert this query's results into a temporary table.
224 Returns
225 -------
226 context : `typing.ContextManager` [ `DataCoordinateQueryResults` ]
227 A context manager that ensures the temporary table is created and
228 populated in ``__enter__`` (returning a results object backed by
229 that table), and dropped in ``__exit__``. If ``self`` is already
230 materialized, the context manager may do nothing (reflecting the
231 fact that an outer context manager should already take care of
232 everything else).
234 Notes
235 -----
236 When using a very large result set to perform multiple queries (e.g.
237 multiple calls to `subset` with different arguments, or even a single
238 call to `expanded`), it may be much more efficient to start by
239 materializing the query and only then performing the follow up queries.
240 It may also be less efficient, depending on how well database engine's
241 query optimizer can simplify those particular follow-up queries and
242 how efficiently it caches query results even when the are not
243 explicitly inserted into a temporary table. See `expanded` and
244 `subset` for examples.
245 """
246 with self._query.materialize(self._db) as materialized:
247 # Note that we depend on order_by columns to be passes from Query
248 # to MaterializedQuery, so order_by and limit are not used.
249 yield self._clone(query=materialized)
251 def expanded(self) -> DataCoordinateQueryResults:
252 """Return a results object for which `hasRecords` returns `True`.
254 This method may involve actually executing database queries to fetch
255 `DimensionRecord` objects.
257 Returns
258 -------
259 results : `DataCoordinateQueryResults`
260 A results object for which `hasRecords` returns `True`. May be
261 ``self`` if that is already the case.
263 Notes
264 -----
265 For very result sets, it may be much more efficient to call
266 `materialize` before calling `expanded`, to avoid performing the
267 original query multiple times (as a subquery) in the follow-up queries
268 that fetch dimension records. For example::
270 with registry.queryDataIds(...).materialize() as tempDataIds:
271 dataIdsWithRecords = tempDataIds.expanded()
272 for dataId in dataIdsWithRecords:
273 ...
274 """
275 if self._records is None:
276 records = {}
277 for element in self.graph.elements:
278 subset = self.subset(graph=element.graph, unique=True)
279 records[element.name] = {
280 tuple(record.dataId.values()): record
281 for record in self._query.managers.dimensions[element].fetch(subset)
282 }
284 return self._clone(query=self._query, records=records)
285 else:
286 return self
288 def subset(
289 self, graph: Optional[DimensionGraph] = None, *, unique: bool = False
290 ) -> DataCoordinateQueryResults:
291 """Return a results object containing a subset of the dimensions of
292 this one, and/or a unique near-subset of its rows.
294 This method may involve actually executing database queries to fetch
295 `DimensionRecord` objects.
297 Parameters
298 ----------
299 graph : `DimensionGraph`, optional
300 Dimensions to include in the new results object. If `None`,
301 ``self.graph`` is used.
302 unique : `bool`, optional
303 If `True` (`False` is default), the query should only return unique
304 data IDs. This is implemented in the database; to obtain unique
305 results via Python-side processing (which may be more efficient in
306 some cases), use `toSet` to construct a `DataCoordinateSet` from
307 this results object instead.
309 Returns
310 -------
311 results : `DataCoordinateQueryResults`
312 A results object corresponding to the given criteria. May be
313 ``self`` if it already qualifies.
315 Notes
316 -----
317 This method can only return a "near-subset" of the original result rows
318 in general because of subtleties in how spatial overlaps are
319 implemented; see `Query.subset` for more information.
321 When calling `subset` multiple times on the same very large result set,
322 it may be much more efficient to call `materialize` first. For
323 example::
325 dimensions1 = DimensionGraph(...)
326 dimensions2 = DimensionGraph(...)
327 with registry.queryDataIds(...).materialize() as tempDataIds:
328 for dataId1 in tempDataIds.subset(
329 graph=dimensions1,
330 unique=True):
331 ...
332 for dataId2 in tempDataIds.subset(
333 graph=dimensions2,
334 unique=True):
335 ...
336 """
337 if graph is None:
338 graph = self.graph
339 if not graph.issubset(self.graph):
340 raise ValueError(f"{graph} is not a subset of {self.graph}")
341 if graph == self.graph and (not unique or self._query.isUnique()):
342 return self
343 records: Optional[Mapping[str, Mapping[tuple, DimensionRecord]]]
344 if self._records is not None:
345 records = {element.name: self._records[element.name] for element in graph.elements}
346 else:
347 records = None
348 query = self._query.subset(graph=graph, datasets=False, unique=unique)
350 return self._clone(graph=graph, query=query, records=records)
352 def constrain(self, query: SimpleQuery, columns: Callable[[str], sqlalchemy.sql.ColumnElement]) -> None:
353 # Docstring inherited from DataCoordinateIterable.
354 sql = self._query.sql
355 if sql is not None:
356 fromClause = sql.alias("c")
357 query.join(
358 fromClause,
359 onclause=sqlalchemy.sql.and_(
360 *[
361 columns(dimension.name) == fromClause.columns[dimension.name]
362 for dimension in self.graph.required
363 ]
364 ),
365 )
367 def findDatasets(
368 self, datasetType: Union[DatasetType, str], collections: Any, *, findFirst: bool = True
369 ) -> ParentDatasetQueryResults:
370 """Find datasets using the data IDs identified by this query.
372 Parameters
373 ----------
374 datasetType : `DatasetType` or `str`
375 Dataset type or the name of one to search for. Must have
376 dimensions that are a subset of ``self.graph``.
377 collections : `Any`
378 An expression that fully or partially identifies the collections
379 to search for the dataset, such as a `str`, `re.Pattern`, or
380 iterable thereof. ``...`` can be used to return all collections.
381 See :ref:`daf_butler_collection_expressions` for more information.
382 findFirst : `bool`, optional
383 If `True` (default), for each result data ID, only yield one
384 `DatasetRef`, from the first collection in which a dataset of that
385 dataset type appears (according to the order of ``collections``
386 passed in). If `True`, ``collections`` must not contain regular
387 expressions and may not be ``...``.
389 Returns
390 -------
391 datasets : `ParentDatasetQueryResults`
392 A lazy-evaluation object representing dataset query results,
393 iterable over `DatasetRef` objects. If ``self.hasRecords()``, all
394 nested data IDs in those dataset references will have records as
395 well.
397 Raises
398 ------
399 ValueError
400 Raised if ``datasetType.dimensions.issubset(self.graph) is False``.
401 """
402 if not isinstance(datasetType, DatasetType):
403 datasetType = self._query.managers.datasets[datasetType].datasetType
404 # moving component handling down into managers.
405 if not datasetType.dimensions.issubset(self.graph):
406 raise ValueError(
407 f"findDatasets requires that the dataset type have the same dimensions as "
408 f"the DataCoordinateQueryResult used as input to the search, but "
409 f"{datasetType.name} has dimensions {datasetType.dimensions}, while the input "
410 f"dimensions are {self.graph}."
411 )
412 if datasetType.isComponent():
413 # We were given a true DatasetType instance, but it's a component.
414 parentName, componentName = datasetType.nameAndComponent()
415 storage = self._query.managers.datasets[parentName]
416 datasetType = storage.datasetType
417 components = [componentName]
418 else:
419 components = [None]
420 summary = QuerySummary(self.graph, whereRegion=self._query.whereRegion, datasets=[datasetType])
421 builder = self._query.makeBuilder(summary)
422 builder.joinDataset(datasetType, collections=collections, findFirst=findFirst)
423 query = builder.finish(joinMissing=False)
424 return ParentDatasetQueryResults(
425 db=self._db, query=query, components=components, records=self._records, datasetType=datasetType
426 )
428 def count(self, *, exact: bool = True) -> int:
429 """Count the number of rows this query would return.
431 Parameters
432 ----------
433 exact : `bool`, optional
434 If `True`, run the full query and perform post-query filtering if
435 needed to account for that filtering in the count. If `False`, the
436 result may be an upper bound.
438 Returns
439 -------
440 count : `int`
441 The number of rows the query would return, or an upper bound if
442 ``exact=False``.
444 Notes
445 -----
446 This counts the number of rows returned, not the number of unique rows
447 returned, so even with ``exact=True`` it may provide only an upper
448 bound on the number of *deduplicated* result rows.
449 """
450 return self._query.count(self._db, exact=exact)
452 def any(
453 self,
454 *,
455 execute: bool = True,
456 exact: bool = True,
457 ) -> bool:
458 """Test whether this query returns any results.
460 Parameters
461 ----------
462 execute : `bool`, optional
463 If `True`, execute at least a ``LIMIT 1`` query if it cannot be
464 determined prior to execution that the query would return no rows.
465 exact : `bool`, optional
466 If `True`, run the full query and perform post-query filtering if
467 needed, until at least one result row is found. If `False`, the
468 returned result does not account for post-query filtering, and
469 hence may be `True` even when all result rows would be filtered
470 out.
472 Returns
473 -------
474 any : `bool`
475 `True` if the query would (or might, depending on arguments) yield
476 result rows. `False` if it definitely would not.
477 """
478 return self._query.any(self._db, execute=execute, exact=exact)
480 def explain_no_results(self) -> Iterator[str]:
481 """Return human-readable messages that may help explain why the query
482 yields no results.
484 Returns
485 -------
486 messages : `Iterator` [ `str` ]
487 String messages that describe reasons the query might not yield any
488 results.
490 Notes
491 -----
492 Messages related to post-query filtering are only available if the
493 iterator has been exhausted, or if `any` or `count` was already called
494 (with ``exact=True`` for the latter two).
496 This method first yields messages that are generated while the query is
497 being built or filtered, but may then proceed to diagnostics generated
498 by performing what should be inexpensive follow-up queries. Callers
499 can short-circuit this at any time by simplying not iterating further.
500 """
501 return self._query.explain_no_results(self._db)
503 def order_by(self, *args: str) -> DataCoordinateQueryResults:
504 """Make the iterator return ordered result.
506 Parameters
507 ----------
508 *args : `str`
509 Names of the columns/dimensions to use for ordering. Column name
510 can be prefixed with minus (``-``) to use descending ordering.
512 Returns
513 -------
514 result : `DataCoordinateQueryResults`
515 Returns ``self`` instance which is updated to return ordered
516 result.
518 Notes
519 -----
520 This method modifies the iterator in place and returns the same
521 instance to support method chaining.
522 """
523 return self._clone(order_by=args)
525 def limit(self, limit: int, offset: Optional[int] = None) -> DataCoordinateQueryResults:
526 """Make the iterator return limited number of records.
528 Parameters
529 ----------
530 limit : `int`
531 Upper limit on the number of returned records.
532 offset : `int` or `None`
533 If not `None` then the number of records to skip before returning
534 ``limit`` records.
536 Returns
537 -------
538 result : `DataCoordinateQueryResults`
539 Returns ``self`` instance which is updated to return limited set
540 of records.
542 Notes
543 -----
544 This method modifies the iterator in place and returns the same
545 instance to support method chaining. Normally this method is used
546 together with `order_by` method.
547 """
548 return self._clone(limit=(limit, offset))
551class DatasetQueryResults(Iterable[DatasetRef]):
552 """An interface for objects that represent the results of queries for
553 datasets.
554 """
556 @abstractmethod
557 def byParentDatasetType(self) -> Iterator[ParentDatasetQueryResults]:
558 """Group results by parent dataset type.
560 Returns
561 -------
562 iter : `Iterator` [ `ParentDatasetQueryResults` ]
563 An iterator over `DatasetQueryResults` instances that are each
564 responsible for a single parent dataset type (either just that
565 dataset type, one or more of its component dataset types, or both).
566 """
567 raise NotImplementedError()
569 @abstractmethod
570 def materialize(self) -> ContextManager[DatasetQueryResults]:
571 """Insert this query's results into a temporary table.
573 Returns
574 -------
575 context : `typing.ContextManager` [ `DatasetQueryResults` ]
576 A context manager that ensures the temporary table is created and
577 populated in ``__enter__`` (returning a results object backed by
578 that table), and dropped in ``__exit__``. If ``self`` is already
579 materialized, the context manager may do nothing (reflecting the
580 fact that an outer context manager should already take care of
581 everything else).
582 """
583 raise NotImplementedError()
585 @abstractmethod
586 def expanded(self) -> DatasetQueryResults:
587 """Return a `DatasetQueryResults` for which `DataCoordinate.hasRecords`
588 returns `True` for all data IDs in returned `DatasetRef` objects.
590 Returns
591 -------
592 expanded : `DatasetQueryResults`
593 Either a new `DatasetQueryResults` instance or ``self``, if it is
594 already expanded.
596 Notes
597 -----
598 As with `DataCoordinateQueryResults.expanded`, it may be more efficient
599 to call `materialize` before expanding data IDs for very large result
600 sets.
601 """
602 raise NotImplementedError()
604 @abstractmethod
605 def count(self, *, exact: bool = True) -> int:
606 """Count the number of rows this query would return.
608 Parameters
609 ----------
610 exact : `bool`, optional
611 If `True`, run the full query and perform post-query filtering if
612 needed to account for that filtering in the count. If `False`, the
613 result may be an upper bound.
615 Returns
616 -------
617 count : `int`
618 The number of rows the query would return, or an upper bound if
619 ``exact=False``.
621 Notes
622 -----
623 This counts the number of rows returned, not the number of unique rows
624 returned, so even with ``exact=True`` it may provide only an upper
625 bound on the number of *deduplicated* result rows.
626 """
627 raise NotImplementedError()
629 @abstractmethod
630 def any(
631 self,
632 *,
633 execute: bool = True,
634 exact: bool = True,
635 ) -> bool:
636 """Test whether this query returns any results.
638 Parameters
639 ----------
640 execute : `bool`, optional
641 If `True`, execute at least a ``LIMIT 1`` query if it cannot be
642 determined prior to execution that the query would return no rows.
643 exact : `bool`, optional
644 If `True`, run the full query and perform post-query filtering if
645 needed, until at least one result row is found. If `False`, the
646 returned result does not account for post-query filtering, and
647 hence may be `True` even when all result rows would be filtered
648 out.
650 Returns
651 -------
652 any : `bool`
653 `True` if the query would (or might, depending on arguments) yield
654 result rows. `False` if it definitely would not.
655 """
656 raise NotImplementedError()
658 @abstractmethod
659 def explain_no_results(self) -> Iterator[str]:
660 """Return human-readable messages that may help explain why the query
661 yields no results.
663 Returns
664 -------
665 messages : `Iterator` [ `str` ]
666 String messages that describe reasons the query might not yield any
667 results.
669 Notes
670 -----
671 Messages related to post-query filtering are only available if the
672 iterator has been exhausted, or if `any` or `count` was already called
673 (with ``exact=True`` for the latter two).
675 This method first yields messages that are generated while the query is
676 being built or filtered, but may then proceed to diagnostics generated
677 by performing what should be inexpensive follow-up queries. Callers
678 can short-circuit this at any time by simplying not iterating further.
679 """
680 raise NotImplementedError()
683class ParentDatasetQueryResults(DatasetQueryResults):
684 """An object that represents results from a query for datasets with a
685 single parent `DatasetType`.
687 Parameters
688 ----------
689 db : `Database`
690 Database engine to execute queries against.
691 query : `Query`
692 Low-level query object that backs these results. ``query.datasetType``
693 will be the parent dataset type for this object, and may not be `None`.
694 components : `Sequence` [ `str` or `None` ]
695 Names of components to include in iteration. `None` may be included
696 (at most once) to include the parent dataset type.
697 records : `Mapping`, optional
698 Mapping containing `DimensionRecord` objects for all dimensions and
699 all data IDs this query will yield. If `None` (default),
700 `DataCoordinate.hasRecords` will return `False` for all nested data
701 IDs. This is a nested mapping with `str` names of dimension elements
702 as outer keys, `DimensionRecord` instances as inner values, and
703 ``tuple(record.dataId.values())`` for the inner keys / outer values
704 (where ``record`` is the innermost `DimensionRecord` instance).
705 datasetType : `DatasetType`, optional
706 Parent dataset type for all datasets returned by this query. If not
707 provided, ``query.datasetType`` be used, and must not be `None` (as it
708 is in the case where the query is known to yield no results prior to
709 execution).
710 """
712 def __init__(
713 self,
714 db: Database,
715 query: Query,
716 *,
717 components: Sequence[Optional[str]],
718 records: Optional[Mapping[str, Mapping[tuple, DimensionRecord]]] = None,
719 datasetType: Optional[DatasetType] = None,
720 ):
721 self._db = db
722 self._query = query
723 self._components = components
724 self._records = records
725 if datasetType is None:
726 datasetType = query.datasetType
727 assert datasetType is not None, "Query used to initialize dataset results must have a dataset."
728 assert (
729 datasetType.dimensions == query.graph
730 ), f"Query dimensions {query.graph} do not match dataset type dimesions {datasetType.dimensions}."
731 self._datasetType = datasetType
733 __slots__ = ("_db", "_query", "_dimensions", "_components", "_records")
735 def __iter__(self) -> Iterator[DatasetRef]:
736 for row in self._query.rows(self._db):
737 parentRef = self._query.extractDatasetRef(row, records=self._records)
738 for component in self._components:
739 if component is None:
740 yield parentRef
741 else:
742 yield parentRef.makeComponentRef(component)
744 def __repr__(self) -> str:
745 return f"<DatasetRef iterator for [components of] {self._datasetType.name}>"
747 def byParentDatasetType(self) -> Iterator[ParentDatasetQueryResults]:
748 # Docstring inherited from DatasetQueryResults.
749 yield self
751 @contextmanager
752 def materialize(self) -> Iterator[ParentDatasetQueryResults]:
753 # Docstring inherited from DatasetQueryResults.
754 with self._query.materialize(self._db) as materialized:
755 yield ParentDatasetQueryResults(
756 self._db, materialized, components=self._components, records=self._records
757 )
759 @property
760 def parentDatasetType(self) -> DatasetType:
761 """The parent dataset type for all datasets in this iterable
762 (`DatasetType`).
763 """
764 return self._datasetType
766 @property
767 def dataIds(self) -> DataCoordinateQueryResults:
768 """A lazy-evaluation object representing a query for just the data
769 IDs of the datasets that would be returned by this query
770 (`DataCoordinateQueryResults`).
772 The returned object is not in general `zip`-iterable with ``self``;
773 it may be in a different order or have (or not have) duplicates.
774 """
775 query = self._query.subset(graph=self.parentDatasetType.dimensions, datasets=False, unique=False)
776 return DataCoordinateQueryResults.from_query(
777 self._db,
778 query,
779 self.parentDatasetType.dimensions,
780 records=self._records,
781 )
783 def withComponents(self, components: Sequence[Optional[str]]) -> ParentDatasetQueryResults:
784 """Return a new query results object for the same parent datasets but
785 different components.
787 components : `Sequence` [ `str` or `None` ]
788 Names of components to include in iteration. `None` may be
789 included (at most once) to include the parent dataset type.
790 """
791 return ParentDatasetQueryResults(
792 self._db, self._query, records=self._records, components=components, datasetType=self._datasetType
793 )
795 def expanded(self) -> ParentDatasetQueryResults:
796 # Docstring inherited from DatasetQueryResults.
797 if self._records is None:
798 records = self.dataIds.expanded()._records
799 return ParentDatasetQueryResults(
800 self._db,
801 self._query,
802 records=records,
803 components=self._components,
804 datasetType=self._datasetType,
805 )
806 else:
807 return self
809 def count(self, *, exact: bool = True) -> int:
810 # Docstring inherited.
811 return len(self._components) * self._query.count(self._db, exact=exact)
813 def any(
814 self,
815 *,
816 execute: bool = True,
817 exact: bool = True,
818 ) -> bool:
819 # Docstring inherited.
820 return self._query.any(self._db, execute=execute, exact=exact)
822 def explain_no_results(self) -> Iterator[str]:
823 # Docstring inherited.
824 return self._query.explain_no_results(self._db)
827class ChainedDatasetQueryResults(DatasetQueryResults):
828 """A `DatasetQueryResults` implementation that simply chains together
829 other results objects, each for a different parent dataset type.
831 Parameters
832 ----------
833 chain : `Sequence` [ `ParentDatasetQueryResults` ]
834 The underlying results objects this object will chain together.
835 doomed_by : `Iterable` [ `str` ], optional
836 A list of messages (appropriate for e.g. logging or exceptions) that
837 explain why the query is known to return no results even before it is
838 executed. Queries with a non-empty list will never be executed.
839 Child results objects may also have their own list.
840 """
842 def __init__(self, chain: Sequence[ParentDatasetQueryResults], doomed_by: Iterable[str] = ()):
843 self._chain = chain
844 self._doomed_by = tuple(doomed_by)
846 __slots__ = ("_chain",)
848 def __iter__(self) -> Iterator[DatasetRef]:
849 return itertools.chain.from_iterable(self._chain)
851 def __repr__(self) -> str:
852 return "<DatasetRef iterator for multiple dataset types>"
854 def byParentDatasetType(self) -> Iterator[ParentDatasetQueryResults]:
855 # Docstring inherited from DatasetQueryResults.
856 return iter(self._chain)
858 @contextmanager
859 def materialize(self) -> Iterator[ChainedDatasetQueryResults]:
860 # Docstring inherited from DatasetQueryResults.
861 with ExitStack() as stack:
862 yield ChainedDatasetQueryResults([stack.enter_context(r.materialize()) for r in self._chain])
864 def expanded(self) -> ChainedDatasetQueryResults:
865 # Docstring inherited from DatasetQueryResults.
866 return ChainedDatasetQueryResults([r.expanded() for r in self._chain])
868 def count(self, *, exact: bool = True) -> int:
869 # Docstring inherited.
870 return sum(r.count(exact=exact) for r in self._chain)
872 def any(
873 self,
874 *,
875 execute: bool = True,
876 exact: bool = True,
877 ) -> bool:
878 # Docstring inherited.
879 return any(r.any(execute=execute, exact=exact) for r in self._chain)
881 def explain_no_results(self) -> Iterator[str]:
882 # Docstring inherited.
883 for r in self._chain:
884 yield from r.explain_no_results()
885 yield from self._doomed_by
888class DimensionRecordQueryResults(Iterable[DimensionRecord]):
889 """An interface for objects that represent the results of queries for
890 dimension records.
891 """
893 @abstractmethod
894 def count(self, *, exact: bool = True) -> int:
895 """Count the number of rows this query would return.
897 Parameters
898 ----------
899 exact : `bool`, optional
900 If `True`, run the full query and perform post-query filtering if
901 needed to account for that filtering in the count. If `False`, the
902 result may be an upper bound.
904 Returns
905 -------
906 count : `int`
907 The number of rows the query would return, or an upper bound if
908 ``exact=False``.
910 Notes
911 -----
912 This counts the number of rows returned, not the number of unique rows
913 returned, so even with ``exact=True`` it may provide only an upper
914 bound on the number of *deduplicated* result rows.
915 """
916 raise NotImplementedError()
918 @abstractmethod
919 def any(self, *, execute: bool = True, exact: bool = True) -> bool:
920 """Test whether this query returns any results.
922 Parameters
923 ----------
924 execute : `bool`, optional
925 If `True`, execute at least a ``LIMIT 1`` query if it cannot be
926 determined prior to execution that the query would return no rows.
927 exact : `bool`, optional
928 If `True`, run the full query and perform post-query filtering if
929 needed, until at least one result row is found. If `False`, the
930 returned result does not account for post-query filtering, and
931 hence may be `True` even when all result rows would be filtered
932 out.
934 Returns
935 -------
936 any : `bool`
937 `True` if the query would (or might, depending on arguments) yield
938 result rows. `False` if it definitely would not.
939 """
940 raise NotImplementedError()
942 @abstractmethod
943 def order_by(self, *args: str) -> DimensionRecordQueryResults:
944 """Make the iterator return ordered result.
946 Parameters
947 ----------
948 *args : `str`
949 Names of the columns/dimensions to use for ordering. Column name
950 can be prefixed with minus (``-``) to use descending ordering.
952 Returns
953 -------
954 result : `DimensionRecordQueryResults`
955 Returns ``self`` instance which is updated to return ordered
956 result.
958 Notes
959 -----
960 This method can modify the iterator in place and return the same
961 instance.
962 """
963 raise NotImplementedError()
965 @abstractmethod
966 def limit(self, limit: int, offset: Optional[int] = None) -> DimensionRecordQueryResults:
967 """Make the iterator return limited number of records.
969 Parameters
970 ----------
971 limit : `int`
972 Upper limit on the number of returned records.
973 offset : `int` or `None`
974 If not `None` then the number of records to skip before returning
975 ``limit`` records.
977 Returns
978 -------
979 result : `DimensionRecordQueryResults`
980 Returns ``self`` instance which is updated to return limited set
981 of records.
983 Notes
984 -----
985 This method can modify the iterator in place and return the same
986 instance. Normally this method is used together with `order_by`
987 method.
988 """
989 raise NotImplementedError()
992class _DimensionRecordKey:
993 """Class for objects used as keys in ordering `DimensionRecord` instances.
995 Parameters
996 ----------
997 attributes : `Sequence` [ `str` ]
998 Sequence of attribute names to use for comparison.
999 ordering : `Sequence` [ `bool` ]
1000 Matching sequence of ordering flags, `False` for descending ordering,
1001 `True` for ascending ordering.
1002 record : `DimensionRecord`
1003 `DimensionRecord` to compare to other records.
1004 """
1006 def __init__(self, attributes: Sequence[str], ordering: Sequence[bool], record: DimensionRecord):
1007 self.attributes = attributes
1008 self.ordering = ordering
1009 self.rec = record
1011 def _cmp(self, other: _DimensionRecordKey) -> int:
1012 """Compare two records using provided comparison operator.
1014 Parameters
1015 ----------
1016 other : `_DimensionRecordKey`
1017 Key for other record.
1019 Returns
1020 -------
1021 result : `int`
1022 0 if keys are identical, negative if ``self`` is ordered before
1023 ``other``, positive otherwise.
1024 """
1025 for attribute, ordering in zip(self.attributes, self.ordering):
1026 # timespan.begin/end cannot use getattr
1027 attrgetter = operator.attrgetter(attribute)
1028 lhs = attrgetter(self.rec)
1029 rhs = attrgetter(other.rec)
1030 if not ordering:
1031 lhs, rhs = rhs, lhs
1032 if lhs != rhs:
1033 return 1 if lhs > rhs else -1
1034 return 0
1036 def __lt__(self, other: _DimensionRecordKey) -> bool:
1037 return self._cmp(other) < 0
1039 def __gt__(self, other: _DimensionRecordKey) -> bool:
1040 return self._cmp(other) > 0
1042 def __eq__(self, other: Any) -> bool:
1043 if not isinstance(other, _DimensionRecordKey):
1044 return NotImplemented
1045 return self._cmp(other) == 0
1047 def __le__(self, other: _DimensionRecordKey) -> bool:
1048 return self._cmp(other) <= 0
1050 def __ge__(self, other: _DimensionRecordKey) -> bool:
1051 return self._cmp(other) >= 0
1054class DatabaseDimensionRecordQueryResults(DimensionRecordQueryResults):
1055 """Implementation of DimensionRecordQueryResults using database query.
1057 Parameters
1058 ----------
1059 dataIds : `DataCoordinateQueryResults`
1060 Iterator for DataIds.
1061 recordStorage : `DimensionRecordStorage`
1062 Instance of storage class for dimension records.
1063 """
1065 def __init__(self, dataIds: DataCoordinateQueryResults, recordStorage: DimensionRecordStorage):
1066 self._dataIds = dataIds
1067 self._recordStorage = recordStorage
1068 self._order_by: Iterable[str] = ()
1070 def __iter__(self) -> Iterator[DimensionRecord]:
1071 # LIMIT is already applied at DataCoordinateQueryResults level
1072 # (assumption here is that if DataId exists then dimension record
1073 # exists too and their counts must be equal). fetch() does not
1074 # guarantee ordering, so we need to sort records in memory below.
1075 recordIter = self._recordStorage.fetch(self._dataIds)
1076 if not self._order_by:
1077 return iter(recordIter)
1079 # Parse list of column names and build a list of attribute name for
1080 # ordering. Note that here we only support ordering by direct
1081 # attributes of the element, and not other elements from the dimension
1082 # graph.
1083 orderBy = ElementOrderByClause(self._order_by, self._recordStorage.element)
1084 attributes: List[str] = []
1085 ordering: List[bool] = []
1086 for column in orderBy.order_by_columns:
1087 if column.column is None:
1088 assert isinstance(column.element, Dimension), "Element must be a Dimension"
1089 attributes.append(column.element.primaryKey.name)
1090 else:
1091 attributes.append(column.column)
1092 ordering.append(column.ordering)
1094 def _key(record: DimensionRecord) -> _DimensionRecordKey:
1095 return _DimensionRecordKey(attributes, ordering, record)
1097 records = sorted(recordIter, key=_key)
1098 return iter(records)
1100 def count(self, *, exact: bool = True) -> int:
1101 # Docstring inherited from base class.
1102 return self._dataIds.count(exact=exact)
1104 def any(self, *, execute: bool = True, exact: bool = True) -> bool:
1105 # Docstring inherited from base class.
1106 return self._dataIds.any(execute=execute, exact=exact)
1108 def order_by(self, *args: str) -> DimensionRecordQueryResults:
1109 # Docstring inherited from base class.
1110 self._dataIds = self._dataIds.order_by(*args)
1111 self._order_by = args
1112 return self
1114 def limit(self, limit: int, offset: Optional[int] = None) -> DimensionRecordQueryResults:
1115 # Docstring inherited from base class.
1116 self._dataIds = self._dataIds.limit(limit, offset)
1117 return self