Coverage for python/lsst/daf/butler/registry/queries/_results.py: 33%
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23__all__ = (
24 "ChainedDatasetQueryResults",
25 "DataCoordinateQueryResults",
26 "DatasetQueryResults",
27 "ParentDatasetQueryResults",
28)
30from abc import abstractmethod
31from contextlib import contextmanager, ExitStack
32import itertools
33from typing import (
34 Any,
35 Callable,
36 ContextManager,
37 Iterable,
38 Iterator,
39 Mapping,
40 Optional,
41 Sequence,
42 Union,
43)
45import sqlalchemy
47from ...core import (
48 DataCoordinate,
49 DataCoordinateIterable,
50 DatasetRef,
51 DatasetType,
52 DimensionGraph,
53 DimensionRecord,
54 SimpleQuery,
55)
56from ..interfaces import Database
57from ._query import Query
58from ._structs import QuerySummary
61class DataCoordinateQueryResults(DataCoordinateIterable):
62 """An enhanced implementation of `DataCoordinateIterable` that represents
63 data IDs retrieved from a database query.
65 Parameters
66 ----------
67 db : `Database`
68 Database engine used to execute queries.
69 query : `Query`
70 Low-level representation of the query that backs this result object.
71 records : `Mapping`, optional
72 A nested mapping containing `DimensionRecord` objects for all
73 dimensions and all data IDs this query will yield. If `None`
74 (default), `DataCoordinateIterable.hasRecords` will return `False`.
75 The outer mapping has `str` keys (the names of dimension elements).
76 The inner mapping has `tuple` keys representing data IDs (tuple
77 conversions of `DataCoordinate.values()`) and `DimensionRecord` values.
79 Notes
80 -----
81 Constructing an instance of this does nothing; the query is not executed
82 until it is iterated over (or some other operation is performed that
83 involves iteration).
85 Instances should generally only be constructed by `Registry` methods or the
86 methods of other query result objects.
87 """
88 def __init__(self, db: Database, query: Query, *,
89 records: Optional[Mapping[str, Mapping[tuple, DimensionRecord]]] = None):
90 self._db = db
91 self._query = query
92 self._records = records
93 assert query.datasetType is None, \
94 "Query used to initialize data coordinate results should not have any datasets."
96 __slots__ = ("_db", "_query", "_records")
98 def __iter__(self) -> Iterator[DataCoordinate]:
99 return (self._query.extractDataId(row, records=self._records) for row in self._query.rows(self._db))
101 def __repr__(self) -> str:
102 return f"<DataCoordinate iterator with dimensions={self._query.graph}>"
104 @property
105 def graph(self) -> DimensionGraph:
106 # Docstring inherited from DataCoordinateIterable.
107 return self._query.graph
109 def hasFull(self) -> bool:
110 # Docstring inherited from DataCoordinateIterable.
111 return True
113 def hasRecords(self) -> bool:
114 # Docstring inherited from DataCoordinateIterable.
115 return self._records is not None or not self._query.graph
117 @contextmanager
118 def materialize(self) -> Iterator[DataCoordinateQueryResults]:
119 """Insert this query's results into a temporary table.
121 Returns
122 -------
123 context : `typing.ContextManager` [ `DataCoordinateQueryResults` ]
124 A context manager that ensures the temporary table is created and
125 populated in ``__enter__`` (returning a results object backed by
126 that table), and dropped in ``__exit__``. If ``self`` is already
127 materialized, the context manager may do nothing (reflecting the
128 fact that an outer context manager should already take care of
129 everything else).
131 Notes
132 -----
133 When using a very large result set to perform multiple queries (e.g.
134 multiple calls to `subset` with different arguments, or even a single
135 call to `expanded`), it may be much more efficient to start by
136 materializing the query and only then performing the follow up queries.
137 It may also be less efficient, depending on how well database engine's
138 query optimizer can simplify those particular follow-up queries and
139 how efficiently it caches query results even when the are not
140 explicitly inserted into a temporary table. See `expanded` and
141 `subset` for examples.
142 """
143 with self._query.materialize(self._db) as materialized:
144 yield DataCoordinateQueryResults(self._db, materialized, records=self._records)
146 def expanded(self) -> DataCoordinateQueryResults:
147 """Return a results object for which `hasRecords` returns `True`.
149 This method may involve actually executing database queries to fetch
150 `DimensionRecord` objects.
152 Returns
153 -------
154 results : `DataCoordinateQueryResults`
155 A results object for which `hasRecords` returns `True`. May be
156 ``self`` if that is already the case.
158 Notes
159 -----
160 For very result sets, it may be much more efficient to call
161 `materialize` before calling `expanded`, to avoid performing the
162 original query multiple times (as a subquery) in the follow-up queries
163 that fetch dimension records. For example::
165 with registry.queryDataIds(...).materialize() as tempDataIds:
166 dataIdsWithRecords = tempDataIds.expanded()
167 for dataId in dataIdsWithRecords:
168 ...
169 """
170 if self._records is None:
171 records = {}
172 for element in self.graph.elements:
173 subset = self.subset(graph=element.graph, unique=True)
174 records[element.name] = {
175 tuple(record.dataId.values()): record
176 for record in self._query.managers.dimensions[element].fetch(subset)
177 }
178 return DataCoordinateQueryResults(self._db, self._query, records=records)
179 else:
180 return self
182 def subset(self, graph: Optional[DimensionGraph] = None, *,
183 unique: bool = False) -> DataCoordinateQueryResults:
184 """Return a results object containing a subset of the dimensions of
185 this one, and/or a unique near-subset of its rows.
187 This method may involve actually executing database queries to fetch
188 `DimensionRecord` objects.
190 Parameters
191 ----------
192 graph : `DimensionGraph`, optional
193 Dimensions to include in the new results object. If `None`,
194 ``self.graph`` is used.
195 unique : `bool`, optional
196 If `True` (`False` is default), the query should only return unique
197 data IDs. This is implemented in the database; to obtain unique
198 results via Python-side processing (which may be more efficient in
199 some cases), use `toSet` to construct a `DataCoordinateSet` from
200 this results object instead.
202 Returns
203 -------
204 results : `DataCoordinateQueryResults`
205 A results object corresponding to the given criteria. May be
206 ``self`` if it already qualifies.
208 Notes
209 -----
210 This method can only return a "near-subset" of the original result rows
211 in general because of subtleties in how spatial overlaps are
212 implemented; see `Query.subset` for more information.
214 When calling `subset` multiple times on the same very large result set,
215 it may be much more efficient to call `materialize` first. For
216 example::
218 dimensions1 = DimensionGraph(...)
219 dimensions2 = DimensionGraph(...)
220 with registry.queryDataIds(...).materialize() as tempDataIds:
221 for dataId1 in tempDataIds.subset(
222 graph=dimensions1,
223 unique=True):
224 ...
225 for dataId2 in tempDataIds.subset(
226 graph=dimensions2,
227 unique=True):
228 ...
229 """
230 if graph is None:
231 graph = self.graph
232 if not graph.issubset(self.graph):
233 raise ValueError(f"{graph} is not a subset of {self.graph}")
234 if graph == self.graph and (not unique or self._query.isUnique()):
235 return self
236 records: Optional[Mapping[str, Mapping[tuple, DimensionRecord]]]
237 if self._records is not None:
238 records = {element.name: self._records[element.name] for element in graph.elements}
239 else:
240 records = None
241 return DataCoordinateQueryResults(
242 self._db,
243 self._query.subset(graph=graph, datasets=False, unique=unique),
244 records=records,
245 )
247 def constrain(self, query: SimpleQuery, columns: Callable[[str], sqlalchemy.sql.ColumnElement]) -> None:
248 # Docstring inherited from DataCoordinateIterable.
249 sql = self._query.sql
250 if sql is not None:
251 fromClause = sql.alias("c")
252 query.join(
253 fromClause,
254 onclause=sqlalchemy.sql.and_(*[
255 columns(dimension.name) == fromClause.columns[dimension.name]
256 for dimension in self.graph.required
257 ])
258 )
260 def findDatasets(self, datasetType: Union[DatasetType, str], collections: Any, *,
261 findFirst: bool = True) -> ParentDatasetQueryResults:
262 """Find datasets using the data IDs identified by this query.
264 Parameters
265 ----------
266 datasetType : `DatasetType` or `str`
267 Dataset type or the name of one to search for. Must have
268 dimensions that are a subset of ``self.graph``.
269 collections : `Any`
270 An expression that fully or partially identifies the collections
271 to search for the dataset, such as a `str`, `re.Pattern`, or
272 iterable thereof. ``...`` can be used to return all collections.
273 See :ref:`daf_butler_collection_expressions` for more information.
274 findFirst : `bool`, optional
275 If `True` (default), for each result data ID, only yield one
276 `DatasetRef`, from the first collection in which a dataset of that
277 dataset type appears (according to the order of ``collections``
278 passed in). If `True`, ``collections`` must not contain regular
279 expressions and may not be ``...``.
281 Returns
282 -------
283 datasets : `ParentDatasetQueryResults`
284 A lazy-evaluation object representing dataset query results,
285 iterable over `DatasetRef` objects. If ``self.hasRecords()``, all
286 nested data IDs in those dataset references will have records as
287 well.
289 Raises
290 ------
291 ValueError
292 Raised if ``datasetType.dimensions.issubset(self.graph) is False``.
293 """
294 if not isinstance(datasetType, DatasetType):
295 datasetType = self._query.managers.datasets[datasetType].datasetType
296 # moving component handling down into managers.
297 if not datasetType.dimensions.issubset(self.graph):
298 raise ValueError(f"findDatasets requires that the dataset type have the same dimensions as "
299 f"the DataCoordinateQueryResult used as input to the search, but "
300 f"{datasetType.name} has dimensions {datasetType.dimensions}, while the input "
301 f"dimensions are {self.graph}.")
302 if datasetType.isComponent():
303 # We were given a true DatasetType instance, but it's a component.
304 parentName, componentName = datasetType.nameAndComponent()
305 storage = self._query.managers.datasets[parentName]
306 datasetType = storage.datasetType
307 components = [componentName]
308 else:
309 components = [None]
310 summary = QuerySummary(self.graph, whereRegion=self._query.whereRegion, datasets=[datasetType])
311 builder = self._query.makeBuilder(summary)
312 builder.joinDataset(datasetType, collections=collections, findFirst=findFirst)
313 query = builder.finish(joinMissing=False)
314 return ParentDatasetQueryResults(db=self._db, query=query, components=components,
315 records=self._records, datasetType=datasetType)
317 def count(self, *, exact: bool = True) -> int:
318 """Count the number of rows this query would return.
320 Parameters
321 ----------
322 exact : `bool`, optional
323 If `True`, run the full query and perform post-query filtering if
324 needed to account for that filtering in the count. If `False`, the
325 result may be an upper bound.
327 Returns
328 -------
329 count : `int`
330 The number of rows the query would return, or an upper bound if
331 ``exact=False``.
333 Notes
334 -----
335 This counts the number of rows returned, not the number of unique rows
336 returned, so even with ``exact=True`` it may provide only an upper
337 bound on the number of *deduplicated* result rows.
338 """
339 return self._query.count(self._db, exact=exact)
341 def any(
342 self, *,
343 execute: bool = True,
344 exact: bool = True,
345 ) -> bool:
346 """Test whether this query returns any results.
348 Parameters
349 ----------
350 execute : `bool`, optional
351 If `True`, execute at least a ``LIMIT 1`` query if it cannot be
352 determined prior to execution that the query would return no rows.
353 exact : `bool`, optional
354 If `True`, run the full query and perform post-query filtering if
355 needed, until at least one result row is found. If `False`, the
356 returned result does not account for post-query filtering, and
357 hence may be `True` even when all result rows would be filtered
358 out.
360 Returns
361 -------
362 any : `bool`
363 `True` if the query would (or might, depending on arguments) yield
364 result rows. `False` if it definitely would not.
365 """
366 return self._query.any(self._db, execute=execute, exact=exact)
368 def explain_no_results(self) -> Iterator[str]:
369 """Return human-readable messages that may help explain why the query
370 yields no results.
372 Returns
373 -------
374 messages : `Iterator` [ `str` ]
375 String messages that describe reasons the query might not yield any
376 results.
378 Notes
379 -----
380 Messages related to post-query filtering are only available if the
381 iterator has been exhausted, or if `any` or `count` was already called
382 (with ``exact=True`` for the latter two).
384 This method first yields messages that are generated while the query is
385 being built or filtered, but may then proceed to diagnostics generated
386 by performing what should be inexpensive follow-up queries. Callers
387 can short-circuit this at any time by simplying not iterating further.
388 """
389 return self._query.explain_no_results(self._db)
392class DatasetQueryResults(Iterable[DatasetRef]):
393 """An interface for objects that represent the results of queries for
394 datasets.
395 """
397 @abstractmethod
398 def byParentDatasetType(self) -> Iterator[ParentDatasetQueryResults]:
399 """Group results by parent dataset type.
401 Returns
402 -------
403 iter : `Iterator` [ `ParentDatasetQueryResults` ]
404 An iterator over `DatasetQueryResults` instances that are each
405 responsible for a single parent dataset type (either just that
406 dataset type, one or more of its component dataset types, or both).
407 """
408 raise NotImplementedError()
410 @abstractmethod
411 def materialize(self) -> ContextManager[DatasetQueryResults]:
412 """Insert this query's results into a temporary table.
414 Returns
415 -------
416 context : `typing.ContextManager` [ `DatasetQueryResults` ]
417 A context manager that ensures the temporary table is created and
418 populated in ``__enter__`` (returning a results object backed by
419 that table), and dropped in ``__exit__``. If ``self`` is already
420 materialized, the context manager may do nothing (reflecting the
421 fact that an outer context manager should already take care of
422 everything else).
423 """
424 raise NotImplementedError()
426 @abstractmethod
427 def expanded(self) -> DatasetQueryResults:
428 """Return a `DatasetQueryResults` for which `DataCoordinate.hasRecords`
429 returns `True` for all data IDs in returned `DatasetRef` objects.
431 Returns
432 -------
433 expanded : `DatasetQueryResults`
434 Either a new `DatasetQueryResults` instance or ``self``, if it is
435 already expanded.
437 Notes
438 -----
439 As with `DataCoordinateQueryResults.expanded`, it may be more efficient
440 to call `materialize` before expanding data IDs for very large result
441 sets.
442 """
443 raise NotImplementedError()
445 @abstractmethod
446 def count(self, *, exact: bool = True) -> int:
447 """Count the number of rows this query would return.
449 Parameters
450 ----------
451 exact : `bool`, optional
452 If `True`, run the full query and perform post-query filtering if
453 needed to account for that filtering in the count. If `False`, the
454 result may be an upper bound.
456 Returns
457 -------
458 count : `int`
459 The number of rows the query would return, or an upper bound if
460 ``exact=False``.
462 Notes
463 -----
464 This counts the number of rows returned, not the number of unique rows
465 returned, so even with ``exact=True`` it may provide only an upper
466 bound on the number of *deduplicated* result rows.
467 """
468 raise NotImplementedError()
470 @abstractmethod
471 def any(
472 self, *,
473 execute: bool = True,
474 exact: bool = True,
475 ) -> bool:
476 """Test whether this query returns any results.
478 Parameters
479 ----------
480 execute : `bool`, optional
481 If `True`, execute at least a ``LIMIT 1`` query if it cannot be
482 determined prior to execution that the query would return no rows.
483 exact : `bool`, optional
484 If `True`, run the full query and perform post-query filtering if
485 needed, until at least one result row is found. If `False`, the
486 returned result does not account for post-query filtering, and
487 hence may be `True` even when all result rows would be filtered
488 out.
490 Returns
491 -------
492 any : `bool`
493 `True` if the query would (or might, depending on arguments) yield
494 result rows. `False` if it definitely would not.
495 """
496 raise NotImplementedError()
498 @abstractmethod
499 def explain_no_results(self) -> Iterator[str]:
500 """Return human-readable messages that may help explain why the query
501 yields no results.
503 Returns
504 -------
505 messages : `Iterator` [ `str` ]
506 String messages that describe reasons the query might not yield any
507 results.
509 Notes
510 -----
511 Messages related to post-query filtering are only available if the
512 iterator has been exhausted, or if `any` or `count` was already called
513 (with ``exact=True`` for the latter two).
515 This method first yields messages that are generated while the query is
516 being built or filtered, but may then proceed to diagnostics generated
517 by performing what should be inexpensive follow-up queries. Callers
518 can short-circuit this at any time by simplying not iterating further.
519 """
520 raise NotImplementedError()
523class ParentDatasetQueryResults(DatasetQueryResults):
524 """An object that represents results from a query for datasets with a
525 single parent `DatasetType`.
527 Parameters
528 ----------
529 db : `Database`
530 Database engine to execute queries against.
531 query : `Query`
532 Low-level query object that backs these results. ``query.datasetType``
533 will be the parent dataset type for this object, and may not be `None`.
534 components : `Sequence` [ `str` or `None` ]
535 Names of components to include in iteration. `None` may be included
536 (at most once) to include the parent dataset type.
537 records : `Mapping`, optional
538 Mapping containing `DimensionRecord` objects for all dimensions and
539 all data IDs this query will yield. If `None` (default),
540 `DataCoordinate.hasRecords` will return `False` for all nested data
541 IDs. This is a nested mapping with `str` names of dimension elements
542 as outer keys, `DimensionRecord` instances as inner values, and
543 ``tuple(record.dataId.values())`` for the inner keys / outer values
544 (where ``record`` is the innermost `DimensionRecord` instance).
545 datasetType : `DatasetType`, optional
546 Parent dataset type for all datasets returned by this query. If not
547 provided, ``query.datasetType`` be used, and must not be `None` (as it
548 is in the case where the query is known to yield no results prior to
549 execution).
550 """
551 def __init__(self, db: Database, query: Query, *,
552 components: Sequence[Optional[str]],
553 records: Optional[Mapping[str, Mapping[tuple, DimensionRecord]]] = None,
554 datasetType: Optional[DatasetType] = None):
555 self._db = db
556 self._query = query
557 self._components = components
558 self._records = records
559 if datasetType is None:
560 datasetType = query.datasetType
561 assert datasetType is not None, \
562 "Query used to initialize dataset results must have a dataset."
563 assert datasetType.dimensions == query.graph, \
564 f"Query dimensions {query.graph} do not match dataset type dimesions {datasetType.dimensions}."
565 self._datasetType = datasetType
567 __slots__ = ("_db", "_query", "_dimensions", "_components", "_records")
569 def __iter__(self) -> Iterator[DatasetRef]:
570 for row in self._query.rows(self._db):
571 parentRef = self._query.extractDatasetRef(row, records=self._records)
572 for component in self._components:
573 if component is None:
574 yield parentRef
575 else:
576 yield parentRef.makeComponentRef(component)
578 def __repr__(self) -> str:
579 return f"<DatasetRef iterator for [components of] {self._datasetType.name}>"
581 def byParentDatasetType(self) -> Iterator[ParentDatasetQueryResults]:
582 # Docstring inherited from DatasetQueryResults.
583 yield self
585 @contextmanager
586 def materialize(self) -> Iterator[ParentDatasetQueryResults]:
587 # Docstring inherited from DatasetQueryResults.
588 with self._query.materialize(self._db) as materialized:
589 yield ParentDatasetQueryResults(self._db, materialized,
590 components=self._components,
591 records=self._records)
593 @property
594 def parentDatasetType(self) -> DatasetType:
595 """The parent dataset type for all datasets in this iterable
596 (`DatasetType`).
597 """
598 return self._datasetType
600 @property
601 def dataIds(self) -> DataCoordinateQueryResults:
602 """A lazy-evaluation object representing a query for just the data
603 IDs of the datasets that would be returned by this query
604 (`DataCoordinateQueryResults`).
606 The returned object is not in general `zip`-iterable with ``self``;
607 it may be in a different order or have (or not have) duplicates.
608 """
609 return DataCoordinateQueryResults(
610 self._db,
611 self._query.subset(graph=self.parentDatasetType.dimensions, datasets=False, unique=False),
612 records=self._records,
613 )
615 def withComponents(self, components: Sequence[Optional[str]]) -> ParentDatasetQueryResults:
616 """Return a new query results object for the same parent datasets but
617 different components.
619 components : `Sequence` [ `str` or `None` ]
620 Names of components to include in iteration. `None` may be
621 included (at most once) to include the parent dataset type.
622 """
623 return ParentDatasetQueryResults(self._db, self._query, records=self._records,
624 components=components, datasetType=self._datasetType)
626 def expanded(self) -> ParentDatasetQueryResults:
627 # Docstring inherited from DatasetQueryResults.
628 if self._records is None:
629 records = self.dataIds.expanded()._records
630 return ParentDatasetQueryResults(self._db, self._query, records=records,
631 components=self._components, datasetType=self._datasetType)
632 else:
633 return self
635 def count(self, *, exact: bool = True) -> int:
636 # Docstring inherited.
637 return len(self._components) * self._query.count(self._db, exact=exact)
639 def any(
640 self, *,
641 execute: bool = True,
642 exact: bool = True,
643 ) -> bool:
644 # Docstring inherited.
645 return self._query.any(self._db, execute=execute, exact=exact)
647 def explain_no_results(self) -> Iterator[str]:
648 # Docstring inherited.
649 return self._query.explain_no_results(self._db)
652class ChainedDatasetQueryResults(DatasetQueryResults):
653 """A `DatasetQueryResults` implementation that simply chains together
654 other results objects, each for a different parent dataset type.
656 Parameters
657 ----------
658 chain : `Sequence` [ `ParentDatasetQueryResults` ]
659 The underlying results objects this object will chain together.
660 doomed_by : `Iterable` [ `str` ], optional
661 A list of messages (appropriate for e.g. logging or exceptions) that
662 explain why the query is known to return no results even before it is
663 executed. Queries with a non-empty list will never be executed.
664 Child results objects may also have their own list.
665 """
667 def __init__(self, chain: Sequence[ParentDatasetQueryResults], doomed_by: Iterable[str] = ()):
668 self._chain = chain
669 self._doomed_by = tuple(doomed_by)
671 __slots__ = ("_chain",)
673 def __iter__(self) -> Iterator[DatasetRef]:
674 return itertools.chain.from_iterable(self._chain)
676 def __repr__(self) -> str:
677 return "<DatasetRef iterator for multiple dataset types>"
679 def byParentDatasetType(self) -> Iterator[ParentDatasetQueryResults]:
680 # Docstring inherited from DatasetQueryResults.
681 return iter(self._chain)
683 @contextmanager
684 def materialize(self) -> Iterator[ChainedDatasetQueryResults]:
685 # Docstring inherited from DatasetQueryResults.
686 with ExitStack() as stack:
687 yield ChainedDatasetQueryResults(
688 [stack.enter_context(r.materialize()) for r in self._chain]
689 )
691 def expanded(self) -> ChainedDatasetQueryResults:
692 # Docstring inherited from DatasetQueryResults.
693 return ChainedDatasetQueryResults([r.expanded() for r in self._chain])
695 def count(self, *, exact: bool = True) -> int:
696 # Docstring inherited.
697 return sum(r.count(exact=exact) for r in self._chain)
699 def any(
700 self, *,
701 execute: bool = True,
702 exact: bool = True,
703 ) -> bool:
704 # Docstring inherited.
705 return any(r.any(execute=execute, exact=exact) for r in self._chain)
707 def explain_no_results(self) -> Iterator[str]:
708 # Docstring inherited.
709 for r in self._chain:
710 yield from r.explain_no_results()
711 yield from self._doomed_by