Coverage for python/lsst/daf/butler/registry/queries/_results.py: 32%
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23__all__ = (
24 "ChainedDatasetQueryResults",
25 "DataCoordinateQueryResults",
26 "DatasetQueryResults",
27 "ParentDatasetQueryResults",
28)
30from abc import abstractmethod
31from contextlib import contextmanager, ExitStack
32import itertools
33from typing import (
34 Any,
35 Callable,
36 ContextManager,
37 Iterable,
38 Iterator,
39 Mapping,
40 Optional,
41 Sequence,
42 Union,
43)
45import sqlalchemy
47from ...core import (
48 DataCoordinate,
49 DataCoordinateIterable,
50 DatasetRef,
51 DatasetType,
52 DimensionGraph,
53 DimensionRecord,
54 SimpleQuery,
55)
56from ..interfaces import Database
57from ._query import Query
58from ._structs import QuerySummary
61class DataCoordinateQueryResults(DataCoordinateIterable):
62 """An enhanced implementation of `DataCoordinateIterable` that represents
63 data IDs retrieved from a database query.
65 Parameters
66 ----------
67 db : `Database`
68 Database engine used to execute queries.
69 query : `Query`
70 Low-level representation of the query that backs this result object.
71 records : `Mapping`, optional
72 A nested mapping containing `DimensionRecord` objects for all
73 dimensions and all data IDs this query will yield. If `None`
74 (default), `DataCoordinateIterable.hasRecords` will return `False`.
75 The outer mapping has `str` keys (the names of dimension elements).
76 The inner mapping has `tuple` keys representing data IDs (tuple
77 conversions of `DataCoordinate.values()`) and `DimensionRecord` values.
79 Notes
80 -----
81 Constructing an instance of this does nothing; the query is not executed
82 until it is iterated over (or some other operation is performed that
83 involves iteration).
85 Instances should generally only be constructed by `Registry` methods or the
86 methods of other query result objects.
87 """
88 def __init__(self, db: Database, query: Query, *,
89 records: Optional[Mapping[str, Mapping[tuple, DimensionRecord]]] = None):
90 self._db = db
91 self._query = query
92 self._records = records
93 assert query.datasetType is None, \
94 "Query used to initialize data coordinate results should not have any datasets."
96 __slots__ = ("_db", "_query", "_records")
98 def __iter__(self) -> Iterator[DataCoordinate]:
99 return (self._query.extractDataId(row, records=self._records) for row in self._query.rows(self._db))
101 @property
102 def graph(self) -> DimensionGraph:
103 # Docstring inherited from DataCoordinateIterable.
104 return self._query.graph
106 def hasFull(self) -> bool:
107 # Docstring inherited from DataCoordinateIterable.
108 return True
110 def hasRecords(self) -> bool:
111 # Docstring inherited from DataCoordinateIterable.
112 return self._records is not None or not self._query.graph
114 @contextmanager
115 def materialize(self) -> Iterator[DataCoordinateQueryResults]:
116 """Insert this query's results into a temporary table.
118 Returns
119 -------
120 context : `typing.ContextManager` [ `DataCoordinateQueryResults` ]
121 A context manager that ensures the temporary table is created and
122 populated in ``__enter__`` (returning a results object backed by
123 that table), and dropped in ``__exit__``. If ``self`` is already
124 materialized, the context manager may do nothing (reflecting the
125 fact that an outer context manager should already take care of
126 everything else).
128 Notes
129 -----
130 When using a very large result set to perform multiple queries (e.g.
131 multiple calls to `subset` with different arguments, or even a single
132 call to `expanded`), it may be much more efficient to start by
133 materializing the query and only then performing the follow up queries.
134 It may also be less efficient, depending on how well database engine's
135 query optimizer can simplify those particular follow-up queries and
136 how efficiently it caches query results even when the are not
137 explicitly inserted into a temporary table. See `expanded` and
138 `subset` for examples.
139 """
140 with self._query.materialize(self._db) as materialized:
141 yield DataCoordinateQueryResults(self._db, materialized, records=self._records)
143 def expanded(self) -> DataCoordinateQueryResults:
144 """Return a results object for which `hasRecords` returns `True`.
146 This method may involve actually executing database queries to fetch
147 `DimensionRecord` objects.
149 Returns
150 -------
151 results : `DataCoordinateQueryResults`
152 A results object for which `hasRecords` returns `True`. May be
153 ``self`` if that is already the case.
155 Notes
156 -----
157 For very result sets, it may be much more efficient to call
158 `materialize` before calling `expanded`, to avoid performing the
159 original query multiple times (as a subquery) in the follow-up queries
160 that fetch dimension records. For example::
162 with registry.queryDataIds(...).materialize() as tempDataIds:
163 dataIdsWithRecords = tempDataIds.expanded()
164 for dataId in dataIdsWithRecords:
165 ...
166 """
167 if self._records is None:
168 records = {}
169 for element in self.graph.elements:
170 subset = self.subset(graph=element.graph, unique=True)
171 records[element.name] = {
172 tuple(record.dataId.values()): record
173 for record in self._query.managers.dimensions[element].fetch(subset)
174 }
175 return DataCoordinateQueryResults(self._db, self._query, records=records)
176 else:
177 return self
179 def subset(self, graph: Optional[DimensionGraph] = None, *,
180 unique: bool = False) -> DataCoordinateQueryResults:
181 """Return a results object containing a subset of the dimensions of
182 this one, and/or a unique near-subset of its rows.
184 This method may involve actually executing database queries to fetch
185 `DimensionRecord` objects.
187 Parameters
188 ----------
189 graph : `DimensionGraph`, optional
190 Dimensions to include in the new results object. If `None`,
191 ``self.graph`` is used.
192 unique : `bool`, optional
193 If `True` (`False` is default), the query should only return unique
194 data IDs. This is implemented in the database; to obtain unique
195 results via Python-side processing (which may be more efficient in
196 some cases), use `toSet` to construct a `DataCoordinateSet` from
197 this results object instead.
199 Returns
200 -------
201 results : `DataCoordinateQueryResults`
202 A results object corresponding to the given criteria. May be
203 ``self`` if it already qualifies.
205 Notes
206 -----
207 This method can only return a "near-subset" of the original result rows
208 in general because of subtleties in how spatial overlaps are
209 implemented; see `Query.subset` for more information.
211 When calling `subset` multiple times on the same very large result set,
212 it may be much more efficient to call `materialize` first. For
213 example::
215 dimensions1 = DimensionGraph(...)
216 dimensions2 = DimensionGraph(...)
217 with registry.queryDataIds(...).materialize() as tempDataIds:
218 for dataId1 in tempDataIds.subset(
219 graph=dimensions1,
220 unique=True):
221 ...
222 for dataId2 in tempDataIds.subset(
223 graph=dimensions2,
224 unique=True):
225 ...
226 """
227 if graph is None:
228 graph = self.graph
229 if not graph.issubset(self.graph):
230 raise ValueError(f"{graph} is not a subset of {self.graph}")
231 if graph == self.graph and (not unique or self._query.isUnique()):
232 return self
233 records: Optional[Mapping[str, Mapping[tuple, DimensionRecord]]]
234 if self._records is not None:
235 records = {element.name: self._records[element.name] for element in graph.elements}
236 else:
237 records = None
238 return DataCoordinateQueryResults(
239 self._db,
240 self._query.subset(graph=graph, datasets=False, unique=unique),
241 records=records,
242 )
244 def constrain(self, query: SimpleQuery, columns: Callable[[str], sqlalchemy.sql.ColumnElement]) -> None:
245 # Docstring inherited from DataCoordinateIterable.
246 sql = self._query.sql
247 if sql is not None:
248 fromClause = sql.alias("c")
249 query.join(
250 fromClause,
251 onclause=sqlalchemy.sql.and_(*[
252 columns(dimension.name) == fromClause.columns[dimension.name]
253 for dimension in self.graph.required
254 ])
255 )
257 def findDatasets(self, datasetType: Union[DatasetType, str], collections: Any, *,
258 findFirst: bool = True) -> ParentDatasetQueryResults:
259 """Find datasets using the data IDs identified by this query.
261 Parameters
262 ----------
263 datasetType : `DatasetType` or `str`
264 Dataset type or the name of one to search for. Must have
265 dimensions that are a subset of ``self.graph``.
266 collections : `Any`
267 An expression that fully or partially identifies the collections
268 to search for the dataset, such as a `str`, `re.Pattern`, or
269 iterable thereof. ``...`` can be used to return all collections.
270 See :ref:`daf_butler_collection_expressions` for more information.
271 findFirst : `bool`, optional
272 If `True` (default), for each result data ID, only yield one
273 `DatasetRef`, from the first collection in which a dataset of that
274 dataset type appears (according to the order of ``collections``
275 passed in). If `True`, ``collections`` must not contain regular
276 expressions and may not be ``...``.
278 Returns
279 -------
280 datasets : `ParentDatasetQueryResults`
281 A lazy-evaluation object representing dataset query results,
282 iterable over `DatasetRef` objects. If ``self.hasRecords()``, all
283 nested data IDs in those dataset references will have records as
284 well.
286 Raises
287 ------
288 ValueError
289 Raised if ``datasetType.dimensions.issubset(self.graph) is False``.
290 """
291 if not isinstance(datasetType, DatasetType):
292 datasetType = self._query.managers.datasets[datasetType].datasetType
293 # moving component handling down into managers.
294 if not datasetType.dimensions.issubset(self.graph):
295 raise ValueError(f"findDatasets requires that the dataset type have the same dimensions as "
296 f"the DataCoordinateQueryResult used as input to the search, but "
297 f"{datasetType.name} has dimensions {datasetType.dimensions}, while the input "
298 f"dimensions are {self.graph}.")
299 if datasetType.isComponent():
300 # We were given a true DatasetType instance, but it's a component.
301 parentName, componentName = datasetType.nameAndComponent()
302 storage = self._query.managers.datasets[parentName]
303 datasetType = storage.datasetType
304 components = [componentName]
305 else:
306 components = [None]
307 summary = QuerySummary(self.graph, whereRegion=self._query.whereRegion, datasets=[datasetType])
308 builder = self._query.makeBuilder(summary)
309 builder.joinDataset(datasetType, collections=collections, findFirst=findFirst)
310 query = builder.finish(joinMissing=False)
311 return ParentDatasetQueryResults(db=self._db, query=query, components=components,
312 records=self._records, datasetType=datasetType)
314 def count(self, *, exact: bool = True) -> int:
315 """Count the number of rows this query would return.
317 Parameters
318 ----------
319 exact : `bool`, optional
320 If `True`, run the full query and perform post-query filtering if
321 needed to account for that filtering in the count. If `False`, the
322 result may be an upper bound.
324 Returns
325 -------
326 count : `int`
327 The number of rows the query would return, or an upper bound if
328 ``exact=False``.
330 Notes
331 -----
332 This counts the number of rows returned, not the number of unique rows
333 returned, so even with ``exact=True`` it may provide only an upper
334 bound on the number of *deduplicated* result rows.
335 """
336 return self._query.count(self._db, exact=exact)
338 def any(
339 self, *,
340 execute: bool = True,
341 exact: bool = True,
342 ) -> bool:
343 """Test whether this query returns any results.
345 Parameters
346 ----------
347 execute : `bool`, optional
348 If `True`, execute at least a ``LIMIT 1`` query if it cannot be
349 determined prior to execution that the query would return no rows.
350 exact : `bool`, optional
351 If `True`, run the full query and perform post-query filtering if
352 needed, until at least one result row is found. If `False`, the
353 returned result does not account for post-query filtering, and
354 hence may be `True` even when all result rows would be filtered
355 out.
357 Returns
358 -------
359 any : `bool`
360 `True` if the query would (or might, depending on arguments) yield
361 result rows. `False` if it definitely would not.
362 """
363 return self._query.any(self._db, execute=execute, exact=exact)
365 def explain_no_results(self) -> Iterator[str]:
366 """Return human-readable messages that may help explain why the query
367 yields no results.
369 Returns
370 -------
371 messages : `Iterator` [ `str` ]
372 String messages that describe reasons the query might not yield any
373 results.
375 Notes
376 -----
377 Messages related to post-query filtering are only available if the
378 iterator has been exhausted, or if `any` or `count` was already called
379 (with ``exact=True`` for the latter two).
381 At present, this method only returns messages that are generated while
382 the query is being built or filtered. In the future, it may perform
383 its own new follow-up queries, which users may wish to short-circuit
384 simply by not continuing to iterate over its results.
385 """
386 return self._query.explain_no_results(self._db)
389class DatasetQueryResults(Iterable[DatasetRef]):
390 """An interface for objects that represent the results of queries for
391 datasets.
392 """
394 @abstractmethod
395 def byParentDatasetType(self) -> Iterator[ParentDatasetQueryResults]:
396 """Group results by parent dataset type.
398 Returns
399 -------
400 iter : `Iterator` [ `ParentDatasetQueryResults` ]
401 An iterator over `DatasetQueryResults` instances that are each
402 responsible for a single parent dataset type (either just that
403 dataset type, one or more of its component dataset types, or both).
404 """
405 raise NotImplementedError()
407 @abstractmethod
408 def materialize(self) -> ContextManager[DatasetQueryResults]:
409 """Insert this query's results into a temporary table.
411 Returns
412 -------
413 context : `typing.ContextManager` [ `DatasetQueryResults` ]
414 A context manager that ensures the temporary table is created and
415 populated in ``__enter__`` (returning a results object backed by
416 that table), and dropped in ``__exit__``. If ``self`` is already
417 materialized, the context manager may do nothing (reflecting the
418 fact that an outer context manager should already take care of
419 everything else).
420 """
421 raise NotImplementedError()
423 @abstractmethod
424 def expanded(self) -> DatasetQueryResults:
425 """Return a `DatasetQueryResults` for which `DataCoordinate.hasRecords`
426 returns `True` for all data IDs in returned `DatasetRef` objects.
428 Returns
429 -------
430 expanded : `DatasetQueryResults`
431 Either a new `DatasetQueryResults` instance or ``self``, if it is
432 already expanded.
434 Notes
435 -----
436 As with `DataCoordinateQueryResults.expanded`, it may be more efficient
437 to call `materialize` before expanding data IDs for very large result
438 sets.
439 """
440 raise NotImplementedError()
442 @abstractmethod
443 def count(self, *, exact: bool = True) -> int:
444 """Count the number of rows this query would return.
446 Parameters
447 ----------
448 exact : `bool`, optional
449 If `True`, run the full query and perform post-query filtering if
450 needed to account for that filtering in the count. If `False`, the
451 result may be an upper bound.
453 Returns
454 -------
455 count : `int`
456 The number of rows the query would return, or an upper bound if
457 ``exact=False``.
459 Notes
460 -----
461 This counts the number of rows returned, not the number of unique rows
462 returned, so even with ``exact=True`` it may provide only an upper
463 bound on the number of *deduplicated* result rows.
464 """
465 raise NotImplementedError()
467 @abstractmethod
468 def any(
469 self, *,
470 execute: bool = True,
471 exact: bool = True,
472 ) -> bool:
473 """Test whether this query returns any results.
475 Parameters
476 ----------
477 execute : `bool`, optional
478 If `True`, execute at least a ``LIMIT 1`` query if it cannot be
479 determined prior to execution that the query would return no rows.
480 exact : `bool`, optional
481 If `True`, run the full query and perform post-query filtering if
482 needed, until at least one result row is found. If `False`, the
483 returned result does not account for post-query filtering, and
484 hence may be `True` even when all result rows would be filtered
485 out.
487 Returns
488 -------
489 any : `bool`
490 `True` if the query would (or might, depending on arguments) yield
491 result rows. `False` if it definitely would not.
492 """
493 raise NotImplementedError()
495 @abstractmethod
496 def explain_no_results(self) -> Iterator[str]:
497 """Return human-readable messages that may help explain why the query
498 yields no results.
500 Returns
501 -------
502 messages : `Iterator` [ `str` ]
503 String messages that describe reasons the query might not yield any
504 results.
506 Notes
507 -----
508 Messages related to post-query filtering are only available if the
509 iterator has been exhausted, or if `any` or `count` was already called
510 (with ``exact=True`` for the latter two).
512 At present, this method only returns messages that are generated while
513 the query is being built or filtered. In the future, it may perform
514 its own new follow-up queries, which users may wish to short-circuit
515 simply by not continuing to iterate over its results.
516 """
517 raise NotImplementedError()
520class ParentDatasetQueryResults(DatasetQueryResults):
521 """An object that represents results from a query for datasets with a
522 single parent `DatasetType`.
524 Parameters
525 ----------
526 db : `Database`
527 Database engine to execute queries against.
528 query : `Query`
529 Low-level query object that backs these results. ``query.datasetType``
530 will be the parent dataset type for this object, and may not be `None`.
531 components : `Sequence` [ `str` or `None` ]
532 Names of components to include in iteration. `None` may be included
533 (at most once) to include the parent dataset type.
534 records : `Mapping`, optional
535 Mapping containing `DimensionRecord` objects for all dimensions and
536 all data IDs this query will yield. If `None` (default),
537 `DataCoordinate.hasRecords` will return `False` for all nested data
538 IDs. This is a nested mapping with `str` names of dimension elements
539 as outer keys, `DimensionRecord` instances as inner values, and
540 ``tuple(record.dataId.values())`` for the inner keys / outer values
541 (where ``record`` is the innermost `DimensionRecord` instance).
542 datasetType : `DatasetType`, optional
543 Parent dataset type for all datasets returned by this query. If not
544 provided, ``query.datasetType`` be used, and must not be `None` (as it
545 is in the case where the query is known to yield no results prior to
546 execution).
547 """
548 def __init__(self, db: Database, query: Query, *,
549 components: Sequence[Optional[str]],
550 records: Optional[Mapping[str, Mapping[tuple, DimensionRecord]]] = None,
551 datasetType: Optional[DatasetType] = None):
552 self._db = db
553 self._query = query
554 self._components = components
555 self._records = records
556 if datasetType is None:
557 datasetType = query.datasetType
558 assert datasetType is not None, \
559 "Query used to initialize dataset results must have a dataset."
560 assert datasetType.dimensions == query.graph, \
561 f"Query dimensions {query.graph} do not match dataset type dimesions {datasetType.dimensions}."
562 self._datasetType = datasetType
564 __slots__ = ("_db", "_query", "_dimensions", "_components", "_records")
566 def __iter__(self) -> Iterator[DatasetRef]:
567 for row in self._query.rows(self._db):
568 parentRef = self._query.extractDatasetRef(row, records=self._records)
569 for component in self._components:
570 if component is None:
571 yield parentRef
572 else:
573 yield parentRef.makeComponentRef(component)
575 def byParentDatasetType(self) -> Iterator[ParentDatasetQueryResults]:
576 # Docstring inherited from DatasetQueryResults.
577 yield self
579 @contextmanager
580 def materialize(self) -> Iterator[ParentDatasetQueryResults]:
581 # Docstring inherited from DatasetQueryResults.
582 with self._query.materialize(self._db) as materialized:
583 yield ParentDatasetQueryResults(self._db, materialized,
584 components=self._components,
585 records=self._records)
587 @property
588 def parentDatasetType(self) -> DatasetType:
589 """The parent dataset type for all datasets in this iterable
590 (`DatasetType`).
591 """
592 return self._datasetType
594 @property
595 def dataIds(self) -> DataCoordinateQueryResults:
596 """A lazy-evaluation object representing a query for just the data
597 IDs of the datasets that would be returned by this query
598 (`DataCoordinateQueryResults`).
600 The returned object is not in general `zip`-iterable with ``self``;
601 it may be in a different order or have (or not have) duplicates.
602 """
603 return DataCoordinateQueryResults(
604 self._db,
605 self._query.subset(graph=self.parentDatasetType.dimensions, datasets=False, unique=False),
606 records=self._records,
607 )
609 def withComponents(self, components: Sequence[Optional[str]]) -> ParentDatasetQueryResults:
610 """Return a new query results object for the same parent datasets but
611 different components.
613 components : `Sequence` [ `str` or `None` ]
614 Names of components to include in iteration. `None` may be
615 included (at most once) to include the parent dataset type.
616 """
617 return ParentDatasetQueryResults(self._db, self._query, records=self._records,
618 components=components, datasetType=self._datasetType)
620 def expanded(self) -> ParentDatasetQueryResults:
621 # Docstring inherited from DatasetQueryResults.
622 if self._records is None:
623 records = self.dataIds.expanded()._records
624 return ParentDatasetQueryResults(self._db, self._query, records=records,
625 components=self._components, datasetType=self._datasetType)
626 else:
627 return self
629 def count(self, *, exact: bool = True) -> int:
630 # Docstring inherited.
631 return len(self._components) * self._query.count(self._db, exact=exact)
633 def any(
634 self, *,
635 execute: bool = True,
636 exact: bool = True,
637 ) -> bool:
638 # Docstring inherited.
639 return self._query.any(self._db, execute=execute, exact=exact)
641 def explain_no_results(self) -> Iterator[str]:
642 # Docstring inherited.
643 return self._query.explain_no_results(self._db)
646class ChainedDatasetQueryResults(DatasetQueryResults):
647 """A `DatasetQueryResults` implementation that simply chains together
648 other results objects, each for a different parent dataset type.
650 Parameters
651 ----------
652 chain : `Sequence` [ `ParentDatasetQueryResults` ]
653 The underlying results objects this object will chain together.
654 doomed_by : `Iterable` [ `str` ], optional
655 A list of messages (appropriate for e.g. logging or exceptions) that
656 explain why the query is known to return no results even before it is
657 executed. Queries with a non-empty list will never be executed.
658 Child results objects may also have their own list.
659 """
661 def __init__(self, chain: Sequence[ParentDatasetQueryResults], doomed_by: Iterable[str] = ()):
662 self._chain = chain
663 self._doomed_by = tuple(doomed_by)
665 __slots__ = ("_chain",)
667 def __iter__(self) -> Iterator[DatasetRef]:
668 return itertools.chain.from_iterable(self._chain)
670 def byParentDatasetType(self) -> Iterator[ParentDatasetQueryResults]:
671 # Docstring inherited from DatasetQueryResults.
672 return iter(self._chain)
674 @contextmanager
675 def materialize(self) -> Iterator[ChainedDatasetQueryResults]:
676 # Docstring inherited from DatasetQueryResults.
677 with ExitStack() as stack:
678 yield ChainedDatasetQueryResults(
679 [stack.enter_context(r.materialize()) for r in self._chain]
680 )
682 def expanded(self) -> ChainedDatasetQueryResults:
683 # Docstring inherited from DatasetQueryResults.
684 return ChainedDatasetQueryResults([r.expanded() for r in self._chain])
686 def count(self, *, exact: bool = True) -> int:
687 # Docstring inherited.
688 return sum(r.count(exact=exact) for r in self._chain)
690 def any(
691 self, *,
692 execute: bool = True,
693 exact: bool = True,
694 ) -> bool:
695 # Docstring inherited.
696 return any(r.any(execute=execute, exact=exact) for r in self._chain)
698 def explain_no_results(self) -> Iterator[str]:
699 # Docstring inherited.
700 for r in self._chain:
701 yield from r.explain_no_results()
702 yield from self._doomed_by