Coverage for python/lsst/daf/butler/registry/queries/_results.py: 32%
161 statements
« prev ^ index » next coverage.py v6.5.0, created at 2022-12-01 19:55 +0000
« prev ^ index » next coverage.py v6.5.0, created at 2022-12-01 19:55 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23__all__ = (
24 "ChainedDatasetQueryResults",
25 "DataCoordinateQueryResults",
26 "DatasetQueryResults",
27 "ParentDatasetQueryResults",
28)
30from abc import abstractmethod
31from contextlib import contextmanager, ExitStack
32import itertools
33from typing import (
34 Any,
35 Callable,
36 ContextManager,
37 Iterable,
38 Iterator,
39 Mapping,
40 Optional,
41 Sequence,
42 Union,
43)
45import sqlalchemy
47from ...core import (
48 DataCoordinate,
49 DataCoordinateIterable,
50 DatasetRef,
51 DatasetType,
52 DimensionGraph,
53 DimensionRecord,
54 SimpleQuery,
55)
56from ..interfaces import Database
57from ._query import Query
60class DataCoordinateQueryResults(DataCoordinateIterable):
61 """An enhanced implementation of `DataCoordinateIterable` that represents
62 data IDs retrieved from a database query.
64 Parameters
65 ----------
66 db : `Database`
67 Database engine used to execute queries.
68 query : `Query`
69 Low-level representation of the query that backs this result object.
70 records : `Mapping`, optional
71 A nested mapping containing `DimensionRecord` objects for all
72 dimensions and all data IDs this query will yield. If `None`
73 (default), `DataCoordinateIterable.hasRecords` will return `False`.
74 The outer mapping has `str` keys (the names of dimension elements).
75 The inner mapping has `tuple` keys representing data IDs (tuple
76 conversions of `DataCoordinate.values()`) and `DimensionRecord` values.
78 Notes
79 -----
80 Constructing an instance of this does nothing; the query is not executed
81 until it is iterated over (or some other operation is performed that
82 involves iteration).
84 Instances should generally only be constructed by `Registry` methods or the
85 methods of other query result objects.
86 """
87 def __init__(self, db: Database, query: Query, *,
88 records: Optional[Mapping[str, Mapping[tuple, DimensionRecord]]] = None):
89 self._db = db
90 self._query = query
91 self._records = records
92 assert query.datasetType is None, \
93 "Query used to initialize data coordinate results should not have any datasets."
95 __slots__ = ("_db", "_query", "_records")
97 def __iter__(self) -> Iterator[DataCoordinate]:
98 return (self._query.extractDataId(row, records=self._records) for row in self._query.rows(self._db))
100 @property
101 def graph(self) -> DimensionGraph:
102 # Docstring inherited from DataCoordinateIterable.
103 return self._query.graph
105 def hasFull(self) -> bool:
106 # Docstring inherited from DataCoordinateIterable.
107 return True
109 def hasRecords(self) -> bool:
110 # Docstring inherited from DataCoordinateIterable.
111 return self._records is not None or not self._query.graph
113 @contextmanager
114 def materialize(self) -> Iterator[DataCoordinateQueryResults]:
115 """Insert this query's results into a temporary table.
117 Returns
118 -------
119 context : `typing.ContextManager` [ `DataCoordinateQueryResults` ]
120 A context manager that ensures the temporary table is created and
121 populated in ``__enter__`` (returning a results object backed by
122 that table), and dropped in ``__exit__``. If ``self`` is already
123 materialized, the context manager may do nothing (reflecting the
124 fact that an outer context manager should already take care of
125 everything else).
127 Notes
128 -----
129 When using a very large result set to perform multiple queries (e.g.
130 multiple calls to `subset` with different arguments, or even a single
131 call to `expanded`), it may be much more efficient to start by
132 materializing the query and only then performing the follow up queries.
133 It may also be less efficient, depending on how well database engine's
134 query optimizer can simplify those particular follow-up queries and
135 how efficiently it caches query results even when the are not
136 explicitly inserted into a temporary table. See `expanded` and
137 `subset` for examples.
138 """
139 with self._query.materialize(self._db) as materialized:
140 yield DataCoordinateQueryResults(self._db, materialized, records=self._records)
142 def expanded(self) -> DataCoordinateQueryResults:
143 """Return a results object for which `hasRecords` returns `True`.
145 This method may involve actually executing database queries to fetch
146 `DimensionRecord` objects.
148 Returns
149 -------
150 results : `DataCoordinateQueryResults`
151 A results object for which `hasRecords` returns `True`. May be
152 ``self`` if that is already the case.
154 Notes
155 -----
156 For very result sets, it may be much more efficient to call
157 `materialize` before calling `expanded`, to avoid performing the
158 original query multiple times (as a subquery) in the follow-up queries
159 that fetch dimension records. For example::
161 with registry.queryDataIds(...).materialize() as tempDataIds:
162 dataIdsWithRecords = tempDataIds.expanded()
163 for dataId in dataIdsWithRecords:
164 ...
165 """
166 if self._records is None:
167 records = {}
168 for element in self.graph.elements:
169 subset = self.subset(graph=element.graph, unique=True)
170 records[element.name] = {
171 tuple(record.dataId.values()): record
172 for record in self._query.managers.dimensions[element].fetch(subset)
173 }
174 return DataCoordinateQueryResults(self._db, self._query, records=records)
175 else:
176 return self
178 def subset(self, graph: Optional[DimensionGraph] = None, *,
179 unique: bool = False) -> DataCoordinateQueryResults:
180 """Return a results object containing a subset of the dimensions of
181 this one, and/or a unique near-subset of its rows.
183 This method may involve actually executing database queries to fetch
184 `DimensionRecord` objects.
186 Parameters
187 ----------
188 graph : `DimensionGraph`, optional
189 Dimensions to include in the new results object. If `None`,
190 ``self.graph`` is used.
191 unique : `bool`, optional
192 If `True` (`False` is default), the query should only return unique
193 data IDs. This is implemented in the database; to obtain unique
194 results via Python-side processing (which may be more efficient in
195 some cases), use `toSet` to construct a `DataCoordinateSet` from
196 this results object instead.
198 Returns
199 -------
200 results : `DataCoordinateQueryResults`
201 A results object corresponding to the given criteria. May be
202 ``self`` if it already qualifies.
204 Notes
205 -----
206 This method can only return a "near-subset" of the original result rows
207 in general because of subtleties in how spatial overlaps are
208 implemented; see `Query.subset` for more information.
210 When calling `subset` multiple times on the same very large result set,
211 it may be much more efficient to call `materialize` first. For
212 example::
214 dimensions1 = DimensionGraph(...)
215 dimensions2 = DimensionGraph(...)
216 with registry.queryDataIds(...).materialize() as tempDataIds:
217 for dataId1 in tempDataIds.subset(
218 graph=dimensions1,
219 unique=True):
220 ...
221 for dataId2 in tempDataIds.subset(
222 graph=dimensions2,
223 unique=True):
224 ...
225 """
226 if graph is None:
227 graph = self.graph
228 if not graph.issubset(self.graph):
229 raise ValueError(f"{graph} is not a subset of {self.graph}")
230 if graph == self.graph and (not unique or self._query.isUnique()):
231 return self
232 records: Optional[Mapping[str, Mapping[tuple, DimensionRecord]]]
233 if self._records is not None:
234 records = {element.name: self._records[element.name] for element in graph.elements}
235 else:
236 records = None
237 return DataCoordinateQueryResults(
238 self._db,
239 self._query.subset(graph=graph, datasets=False, unique=unique),
240 records=records,
241 )
243 def constrain(self, query: SimpleQuery, columns: Callable[[str], sqlalchemy.sql.ColumnElement]) -> None:
244 # Docstring inherited from DataCoordinateIterable.
245 sql = self._query.sql
246 if sql is not None:
247 fromClause = sql.alias("c")
248 query.join(
249 fromClause,
250 onclause=sqlalchemy.sql.and_(*[
251 columns(dimension.name) == fromClause.columns[dimension.name]
252 for dimension in self.graph.required
253 ])
254 )
256 def findDatasets(self, datasetType: Union[DatasetType, str], collections: Any, *,
257 findFirst: bool = True) -> ParentDatasetQueryResults:
258 """Find datasets using the data IDs identified by this query.
260 Parameters
261 ----------
262 datasetType : `DatasetType` or `str`
263 Dataset type or the name of one to search for. Must have
264 dimensions that are a subset of ``self.graph``.
265 collections : `Any`
266 An expression that fully or partially identifies the collections
267 to search for the dataset, such as a `str`, `re.Pattern`, or
268 iterable thereof. ``...`` can be used to return all collections.
269 See :ref:`daf_butler_collection_expressions` for more information.
270 findFirst : `bool`, optional
271 If `True` (default), for each result data ID, only yield one
272 `DatasetRef`, from the first collection in which a dataset of that
273 dataset type appears (according to the order of ``collections``
274 passed in). If `True`, ``collections`` must not contain regular
275 expressions and may not be ``...``.
277 Returns
278 -------
279 datasets : `ParentDatasetQueryResults`
280 A lazy-evaluation object representing dataset query results,
281 iterable over `DatasetRef` objects. If ``self.hasRecords()``, all
282 nested data IDs in those dataset references will have records as
283 well.
285 Raises
286 ------
287 ValueError
288 Raised if ``datasetType.dimensions.issubset(self.graph) is False``.
289 """
290 if not isinstance(datasetType, DatasetType):
291 datasetType = self._query.managers.datasets[datasetType].datasetType
292 # moving component handling down into managers.
293 if not datasetType.dimensions.issubset(self.graph):
294 raise ValueError(f"findDatasets requires that the dataset type have the same dimensions as "
295 f"the DataCoordinateQueryResult used as input to the search, but "
296 f"{datasetType.name} has dimensions {datasetType.dimensions}, while the input "
297 f"dimensions are {self.graph}.")
298 builder = self._query.makeBuilder()
299 if datasetType.isComponent():
300 # We were given a true DatasetType instance, but it's a component.
301 parentName, componentName = datasetType.nameAndComponent()
302 storage = self._query.managers.datasets[parentName]
303 datasetType = storage.datasetType
304 components = [componentName]
305 else:
306 components = [None]
307 builder.joinDataset(datasetType, collections=collections, findFirst=findFirst)
308 query = builder.finish(joinMissing=False)
309 return ParentDatasetQueryResults(db=self._db, query=query, components=components,
310 records=self._records, datasetType=datasetType)
312 def count(self, *, exact: bool = True) -> int:
313 """Count the number of rows this query would return.
315 Parameters
316 ----------
317 exact : `bool`, optional
318 If `True`, run the full query and perform post-query filtering if
319 needed to account for that filtering in the count. If `False`, the
320 result may be an upper bound.
322 Returns
323 -------
324 count : `int`
325 The number of rows the query would return, or an upper bound if
326 ``exact=False``.
328 Notes
329 -----
330 This counts the number of rows returned, not the number of unique rows
331 returned, so even with ``exact=True`` it may provide only an upper
332 bound on the number of *deduplicated* result rows.
333 """
334 return self._query.count(self._db, exact=exact)
336 def any(
337 self, *,
338 execute: bool = True,
339 exact: bool = True,
340 ) -> bool:
341 """Test whether this query returns any results.
343 Parameters
344 ----------
345 execute : `bool`, optional
346 If `True`, execute at least a ``LIMIT 1`` query if it cannot be
347 determined prior to execution that the query would return no rows.
348 exact : `bool`, optional
349 If `True`, run the full query and perform post-query filtering if
350 needed, until at least one result row is found. If `False`, the
351 returned result does not account for post-query filtering, and
352 hence may be `True` even when all result rows would be filtered
353 out.
355 Returns
356 -------
357 any : `bool`
358 `True` if the query would (or might, depending on arguments) yield
359 result rows. `False` if it definitely would not.
360 """
361 return self._query.any(self._db, execute=execute, exact=exact)
363 def explain_no_results(self) -> Iterator[str]:
364 """Return human-readable messages that may help explain why the query
365 yields no results.
367 Returns
368 -------
369 messages : `Iterator` [ `str` ]
370 String messages that describe reasons the query might not yield any
371 results.
373 Notes
374 -----
375 Messages related to post-query filtering are only available if the
376 iterator has been exhausted, or if `any` or `count` was already called
377 (with ``exact=True`` for the latter two).
379 At present, this method only returns messages that are generated while
380 the query is being built or filtered. In the future, it may perform
381 its own new follow-up queries, which users may wish to short-circuit
382 simply by not continuing to iterate over its results.
383 """
384 return self._query.explain_no_results(self._db)
387class DatasetQueryResults(Iterable[DatasetRef]):
388 """An interface for objects that represent the results of queries for
389 datasets.
390 """
392 @abstractmethod
393 def byParentDatasetType(self) -> Iterator[ParentDatasetQueryResults]:
394 """Group results by parent dataset type.
396 Returns
397 -------
398 iter : `Iterator` [ `ParentDatasetQueryResults` ]
399 An iterator over `DatasetQueryResults` instances that are each
400 responsible for a single parent dataset type (either just that
401 dataset type, one or more of its component dataset types, or both).
402 """
403 raise NotImplementedError()
405 @abstractmethod
406 def materialize(self) -> ContextManager[DatasetQueryResults]:
407 """Insert this query's results into a temporary table.
409 Returns
410 -------
411 context : `typing.ContextManager` [ `DatasetQueryResults` ]
412 A context manager that ensures the temporary table is created and
413 populated in ``__enter__`` (returning a results object backed by
414 that table), and dropped in ``__exit__``. If ``self`` is already
415 materialized, the context manager may do nothing (reflecting the
416 fact that an outer context manager should already take care of
417 everything else).
418 """
419 raise NotImplementedError()
421 @abstractmethod
422 def expanded(self) -> DatasetQueryResults:
423 """Return a `DatasetQueryResults` for which `DataCoordinate.hasRecords`
424 returns `True` for all data IDs in returned `DatasetRef` objects.
426 Returns
427 -------
428 expanded : `DatasetQueryResults`
429 Either a new `DatasetQueryResults` instance or ``self``, if it is
430 already expanded.
432 Notes
433 -----
434 As with `DataCoordinateQueryResults.expanded`, it may be more efficient
435 to call `materialize` before expanding data IDs for very large result
436 sets.
437 """
438 raise NotImplementedError()
440 @abstractmethod
441 def count(self, *, exact: bool = True) -> int:
442 """Count the number of rows this query would return.
444 Parameters
445 ----------
446 exact : `bool`, optional
447 If `True`, run the full query and perform post-query filtering if
448 needed to account for that filtering in the count. If `False`, the
449 result may be an upper bound.
451 Returns
452 -------
453 count : `int`
454 The number of rows the query would return, or an upper bound if
455 ``exact=False``.
457 Notes
458 -----
459 This counts the number of rows returned, not the number of unique rows
460 returned, so even with ``exact=True`` it may provide only an upper
461 bound on the number of *deduplicated* result rows.
462 """
463 raise NotImplementedError()
465 @abstractmethod
466 def any(
467 self, *,
468 execute: bool = True,
469 exact: bool = True,
470 ) -> bool:
471 """Test whether this query returns any results.
473 Parameters
474 ----------
475 execute : `bool`, optional
476 If `True`, execute at least a ``LIMIT 1`` query if it cannot be
477 determined prior to execution that the query would return no rows.
478 exact : `bool`, optional
479 If `True`, run the full query and perform post-query filtering if
480 needed, until at least one result row is found. If `False`, the
481 returned result does not account for post-query filtering, and
482 hence may be `True` even when all result rows would be filtered
483 out.
485 Returns
486 -------
487 any : `bool`
488 `True` if the query would (or might, depending on arguments) yield
489 result rows. `False` if it definitely would not.
490 """
491 raise NotImplementedError()
493 @abstractmethod
494 def explain_no_results(self) -> Iterator[str]:
495 """Return human-readable messages that may help explain why the query
496 yields no results.
498 Returns
499 -------
500 messages : `Iterator` [ `str` ]
501 String messages that describe reasons the query might not yield any
502 results.
504 Notes
505 -----
506 Messages related to post-query filtering are only available if the
507 iterator has been exhausted, or if `any` or `count` was already called
508 (with ``exact=True`` for the latter two).
510 At present, this method only returns messages that are generated while
511 the query is being built or filtered. In the future, it may perform
512 its own new follow-up queries, which users may wish to short-circuit
513 simply by not continuing to iterate over its results.
514 """
515 raise NotImplementedError()
518class ParentDatasetQueryResults(DatasetQueryResults):
519 """An object that represents results from a query for datasets with a
520 single parent `DatasetType`.
522 Parameters
523 ----------
524 db : `Database`
525 Database engine to execute queries against.
526 query : `Query`
527 Low-level query object that backs these results. ``query.datasetType``
528 will be the parent dataset type for this object, and may not be `None`.
529 components : `Sequence` [ `str` or `None` ]
530 Names of components to include in iteration. `None` may be included
531 (at most once) to include the parent dataset type.
532 records : `Mapping`, optional
533 Mapping containing `DimensionRecord` objects for all dimensions and
534 all data IDs this query will yield. If `None` (default),
535 `DataCoordinate.hasRecords` will return `False` for all nested data
536 IDs. This is a nested mapping with `str` names of dimension elements
537 as outer keys, `DimensionRecord` instances as inner values, and
538 ``tuple(record.dataId.values())`` for the inner keys / outer values
539 (where ``record`` is the innermost `DimensionRecord` instance).
540 datasetType : `DatasetType`, optional
541 Parent dataset type for all datasets returned by this query. If not
542 provided, ``query.datasetType`` be used, and must not be `None` (as it
543 is in the case where the query is known to yield no results prior to
544 execution).
545 """
546 def __init__(self, db: Database, query: Query, *,
547 components: Sequence[Optional[str]],
548 records: Optional[Mapping[str, Mapping[tuple, DimensionRecord]]] = None,
549 datasetType: Optional[DatasetType] = None):
550 self._db = db
551 self._query = query
552 self._components = components
553 self._records = records
554 if datasetType is None:
555 datasetType = query.datasetType
556 assert datasetType is not None, \
557 "Query used to initialize dataset results must have a dataset."
558 assert datasetType.dimensions == query.graph, \
559 f"Query dimensions {query.graph} do not match dataset type dimesions {datasetType.dimensions}."
560 self._datasetType = datasetType
562 __slots__ = ("_db", "_query", "_dimensions", "_components", "_records")
564 def __iter__(self) -> Iterator[DatasetRef]:
565 for row in self._query.rows(self._db):
566 parentRef = self._query.extractDatasetRef(row, records=self._records)
567 for component in self._components:
568 if component is None:
569 yield parentRef
570 else:
571 yield parentRef.makeComponentRef(component)
573 def byParentDatasetType(self) -> Iterator[ParentDatasetQueryResults]:
574 # Docstring inherited from DatasetQueryResults.
575 yield self
577 @contextmanager
578 def materialize(self) -> Iterator[ParentDatasetQueryResults]:
579 # Docstring inherited from DatasetQueryResults.
580 with self._query.materialize(self._db) as materialized:
581 yield ParentDatasetQueryResults(self._db, materialized,
582 components=self._components,
583 records=self._records)
585 @property
586 def parentDatasetType(self) -> DatasetType:
587 """The parent dataset type for all datasets in this iterable
588 (`DatasetType`).
589 """
590 return self._datasetType
592 @property
593 def dataIds(self) -> DataCoordinateQueryResults:
594 """A lazy-evaluation object representing a query for just the data
595 IDs of the datasets that would be returned by this query
596 (`DataCoordinateQueryResults`).
598 The returned object is not in general `zip`-iterable with ``self``;
599 it may be in a different order or have (or not have) duplicates.
600 """
601 return DataCoordinateQueryResults(
602 self._db,
603 self._query.subset(graph=self.parentDatasetType.dimensions, datasets=False, unique=False),
604 records=self._records,
605 )
607 def withComponents(self, components: Sequence[Optional[str]]) -> ParentDatasetQueryResults:
608 """Return a new query results object for the same parent datasets but
609 different components.
611 components : `Sequence` [ `str` or `None` ]
612 Names of components to include in iteration. `None` may be
613 included (at most once) to include the parent dataset type.
614 """
615 return ParentDatasetQueryResults(self._db, self._query, records=self._records,
616 components=components, datasetType=self._datasetType)
618 def expanded(self) -> ParentDatasetQueryResults:
619 # Docstring inherited from DatasetQueryResults.
620 if self._records is None:
621 records = self.dataIds.expanded()._records
622 return ParentDatasetQueryResults(self._db, self._query, records=records,
623 components=self._components, datasetType=self._datasetType)
624 else:
625 return self
627 def count(self, *, exact: bool = True) -> int:
628 # Docstring inherited.
629 return len(self._components) * self._query.count(self._db, exact=exact)
631 def any(
632 self, *,
633 execute: bool = True,
634 exact: bool = True,
635 ) -> bool:
636 # Docstring inherited.
637 return self._query.any(self._db, execute=execute, exact=exact)
639 def explain_no_results(self) -> Iterator[str]:
640 # Docstring inherited.
641 return self._query.explain_no_results(self._db)
644class ChainedDatasetQueryResults(DatasetQueryResults):
645 """A `DatasetQueryResults` implementation that simply chains together
646 other results objects, each for a different parent dataset type.
648 Parameters
649 ----------
650 chain : `Sequence` [ `ParentDatasetQueryResults` ]
651 The underlying results objects this object will chain together.
652 doomed_by : `Iterable` [ `str` ], optional
653 A list of messages (appropriate for e.g. logging or exceptions) that
654 explain why the query is known to return no results even before it is
655 executed. Queries with a non-empty list will never be executed.
656 Child results objects may also have their own list.
657 """
659 def __init__(self, chain: Sequence[ParentDatasetQueryResults], doomed_by: Iterable[str] = ()):
660 self._chain = chain
661 self._doomed_by = tuple(doomed_by)
663 __slots__ = ("_chain",)
665 def __iter__(self) -> Iterator[DatasetRef]:
666 return itertools.chain.from_iterable(self._chain)
668 def byParentDatasetType(self) -> Iterator[ParentDatasetQueryResults]:
669 # Docstring inherited from DatasetQueryResults.
670 return iter(self._chain)
672 @contextmanager
673 def materialize(self) -> Iterator[ChainedDatasetQueryResults]:
674 # Docstring inherited from DatasetQueryResults.
675 with ExitStack() as stack:
676 yield ChainedDatasetQueryResults(
677 [stack.enter_context(r.materialize()) for r in self._chain]
678 )
680 def expanded(self) -> ChainedDatasetQueryResults:
681 # Docstring inherited from DatasetQueryResults.
682 return ChainedDatasetQueryResults([r.expanded() for r in self._chain])
684 def count(self, *, exact: bool = True) -> int:
685 # Docstring inherited.
686 return sum(r.count(exact=exact) for r in self._chain)
688 def any(
689 self, *,
690 execute: bool = True,
691 exact: bool = True,
692 ) -> bool:
693 # Docstring inherited.
694 return any(r.any(execute=execute, exact=exact) for r in self._chain)
696 def explain_no_results(self) -> Iterator[str]:
697 # Docstring inherited.
698 for r in self._chain:
699 yield from r.explain_no_results()
700 yield from self._doomed_by