Coverage for python/lsst/daf/butler/registry/queries/_results.py : 26%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23__all__ = (
24 "ChainedDatasetQueryResults",
25 "DataCoordinateQueryResults",
26 "DatasetQueryResults",
27 "ParentDatasetQueryResults",
28)
30from abc import abstractmethod
31from contextlib import contextmanager, ExitStack
32import itertools
33from typing import (
34 Any,
35 Callable,
36 ContextManager,
37 Iterable,
38 Iterator,
39 Mapping,
40 Optional,
41 Sequence,
42 Union,
43)
45import sqlalchemy
47from ...core import (
48 DataCoordinate,
49 DataCoordinateIterable,
50 DatasetRef,
51 DatasetType,
52 DimensionGraph,
53 DimensionRecord,
54 SimpleQuery,
55)
56from ..interfaces import Database
57from ._query import Query
60class DataCoordinateQueryResults(DataCoordinateIterable):
61 """An enhanced implementation of `DataCoordinateIterable` that represents
62 data IDs retrieved from a database query.
64 Parameters
65 ----------
66 db : `Database`
67 Database engine used to execute queries.
68 query : `Query`
69 Low-level representation of the query that backs this result object.
70 records : `Mapping`, optional
71 A nested mapping containing `DimensionRecord` objects for all
72 dimensions and all data IDs this query will yield. If `None`
73 (default), `DataCoordinateIterable.hasRecords` will return `False`.
74 The outer mapping has `str` keys (the names of dimension elements).
75 The inner mapping has `tuple` keys representing data IDs (tuple
76 conversions of `DataCoordinate.values()`) and `DimensionRecord` values.
78 Notes
79 -----
80 Constructing an instance of this does nothing; the query is not executed
81 until it is iterated over (or some other operation is performed that
82 involves iteration).
84 Instances should generally only be constructed by `Registry` methods or the
85 methods of other query result objects.
86 """
87 def __init__(self, db: Database, query: Query, *,
88 records: Optional[Mapping[str, Mapping[tuple, DimensionRecord]]] = None):
89 self._db = db
90 self._query = query
91 self._records = records
92 assert query.datasetType is None, \
93 "Query used to initialize data coordinate results should not have any datasets."
95 __slots__ = ("_db", "_query", "_records")
97 def __iter__(self) -> Iterator[DataCoordinate]:
98 return (self._query.extractDataId(row, records=self._records) for row in self._query.rows(self._db))
100 @property
101 def graph(self) -> DimensionGraph:
102 # Docstring inherited from DataCoordinateIterable.
103 return self._query.graph
105 def hasFull(self) -> bool:
106 # Docstring inherited from DataCoordinateIterable.
107 return True
109 def hasRecords(self) -> bool:
110 # Docstring inherited from DataCoordinateIterable.
111 return self._records is not None or not self._query.graph
113 @contextmanager
114 def materialize(self) -> Iterator[DataCoordinateQueryResults]:
115 """Insert this query's results into a temporary table.
117 Returns
118 -------
119 context : `typing.ContextManager` [ `DataCoordinateQueryResults` ]
120 A context manager that ensures the temporary table is created and
121 populated in ``__enter__`` (returning a results object backed by
122 that table), and dropped in ``__exit__``. If ``self`` is already
123 materialized, the context manager may do nothing (reflecting the
124 fact that an outer context manager should already take care of
125 everything else).
127 Notes
128 -----
129 When using a very large result set to perform multiple queries (e.g.
130 multiple calls to `subset` with different arguments, or even a single
131 call to `expanded`), it may be much more efficient to start by
132 materializing the query and only then performing the follow up queries.
133 It may also be less efficient, depending on how well database engine's
134 query optimizer can simplify those particular follow-up queries and
135 how efficiently it caches query results even when the are not
136 explicitly inserted into a temporary table. See `expanded` and
137 `subset` for examples.
138 """
139 with self._query.materialize(self._db) as materialized:
140 yield DataCoordinateQueryResults(self._db, materialized, records=self._records)
142 def expanded(self) -> DataCoordinateQueryResults:
143 """Return a results object for which `hasRecords` returns `True`.
145 This method may involve actually executing database queries to fetch
146 `DimensionRecord` objects.
148 Returns
149 -------
150 results : `DataCoordinateQueryResults`
151 A results object for which `hasRecords` returns `True`. May be
152 ``self`` if that is already the case.
154 Notes
155 -----
156 For very result sets, it may be much more efficient to call
157 `materialize` before calling `expanded`, to avoid performing the
158 original query multiple times (as a subquery) in the follow-up queries
159 that fetch dimension records. For example::
161 with registry.queryDataIds(...).materialize() as tempDataIds:
162 dataIdsWithRecords = tempDataIds.expanded()
163 for dataId in dataIdsWithRecords:
164 ...
165 """
166 if self._records is None:
167 records = {}
168 for element in self.graph.elements:
169 subset = self.subset(graph=element.graph, unique=True)
170 records[element.name] = {
171 tuple(record.dataId.values()): record
172 for record in self._query.managers.dimensions[element].fetch(subset)
173 }
174 return DataCoordinateQueryResults(self._db, self._query, records=records)
175 else:
176 return self
178 def subset(self, graph: Optional[DimensionGraph] = None, *,
179 unique: bool = False) -> DataCoordinateQueryResults:
180 """Return a results object containing a subset of the dimensions of
181 this one, and/or a unique near-subset of its rows.
183 This method may involve actually executing database queries to fetch
184 `DimensionRecord` objects.
186 Parameters
187 ----------
188 graph : `DimensionGraph`, optional
189 Dimensions to include in the new results object. If `None`,
190 ``self.graph`` is used.
191 unique : `bool`, optional
192 If `True` (`False` is default), the query should only return unique
193 data IDs. This is implemented in the database; to obtain unique
194 results via Python-side processing (which may be more efficient in
195 some cases), use `toSet` to construct a `DataCoordinateSet` from
196 this results object instead.
198 Returns
199 -------
200 results : `DataCoordinateQueryResults`
201 A results object corresponding to the given criteria. May be
202 ``self`` if it already qualifies.
204 Notes
205 -----
206 This method can only return a "near-subset" of the original result rows
207 in general because of subtleties in how spatial overlaps are
208 implemented; see `Query.subset` for more information.
210 When calling `subset` multiple times on the same very large result set,
211 it may be much more efficient to call `materialize` first. For
212 example::
214 dimensions1 = DimensionGraph(...)
215 dimensions2 = DimensionGraph(...)
216 with registry.queryDataIds(...).materialize() as tempDataIds:
217 for dataId1 in tempDataIds.subset(
218 graph=dimensions1,
219 unique=True):
220 ...
221 for dataId2 in tempDataIds.subset(
222 graph=dimensions2,
223 unique=True):
224 ...
225 """
226 if graph is None:
227 graph = self.graph
228 if not graph.issubset(self.graph):
229 raise ValueError(f"{graph} is not a subset of {self.graph}")
230 if graph == self.graph and (not unique or self._query.isUnique()):
231 return self
232 records: Optional[Mapping[str, Mapping[tuple, DimensionRecord]]]
233 if self._records is not None:
234 records = {element.name: self._records[element.name] for element in graph.elements}
235 else:
236 records = None
237 return DataCoordinateQueryResults(
238 self._db,
239 self._query.subset(graph=graph, datasets=False, unique=unique),
240 records=records,
241 )
243 def constrain(self, query: SimpleQuery, columns: Callable[[str], sqlalchemy.sql.ColumnElement]) -> None:
244 # Docstring inherited from DataCoordinateIterable.
245 sql = self._query.sql
246 if sql is not None:
247 fromClause = sql.alias("c")
248 query.join(
249 fromClause,
250 onclause=sqlalchemy.sql.and_(*[
251 columns(dimension.name) == fromClause.columns[dimension.name]
252 for dimension in self.graph.required
253 ])
254 )
256 def findDatasets(self, datasetType: Union[DatasetType, str], collections: Any, *,
257 findFirst: bool = True) -> ParentDatasetQueryResults:
258 """Find datasets using the data IDs identified by this query.
260 Parameters
261 ----------
262 datasetType : `DatasetType` or `str`
263 Dataset type or the name of one to search for. Must have
264 dimensions that are a subset of ``self.graph``.
265 collections : `Any`
266 An expression that fully or partially identifies the collections
267 to search for the dataset, such as a `str`, `re.Pattern`, or
268 iterable thereof. ``...`` can be used to return all collections.
269 See :ref:`daf_butler_collection_expressions` for more information.
270 findFirst : `bool`, optional
271 If `True` (default), for each result data ID, only yield one
272 `DatasetRef`, from the first collection in which a dataset of that
273 dataset type appears (according to the order of ``collections``
274 passed in). If `True`, ``collections`` must not contain regular
275 expressions and may not be ``...``.
277 Returns
278 -------
279 datasets : `ParentDatasetQueryResults`
280 A lazy-evaluation object representing dataset query results,
281 iterable over `DatasetRef` objects. If ``self.hasRecords()``, all
282 nested data IDs in those dataset references will have records as
283 well.
285 Raises
286 ------
287 ValueError
288 Raised if ``datasetType.dimensions.issubset(self.graph) is False``.
289 """
290 if not isinstance(datasetType, DatasetType):
291 datasetType = self._query.managers.datasets[datasetType].datasetType
292 # moving component handling down into managers.
293 if not datasetType.dimensions.issubset(self.graph):
294 raise ValueError(f"findDatasets requires that the dataset type have the same dimensions as "
295 f"the DataCoordinateQueryResult used as input to the search, but "
296 f"{datasetType.name} has dimensions {datasetType.dimensions}, while the input "
297 f"dimensions are {self.graph}.")
298 builder = self._query.makeBuilder()
299 if datasetType.isComponent():
300 # We were given a true DatasetType instance, but it's a component.
301 parentName, componentName = datasetType.nameAndComponent()
302 storage = self._query.managers.datasets[parentName]
303 datasetType = storage.datasetType
304 components = [componentName]
305 else:
306 components = [None]
307 if not builder.joinDataset(datasetType, collections=collections, findFirst=findFirst):
308 raise RuntimeError(
309 f"Error finding datasets of type {datasetType.name} in collections {collections}; "
310 "it is impossible for any such datasets to be found in any of those collections, "
311 "most likely because the dataset type is not registered. "
312 "This error may become a successful query that returns no results in the future, "
313 "because queries with no results are not usually considered an error."
314 )
315 query = builder.finish(joinMissing=False)
316 return ParentDatasetQueryResults(db=self._db, query=query, components=components,
317 records=self._records)
320class DatasetQueryResults(Iterable[DatasetRef]):
321 """An interface for objects that represent the results of queries for
322 datasets.
323 """
325 @abstractmethod
326 def byParentDatasetType(self) -> Iterator[ParentDatasetQueryResults]:
327 """Group results by parent dataset type.
329 Returns
330 -------
331 iter : `Iterator` [ `ParentDatasetQueryResults` ]
332 An iterator over `DatasetQueryResults` instances that are each
333 responsible for a single parent dataset type (either just that
334 dataset type, one or more of its component dataset types, or both).
335 """
336 raise NotImplementedError()
338 @abstractmethod
339 def materialize(self) -> ContextManager[DatasetQueryResults]:
340 """Insert this query's results into a temporary table.
342 Returns
343 -------
344 context : `typing.ContextManager` [ `DatasetQueryResults` ]
345 A context manager that ensures the temporary table is created and
346 populated in ``__enter__`` (returning a results object backed by
347 that table), and dropped in ``__exit__``. If ``self`` is already
348 materialized, the context manager may do nothing (reflecting the
349 fact that an outer context manager should already take care of
350 everything else).
351 """
352 raise NotImplementedError()
354 @abstractmethod
355 def expanded(self) -> DatasetQueryResults:
356 """Return a `DatasetQueryResults` for which `DataCoordinate.hasResults`
357 returns `True` for all data IDs in returned `DatasetRef` objects.
359 Returns
360 -------
361 expanded : `DatasetQueryResults`
362 Either a new `DatasetQueryResults` instance or ``self``, if it is
363 already expanded.
365 Notes
366 -----
367 As with `DataCoordinateQueryResults.expanded`, it may be more efficient
368 to call `materialize` before expanding data IDs for very large result
369 sets.
370 """
371 raise NotImplementedError()
374class ParentDatasetQueryResults(DatasetQueryResults):
375 """An object that represents results from a query for datasets with a
376 single parent `DatasetType`.
378 Parameters
379 ----------
380 db : `Database`
381 Database engine to execute queries against.
382 query : `Query`
383 Low-level query object that backs these results. ``query.datasetType``
384 will be the parent dataset type for this object, and may not be `None`.
385 components : `Sequence` [ `str` or `None` ]
386 Names of components to include in iteration. `None` may be included
387 (at most once) to include the parent dataset type.
388 records : `Mapping`, optional
389 Mapping containing `DimensionRecord` objects for all dimensions and
390 all data IDs this query will yield. If `None` (default),
391 `DataCoordinate.hasRecords` will return `False` for all nested data
392 IDs. This is a nested mapping with `str` names of dimension elements
393 as outer keys, `DimensionRecord` instances as inner values, and
394 ``tuple(record.dataId.values())`` for the inner keys / outer values
395 (where ``record`` is the innermost `DimensionRecord` instance).
396 """
397 def __init__(self, db: Database, query: Query, *,
398 components: Sequence[Optional[str]],
399 records: Optional[Mapping[str, Mapping[tuple, DimensionRecord]]] = None):
400 self._db = db
401 self._query = query
402 self._components = components
403 self._records = records
404 assert query.datasetType is not None, \
405 "Query used to initialize dataset results must have a dataset."
406 assert query.datasetType.dimensions == query.graph
408 __slots__ = ("_db", "_query", "_dimensions", "_components", "_records")
410 def __iter__(self) -> Iterator[DatasetRef]:
411 for row in self._query.rows(self._db):
412 parentRef = self._query.extractDatasetRef(row, records=self._records)
413 for component in self._components:
414 if component is None:
415 yield parentRef
416 else:
417 yield parentRef.makeComponentRef(component)
419 def byParentDatasetType(self) -> Iterator[ParentDatasetQueryResults]:
420 # Docstring inherited from DatasetQueryResults.
421 yield self
423 @contextmanager
424 def materialize(self) -> Iterator[ParentDatasetQueryResults]:
425 # Docstring inherited from DatasetQueryResults.
426 with self._query.materialize(self._db) as materialized:
427 yield ParentDatasetQueryResults(self._db, materialized,
428 components=self._components,
429 records=self._records)
431 @property
432 def parentDatasetType(self) -> DatasetType:
433 """The parent dataset type for all datasets in this iterable
434 (`DatasetType`).
435 """
436 assert self._query.datasetType is not None
437 return self._query.datasetType
439 @property
440 def dataIds(self) -> DataCoordinateQueryResults:
441 """A lazy-evaluation object representing a query for the just the data
442 IDs of the datasets that would be returned by this query
443 (`DataCoordinateQueryResults`).
445 The returned object is not in general `zip`-iterable with ``self``;
446 it may be in a different order or have (or not have) duplicates.
447 """
448 return DataCoordinateQueryResults(
449 self._db,
450 self._query.subset(graph=self.parentDatasetType.dimensions, datasets=False, unique=False),
451 records=self._records,
452 )
454 def withComponents(self, components: Sequence[Optional[str]]) -> ParentDatasetQueryResults:
455 """Return a new query results object for the same parent datasets but
456 different components.
458 components : `Sequence` [ `str` or `None` ]
459 Names of components to include in iteration. `None` may be
460 included (at most once) to include the parent dataset type.
461 """
462 return ParentDatasetQueryResults(self._db, self._query, records=self._records,
463 components=components)
465 def expanded(self) -> ParentDatasetQueryResults:
466 # Docstring inherited from DatasetQueryResults.
467 if self._records is None:
468 records = self.dataIds.expanded()._records
469 return ParentDatasetQueryResults(self._db, self._query, records=records,
470 components=self._components)
471 else:
472 return self
475class ChainedDatasetQueryResults(DatasetQueryResults):
476 """A `DatasetQueryResults` implementation that simply chains together
477 other results objects, each for a different parent dataset type.
479 Parameters
480 ----------
481 chain : `Sequence` [ `ParentDatasetQueryResults` ]
482 The underlying results objects this object will chain together.
483 """
485 def __init__(self, chain: Sequence[ParentDatasetQueryResults]):
486 self._chain = chain
488 __slots__ = ("_chain",)
490 def __iter__(self) -> Iterator[DatasetRef]:
491 return itertools.chain.from_iterable(self._chain)
493 def byParentDatasetType(self) -> Iterator[ParentDatasetQueryResults]:
494 # Docstring inherited from DatasetQueryResults.
495 return iter(self._chain)
497 @contextmanager
498 def materialize(self) -> Iterator[ChainedDatasetQueryResults]:
499 # Docstring inherited from DatasetQueryResults.
500 with ExitStack() as stack:
501 yield ChainedDatasetQueryResults(
502 [stack.enter_context(r.materialize()) for r in self._chain]
503 )
505 def expanded(self) -> ChainedDatasetQueryResults:
506 # Docstring inherited from DatasetQueryResults.
507 return ChainedDatasetQueryResults([r.expanded() for r in self._chain])