Coverage for python/lsst/daf/butler/registry/queries/_results.py : 27%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23__all__ = (
24 "ChainedDatasetQueryResults",
25 "DataCoordinateQueryResults",
26 "DatasetQueryResults",
27 "ParentDatasetQueryResults",
28)
30from abc import abstractmethod
31from contextlib import contextmanager, ExitStack
32import itertools
33from typing import (
34 Any,
35 Callable,
36 ContextManager,
37 Iterable,
38 Iterator,
39 Mapping,
40 Optional,
41 Sequence,
42 Union,
43)
45import sqlalchemy
47from ...core import (
48 DataCoordinate,
49 DataCoordinateIterable,
50 DatasetRef,
51 DatasetType,
52 DimensionGraph,
53 DimensionRecord,
54 SimpleQuery,
55)
56from ..interfaces import Database
57from ._query import Query
60class DataCoordinateQueryResults(DataCoordinateIterable):
61 """An enhanced implementation of `DataCoordinateIterable` that represents
62 data IDs retrieved from a database query.
64 Parameters
65 ----------
66 db : `Database`
67 Database engine used to execute queries.
68 query : `Query`
69 Low-level representation of the query that backs this result object.
70 records : `Mapping`, optional
71 A nested mapping containing `DimensionRecord` objects for all
72 dimensions and all data IDs this query will yield. If `None`
73 (default), `DataCoordinateIterable.hasRecords` will return `False`.
74 The outer mapping has `str` keys (the names of dimension elements).
75 The inner mapping has `tuple` keys representing data IDs (tuple
76 conversions of `DataCoordinate.values()`) and `DimensionRecord` values.
78 Notes
79 -----
80 Constructing an instance of this does nothing; the query is not executed
81 until it is iterated over (or some other operation is performed that
82 involves iteration).
84 Instances should generally only be constructed by `Registry` methods or the
85 methods of other query result objects.
86 """
87 def __init__(self, db: Database, query: Query, *,
88 records: Optional[Mapping[str, Mapping[tuple, DimensionRecord]]] = None):
89 self._db = db
90 self._query = query
91 self._records = records
92 assert query.datasetType is None, \
93 "Query used to initialize data coordinate results should not have any datasets."
95 __slots__ = ("_db", "_query", "_records")
97 def __iter__(self) -> Iterator[DataCoordinate]:
98 return (self._query.extractDataId(row, records=self._records) for row in self._query.rows(self._db))
100 @property
101 def graph(self) -> DimensionGraph:
102 # Docstring inherited from DataCoordinateIterable.
103 return self._query.graph
105 def hasFull(self) -> bool:
106 # Docstring inherited from DataCoordinateIterable.
107 return True
109 def hasRecords(self) -> bool:
110 # Docstring inherited from DataCoordinateIterable.
111 return self._records is not None or not self._query.graph
113 @contextmanager
114 def materialize(self) -> Iterator[DataCoordinateQueryResults]:
115 """Insert this query's results into a temporary table.
117 Returns
118 -------
119 context : `typing.ContextManager` [ `DataCoordinateQueryResults` ]
120 A context manager that ensures the temporary table is created and
121 populated in ``__enter__`` (returning a results object backed by
122 that table), and dropped in ``__exit__``. If ``self`` is already
123 materialized, the context manager may do nothing (reflecting the
124 fact that an outer context manager should already take care of
125 everything else).
127 Notes
128 -----
129 When using a very large result set to perform multiple queries (e.g.
130 multiple calls to `subset` with different arguments, or even a single
131 call to `expanded`), it may be much more efficient to start by
132 materializing the query and only then performing the follow up queries.
133 It may also be less efficient, depending on how well database engine's
134 query optimizer can simplify those particular follow-up queries and
135 how efficiently it caches query results even when the are not
136 explicitly inserted into a temporary table. See `expanded` and
137 `subset` for examples.
138 """
139 with self._query.materialize(self._db) as materialized:
140 yield DataCoordinateQueryResults(self._db, materialized, records=self._records)
142 def expanded(self) -> DataCoordinateQueryResults:
143 """Return a results object for which `hasRecords` returns `True`.
145 This method may involve actually executing database queries to fetch
146 `DimensionRecord` objects.
148 Returns
149 -------
150 results : `DataCoordinateQueryResults`
151 A results object for which `hasRecords` returns `True`. May be
152 ``self`` if that is already the case.
154 Notes
155 -----
156 For very result sets, it may be much more efficient to call
157 `materialize` before calling `expanded`, to avoid performing the
158 original query multiple times (as a subquery) in the follow-up queries
159 that fetch dimension records. For example::
161 with registry.queryDataIds(...).materialize() as tempDataIds:
162 dataIdsWithRecords = tempDataIds.expanded()
163 for dataId in dataIdsWithRecords:
164 ...
165 """
166 if self._records is None:
167 records = {}
168 for element in self.graph.elements:
169 subset = self.subset(graph=element.graph, unique=True)
170 records[element.name] = {
171 tuple(record.dataId.values()): record
172 for record in self._query.managers.dimensions[element].fetch(subset)
173 }
174 return DataCoordinateQueryResults(self._db, self._query, records=records)
175 else:
176 return self
178 def subset(self, graph: Optional[DimensionGraph] = None, *,
179 unique: bool = False) -> DataCoordinateQueryResults:
180 """Return a results object containing a subset of the dimensions of
181 this one, and/or a unique near-subset of its rows.
183 This method may involve actually executing database queries to fetch
184 `DimensionRecord` objects.
186 Parameters
187 ----------
188 graph : `DimensionGraph`, optional
189 Dimensions to include in the new results object. If `None`,
190 ``self.graph`` is used.
191 unique : `bool`, optional
192 If `True` (`False` is default), the query should only return unique
193 data IDs. This is implemented in the database; to obtain unique
194 results via Python-side processing (which may be more efficient in
195 some cases), use `toSet` to construct a `DataCoordinateSet` from
196 this results object instead.
198 Returns
199 -------
200 results : `DataCoordinateQueryResults`
201 A results object corresponding to the given criteria. May be
202 ``self`` if it already qualifies.
204 Notes
205 -----
206 This method can only return a "near-subset" of the original result rows
207 in general because of subtleties in how spatial overlaps are
208 implemented; see `Query.subset` for more information.
210 When calling `subset` multiple times on the same very large result set,
211 it may be much more efficient to call `materialize` first. For
212 example::
214 dimensions1 = DimensionGraph(...)
215 dimensions2 = DimensionGraph(...)
216 with registry.queryDataIds(...).materialize() as tempDataIds:
217 for dataId1 in tempDataIds.subset(
218 graph=dimensions1,
219 unique=True):
220 ...
221 for dataId2 in tempDataIds.subset(
222 graph=dimensions2,
223 unique=True):
224 ...
225 """
226 if graph is None:
227 graph = self.graph
228 if not graph.issubset(self.graph):
229 raise ValueError(f"{graph} is not a subset of {self.graph}")
230 if graph == self.graph and (not unique or self._query.isUnique()):
231 return self
232 records: Optional[Mapping[str, Mapping[tuple, DimensionRecord]]]
233 if self._records is not None:
234 records = {element.name: self._records[element.name] for element in graph.elements}
235 else:
236 records = None
237 return DataCoordinateQueryResults(
238 self._db,
239 self._query.subset(graph=graph, datasets=False, unique=unique),
240 records=records,
241 )
243 def constrain(self, query: SimpleQuery, columns: Callable[[str], sqlalchemy.sql.ColumnElement]) -> None:
244 # Docstring inherited from DataCoordinateIterable.
245 sql = self._query.sql
246 if sql is not None:
247 fromClause = sql.alias("c")
248 query.join(
249 fromClause,
250 onclause=sqlalchemy.sql.and_(*[
251 columns(dimension.name) == fromClause.columns[dimension.name]
252 for dimension in self.graph.required
253 ])
254 )
256 def findDatasets(self, datasetType: Union[DatasetType, str], collections: Any, *,
257 deduplicate: bool = True) -> ParentDatasetQueryResults:
258 """Find datasets using the data IDs identified by this query.
260 Parameters
261 ----------
262 datasetType : `DatasetType` or `str`
263 Dataset type or the name of one to search for. Must have
264 dimensions that are a subset of ``self.graph``.
265 collections : `Any`
266 An expression that fully or partially identifies the collections
267 to search for the dataset, such as a `str`, `re.Pattern`, or
268 iterable thereof. ``...`` can be used to return all collections.
269 See :ref:`daf_butler_collection_expressions` for more information.
270 deduplicate : `bool`, optional
271 If `True` (default), for each result data ID, only yield one
272 `DatasetRef`, from the first collection in which a dataset of that
273 dataset type appears (according to the order of ``collections``
274 passed in). If `True`, ``collections`` must not contain regular
275 expressions and may not be ``...``.
277 Returns
278 -------
279 datasets : `ParentDatasetQueryResults`
280 A lazy-evaluation object representing dataset query results,
281 iterable over `DatasetRef` objects. If ``self.hasRecords()``, all
282 nested data IDs in those dataset references will have records as
283 well.
285 Raises
286 ------
287 ValueError
288 Raised if ``datasetType.dimensions.issubset(self.graph) is False``.
289 """
290 if not isinstance(datasetType, DatasetType):
291 datasetType = self._query.managers.datasets[datasetType].datasetType
292 # moving component handling down into managers.
293 if not datasetType.dimensions.issubset(self.graph):
294 raise ValueError(f"findDatasets requires that the dataset type have the same dimensions as "
295 f"the DataCoordinateQueryResult used as input to the search, but "
296 f"{datasetType.name} has dimensions {datasetType.dimensions}, while the input "
297 f"dimensions are {self.graph}.")
298 builder = self._query.makeBuilder()
299 if datasetType.isComponent():
300 # We were given a true DatasetType instance, but it's a component.
301 parentName, componentName = datasetType.nameAndComponent()
302 storage = self._query.managers.datasets[parentName]
303 datasetType = storage.datasetType
304 components = [componentName]
305 else:
306 components = [None]
307 builder.joinDataset(datasetType, collections=collections, deduplicate=deduplicate)
308 query = builder.finish(joinMissing=False)
309 return ParentDatasetQueryResults(db=self._db, query=query, components=components,
310 records=self._records)
313class DatasetQueryResults(Iterable[DatasetRef]):
314 """An interface for objects that represent the results of queries for
315 datasets.
316 """
318 @abstractmethod
319 def byParentDatasetType(self) -> Iterator[ParentDatasetQueryResults]:
320 """Group results by parent dataset type.
322 Returns
323 -------
324 iter : `Iterator` [ `ParentDatasetQueryResults` ]
325 An iterator over `DatasetQueryResults` instances that are each
326 responsible for a single parent dataset type (either just that
327 dataset type, one or more of its component dataset types, or both).
328 """
329 raise NotImplementedError()
331 @abstractmethod
332 def materialize(self) -> ContextManager[DatasetQueryResults]:
333 """Insert this query's results into a temporary table.
335 Returns
336 -------
337 context : `typing.ContextManager` [ `DatasetQueryResults` ]
338 A context manager that ensures the temporary table is created and
339 populated in ``__enter__`` (returning a results object backed by
340 that table), and dropped in ``__exit__``. If ``self`` is already
341 materialized, the context manager may do nothing (reflecting the
342 fact that an outer context manager should already take care of
343 everything else).
344 """
345 raise NotImplementedError()
347 @abstractmethod
348 def expanded(self) -> DatasetQueryResults:
349 """Return a `DatasetQueryResults` for which `DataCoordinate.hasResults`
350 returns `True` for all data IDs in returned `DatasetRef` objects.
352 Returns
353 -------
354 expanded : `DatasetQueryResults`
355 Either a new `DatasetQueryResults` instance or ``self``, if it is
356 already expanded.
358 Notes
359 -----
360 As with `DataCoordinateQueryResults.expanded`, it may be more efficient
361 to call `materialize` before expanding data IDs for very large result
362 sets.
363 """
364 raise NotImplementedError()
367class ParentDatasetQueryResults(DatasetQueryResults):
368 """An object that represents results from a query for datasets with a
369 single parent `DatasetType`.
371 Parameters
372 ----------
373 db : `Database`
374 Database engine to execute queries against.
375 query : `Query`
376 Low-level query object that backs these results. ``query.datasetType``
377 will be the parent dataset type for this object, and may not be `None`.
378 components : `Sequence` [ `str` or `None` ]
379 Names of components to include in iteration. `None` may be included
380 (at most once) to include the parent dataset type.
381 records : `Mapping`, optional
382 Mapping containing `DimensionRecord` objects for all dimensions and
383 all data IDs this query will yield. If `None` (default),
384 `DataCoordinate.hasRecords` will return `False` for all nested data
385 IDs. This is a nested mapping with `str` names of dimension elements
386 as outer keys, `DimensionRecord` instances as inner values, and
387 ``tuple(record.dataId.values())`` for the inner keys / outer values
388 (where ``record`` is the innermost `DimensionRecord` instance).
389 """
390 def __init__(self, db: Database, query: Query, *,
391 components: Sequence[Optional[str]],
392 records: Optional[Mapping[str, Mapping[tuple, DimensionRecord]]] = None):
393 self._db = db
394 self._query = query
395 self._components = components
396 self._records = records
397 assert query.datasetType is not None, \
398 "Query used to initialize dataset results must have a dataset."
399 assert query.datasetType.dimensions == query.graph
401 __slots__ = ("_db", "_query", "_dimensions", "_components", "_records")
403 def __iter__(self) -> Iterator[DatasetRef]:
404 for row in self._query.rows(self._db):
405 parentRef = self._query.extractDatasetRef(row, records=self._records)
406 for component in self._components:
407 if component is None:
408 yield parentRef
409 else:
410 yield parentRef.makeComponentRef(component)
412 def byParentDatasetType(self) -> Iterator[ParentDatasetQueryResults]:
413 # Docstring inherited from DatasetQueryResults.
414 yield self
416 @contextmanager
417 def materialize(self) -> Iterator[ParentDatasetQueryResults]:
418 # Docstring inherited from DatasetQueryResults.
419 with self._query.materialize(self._db) as materialized:
420 yield ParentDatasetQueryResults(self._db, materialized,
421 components=self._components,
422 records=self._records)
424 @property
425 def parentDatasetType(self) -> DatasetType:
426 """The parent dataset type for all datasets in this iterable
427 (`DatasetType`).
428 """
429 assert self._query.datasetType is not None
430 return self._query.datasetType
432 @property
433 def dataIds(self) -> DataCoordinateQueryResults:
434 """A lazy-evaluation object representing a query for the just the data
435 IDs of the datasets that would be returned by this query
436 (`DataCoordinateQueryResults`).
438 The returned object is not in general `zip`-iterable with ``self``;
439 it may be in a different order or have (or not have) duplicates.
440 """
441 return DataCoordinateQueryResults(
442 self._db,
443 self._query.subset(graph=self.parentDatasetType.dimensions, datasets=False, unique=False),
444 records=self._records,
445 )
447 def withComponents(self, components: Sequence[Optional[str]]) -> ParentDatasetQueryResults:
448 """Return a new query results object for the same parent datasets but
449 different components.
451 components : `Sequence` [ `str` or `None` ]
452 Names of components to include in iteration. `None` may be
453 included (at most once) to include the parent dataset type.
454 """
455 return ParentDatasetQueryResults(self._db, self._query, records=self._records,
456 components=components)
458 def expanded(self) -> ParentDatasetQueryResults:
459 # Docstring inherited from DatasetQueryResults.
460 if self._records is None:
461 records = self.dataIds.expanded()._records
462 return ParentDatasetQueryResults(self._db, self._query, records=records,
463 components=self._components)
464 else:
465 return self
468class ChainedDatasetQueryResults(DatasetQueryResults):
469 """A `DatasetQueryResults` implementation that simply chains together
470 other results objects, each for a different parent dataset type.
472 Parameters
473 ----------
474 chain : `Sequence` [ `ParentDatasetQueryResults` ]
475 The underlying results objects this object will chain together.
476 """
478 def __init__(self, chain: Sequence[ParentDatasetQueryResults]):
479 self._chain = chain
481 __slots__ = ("_chain",)
483 def __iter__(self) -> Iterator[DatasetRef]:
484 return itertools.chain.from_iterable(self._chain)
486 def byParentDatasetType(self) -> Iterator[ParentDatasetQueryResults]:
487 # Docstring inherited from DatasetQueryResults.
488 return iter(self._chain)
490 @contextmanager
491 def materialize(self) -> Iterator[ChainedDatasetQueryResults]:
492 # Docstring inherited from DatasetQueryResults.
493 with ExitStack() as stack:
494 yield ChainedDatasetQueryResults(
495 [stack.enter_context(r.materialize()) for r in self._chain]
496 )
498 def expanded(self) -> ChainedDatasetQueryResults:
499 # Docstring inherited from DatasetQueryResults.
500 return ChainedDatasetQueryResults([r.expanded() for r in self._chain])