Coverage for python/lsst/daf/butler/queries/_query.py: 20%
152 statements
« prev ^ index » next coverage.py v7.4.3, created at 2024-03-05 11:36 +0000
« prev ^ index » next coverage.py v7.4.3, created at 2024-03-05 11:36 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This software is dual licensed under the GNU General Public License and also
10# under a 3-clause BSD license. Recipients may choose which of these licenses
11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12# respectively. If you choose the GPL option then the following text applies
13# (but note that there is still no warranty even if you opt for BSD instead):
14#
15# This program is free software: you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation, either version 3 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program. If not, see <http://www.gnu.org/licenses/>.
28from __future__ import annotations
30__all__ = ("Query",)
32from collections.abc import Iterable, Mapping, Set
33from types import EllipsisType
34from typing import Any, final, overload
36from lsst.utils.iteration import ensure_iterable
38from .._dataset_type import DatasetType
39from .._storage_class import StorageClassFactory
40from ..dimensions import DataCoordinate, DataId, DataIdValue, DimensionGroup
41from ..registry import DatasetTypeError, MissingDatasetTypeError
42from ._base import HomogeneousQueryBase
43from ._data_coordinate_query_results import DataCoordinateQueryResults
44from ._dataset_query_results import (
45 ChainedDatasetQueryResults,
46 DatasetQueryResults,
47 SingleTypeDatasetQueryResults,
48)
49from ._dimension_record_query_results import DimensionRecordQueryResults
50from .convert_args import convert_where_args
51from .driver import QueryDriver
52from .expression_factory import ExpressionFactory
53from .result_specs import DataCoordinateResultSpec, DatasetRefResultSpec, DimensionRecordResultSpec
54from .tree import DatasetSearch, InvalidQueryError, Predicate, QueryTree, make_identity_query_tree
57@final
58class Query(HomogeneousQueryBase):
59 """A method-chaining builder for butler queries.
61 Parameters
62 ----------
63 driver : `QueryDriver`
64 Implementation object that knows how to actually execute queries.
65 tree : `QueryTree`
66 Description of the query as a tree of joins and column expressions. The
67 instance returned directly by the `Butler._query` entry point should be
68 constructed via `make_identity_query_tree`.
70 Notes
71 -----
72 `Query` objects should never be constructed directly by users; use
73 `Butler._query` instead.
75 A `Query` object represents the first stage of query construction, in which
76 constraints and joins are defined (roughly corresponding to the WHERE and
77 FROM clauses in SQL). The various "results" objects represent the second
78 (and final) stage, where the columns returned are specified and any sorting
79 or integer slicing can be applied. Result objects are obtained from the
80 `data_ids`, `datasets`, and `dimension_records` methods.
82 `Query` and query-result objects are always immutable (except for caching
83 information fetched from the database or server), so modifier methods
84 always return a new object without modifying the current one.
85 """
87 def __init__(self, driver: QueryDriver, tree: QueryTree):
88 # __init__ defined here because there are multiple base classes and
89 # not all define __init__ (and hence inherit object.__init__, which
90 # just ignores its args). Even if we just delegate to super(), it
91 # seems less fragile to make it explicit here.
92 super().__init__(driver, tree)
94 @property
95 def constraint_dataset_types(self) -> Set[str]:
96 """The names of all dataset types joined into the query.
98 The existence of datasets of these types constrains the data IDs of any
99 type of result. Fields for these dataset types are also usable in
100 'where' expressions.
101 """
102 # Note that this includes only dataset type names, not `DatasetType`
103 # instances; the `DatasetQueryResults` adapter returned by the
104 # `datasets` method does include `DatasetType` instances, since it is
105 # in a better position to track and respect any storage class override
106 # specified.
107 return self._tree.datasets.keys()
109 @property
110 def constraint_dimensions(self) -> DimensionGroup:
111 """Dimensions currently present in the query, either directly or
112 indirectly.
114 This includes dimensions that are present in any joined subquery (such
115 as a dataset search, materialization, or data ID upload) or `where`
116 argument, as well as any required or implied dependency of those
117 dimensions.
118 """
119 return self._tree.dimensions
121 @property
122 def expression_factory(self) -> ExpressionFactory:
123 """A factory for column expressions using overloaded operators.
125 Notes
126 -----
127 Typically this attribute will be assigned to a single-character local
128 variable, and then its (dynamic) attributes can be used to obtain
129 references to columns that can be included in a query::
131 with butler._query() as query:
132 x = query.expression_factory
133 query = query.where(
134 x.instrument == "LSSTCam",
135 x.visit.day_obs > 20240701,
136 x.any(x.band == 'u', x.band == 'y'),
137 )
139 As shown above, the returned object also has an `any` method to create
140 combine expressions with logical OR (as well as `not_` and `all`,
141 though the latter is rarely necessary since `where` already combines
142 its arguments with AND).
144 Proxies for fields associated with dataset types (``dataset_id``,
145 ``ingest_date``, ``run``, ``collection``, as well as ``timespan`` for
146 `~CollectionType.CALIBRATION` collection searches) can be obtained with
147 dict-like access instead::
149 with butler._query() as query:
150 query = query.order_by(x["raw"].ingest_date)
152 Expression proxy objects that correspond to scalar columns overload the
153 standard comparison operators (``==``, ``!=``, ``<``, ``>``, ``<=``,
154 ``>=``) and provide `~ScalarExpressionProxy.in_range`,
155 `~ScalarExpressionProxy.in_iterable`, and
156 `~ScalarExpressionProxy.in_query` methods for membership tests. For
157 `order_by` contexts, they also have a `~ScalarExpressionProxy.desc`
158 property to indicate that the sort order for that expression should be
159 reversed.
161 Proxy objects for region and timespan fields have an `overlaps` method,
162 and timespans also have `~TimespanProxy.begin` and `~TimespanProxy.end`
163 properties to access scalar expression proxies for the bounds.
165 All proxy objects also have a `~ExpressionProxy.is_null` property.
167 Literal values can be created by calling `ExpressionFactory.literal`,
168 but can almost always be created implicitly via overloaded operators
169 instead.
170 """
171 return ExpressionFactory(self._driver.universe)
173 def data_ids(
174 self, dimensions: DimensionGroup | Iterable[str] | str | None = None
175 ) -> DataCoordinateQueryResults:
176 """Return a result object that is a `DataCoordinate` iterable.
178 Parameters
179 ----------
180 dimensions : `DimensionGroup`, `str`, or \
181 `~collections.abc.Iterable` [`str`], optional
182 The dimensions of the data IDs to yield, as either `DimensionGroup`
183 instances or `str` names. Will be automatically expanded to a
184 complete `DimensionGroup`. These dimensions do not need to match
185 the query's current `dimensions`. Default is
186 `constraint_dimensions`.
188 Returns
189 -------
190 data_ids : `DataCoordinateQueryResults`
191 Data IDs matching the given query parameters. These are guaranteed
192 to identify all dimensions (`DataCoordinate.hasFull` returns
193 `True`), but will not contain `DimensionRecord` objects
194 (`DataCoordinate.hasRecords` returns `False`). Call
195 `~DataCoordinateQueryResults.with_dimension_records` on the
196 returned object to include dimension records as well.
197 """
198 tree = self._tree
199 if dimensions is None:
200 dimensions = self._tree.dimensions
201 else:
202 dimensions = self._driver.universe.conform(dimensions)
203 if not dimensions <= self._tree.dimensions:
204 tree = tree.join_dimensions(dimensions)
205 result_spec = DataCoordinateResultSpec(dimensions=dimensions, include_dimension_records=False)
206 return DataCoordinateQueryResults(self._driver, tree, result_spec)
208 @overload
209 def datasets(
210 self,
211 dataset_type: str | DatasetType,
212 collections: str | Iterable[str] | None = None,
213 *,
214 find_first: bool = True,
215 ) -> SingleTypeDatasetQueryResults: ... # pragma: no cover
217 @overload
218 def datasets(
219 self,
220 dataset_type: Iterable[str | DatasetType] | EllipsisType,
221 collections: str | Iterable[str] | None = None,
222 *,
223 find_first: bool = True,
224 ) -> DatasetQueryResults: ... # pragma: no cover
226 def datasets(
227 self,
228 dataset_type: str | DatasetType | Iterable[str | DatasetType] | EllipsisType,
229 collections: str | Iterable[str] | None = None,
230 *,
231 find_first: bool = True,
232 ) -> DatasetQueryResults:
233 """Return a result object that is a `DatasetRef` iterable.
235 Parameters
236 ----------
237 dataset_type : `str`, `DatasetType`, \
238 `~collections.abc.Iterable` [ `str` or `DatasetType` ], \
239 or ``...``
240 The dataset type or types to search for. Passing ``...`` searches
241 for all datasets in the given collections.
242 collections : `str` or `~collections.abc.Iterable` [ `str` ], optional
243 The collection or collections to search, in order. If not provided
244 or `None`, and the dataset has not already been joined into the
245 query, the default collection search path for this butler is used.
246 find_first : `bool`, optional
247 If `True` (default), for each result data ID, only yield one
248 `DatasetRef` of each `DatasetType`, from the first collection in
249 which a dataset of that dataset type appears (according to the
250 order of ``collections`` passed in). If `True`, ``collections``
251 must not be ``...``.
253 Returns
254 -------
255 refs : `.queries.DatasetQueryResults`
256 Dataset references matching the given query criteria. Nested data
257 IDs are guaranteed to include values for all implied dimensions
258 (i.e. `DataCoordinate.hasFull` will return `True`), but will not
259 include dimension records (`DataCoordinate.hasRecords` will be
260 `False`) unless
261 `~.queries.DatasetQueryResults.with_dimension_records` is
262 called on the result object (which returns a new one).
264 Raises
265 ------
266 lsst.daf.butler.registry.DatasetTypeExpressionError
267 Raised when the ``dataset_type`` expression is invalid.
268 lsst.daf.butler.registry.NoDefaultCollectionError
269 Raised when ``collections`` is `None` and default butler
270 collections are not defined.
271 TypeError
272 Raised when the arguments are incompatible, such as when a
273 collection wildcard is passed when ``find_first`` is `True`
275 Notes
276 -----
277 When multiple dataset types are queried in a single call, the
278 results of this operation are equivalent to querying for each dataset
279 type separately in turn, and no information about the relationships
280 between datasets of different types is included.
281 """
282 queries: dict[str, Query] = {}
283 if dataset_type is ...:
284 if collections is None:
285 collections = self._driver.get_default_collections()
286 else:
287 collections = tuple(ensure_iterable(collections))
288 for _, summary in self._driver.resolve_collection_path(collections):
289 for dataset_type_name in summary.dataset_types.names:
290 queries[dataset_type_name] = self.join_dataset_search(dataset_type_name, collections)
291 else:
292 for arg in ensure_iterable(dataset_type):
293 dataset_type_name, query = self._join_dataset_search_impl(arg, collections)
294 queries[dataset_type_name] = query
296 single_type_results: list[SingleTypeDatasetQueryResults] = []
297 for dataset_type_name in sorted(queries):
298 query = queries[dataset_type_name]
299 dataset_search = query._tree.datasets[dataset_type_name]
300 if dataset_search.storage_class_name is None:
301 raise MissingDatasetTypeError(
302 f"No storage class provided for unregistered dataset type {dataset_type_name!r}. "
303 "Provide a complete DatasetType object instead of a string name to turn this error "
304 "into an empty result set."
305 )
306 spec = DatasetRefResultSpec.model_construct(
307 dataset_type_name=dataset_type_name,
308 dimensions=dataset_search.dimensions,
309 storage_class_name=dataset_search.storage_class_name,
310 include_dimension_records=False,
311 find_first=find_first,
312 )
313 single_type_results.append(
314 SingleTypeDatasetQueryResults(self._driver, tree=query._tree, spec=spec)
315 )
316 if len(single_type_results) == 1:
317 return single_type_results[0]
318 else:
319 return ChainedDatasetQueryResults(tuple(single_type_results))
321 def dimension_records(self, element: str) -> DimensionRecordQueryResults:
322 """Return a result object that is a `DimensionRecord` iterable.
324 Parameters
325 ----------
326 element : `str`
327 The name of a dimension element to obtain records for.
329 Returns
330 -------
331 records : `.queries.DimensionRecordQueryResults`
332 Data IDs matching the given query parameters.
333 """
334 tree = self._tree
335 if element not in tree.dimensions.elements:
336 tree = tree.join_dimensions(self._driver.universe[element].minimal_group)
337 result_spec = DimensionRecordResultSpec(element=self._driver.universe[element])
338 return DimensionRecordQueryResults(self._driver, tree, result_spec)
340 def materialize(
341 self,
342 *,
343 dimensions: Iterable[str] | DimensionGroup | None = None,
344 datasets: Iterable[str] | None = None,
345 ) -> Query:
346 """Execute the query, save its results to a temporary location, and
347 return a new query that represents fetching or joining against those
348 saved results.
350 Parameters
351 ----------
352 dimensions : `~collections.abc.Iterable` [ `str` ] or \
353 `DimensionGroup`, optional
354 Dimensions to include in the temporary results. Default is to
355 include all dimensions in the query.
356 datasets : `~collections.abc.Iterable` [ `str` ], optional
357 Names of dataset types that should be included in the new query;
358 default is to include `constraint_dataset_types`.
360 Returns
361 -------
362 query : `Query`
363 A new query object whose that represents the materialized rows.
365 Notes
366 -----
367 Only dimension key columns and (at the discretion of the
368 implementation) certain dataset columns are actually materialized,
369 since at this stage we do not know which dataset or dimension record
370 fields are actually needed in result rows, and these can be joined back
371 in on the materialized dimension keys. But all constraints on those
372 dimension keys (including dataset existence) are applied to the
373 materialized rows.
374 """
375 if datasets is None:
376 datasets = frozenset(self.constraint_dataset_types)
377 else:
378 datasets = frozenset(datasets)
379 if not (datasets <= self.constraint_dataset_types):
380 raise InvalidQueryError(
381 f"Dataset(s) {datasets - self.constraint_dataset_types} are present in the query."
382 )
383 if dimensions is None:
384 dimensions = self._tree.dimensions
385 else:
386 dimensions = self._driver.universe.conform(dimensions)
387 key = self._driver.materialize(self._tree, dimensions, datasets)
388 tree = make_identity_query_tree(self._driver.universe).join_materialization(
389 key, dimensions=dimensions
390 )
391 for dataset_type_name in datasets:
392 dataset_search = self._tree.datasets[dataset_type_name]
393 if not (dataset_search.dimensions <= tree.dimensions):
394 raise InvalidQueryError(
395 f"Materialization-backed query has dimensions {tree.dimensions}, which do not "
396 f"cover the dimensions {dataset_search.dimensions} of dataset {dataset_type_name!r}. "
397 "Expand the dimensions or drop this dataset type in the arguments to materialize to "
398 "avoid this error."
399 )
400 tree = tree.join_dataset(dataset_type_name, self._tree.datasets[dataset_type_name])
401 return Query(self._driver, tree)
403 def join_dataset_search(
404 self,
405 dataset_type: str | DatasetType,
406 collections: Iterable[str] | None = None,
407 dimensions: DimensionGroup | None = None,
408 ) -> Query:
409 """Return a new query with a search for a dataset joined in.
411 Parameters
412 ----------
413 dataset_type : `str` or `DatasetType`
414 Dataset type or name. May not refer to a dataset component.
415 collections : `~collections.abc.Iterable` [ `str` ], optional
416 Iterable of collections to search. Order is preserved, but will
417 not matter if the dataset search is only used as a constraint on
418 dimensions or if ``find_first=False`` when requesting results. If
419 not present or `None`, the default collection search path will be
420 used.
421 dimensions : `DimensionGroup`, optional
422 The dimensions to assume for the dataset type if it is not
423 registered, or to check against if it is registered. When the
424 dataset is not registered and this is not provided,
425 `MissingDatasetTypeError` is raised, since we cannot construct a
426 query without knowing the dataset's dimensions. Providing this
427 argument causes the returned query to instead return no rows (as it
428 does when the dataset type is registered but no matching datasets
429 are found).
431 Returns
432 -------
433 query : `Query`
434 A new query object with dataset columns available and rows
435 restricted to those consistent with the found data IDs.
437 Raises
438 ------
439 DatasetTypeError
440 Raised if the dimensions were provided but they do not match the
441 registered dataset type.
442 MissingDatasetTypeError
443 Raised if the dimensions were not provided and the dataset type was
444 not registered.
446 Notes
447 -----
448 This method may require communication with the server unless the
449 dataset type and collections have already been referenced by the same
450 query context.
451 """
452 _, query = self._join_dataset_search_impl(dataset_type, collections, dimensions)
453 return query
455 def join_data_coordinates(self, iterable: Iterable[DataCoordinate]) -> Query:
456 """Return a new query that joins in an explicit table of data IDs.
458 Parameters
459 ----------
460 iterable : `~collections.abc.Iterable` [ `DataCoordinate` ]
461 Iterable of `DataCoordinate`. All items must have the same
462 dimensions. Must have at least one item.
464 Returns
465 -------
466 query : `Query`
467 A new query object with the data IDs joined in.
468 """
469 rows: set[tuple[DataIdValue, ...]] = set()
470 dimensions: DimensionGroup | None = None
471 for data_coordinate in iterable:
472 if dimensions is None:
473 dimensions = data_coordinate.dimensions
474 elif dimensions != data_coordinate.dimensions:
475 raise InvalidQueryError(
476 f"Inconsistent dimensions: {dimensions} != {data_coordinate.dimensions}."
477 )
478 rows.add(data_coordinate.required_values)
479 if dimensions is None:
480 raise InvalidQueryError("Cannot upload an empty data coordinate set.")
481 key = self._driver.upload_data_coordinates(dimensions, rows)
482 return Query(
483 tree=self._tree.join_data_coordinate_upload(dimensions=dimensions, key=key), driver=self._driver
484 )
486 def join_dimensions(self, dimensions: Iterable[str] | DimensionGroup) -> Query:
487 """Return a new query that joins the logical tables for additional
488 dimensions.
490 Parameters
491 ----------
492 dimensions : `~collections.abc.Iterable` [ `str` ] or `DimensionGroup`
493 Names of dimensions to join in.
495 Returns
496 -------
497 query : `Query`
498 A new query object with the dimensions joined in.
500 Notes
501 -----
502 Dimensions are automatically joined in whenever needed, so this method
503 should rarely need to be called directly.
504 """
505 dimensions = self._driver.universe.conform(dimensions)
506 return Query(tree=self._tree.join_dimensions(dimensions), driver=self._driver)
508 def where(
509 self,
510 *args: str | Predicate | DataId,
511 bind: Mapping[str, Any] | None = None,
512 **kwargs: Any,
513 ) -> Query:
514 """Return a query with a boolean-expression filter on its rows.
516 Parameters
517 ----------
518 *args
519 Constraints to apply, combined with logical AND. Arguments may be
520 `str` expressions to parse, `Predicate` objects (these are
521 typically constructed via `expression_factory`) or data IDs.
522 bind : `~collections.abc.Mapping`
523 Mapping from string identifier appearing in a string expression to
524 a literal value that should be substituted for it. This is
525 recommended instead of embedding literals directly into the
526 expression, especially for strings, timespans, or other types where
527 quoting or formatting is nontrivial.
528 **kwargs
529 Data ID key value pairs that extend and override any present in
530 ``*args``.
532 Returns
533 -------
534 query : `Query`
535 A new query object with the given row filters (as well as any
536 already present in ``self``). All row filters are combined with
537 logical AND.
539 Notes
540 -----
541 If an expression references a dimension or dimension element that is
542 not already present in the query, it will be joined in, but dataset
543 searches must already be joined into a query in order to reference
544 their fields in expressions.
546 Data ID values are not checked for consistency; they are extracted from
547 ``args`` and then ``kwargs`` and combined, with later values overriding
548 earlier ones.
549 """
550 return Query(
551 tree=self._tree.where(
552 convert_where_args(self.dimensions, self.constraint_dataset_types, *args, bind=bind, **kwargs)
553 ),
554 driver=self._driver,
555 )
557 def _join_dataset_search_impl(
558 self,
559 dataset_type: str | DatasetType,
560 collections: Iterable[str] | None = None,
561 dimensions: DimensionGroup | None = None,
562 ) -> tuple[str, Query]:
563 """Implement `join_dataset_search`, and also return the dataset type
564 name.
565 """
566 # In this method we need the dimensions of the dataset type, but we
567 # don't necessarily need the storage class, since the dataset may only
568 # be used as an existence constraint. But we also want to remember the
569 # storage class if it's passed in, so users don't get frustrated having
570 # to pass it twice if they do want DatasetRefs back.
571 storage_class_name: str | None = None
572 # Handle DatasetType vs. str arg.
573 if isinstance(dataset_type, DatasetType):
574 dataset_type_name = dataset_type.name
575 if dimensions is not None:
576 raise TypeError("Cannot provide a full DatasetType object and separate dimensions.")
577 dimensions = dataset_type.dimensions.as_group()
578 storage_class_name = dataset_type.storageClass_name
579 elif isinstance(dataset_type, str):
580 dataset_type_name = dataset_type
581 else:
582 raise TypeError(f"Invalid dataset type argument {dataset_type!r}.")
583 # See if this dataset has already been joined into the query.
584 if existing_search := self._tree.datasets.get(dataset_type_name):
585 if collections is None:
586 collections = existing_search.collections
587 else:
588 collections = tuple(ensure_iterable(collections))
589 if collections != existing_search.collections:
590 raise InvalidQueryError(
591 f"Dataset type {dataset_type_name!r} was already joined into this "
592 "query with a different collection search path (previously "
593 f"[{', '.join(existing_search.collections)}], now [{', '.join(collections)}])."
594 )
595 if dimensions is None:
596 dimensions = existing_search.dimensions
597 elif dimensions != existing_search.dimensions:
598 raise DatasetTypeError(
599 f"Given dimensions {dimensions} for dataset type {dataset_type_name!r} do not match the "
600 f"previously-joined dimensions {existing_search.dimensions}."
601 )
602 if storage_class_name is None or storage_class_name == existing_search.storage_class_name:
603 # Nothing to do; this dataset has already been joined in with
604 # the parameters we want. We don't need to check against the
605 # registered dataset type since that will have been done the
606 # first time we joined this dataset type in.
607 return dataset_type_name, self
608 else:
609 if collections is None:
610 collections = self._driver.get_default_collections()
611 collections = tuple(ensure_iterable(collections))
612 # See if the dataset type is registered, to look up and/or check
613 # dimensions, and get a storage class if there isn't one already.
614 try:
615 resolved_dataset_type = self._driver.get_dataset_type(dataset_type_name)
616 resolved_dimensions = resolved_dataset_type.dimensions.as_group()
617 if storage_class_name is None:
618 storage_class_name = resolved_dataset_type.storageClass_name
619 except MissingDatasetTypeError:
620 if dimensions is None:
621 raise
622 resolved_dimensions = dimensions
623 else:
624 if dimensions is not None and dimensions != resolved_dimensions:
625 raise DatasetTypeError(
626 f"Given dimensions {dimensions} for dataset type {dataset_type_name!r} do not match the "
627 f"registered dimensions {resolved_dimensions}."
628 )
629 if (
630 storage_class_name is not None
631 and storage_class_name != resolved_dataset_type.storageClass_name
632 ):
633 if not (
634 StorageClassFactory()
635 .getStorageClass(storage_class_name)
636 .can_convert(resolved_dataset_type.storageClass)
637 ):
638 raise DatasetTypeError(
639 f"Given storage class {storage_class_name!r} for {dataset_type_name!r} is not "
640 f"compatible with repository storage class {resolved_dataset_type.storageClass_name}."
641 )
642 # We do not check the storage class for consistency with the registered
643 # storage class at this point, because it's not going to be used for
644 # anything yet other than a default that can still be overridden.
645 dataset_search = DatasetSearch.model_construct(
646 collections=collections,
647 dimensions=resolved_dimensions,
648 storage_class_name=storage_class_name,
649 )
650 return dataset_type_name, Query(
651 self._driver, self._tree.join_dataset(dataset_type_name, dataset_search)
652 )