Coverage for python/lsst/daf/butler/registry/_registry.py : 14%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22from __future__ import annotations
24__all__ = (
25 "ConsistentDataIds",
26 "Registry",
27)
29from collections import defaultdict
30import contextlib
31from dataclasses import dataclass
32import sys
33from typing import (
34 Any,
35 Iterable,
36 Iterator,
37 List,
38 Mapping,
39 Optional,
40 Set,
41 Type,
42 TYPE_CHECKING,
43 Union,
44)
46import sqlalchemy
48import lsst.sphgeom
49from ..core import (
50 Config,
51 DataCoordinate,
52 DataId,
53 DatasetRef,
54 DatasetType,
55 Dimension,
56 DimensionElement,
57 DimensionGraph,
58 DimensionRecord,
59 DimensionUniverse,
60 ExpandedDataCoordinate,
61 FakeDatasetRef,
62 StorageClassFactory,
63)
64from ..core import ddl
65from ..core.utils import doImport, iterable, transactional
66from ._config import RegistryConfig
67from .queries import (
68 QueryBuilder,
69 QuerySummary,
70)
71from .tables import makeRegistryTableSpecs
72from ._collectionType import CollectionType
73from ._exceptions import ConflictingDefinitionError, InconsistentDataIdError, OrphanedRecordError
74from .wildcards import CategorizedWildcard, CollectionQuery, CollectionSearch
76if TYPE_CHECKING: 76 ↛ 77line 76 didn't jump to line 77, because the condition on line 76 was never true
77 from ..butlerConfig import ButlerConfig
78 from ..core import (
79 Quantum
80 )
81 from .interfaces import (
82 CollectionManager,
83 Database,
84 OpaqueTableStorageManager,
85 DimensionRecordStorageManager,
86 DatasetRecordStorageManager,
87 )
90@dataclass
91class ConsistentDataIds:
92 """A struct used to report relationships between data IDs by
93 `Registry.relateDataIds`.
95 If an instance of this class is returned (instead of `None`), the data IDs
96 are "not inconsistent" - any keys they have in common have the same value,
97 and any spatial or temporal relationships they have at least might involve
98 an overlap. To capture this, any instance of `ConsistentDataIds` coerces
99 to `True` in boolean contexts.
100 """
102 overlaps: bool
103 """If `True`, the data IDs have at least one key in common, associated with
104 the same value.
106 Note that data IDs are not inconsistent even if overlaps is `False` - they
107 may simply have no keys in common, which means they cannot have
108 inconsistent values for any keys. They may even be equal, in the case that
109 both data IDs are empty.
111 This field does _not_ indicate whether a spatial or temporal overlap
112 relationship exists.
113 """
115 contains: bool
116 """If `True`, all keys in the first data ID are in the second, and are
117 associated with the same values.
119 This includes case where the first data ID is empty.
120 """
122 within: bool
123 """If `True`, all keys in the second data ID are in the first, and are
124 associated with the same values.
126 This includes case where the second data ID is empty.
127 """
129 @property
130 def equal(self) -> bool:
131 """If `True`, the two data IDs are the same.
133 Data IDs are equal if they have both a `contains` and a `within`
134 relationship.
135 """
136 return self.contains and self.within
138 @property
139 def disjoint(self) -> bool:
140 """If `True`, the two data IDs have no keys in common.
142 This is simply the oppose of `overlaps`. Disjoint datasets are by
143 definition not inconsistent.
144 """
145 return not self.overlaps
147 def __bool__(self) -> bool:
148 return True
151class Registry:
152 """Registry interface.
154 Parameters
155 ----------
156 config : `ButlerConfig`, `RegistryConfig`, `Config` or `str`
157 Registry configuration
158 """
160 defaultConfigFile = None
161 """Path to configuration defaults. Relative to $DAF_BUTLER_DIR/config or
162 absolute path. Can be None if no defaults specified.
163 """
165 @classmethod
166 def fromConfig(cls, config: Union[ButlerConfig, RegistryConfig, Config, str], create: bool = False,
167 butlerRoot: Optional[str] = None, writeable: bool = True) -> Registry:
168 """Create `Registry` subclass instance from `config`.
170 Uses ``registry.cls`` from `config` to determine which subclass to
171 instantiate.
173 Parameters
174 ----------
175 config : `ButlerConfig`, `RegistryConfig`, `Config` or `str`
176 Registry configuration
177 create : `bool`, optional
178 Assume empty Registry and create a new one.
179 butlerRoot : `str`, optional
180 Path to the repository root this `Registry` will manage.
181 writeable : `bool`, optional
182 If `True` (default) create a read-write connection to the database.
184 Returns
185 -------
186 registry : `Registry` (subclass)
187 A new `Registry` subclass instance.
188 """
189 if not isinstance(config, RegistryConfig):
190 if isinstance(config, str) or isinstance(config, Config):
191 config = RegistryConfig(config)
192 else:
193 raise ValueError("Incompatible Registry configuration: {}".format(config))
194 config.replaceRoot(butlerRoot)
195 DatabaseClass = config.getDatabaseClass()
196 database = DatabaseClass.fromUri(str(config.connectionString), origin=config.get("origin", 0),
197 namespace=config.get("namespace"), writeable=writeable)
198 universe = DimensionUniverse(config)
199 opaque = doImport(config["managers", "opaque"])
200 dimensions = doImport(config["managers", "dimensions"])
201 collections = doImport(config["managers", "collections"])
202 datasets = doImport(config["managers", "datasets"])
203 return cls(database, universe, dimensions=dimensions, opaque=opaque, collections=collections,
204 datasets=datasets, create=create)
206 def __init__(self, database: Database, universe: DimensionUniverse, *,
207 opaque: Type[OpaqueTableStorageManager],
208 dimensions: Type[DimensionRecordStorageManager],
209 collections: Type[CollectionManager],
210 datasets: Type[DatasetRecordStorageManager],
211 create: bool = False):
212 self._db = database
213 self.storageClasses = StorageClassFactory()
214 with self._db.declareStaticTables(create=create) as context:
215 self._dimensions = dimensions.initialize(self._db, context, universe=universe)
216 self._collections = collections.initialize(self._db, context)
217 self._datasets = datasets.initialize(self._db, context,
218 collections=self._collections,
219 universe=self.dimensions)
220 self._tables = context.addTableTuple(makeRegistryTableSpecs(self.dimensions,
221 self._collections,
222 self._datasets))
223 self._opaque = opaque.initialize(self._db, context)
224 self._collections.refresh()
225 self._datasets.refresh(universe=self._dimensions.universe)
227 def __str__(self) -> str:
228 return str(self._db)
230 def __repr__(self) -> str:
231 return f"Registry({self._db!r}, {self.dimensions!r})"
233 def isWriteable(self) -> bool:
234 """Return `True` if this registry allows write operations, and `False`
235 otherwise.
236 """
237 return self._db.isWriteable()
239 @property
240 def dimensions(self) -> DimensionUniverse:
241 """All dimensions recognized by this `Registry` (`DimensionUniverse`).
242 """
243 return self._dimensions.universe
245 @contextlib.contextmanager
246 def transaction(self):
247 """Return a context manager that represents a transaction.
248 """
249 # TODO make savepoint=False the default.
250 try:
251 with self._db.transaction():
252 yield
253 except BaseException:
254 # TODO: this clears the caches sometimes when we wouldn't actually
255 # need to. Can we avoid that?
256 self._dimensions.clearCaches()
257 raise
259 def registerOpaqueTable(self, tableName: str, spec: ddl.TableSpec):
260 """Add an opaque (to the `Registry`) table for use by a `Datastore` or
261 other data repository client.
263 Opaque table records can be added via `insertOpaqueData`, retrieved via
264 `fetchOpaqueData`, and removed via `deleteOpaqueData`.
266 Parameters
267 ----------
268 tableName : `str`
269 Logical name of the opaque table. This may differ from the
270 actual name used in the database by a prefix and/or suffix.
271 spec : `ddl.TableSpec`
272 Specification for the table to be added.
273 """
274 self._opaque.register(tableName, spec)
276 @transactional
277 def insertOpaqueData(self, tableName: str, *data: dict):
278 """Insert records into an opaque table.
280 Parameters
281 ----------
282 tableName : `str`
283 Logical name of the opaque table. Must match the name used in a
284 previous call to `registerOpaqueTable`.
285 data
286 Each additional positional argument is a dictionary that represents
287 a single row to be added.
288 """
289 self._opaque[tableName].insert(*data)
291 def fetchOpaqueData(self, tableName: str, **where: Any) -> Iterator[dict]:
292 """Retrieve records from an opaque table.
294 Parameters
295 ----------
296 tableName : `str`
297 Logical name of the opaque table. Must match the name used in a
298 previous call to `registerOpaqueTable`.
299 where
300 Additional keyword arguments are interpreted as equality
301 constraints that restrict the returned rows (combined with AND);
302 keyword arguments are column names and values are the values they
303 must have.
305 Yields
306 ------
307 row : `dict`
308 A dictionary representing a single result row.
309 """
310 yield from self._opaque[tableName].fetch(**where)
312 @transactional
313 def deleteOpaqueData(self, tableName: str, **where: Any):
314 """Remove records from an opaque table.
316 Parameters
317 ----------
318 tableName : `str`
319 Logical name of the opaque table. Must match the name used in a
320 previous call to `registerOpaqueTable`.
321 where
322 Additional keyword arguments are interpreted as equality
323 constraints that restrict the deleted rows (combined with AND);
324 keyword arguments are column names and values are the values they
325 must have.
326 """
327 self._opaque[tableName].delete(**where)
329 def registerCollection(self, name: str, type: CollectionType = CollectionType.TAGGED):
330 """Add a new collection if one with the given name does not exist.
332 Parameters
333 ----------
334 name : `str`
335 The name of the collection to create.
336 type : `CollectionType`
337 Enum value indicating the type of collection to create.
339 Notes
340 -----
341 This method cannot be called within transactions, as it needs to be
342 able to perform its own transaction to be concurrent.
343 """
344 self._collections.register(name, type)
346 def getCollectionType(self, name: str) -> CollectionType:
347 """Return an enumeration value indicating the type of the given
348 collection.
350 Parameters
351 ----------
352 name : `str`
353 The name of the collection.
355 Returns
356 -------
357 type : `CollectionType`
358 Enum value indicating the type of this collection.
360 Raises
361 ------
362 MissingCollectionError
363 Raised if no collection with the given name exists.
364 """
365 return self._collections.find(name).type
367 def registerRun(self, name: str):
368 """Add a new run if one with the given name does not exist.
370 Parameters
371 ----------
372 name : `str`
373 The name of the run to create.
375 Notes
376 -----
377 This method cannot be called within transactions, as it needs to be
378 able to perform its own transaction to be concurrent.
379 """
380 self._collections.register(name, CollectionType.RUN)
382 @transactional
383 def removeCollection(self, name: str):
384 """Completely remove the given collection.
386 Parameters
387 ----------
388 name : `str`
389 The name of the collection to remove.
391 Raises
392 ------
393 MissingCollectionError
394 Raised if no collection with the given name exists.
396 Notes
397 -----
398 If this is a `~CollectionType.RUN` collection, all datasets and quanta
399 in it are also fully removed. This requires that those datasets be
400 removed (or at least trashed) from any datastores that hold them first.
402 A collection may not be deleted as long as it is referenced by a
403 `~CollectionType.CHAINED` collection; the ``CHAINED`` collection must
404 be deleted or redefined first.
405 """
406 self._collections.remove(name)
408 def getCollectionChain(self, parent: str) -> CollectionSearch:
409 """Return the child collections in a `~CollectionType.CHAINED`
410 collection.
412 Parameters
413 ----------
414 parent : `str`
415 Name of the chained collection. Must have already been added via
416 a call to `Registry.registerCollection`.
418 Returns
419 -------
420 children : `CollectionSearch`
421 An object that defines the search path of the collection.
422 See :ref:`daf_butler_collection_expressions` for more information.
424 Raises
425 ------
426 MissingCollectionError
427 Raised if ``parent`` does not exist in the `Registry`.
428 TypeError
429 Raised if ``parent`` does not correspond to a
430 `~CollectionType.CHAINED` collection.
431 """
432 record = self._collections.find(parent)
433 if record.type is not CollectionType.CHAINED:
434 raise TypeError(f"Collection '{parent}' has type {record.type.name}, not CHAINED.")
435 return record.children
437 @transactional
438 def setCollectionChain(self, parent: str, children: Any):
439 """Define or redefine a `~CollectionType.CHAINED` collection.
441 Parameters
442 ----------
443 parent : `str`
444 Name of the chained collection. Must have already been added via
445 a call to `Registry.registerCollection`.
446 children : `Any`
447 An expression defining an ordered search of child collections,
448 generally an iterable of `str`. Restrictions on the dataset types
449 to be searched can also be included, by passing mapping or an
450 iterable containing tuples; see
451 :ref:`daf_butler_collection_expressions` for more information.
453 Raises
454 ------
455 MissingCollectionError
456 Raised when any of the given collections do not exist in the
457 `Registry`.
458 TypeError
459 Raised if ``parent`` does not correspond to a
460 `~CollectionType.CHAINED` collection.
461 ValueError
462 Raised if the given collections contains a cycle.
463 """
464 record = self._collections.find(parent)
465 if record.type is not CollectionType.CHAINED:
466 raise TypeError(f"Collection '{parent}' has type {record.type.name}, not CHAINED.")
467 children = CollectionSearch.fromExpression(children)
468 if children != record.children:
469 record.update(self._collections, children)
471 def registerDatasetType(self, datasetType: DatasetType) -> bool:
472 """
473 Add a new `DatasetType` to the Registry.
475 It is not an error to register the same `DatasetType` twice.
477 Parameters
478 ----------
479 datasetType : `DatasetType`
480 The `DatasetType` to be added.
482 Returns
483 -------
484 inserted : `bool`
485 `True` if ``datasetType`` was inserted, `False` if an identical
486 existing `DatsetType` was found. Note that in either case the
487 DatasetType is guaranteed to be defined in the Registry
488 consistently with the given definition.
490 Raises
491 ------
492 ValueError
493 Raised if the dimensions or storage class are invalid.
494 ConflictingDefinitionError
495 Raised if this DatasetType is already registered with a different
496 definition.
498 Notes
499 -----
500 This method cannot be called within transactions, as it needs to be
501 able to perform its own transaction to be concurrent.
502 """
503 _, inserted = self._datasets.register(datasetType)
504 return inserted
506 def getDatasetType(self, name: str) -> DatasetType:
507 """Get the `DatasetType`.
509 Parameters
510 ----------
511 name : `str`
512 Name of the type.
514 Returns
515 -------
516 type : `DatasetType`
517 The `DatasetType` associated with the given name.
519 Raises
520 ------
521 KeyError
522 Requested named DatasetType could not be found in registry.
523 """
524 storage = self._datasets.find(name)
525 if storage is None:
526 raise KeyError(f"DatasetType '{name}' could not be found.")
527 return storage.datasetType
529 def findDataset(self, datasetType: Union[DatasetType, str], dataId: Optional[DataId] = None, *,
530 collections: Any, **kwargs: Any) -> Optional[DatasetRef]:
531 """Find a dataset given its `DatasetType` and data ID.
533 This can be used to obtain a `DatasetRef` that permits the dataset to
534 be read from a `Datastore`. If the dataset is a component and can not
535 be found using the provided dataset type, a dataset ref for the parent
536 will be returned instead but with the correct dataset type.
538 Parameters
539 ----------
540 datasetType : `DatasetType` or `str`
541 A `DatasetType` or the name of one.
542 dataId : `dict` or `DataCoordinate`, optional
543 A `dict`-like object containing the `Dimension` links that identify
544 the dataset within a collection.
545 collections
546 An expression that fully or partially identifies the collections
547 to search for the dataset, such as a `str`, `re.Pattern`, or
548 iterable thereof. `...` can be used to return all collections.
549 See :ref:`daf_butler_collection_expressions` for more information.
550 **kwargs
551 Additional keyword arguments passed to
552 `DataCoordinate.standardize` to convert ``dataId`` to a true
553 `DataCoordinate` or augment an existing one.
555 Returns
556 -------
557 ref : `DatasetRef`
558 A reference to the dataset, or `None` if no matching Dataset
559 was found.
561 Raises
562 ------
563 LookupError
564 Raised if one or more data ID keys are missing or the dataset type
565 does not exist.
566 MissingCollectionError
567 Raised if any of ``collections`` does not exist in the registry.
568 """
569 if isinstance(datasetType, DatasetType):
570 storage = self._datasets.find(datasetType.name)
571 if storage is None:
572 raise LookupError(f"DatasetType '{datasetType}' has not been registered.")
573 else:
574 storage = self._datasets.find(datasetType)
575 if storage is None:
576 raise LookupError(f"DatasetType with name '{datasetType}' has not been registered.")
577 dataId = DataCoordinate.standardize(dataId, graph=storage.datasetType.dimensions,
578 universe=self.dimensions, **kwargs)
579 collections = CollectionSearch.fromExpression(collections)
580 for collectionRecord in collections.iter(self._collections, datasetType=storage.datasetType):
581 result = storage.find(collectionRecord, dataId)
582 if result is not None:
583 if result.datasetType.isComposite():
584 result = self._datasets.fetchComponents(result)
585 return result
587 # fallback to the parent if we got nothing and this was a component
588 if storage.datasetType.isComponent():
589 parentType, _ = storage.datasetType.nameAndComponent()
590 parentRef = self.findDataset(parentType, dataId, collections=collections, **kwargs)
591 if parentRef is not None:
592 # Should already conform and we know no components
593 return DatasetRef(storage.datasetType, parentRef.dataId, id=parentRef.id,
594 run=parentRef.run, conform=False, hasParentId=True)
596 return None
598 @transactional
599 def insertDatasets(self, datasetType: Union[DatasetType, str], dataIds: Iterable[DataId],
600 run: str, *, producer: Optional[Quantum] = None, recursive: bool = False
601 ) -> List[DatasetRef]:
602 """Insert one or more datasets into the `Registry`
604 This always adds new datasets; to associate existing datasets with
605 a new collection, use ``associate``.
607 Parameters
608 ----------
609 datasetType : `DatasetType` or `str`
610 A `DatasetType` or the name of one.
611 dataIds : `~collections.abc.Iterable` of `dict` or `DataCoordinate`
612 Dimension-based identifiers for the new datasets.
613 run : `str`
614 The name of the run that produced the datasets.
615 producer : `Quantum`
616 Unit of work that produced the datasets. May be `None` to store
617 no provenance information, but if present the `Quantum` must
618 already have been added to the Registry.
619 recursive : `bool`
620 If True, recursively add datasets and attach entries for component
621 datasets as well.
623 Returns
624 -------
625 refs : `list` of `DatasetRef`
626 Resolved `DatasetRef` instances for all given data IDs (in the same
627 order).
629 Raises
630 ------
631 ConflictingDefinitionError
632 If a dataset with the same dataset type and data ID as one of those
633 given already exists in ``run``.
634 MissingCollectionError
635 Raised if ``run`` does not exist in the registry.
636 """
637 if isinstance(datasetType, DatasetType):
638 storage = self._datasets.find(datasetType.name)
639 if storage is None:
640 raise LookupError(f"DatasetType '{datasetType}' has not been registered.")
641 else:
642 storage = self._datasets.find(datasetType)
643 if storage is None:
644 raise LookupError(f"DatasetType with name '{datasetType}' has not been registered.")
645 runRecord = self._collections.find(run)
646 dataIds = [self.expandDataId(dataId, graph=storage.datasetType.dimensions) for dataId in dataIds]
647 try:
648 refs = list(storage.insert(runRecord, dataIds, quantum=producer))
649 except sqlalchemy.exc.IntegrityError as err:
650 raise ConflictingDefinitionError(f"A database constraint failure was triggered by inserting "
651 f"one or more datasets of type {storage.datasetType} into "
652 f"collection '{run}'. "
653 f"This probably means a dataset with the same data ID "
654 f"and dataset type already exists, but it may also mean a "
655 f"dimension row is missing.") from err
656 if recursive and storage.datasetType.isComposite():
657 # Insert component rows by recursing.
658 composites = defaultdict(dict)
659 # TODO: we really shouldn't be inserting all components defined by
660 # the storage class, because there's no guarantee all of them are
661 # actually present in these datasets.
662 for componentName in storage.datasetType.storageClass.components:
663 componentDatasetType = storage.datasetType.makeComponentDatasetType(componentName)
664 componentRefs = self.insertDatasets(componentDatasetType,
665 dataIds=dataIds,
666 run=run,
667 producer=producer,
668 recursive=True)
669 for parentRef, componentRef in zip(refs, componentRefs):
670 composites[parentRef][componentName] = componentRef
671 if composites:
672 refs = list(self._datasets.attachComponents(composites.items()))
673 return refs
675 def getDataset(self, id: int) -> Optional[DatasetRef]:
676 """Retrieve a Dataset entry.
678 Parameters
679 ----------
680 id : `int`
681 The unique identifier for the dataset.
683 Returns
684 -------
685 ref : `DatasetRef` or `None`
686 A ref to the Dataset, or `None` if no matching Dataset
687 was found.
688 """
689 ref = self._datasets.getDatasetRef(id)
690 if ref is None:
691 return None
692 if ref.datasetType.isComposite():
693 return self._datasets.fetchComponents(ref)
694 return ref
696 @transactional
697 def removeDatasets(self, refs: Iterable[DatasetRef], *, recursive: bool = True):
698 """Remove datasets from the Registry.
700 The datasets will be removed unconditionally from all collections, and
701 any `Quantum` that consumed this dataset will instead be marked with
702 having a NULL input. `Datastore` records will *not* be deleted; the
703 caller is responsible for ensuring that the dataset has already been
704 removed from all Datastores.
706 Parameters
707 ----------
708 refs : `Iterable` of `DatasetRef`
709 References to the datasets to be removed. Must include a valid
710 ``id`` attribute, and should be considered invalidated upon return.
711 recursive : `bool`, optional
712 If `True`, remove all component datasets as well. Note that
713 this only removes components that are actually included in the
714 given `DatasetRef` instances, which may not be the same as those in
715 the database (especially if they were obtained from
716 `queryDatasets`, which does not populate `DatasetRef.components`).
718 Raises
719 ------
720 AmbiguousDatasetError
721 Raised if any ``ref.id`` is `None`.
722 OrphanedRecordError
723 Raised if any dataset is still present in any `Datastore`.
724 """
725 for datasetType, refsForType in DatasetRef.groupByType(refs, recursive=recursive).items():
726 storage = self._datasets.find(datasetType.name)
727 try:
728 storage.delete(refsForType)
729 except sqlalchemy.exc.IntegrityError as err:
730 raise OrphanedRecordError("One or more datasets is still "
731 "present in one or more Datastores.") from err
733 @transactional
734 def attachComponents(self, parent: DatasetRef, components: Mapping[str, DatasetRef]):
735 """Attach components to a dataset.
737 Parameters
738 ----------
739 parent : `DatasetRef`
740 A reference to the parent dataset.
741 components : `Mapping` [ `str`, `DatasetRef` ]
742 Mapping from component name to the `DatasetRef` for that component.
744 Returns
745 -------
746 ref : `DatasetRef`
747 An updated version of ``parent`` with components included.
749 Returns
750 -------
751 ref : `DatasetRef`
752 A version ``parent`` with ``component`` included in its components.
754 Raises
755 ------
756 AmbiguousDatasetError
757 Raised if ``parent.id`` or any `DatasetRef.id` in ``components``
758 is `None`.
759 """
760 for name, ref in components.items():
761 if ref.datasetType.storageClass != parent.datasetType.storageClass.components[name]:
762 raise TypeError(f"Expected storage class "
763 f"'{parent.datasetType.storageClass.components[name].name}' "
764 f"for component '{name}' of dataset {parent}; got "
765 f"dataset {ref} with storage class "
766 f"'{ref.datasetType.storageClass.name}'.")
767 ref, = self._datasets.attachComponents([(parent, components)])
768 return ref
770 @transactional
771 def associate(self, collection: str, refs: Iterable[DatasetRef], *, recursive: bool = True):
772 """Add existing datasets to a `~CollectionType.TAGGED` collection.
774 If a DatasetRef with the same exact integer ID is already in a
775 collection nothing is changed. If a `DatasetRef` with the same
776 `DatasetType` and data ID but with different integer ID
777 exists in the collection, `ConflictingDefinitionError` is raised.
779 Parameters
780 ----------
781 collection : `str`
782 Indicates the collection the datasets should be associated with.
783 refs : `Iterable` [ `DatasetRef` ]
784 An iterable of resolved `DatasetRef` instances that already exist
785 in this `Registry`.
786 recursive : `bool`, optional
787 If `True`, associate all component datasets as well. Note that
788 this only associates components that are actually included in the
789 given `DatasetRef` instances, which may not be the same as those in
790 the database (especially if they were obtained from
791 `queryDatasets`, which does not populate `DatasetRef.components`).
793 Raises
794 ------
795 ConflictingDefinitionError
796 If a Dataset with the given `DatasetRef` already exists in the
797 given collection.
798 AmbiguousDatasetError
799 Raised if ``any(ref.id is None for ref in refs)``.
800 MissingCollectionError
801 Raised if ``collection`` does not exist in the registry.
802 TypeError
803 Raise adding new datasets to the given ``collection`` is not
804 allowed.
805 """
806 collectionRecord = self._collections.find(collection)
807 if collectionRecord.type is not CollectionType.TAGGED:
808 raise TypeError(f"Collection '{collection}' has type {collectionRecord.type.name}, not TAGGED.")
809 for datasetType, refsForType in DatasetRef.groupByType(refs, recursive=recursive).items():
810 storage = self._datasets.find(datasetType.name)
811 try:
812 storage.associate(collectionRecord, refsForType)
813 except sqlalchemy.exc.IntegrityError as err:
814 raise ConflictingDefinitionError(
815 f"Constraint violation while associating dataset of type {datasetType.name} with "
816 f"collection {collection}. This probably means that one or more datasets with the same "
817 f"dataset type and data ID already exist in the collection, but it may also indicate "
818 f"that the datasets do not exist."
819 ) from err
821 @transactional
822 def disassociate(self, collection: str, refs: Iterable[DatasetRef], *, recursive: bool = True):
823 """Remove existing datasets from a `~CollectionType.TAGGED` collection.
825 ``collection`` and ``ref`` combinations that are not currently
826 associated are silently ignored.
828 Parameters
829 ----------
830 collection : `str`
831 The collection the datasets should no longer be associated with.
832 refs : `Iterable` [ `DatasetRef` ]
833 An iterable of resolved `DatasetRef` instances that already exist
834 in this `Registry`.
835 recursive : `bool`, optional
836 If `True`, disassociate all component datasets as well. Note that
837 this only disassociates components that are actually included in
838 the given `DatasetRef` instances, which may not be the same as
839 those in the database (especially if they were obtained from
840 `queryDatasets`, which does not populate `DatasetRef.components`).
842 Raises
843 ------
844 AmbiguousDatasetError
845 Raised if any of the given dataset references is unresolved.
846 MissingCollectionError
847 Raised if ``collection`` does not exist in the registry.
848 TypeError
849 Raise adding new datasets to the given ``collection`` is not
850 allowed.
851 """
852 collectionRecord = self._collections.find(collection)
853 if collectionRecord.type is not CollectionType.TAGGED:
854 raise TypeError(f"Collection '{collection}' has type {collectionRecord.type.name}; "
855 "expected TAGGED.")
856 for datasetType, refsForType in DatasetRef.groupByType(refs, recursive=recursive).items():
857 storage = self._datasets.find(datasetType.name)
858 storage.disassociate(collectionRecord, refsForType)
860 @transactional
861 def insertDatasetLocations(self, datastoreName: str, refs: Iterable[DatasetRef]):
862 """Record that a datastore holds the given datasets.
864 Typically used by `Datastore`.
866 Parameters
867 ----------
868 datastoreName : `str`
869 Name of the datastore holding these datasets.
870 refs : `~collections.abc.Iterable` of `DatasetRef`
871 References to the datasets.
873 Raises
874 ------
875 AmbiguousDatasetError
876 Raised if ``any(ref.id is None for ref in refs)``.
877 """
878 self._db.insert(
879 self._tables.dataset_location,
880 *[{"datastore_name": datastoreName, "dataset_id": ref.getCheckedId()} for ref in refs]
881 )
883 @transactional
884 def moveDatasetLocationToTrash(self, datastoreName: str, refs: Iterable[DatasetRef]):
885 """Move the dataset location information to trash.
887 Parameters
888 ----------
889 datastoreName : `str`
890 Name of the datastore holding these datasets.
891 refs : `~collections.abc.Iterable` of `DatasetRef`
892 References to the datasets.
893 """
894 # We only want to move rows that already exist in the main table
895 filtered = self.checkDatasetLocations(datastoreName, refs)
896 self.canDeleteDatasetLocations(datastoreName, filtered)
897 self.removeDatasetLocation(datastoreName, filtered)
899 @transactional
900 def canDeleteDatasetLocations(self, datastoreName: str, refs: Iterable[DatasetRef]):
901 """Record that a datastore can delete this dataset
903 Parameters
904 ----------
905 datastoreName : `str`
906 Name of the datastore holding these datasets.
907 refs : `~collections.abc.Iterable` of `DatasetRef`
908 References to the datasets.
910 Raises
911 ------
912 AmbiguousDatasetError
913 Raised if ``any(ref.id is None for ref in refs)``.
914 """
915 self._db.insert(
916 self._tables.dataset_location_trash,
917 *[{"datastore_name": datastoreName, "dataset_id": ref.getCheckedId()} for ref in refs]
918 )
920 def checkDatasetLocations(self, datastoreName: str, refs: Iterable[DatasetRef]) -> List[DatasetRef]:
921 """Check which refs are listed for this datastore.
923 Parameters
924 ----------
925 datastoreName : `str`
926 Name of the datastore holding these datasets.
927 refs : `~collections.abc.Iterable` of `DatasetRef`
928 References to the datasets.
930 Returns
931 -------
932 present : `list` of `DatasetRef`
933 All the `DatasetRef` that are listed.
934 """
936 table = self._tables.dataset_location
937 result = self._db.query(
938 sqlalchemy.sql.select(
939 [table.columns.datastore_name, table.columns.dataset_id]
940 ).where(
941 sqlalchemy.sql.and_(table.columns.dataset_id.in_([ref.id for ref in refs]),
942 table.columns.datastore_name == datastoreName)
943 )
944 ).fetchall()
946 matched_ids = {r["dataset_id"] for r in result}
947 return [ref for ref in refs if ref.id in matched_ids]
949 def getDatasetLocations(self, ref: DatasetRef) -> Set[str]:
950 """Retrieve datastore locations for a given dataset.
952 Typically used by `Datastore`.
954 Parameters
955 ----------
956 ref : `DatasetRef`
957 A reference to the dataset for which to retrieve storage
958 information.
960 Returns
961 -------
962 datastores : `set` of `str`
963 All the matching datastores holding this dataset. Empty set
964 if the dataset does not exist anywhere.
966 Raises
967 ------
968 AmbiguousDatasetError
969 Raised if ``ref.id`` is `None`.
970 """
971 table = self._tables.dataset_location
972 result = self._db.query(
973 sqlalchemy.sql.select(
974 [table.columns.datastore_name]
975 ).where(
976 table.columns.dataset_id == ref.id
977 )
978 ).fetchall()
979 return {r["datastore_name"] for r in result}
981 @transactional
982 def getTrashedDatasets(self, datastoreName: str) -> Set[FakeDatasetRef]:
983 """Retrieve all the dataset ref IDs that are in the trash
984 associated with the specified datastore.
986 Parameters
987 ----------
988 datastoreName : `str`
989 The relevant datastore name to use.
991 Returns
992 -------
993 ids : `set` of `FakeDatasetRef`
994 The IDs of datasets that can be safely removed from this datastore.
995 Can be empty.
996 """
997 table = self._tables.dataset_location_trash
998 result = self._db.query(
999 sqlalchemy.sql.select(
1000 [table.columns.dataset_id]
1001 ).where(
1002 table.columns.datastore_name == datastoreName
1003 )
1004 ).fetchall()
1005 return {FakeDatasetRef(r["dataset_id"]) for r in result}
1007 @transactional
1008 def emptyDatasetLocationsTrash(self, datastoreName: str, refs: Iterable[FakeDatasetRef]) -> None:
1009 """Remove datastore location associated with these datasets from trash.
1011 Typically used by `Datastore` when a dataset is removed.
1013 Parameters
1014 ----------
1015 datastoreName : `str`
1016 Name of this `Datastore`.
1017 refs : iterable of `FakeDatasetRef`
1018 The dataset IDs to be removed.
1020 Raises
1021 ------
1022 AmbiguousDatasetError
1023 Raised if ``ref.id`` is `None`.
1024 """
1025 if not refs:
1026 return
1027 self._db.delete(
1028 self._tables.dataset_location_trash,
1029 ["dataset_id", "datastore_name"],
1030 *[{"dataset_id": ref.id, "datastore_name": datastoreName} for ref in refs]
1031 )
1033 @transactional
1034 def removeDatasetLocation(self, datastoreName: str, refs: Iterable[DatasetRef]) -> None:
1035 """Remove datastore location associated with this dataset.
1037 Typically used by `Datastore` when a dataset is removed.
1039 Parameters
1040 ----------
1041 datastoreName : `str`
1042 Name of this `Datastore`.
1043 refs : iterable of `DatasetRef`
1044 A reference to the dataset for which information is to be removed.
1046 Raises
1047 ------
1048 AmbiguousDatasetError
1049 Raised if ``ref.id`` is `None`.
1050 """
1051 if not refs:
1052 return
1053 self._db.delete(
1054 self._tables.dataset_location,
1055 ["dataset_id", "datastore_name"],
1056 *[{"dataset_id": ref.getCheckedId(), "datastore_name": datastoreName} for ref in refs]
1057 )
1059 def expandDataId(self, dataId: Optional[DataId] = None, *, graph: Optional[DimensionGraph] = None,
1060 records: Optional[Mapping[DimensionElement, DimensionRecord]] = None, **kwds):
1061 """Expand a dimension-based data ID to include additional information.
1063 Parameters
1064 ----------
1065 dataId : `DataCoordinate` or `dict`, optional
1066 Data ID to be expanded; augmented and overridden by ``kwds``.
1067 graph : `DimensionGraph`, optional
1068 Set of dimensions for the expanded ID. If `None`, the dimensions
1069 will be inferred from the keys of ``dataId`` and ``kwds``.
1070 Dimensions that are in ``dataId`` or ``kwds`` but not in ``graph``
1071 are silently ignored, providing a way to extract and expand a
1072 subset of a data ID.
1073 records : mapping [`DimensionElement`, `DimensionRecord`], optional
1074 Dimension record data to use before querying the database for that
1075 data.
1076 **kwds
1077 Additional keywords are treated like additional key-value pairs for
1078 ``dataId``, extending and overriding
1080 Returns
1081 -------
1082 expanded : `ExpandedDataCoordinate`
1083 A data ID that includes full metadata for all of the dimensions it
1084 identifieds.
1085 """
1086 standardized = DataCoordinate.standardize(dataId, graph=graph, universe=self.dimensions, **kwds)
1087 if isinstance(standardized, ExpandedDataCoordinate):
1088 return standardized
1089 elif isinstance(dataId, ExpandedDataCoordinate):
1090 records = dict(records) if records is not None else {}
1091 records.update(dataId.records)
1092 else:
1093 records = dict(records) if records is not None else {}
1094 keys = dict(standardized)
1095 regions = []
1096 timespans = []
1097 for element in standardized.graph.primaryKeyTraversalOrder:
1098 record = records.get(element.name, ...) # Use ... to mean not found; None might mean NULL
1099 if record is ...:
1100 storage = self._dimensions[element]
1101 record = storage.fetch(keys)
1102 records[element] = record
1103 if record is not None:
1104 for d in element.implied:
1105 value = getattr(record, d.name)
1106 if keys.setdefault(d, value) != value:
1107 raise InconsistentDataIdError(f"Data ID {standardized} has {d.name}={keys[d]!r}, "
1108 f"but {element.name} implies {d.name}={value!r}.")
1109 if element in standardized.graph.spatial and record.region is not None:
1110 if any(record.region.relate(r) & lsst.sphgeom.DISJOINT for r in regions):
1111 raise InconsistentDataIdError(f"Data ID {standardized}'s region for {element.name} "
1112 f"is disjoint with those for other elements.")
1113 regions.append(record.region)
1114 if element in standardized.graph.temporal:
1115 if any(not record.timespan.overlaps(t) for t in timespans):
1116 raise InconsistentDataIdError(f"Data ID {standardized}'s timespan for {element.name}"
1117 f" is disjoint with those for other elements.")
1118 timespans.append(record.timespan)
1119 else:
1120 if element in standardized.graph.required:
1121 raise LookupError(
1122 f"Could not fetch record for required dimension {element.name} via keys {keys}."
1123 )
1124 if element.alwaysJoin:
1125 raise InconsistentDataIdError(
1126 f"Could not fetch record for element {element.name} via keys {keys}, ",
1127 f"but it is marked alwaysJoin=True; this means one or more dimensions are not "
1128 f"related."
1129 )
1130 records.update((d, None) for d in element.implied)
1131 return ExpandedDataCoordinate(standardized.graph, standardized.values(), records=records)
1133 def relateDataIds(self, a: DataId, b: DataId) -> Optional[ConsistentDataIds]:
1134 """Compare the keys and values of a pair of data IDs for consistency.
1136 See `ConsistentDataIds` for more information.
1138 Parameters
1139 ----------
1140 a : `dict` or `DataCoordinate`
1141 First data ID to be compared.
1142 b : `dict` or `DataCoordinate`
1143 Second data ID to be compared.
1145 Returns
1146 -------
1147 relationship : `ConsistentDataIds` or `None`
1148 Relationship information. This is not `None` and coerces to
1149 `True` in boolean contexts if and only if the data IDs are
1150 consistent in terms of all common key-value pairs, all many-to-many
1151 join tables, and all spatial andtemporal relationships.
1152 """
1153 a = DataCoordinate.standardize(a, universe=self.dimensions)
1154 b = DataCoordinate.standardize(b, universe=self.dimensions)
1155 aFull = getattr(a, "full", None)
1156 bFull = getattr(b, "full", None)
1157 aBest = aFull if aFull is not None else a
1158 bBest = bFull if bFull is not None else b
1159 jointKeys = aBest.keys() & bBest.keys()
1160 # If any common values are not equal, we know they are inconsistent.
1161 if any(aBest[k] != bBest[k] for k in jointKeys):
1162 return None
1163 # If the graphs are equal, we know the data IDs are.
1164 if a.graph == b.graph:
1165 return ConsistentDataIds(contains=True, within=True, overlaps=bool(jointKeys))
1166 # Result is still inconclusive. Try to expand a data ID containing
1167 # keys from both; that will fail if they are inconsistent.
1168 # First, if either input was already an ExpandedDataCoordinate, extract
1169 # its records so we don't have to query for them.
1170 records = {}
1171 if hasattr(a, "records"):
1172 records.update(a.records)
1173 if hasattr(b, "records"):
1174 records.update(b.records)
1175 try:
1176 self.expandDataId({**a, **b}, graph=(a.graph | b.graph), records=records)
1177 except InconsistentDataIdError:
1178 return None
1179 # We know the answer is not `None`; time to figure out what it is.
1180 return ConsistentDataIds(
1181 contains=(a.graph >= b.graph),
1182 within=(a.graph <= b.graph),
1183 overlaps=bool(a.graph & b.graph),
1184 )
1186 def insertDimensionData(self, element: Union[DimensionElement, str],
1187 *data: Union[dict, DimensionRecord],
1188 conform: bool = True):
1189 """Insert one or more dimension records into the database.
1191 Parameters
1192 ----------
1193 element : `DimensionElement` or `str`
1194 The `DimensionElement` or name thereof that identifies the table
1195 records will be inserted into.
1196 data : `dict` or `DimensionRecord` (variadic)
1197 One or more records to insert.
1198 conform : `bool`, optional
1199 If `False` (`True` is default) perform no checking or conversions,
1200 and assume that ``element`` is a `DimensionElement` instance and
1201 ``data`` is a one or more `DimensionRecord` instances of the
1202 appropriate subclass.
1203 """
1204 if conform:
1205 element = self.dimensions[element] # if this is a name, convert it to a true DimensionElement.
1206 records = [element.RecordClass.fromDict(row) if not type(row) is element.RecordClass else row
1207 for row in data]
1208 else:
1209 records = data
1210 storage = self._dimensions[element]
1211 storage.insert(*records)
1213 def syncDimensionData(self, element: Union[DimensionElement, str],
1214 row: Union[dict, DimensionRecord],
1215 conform: bool = True) -> bool:
1216 """Synchronize the given dimension record with the database, inserting
1217 if it does not already exist and comparing values if it does.
1219 Parameters
1220 ----------
1221 element : `DimensionElement` or `str`
1222 The `DimensionElement` or name thereof that identifies the table
1223 records will be inserted into.
1224 row : `dict` or `DimensionRecord`
1225 The record to insert.
1226 conform : `bool`, optional
1227 If `False` (`True` is default) perform no checking or conversions,
1228 and assume that ``element`` is a `DimensionElement` instance and
1229 ``data`` is a one or more `DimensionRecord` instances of the
1230 appropriate subclass.
1232 Returns
1233 -------
1234 inserted : `bool`
1235 `True` if a new row was inserted, `False` otherwise.
1237 Raises
1238 ------
1239 ConflictingDefinitionError
1240 Raised if the record exists in the database (according to primary
1241 key lookup) but is inconsistent with the given one.
1243 Notes
1244 -----
1245 This method cannot be called within transactions, as it needs to be
1246 able to perform its own transaction to be concurrent.
1247 """
1248 if conform:
1249 element = self.dimensions[element] # if this is a name, convert it to a true DimensionElement.
1250 record = element.RecordClass.fromDict(row) if not type(row) is element.RecordClass else row
1251 else:
1252 record = row
1253 storage = self._dimensions[element]
1254 return storage.sync(record)
1256 def queryDatasetTypes(self, expression: Any = ...) -> Iterator[DatasetType]:
1257 """Iterate over the dataset types whose names match an expression.
1259 Parameters
1260 ----------
1261 expression : `Any`, optional
1262 An expression that fully or partially identifies the dataset types
1263 to return, such as a `str`, `re.Pattern`, or iterable thereof.
1264 `...` can be used to return all dataset types, and is the default.
1265 See :ref:`daf_butler_dataset_type_expressions` for more
1266 information.
1268 Yields
1269 ------
1270 datasetType : `DatasetType`
1271 A `DatasetType` instance whose name matches ``expression``.
1272 """
1273 wildcard = CategorizedWildcard.fromExpression(expression, coerceUnrecognized=lambda d: d.name)
1274 if wildcard is ...:
1275 yield from self._datasets
1276 return
1277 done = set()
1278 for name in wildcard.strings:
1279 storage = self._datasets.find(name)
1280 if storage is not None:
1281 done.add(storage.datasetType)
1282 yield storage.datasetType
1283 if wildcard.patterns:
1284 for datasetType in self._datasets:
1285 if datasetType.name in done:
1286 continue
1287 if any(p.fullmatch(datasetType.name) for p in wildcard.patterns):
1288 yield datasetType
1290 def queryCollections(self, expression: Any = ...,
1291 datasetType: Optional[DatasetType] = None,
1292 collectionType: Optional[CollectionType] = None,
1293 flattenChains: bool = False,
1294 includeChains: Optional[bool] = None) -> Iterator[str]:
1295 """Iterate over the collections whose names match an expression.
1297 Parameters
1298 ----------
1299 expression : `Any`, optional
1300 An expression that fully or partially identifies the collections
1301 to return, such as a `str`, `re.Pattern`, or iterable thereof.
1302 `...` can be used to return all collections, and is the default.
1303 See :ref:`daf_butler_collection_expressions` for more
1304 information.
1305 datasetType : `DatasetType`, optional
1306 If provided, only yield collections that should be searched for
1307 this dataset type according to ``expression``. If this is
1308 not provided, any dataset type restrictions in ``expression`` are
1309 ignored.
1310 collectionType : `CollectionType`, optional
1311 If provided, only yield collections of this type.
1312 flattenChains : `bool`, optional
1313 If `True` (`False` is default), recursively yield the child
1314 collections of matching `~CollectionType.CHAINED` collections.
1315 includeChains : `bool`, optional
1316 If `True`, yield records for matching `~CollectionType.CHAINED`
1317 collections. Default is the opposite of ``flattenChains``: include
1318 either CHAINED collections or their children, but not both.
1320 Yields
1321 ------
1322 collection : `str`
1323 The name of a collection that matches ``expression``.
1324 """
1325 query = CollectionQuery.fromExpression(expression)
1326 for record in query.iter(self._collections, datasetType=datasetType, collectionType=collectionType,
1327 flattenChains=flattenChains, includeChains=includeChains):
1328 yield record.name
1330 def makeQueryBuilder(self, summary: QuerySummary) -> QueryBuilder:
1331 """Return a `QueryBuilder` instance capable of constructing and
1332 managing more complex queries than those obtainable via `Registry`
1333 interfaces.
1335 This is an advanced interface; downstream code should prefer
1336 `Registry.queryDimensions` and `Registry.queryDatasets` whenever those
1337 are sufficient.
1339 Parameters
1340 ----------
1341 summary : `QuerySummary`
1342 Object describing and categorizing the full set of dimensions that
1343 will be included in the query.
1345 Returns
1346 -------
1347 builder : `QueryBuilder`
1348 Object that can be used to construct and perform advanced queries.
1349 """
1350 return QueryBuilder(summary=summary,
1351 collections=self._collections,
1352 dimensions=self._dimensions,
1353 datasets=self._datasets)
1355 def queryDimensions(self, dimensions: Union[Iterable[Union[Dimension, str]], Dimension, str], *,
1356 dataId: Optional[DataId] = None,
1357 datasets: Any = None,
1358 collections: Any = None,
1359 where: Optional[str] = None,
1360 expand: bool = True,
1361 **kwds) -> Iterator[DataCoordinate]:
1362 """Query for and iterate over data IDs matching user-provided criteria.
1364 Parameters
1365 ----------
1366 dimensions : `Dimension` or `str`, or iterable thereof
1367 The dimensions of the data IDs to yield, as either `Dimension`
1368 instances or `str`. Will be automatically expanded to a complete
1369 `DimensionGraph`.
1370 dataId : `dict` or `DataCoordinate`, optional
1371 A data ID whose key-value pairs are used as equality constraints
1372 in the query.
1373 datasets : `Any`, optional
1374 An expression that fully or partially identifies dataset types
1375 that should constrain the yielded data IDs. For example, including
1376 "raw" here would constrain the yielded ``instrument``,
1377 ``exposure``, ``detector``, and ``physical_filter`` values to only
1378 those for which at least one "raw" dataset exists in
1379 ``collections``. Allowed types include `DatasetType`, `str`,
1380 `re.Pattern`, and iterables thereof. Unlike other dataset type
1381 expressions, `...` is not permitted - it doesn't make sense to
1382 constrain data IDs on the existence of *all* datasets.
1383 See :ref:`daf_butler_dataset_type_expressions` for more
1384 information.
1385 collections: `Any`, optional
1386 An expression that fully or partially identifies the collections
1387 to search for datasets, such as a `str`, `re.Pattern`, or iterable
1388 thereof. `...` can be used to return all collections. Must be
1389 provided if ``datasets`` is, and is ignored if it is not. See
1390 :ref:`daf_butler_collection_expressions` for more information.
1391 where : `str`, optional
1392 A string expression similar to a SQL WHERE clause. May involve
1393 any column of a dimension table or (as a shortcut for the primary
1394 key column of a dimension table) dimension name. See
1395 :ref:`daf_butler_dimension_expressions` for more information.
1396 expand : `bool`, optional
1397 If `True` (default) yield `ExpandedDataCoordinate` instead of
1398 minimal `DataCoordinate` base-class instances.
1399 kwds
1400 Additional keyword arguments are forwarded to
1401 `DataCoordinate.standardize` when processing the ``dataId``
1402 argument (and may be used to provide a constraining data ID even
1403 when the ``dataId`` argument is `None`).
1405 Yields
1406 ------
1407 dataId : `DataCoordinate`
1408 Data IDs matching the given query parameters. Order is
1409 unspecified.
1410 """
1411 dimensions = iterable(dimensions)
1412 standardizedDataId = self.expandDataId(dataId, **kwds)
1413 standardizedDatasetTypes = []
1414 requestedDimensionNames = set(self.dimensions.extract(dimensions).names)
1415 if datasets is not None:
1416 if collections is None:
1417 raise TypeError("Cannot pass 'datasets' without 'collections'.")
1418 for datasetType in self.queryDatasetTypes(datasets):
1419 requestedDimensionNames.update(datasetType.dimensions.names)
1420 standardizedDatasetTypes.append(datasetType)
1421 # Preprocess collections expression in case the original included
1422 # single-pass iterators (we'll want to use it multiple times
1423 # below).
1424 collections = CollectionQuery.fromExpression(collections)
1426 summary = QuerySummary(
1427 requested=DimensionGraph(self.dimensions, names=requestedDimensionNames),
1428 dataId=standardizedDataId,
1429 expression=where,
1430 )
1431 builder = self.makeQueryBuilder(summary)
1432 for datasetType in standardizedDatasetTypes:
1433 builder.joinDataset(datasetType, collections, isResult=False)
1434 query = builder.finish()
1435 predicate = query.predicate()
1436 for row in self._db.query(query.sql):
1437 if predicate(row):
1438 result = query.extractDataId(row)
1439 if expand:
1440 yield self.expandDataId(result, records=standardizedDataId.records)
1441 else:
1442 yield result
1444 def queryDatasets(self, datasetType: Any, *,
1445 collections: Any,
1446 dimensions: Optional[Iterable[Union[Dimension, str]]] = None,
1447 dataId: Optional[DataId] = None,
1448 where: Optional[str] = None,
1449 deduplicate: bool = False,
1450 expand: bool = True,
1451 **kwds) -> Iterator[DatasetRef]:
1452 """Query for and iterate over dataset references matching user-provided
1453 criteria.
1455 Parameters
1456 ----------
1457 datasetType
1458 An expression that fully or partially identifies the dataset types
1459 to be queried. Allowed types include `DatasetType`, `str`,
1460 `re.Pattern`, and iterables thereof. The special value `...` can
1461 be used to query all dataset types. See
1462 :ref:`daf_butler_dataset_type_expressions` for more information.
1463 collections
1464 An expression that fully or partially identifies the collections
1465 to search for datasets, such as a `str`, `re.Pattern`, or iterable
1466 thereof. `...` can be used to return all collections. See
1467 :ref:`daf_butler_collection_expressions` for more information.
1468 dimensions : `~collections.abc.Iterable` of `Dimension` or `str`
1469 Dimensions to include in the query (in addition to those used
1470 to identify the queried dataset type(s)), either to constrain
1471 the resulting datasets to those for which a matching dimension
1472 exists, or to relate the dataset type's dimensions to dimensions
1473 referenced by the ``dataId`` or ``where`` arguments.
1474 dataId : `dict` or `DataCoordinate`, optional
1475 A data ID whose key-value pairs are used as equality constraints
1476 in the query.
1477 where : `str`, optional
1478 A string expression similar to a SQL WHERE clause. May involve
1479 any column of a dimension table or (as a shortcut for the primary
1480 key column of a dimension table) dimension name. See
1481 :ref:`daf_butler_dimension_expressions` for more information.
1482 deduplicate : `bool`, optional
1483 If `True` (`False` is default), for each result data ID, only
1484 yield one `DatasetRef` of each `DatasetType`, from the first
1485 collection in which a dataset of that dataset type appears
1486 (according to the order of ``collections`` passed in). If `True`,
1487 ``collections`` must not contain regular expressions and may not
1488 be `...`.
1489 expand : `bool`, optional
1490 If `True` (default) attach `ExpandedDataCoordinate` instead of
1491 minimal `DataCoordinate` base-class instances.
1492 kwds
1493 Additional keyword arguments are forwarded to
1494 `DataCoordinate.standardize` when processing the ``dataId``
1495 argument (and may be used to provide a constraining data ID even
1496 when the ``dataId`` argument is `None`).
1498 Yields
1499 ------
1500 ref : `DatasetRef`
1501 Dataset references matching the given query criteria. These
1502 are grouped by `DatasetType` if the query evaluates to multiple
1503 dataset types, but order is otherwise unspecified.
1505 Raises
1506 ------
1507 TypeError
1508 Raised when the arguments are incompatible, such as when a
1509 collection wildcard is passed when ``deduplicate`` is `True`.
1511 Notes
1512 -----
1513 When multiple dataset types are queried in a single call, the
1514 results of this operation are equivalent to querying for each dataset
1515 type separately in turn, and no information about the relationships
1516 between datasets of different types is included. In contexts where
1517 that kind of information is important, the recommended pattern is to
1518 use `queryDimensions` to first obtain data IDs (possibly with the
1519 desired dataset types and collections passed as constraints to the
1520 query), and then use multiple (generally much simpler) calls to
1521 `queryDatasets` with the returned data IDs passed as constraints.
1522 """
1523 # Standardize the collections expression.
1524 if deduplicate:
1525 collections = CollectionSearch.fromExpression(collections)
1526 else:
1527 collections = CollectionQuery.fromExpression(collections)
1528 # Standardize and expand the data ID provided as a constraint.
1529 standardizedDataId = self.expandDataId(dataId, **kwds)
1530 # If the datasetType passed isn't actually a DatasetType, expand it
1531 # (it could be an expression that yields multiple DatasetTypes) and
1532 # recurse.
1533 if not isinstance(datasetType, DatasetType):
1534 for trueDatasetType in self.queryDatasetTypes(datasetType):
1535 yield from self.queryDatasets(trueDatasetType, collections=collections,
1536 dimensions=dimensions, dataId=standardizedDataId,
1537 where=where, deduplicate=deduplicate)
1538 return
1539 # The full set of dimensions in the query is the combination of those
1540 # needed for the DatasetType and those explicitly requested, if any.
1541 requestedDimensionNames = set(datasetType.dimensions.names)
1542 if dimensions is not None:
1543 requestedDimensionNames.update(self.dimensions.extract(dimensions).names)
1544 # Construct the summary structure needed to construct a QueryBuilder.
1545 summary = QuerySummary(
1546 requested=DimensionGraph(self.dimensions, names=requestedDimensionNames),
1547 dataId=standardizedDataId,
1548 expression=where,
1549 )
1550 builder = self.makeQueryBuilder(summary)
1551 # Add the dataset subquery to the query, telling the QueryBuilder to
1552 # include the rank of the selected collection in the results only if we
1553 # need to deduplicate. Note that if any of the collections are
1554 # actually wildcard expressions, and we've asked for deduplication,
1555 # this will raise TypeError for us.
1556 if not builder.joinDataset(datasetType, collections, isResult=True, addRank=deduplicate):
1557 return
1558 query = builder.finish()
1559 predicate = query.predicate()
1560 if not deduplicate:
1561 # No need to de-duplicate across collections.
1562 for row in self._db.query(query.sql):
1563 if predicate(row):
1564 dataId = query.extractDataId(row, graph=datasetType.dimensions)
1565 if expand:
1566 dataId = self.expandDataId(dataId, records=standardizedDataId.records)
1567 yield query.extractDatasetRef(row, datasetType, dataId)[0]
1568 else:
1569 # For each data ID, yield only the DatasetRef with the lowest
1570 # collection rank.
1571 bestRefs = {}
1572 bestRanks = {}
1573 for row in self._db.query(query.sql):
1574 if predicate(row):
1575 ref, rank = query.extractDatasetRef(row, datasetType)
1576 bestRank = bestRanks.get(ref.dataId, sys.maxsize)
1577 if rank < bestRank:
1578 bestRefs[ref.dataId] = ref
1579 bestRanks[ref.dataId] = rank
1580 # If caller requested expanded data IDs, we defer that until here
1581 # so we do as little expansion as possible.
1582 if expand:
1583 for ref in bestRefs.values():
1584 dataId = self.expandDataId(ref.dataId, records=standardizedDataId.records)
1585 yield ref.expanded(dataId)
1586 else:
1587 yield from bestRefs.values()
1589 dimensions: DimensionUniverse
1590 """The universe of all dimensions known to the registry
1591 (`DimensionUniverse`).
1592 """
1594 storageClasses: StorageClassFactory
1595 """All storage classes known to the registry (`StorageClassFactory`).
1596 """