Coverage for python/lsst/daf/butler/registry/_registry.py : 14%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22from __future__ import annotations
24__all__ = (
25 "ConsistentDataIds",
26 "Registry",
27)
29from collections import defaultdict
30import contextlib
31from dataclasses import dataclass
32import sys
33from typing import (
34 Any,
35 Iterable,
36 Iterator,
37 List,
38 Mapping,
39 Optional,
40 Set,
41 Type,
42 TYPE_CHECKING,
43 Union,
44)
46import sqlalchemy
48import lsst.sphgeom
49from ..core import (
50 Config,
51 DataCoordinate,
52 DataId,
53 DatasetRef,
54 DatasetType,
55 Dimension,
56 DimensionElement,
57 DimensionGraph,
58 DimensionRecord,
59 DimensionUniverse,
60 ExpandedDataCoordinate,
61 FakeDatasetRef,
62 StorageClassFactory,
63)
64from ..core import ddl
65from ..core.utils import doImport, iterable, transactional
66from ._config import RegistryConfig
67from .queries import (
68 QueryBuilder,
69 QuerySummary,
70)
71from .tables import makeRegistryTableSpecs
72from ._collectionType import CollectionType
73from ._exceptions import ConflictingDefinitionError, InconsistentDataIdError, OrphanedRecordError
74from .wildcards import CategorizedWildcard, CollectionQuery, CollectionSearch
76if TYPE_CHECKING: 76 ↛ 77line 76 didn't jump to line 77, because the condition on line 76 was never true
77 from ..butlerConfig import ButlerConfig
78 from ..core import (
79 Quantum
80 )
81 from .interfaces import (
82 CollectionManager,
83 Database,
84 OpaqueTableStorageManager,
85 DimensionRecordStorageManager,
86 DatasetRecordStorageManager,
87 )
90@dataclass
91class ConsistentDataIds:
92 """A struct used to report relationships between data IDs by
93 `Registry.relateDataIds`.
95 If an instance of this class is returned (instead of `None`), the data IDs
96 are "not inconsistent" - any keys they have in common have the same value,
97 and any spatial or temporal relationships they have at least might involve
98 an overlap. To capture this, any instance of `ConsistentDataIds` coerces
99 to `True` in boolean contexts.
100 """
102 overlaps: bool
103 """If `True`, the data IDs have at least one key in common, associated with
104 the same value.
106 Note that data IDs are not inconsistent even if overlaps is `False` - they
107 may simply have no keys in common, which means they cannot have
108 inconsistent values for any keys. They may even be equal, in the case that
109 both data IDs are empty.
111 This field does _not_ indicate whether a spatial or temporal overlap
112 relationship exists.
113 """
115 contains: bool
116 """If `True`, all keys in the first data ID are in the second, and are
117 associated with the same values.
119 This includes case where the first data ID is empty.
120 """
122 within: bool
123 """If `True`, all keys in the second data ID are in the first, and are
124 associated with the same values.
126 This includes case where the second data ID is empty.
127 """
129 @property
130 def equal(self) -> bool:
131 """If `True`, the two data IDs are the same.
133 Data IDs are equal if they have both a `contains` and a `within`
134 relationship.
135 """
136 return self.contains and self.within
138 @property
139 def disjoint(self) -> bool:
140 """If `True`, the two data IDs have no keys in common.
142 This is simply the oppose of `overlaps`. Disjoint datasets are by
143 definition not inconsistent.
144 """
145 return not self.overlaps
147 def __bool__(self) -> bool:
148 return True
151class Registry:
152 """Registry interface.
154 Parameters
155 ----------
156 config : `ButlerConfig`, `RegistryConfig`, `Config` or `str`
157 Registry configuration
158 """
160 defaultConfigFile = None
161 """Path to configuration defaults. Relative to $DAF_BUTLER_DIR/config or
162 absolute path. Can be None if no defaults specified.
163 """
165 @classmethod
166 def fromConfig(cls, config: Union[ButlerConfig, RegistryConfig, Config, str], create: bool = False,
167 butlerRoot: Optional[str] = None, writeable: bool = True) -> Registry:
168 """Create `Registry` subclass instance from `config`.
170 Uses ``registry.cls`` from `config` to determine which subclass to
171 instantiate.
173 Parameters
174 ----------
175 config : `ButlerConfig`, `RegistryConfig`, `Config` or `str`
176 Registry configuration
177 create : `bool`, optional
178 Assume empty Registry and create a new one.
179 butlerRoot : `str`, optional
180 Path to the repository root this `Registry` will manage.
181 writeable : `bool`, optional
182 If `True` (default) create a read-write connection to the database.
184 Returns
185 -------
186 registry : `Registry` (subclass)
187 A new `Registry` subclass instance.
188 """
189 if not isinstance(config, RegistryConfig):
190 if isinstance(config, str) or isinstance(config, Config):
191 config = RegistryConfig(config)
192 else:
193 raise ValueError("Incompatible Registry configuration: {}".format(config))
194 config.replaceRoot(butlerRoot)
195 DatabaseClass = config.getDatabaseClass()
196 database = DatabaseClass.fromUri(str(config.connectionString), origin=config.get("origin", 0),
197 namespace=config.get("namespace"), writeable=writeable)
198 universe = DimensionUniverse(config)
199 opaque = doImport(config["managers", "opaque"])
200 dimensions = doImport(config["managers", "dimensions"])
201 collections = doImport(config["managers", "collections"])
202 datasets = doImport(config["managers", "datasets"])
203 return cls(database, universe, dimensions=dimensions, opaque=opaque, collections=collections,
204 datasets=datasets, create=create)
206 def __init__(self, database: Database, universe: DimensionUniverse, *,
207 opaque: Type[OpaqueTableStorageManager],
208 dimensions: Type[DimensionRecordStorageManager],
209 collections: Type[CollectionManager],
210 datasets: Type[DatasetRecordStorageManager],
211 create: bool = False):
212 self._db = database
213 self.storageClasses = StorageClassFactory()
214 with self._db.declareStaticTables(create=create) as context:
215 self._dimensions = dimensions.initialize(self._db, context, universe=universe)
216 self._collections = collections.initialize(self._db, context)
217 self._datasets = datasets.initialize(self._db, context,
218 collections=self._collections,
219 universe=self.dimensions)
220 self._tables = context.addTableTuple(makeRegistryTableSpecs(self.dimensions,
221 self._collections,
222 self._datasets))
223 self._opaque = opaque.initialize(self._db, context)
224 self._collections.refresh()
225 self._datasets.refresh(universe=self._dimensions.universe)
227 def __str__(self) -> str:
228 return str(self._db)
230 def __repr__(self) -> str:
231 return f"Registry({self._db!r}, {self.dimensions!r})"
233 def isWriteable(self) -> bool:
234 """Return `True` if this registry allows write operations, and `False`
235 otherwise.
236 """
237 return self._db.isWriteable()
239 @property
240 def dimensions(self) -> DimensionUniverse:
241 """All dimensions recognized by this `Registry` (`DimensionUniverse`).
242 """
243 return self._dimensions.universe
245 @contextlib.contextmanager
246 def transaction(self):
247 """Return a context manager that represents a transaction.
248 """
249 # TODO make savepoint=False the default.
250 try:
251 with self._db.transaction():
252 yield
253 except BaseException:
254 # TODO: this clears the caches sometimes when we wouldn't actually
255 # need to. Can we avoid that?
256 self._dimensions.clearCaches()
257 raise
259 def registerOpaqueTable(self, tableName: str, spec: ddl.TableSpec):
260 """Add an opaque (to the `Registry`) table for use by a `Datastore` or
261 other data repository client.
263 Opaque table records can be added via `insertOpaqueData`, retrieved via
264 `fetchOpaqueData`, and removed via `deleteOpaqueData`.
266 Parameters
267 ----------
268 tableName : `str`
269 Logical name of the opaque table. This may differ from the
270 actual name used in the database by a prefix and/or suffix.
271 spec : `ddl.TableSpec`
272 Specification for the table to be added.
273 """
274 self._opaque.register(tableName, spec)
276 @transactional
277 def insertOpaqueData(self, tableName: str, *data: dict):
278 """Insert records into an opaque table.
280 Parameters
281 ----------
282 tableName : `str`
283 Logical name of the opaque table. Must match the name used in a
284 previous call to `registerOpaqueTable`.
285 data
286 Each additional positional argument is a dictionary that represents
287 a single row to be added.
288 """
289 self._opaque[tableName].insert(*data)
291 def fetchOpaqueData(self, tableName: str, **where: Any) -> Iterator[dict]:
292 """Retrieve records from an opaque table.
294 Parameters
295 ----------
296 tableName : `str`
297 Logical name of the opaque table. Must match the name used in a
298 previous call to `registerOpaqueTable`.
299 where
300 Additional keyword arguments are interpreted as equality
301 constraints that restrict the returned rows (combined with AND);
302 keyword arguments are column names and values are the values they
303 must have.
305 Yields
306 ------
307 row : `dict`
308 A dictionary representing a single result row.
309 """
310 yield from self._opaque[tableName].fetch(**where)
312 @transactional
313 def deleteOpaqueData(self, tableName: str, **where: Any):
314 """Remove records from an opaque table.
316 Parameters
317 ----------
318 tableName : `str`
319 Logical name of the opaque table. Must match the name used in a
320 previous call to `registerOpaqueTable`.
321 where
322 Additional keyword arguments are interpreted as equality
323 constraints that restrict the deleted rows (combined with AND);
324 keyword arguments are column names and values are the values they
325 must have.
326 """
327 self._opaque[tableName].delete(**where)
329 def registerCollection(self, name: str, type: CollectionType = CollectionType.TAGGED):
330 """Add a new collection if one with the given name does not exist.
332 Parameters
333 ----------
334 name : `str`
335 The name of the collection to create.
336 type : `CollectionType`
337 Enum value indicating the type of collection to create.
339 Notes
340 -----
341 This method cannot be called within transactions, as it needs to be
342 able to perform its own transaction to be concurrent.
343 """
344 self._collections.register(name, type)
346 def getCollectionType(self, name: str) -> CollectionType:
347 """Return an enumeration value indicating the type of the given
348 collection.
350 Parameters
351 ----------
352 name : `str`
353 The name of the collection.
355 Returns
356 -------
357 type : `CollectionType`
358 Enum value indicating the type of this collection.
360 Raises
361 ------
362 MissingCollectionError
363 Raised if no collection with the given name exists.
364 """
365 return self._collections.find(name).type
367 def registerRun(self, name: str):
368 """Add a new run if one with the given name does not exist.
370 Parameters
371 ----------
372 name : `str`
373 The name of the run to create.
375 Notes
376 -----
377 This method cannot be called within transactions, as it needs to be
378 able to perform its own transaction to be concurrent.
379 """
380 self._collections.register(name, CollectionType.RUN)
382 @transactional
383 def removeCollection(self, name: str):
384 """Completely remove the given collection.
386 Parameters
387 ----------
388 name : `str`
389 The name of the collection to remove.
391 Raises
392 ------
393 MissingCollectionError
394 Raised if no collection with the given name exists.
396 Notes
397 -----
398 If this is a `~CollectionType.RUN` collection, all datasets and quanta
399 in it are also fully removed. This requires that those datasets be
400 removed (or at least trashed) from any datastores that hold them first.
402 A collection may not be deleted as long as it is referenced by a
403 `~CollectionType.CHAINED` collection; the ``CHAINED`` collection must
404 be deleted or redefined first.
405 """
406 self._collections.remove(name)
408 def getCollectionChain(self, parent: str) -> CollectionSearch:
409 """Return the child collections in a `~CollectionType.CHAINED`
410 collection.
412 Parameters
413 ----------
414 parent : `str`
415 Name of the chained collection. Must have already been added via
416 a call to `Registry.registerCollection`.
418 Returns
419 -------
420 children : `CollectionSearch`
421 An object that defines the search path of the collection.
422 See :ref:`daf_butler_collection_expressions` for more information.
424 Raises
425 ------
426 MissingCollectionError
427 Raised if ``parent`` does not exist in the `Registry`.
428 TypeError
429 Raised if ``parent`` does not correspond to a
430 `~CollectionType.CHAINED` collection.
431 """
432 record = self._collections.find(parent)
433 if record.type is not CollectionType.CHAINED:
434 raise TypeError(f"Collection '{parent}' has type {record.type.name}, not CHAINED.")
435 return record.children
437 @transactional
438 def setCollectionChain(self, parent: str, children: Any):
439 """Define or redefine a `~CollectionType.CHAINED` collection.
441 Parameters
442 ----------
443 parent : `str`
444 Name of the chained collection. Must have already been added via
445 a call to `Registry.registerCollection`.
446 children : `Any`
447 An expression defining an ordered search of child collections,
448 generally an iterable of `str`. Restrictions on the dataset types
449 to be searched can also be included, by passing mapping or an
450 iterable containing tuples; see
451 :ref:`daf_butler_collection_expressions` for more information.
453 Raises
454 ------
455 MissingCollectionError
456 Raised when any of the given collections do not exist in the
457 `Registry`.
458 TypeError
459 Raised if ``parent`` does not correspond to a
460 `~CollectionType.CHAINED` collection.
461 ValueError
462 Raised if the given collections contains a cycle.
463 """
464 record = self._collections.find(parent)
465 if record.type is not CollectionType.CHAINED:
466 raise TypeError(f"Collection '{parent}' has type {record.type.name}, not CHAINED.")
467 children = CollectionSearch.fromExpression(children)
468 if children != record.children:
469 record.update(self._collections, children)
471 def registerDatasetType(self, datasetType: DatasetType) -> bool:
472 """
473 Add a new `DatasetType` to the Registry.
475 It is not an error to register the same `DatasetType` twice.
477 Parameters
478 ----------
479 datasetType : `DatasetType`
480 The `DatasetType` to be added.
482 Returns
483 -------
484 inserted : `bool`
485 `True` if ``datasetType`` was inserted, `False` if an identical
486 existing `DatsetType` was found. Note that in either case the
487 DatasetType is guaranteed to be defined in the Registry
488 consistently with the given definition.
490 Raises
491 ------
492 ValueError
493 Raised if the dimensions or storage class are invalid.
494 ConflictingDefinitionError
495 Raised if this DatasetType is already registered with a different
496 definition.
498 Notes
499 -----
500 This method cannot be called within transactions, as it needs to be
501 able to perform its own transaction to be concurrent.
502 """
503 _, inserted = self._datasets.register(datasetType)
504 return inserted
506 def getDatasetType(self, name: str) -> DatasetType:
507 """Get the `DatasetType`.
509 Parameters
510 ----------
511 name : `str`
512 Name of the type.
514 Returns
515 -------
516 type : `DatasetType`
517 The `DatasetType` associated with the given name.
519 Raises
520 ------
521 KeyError
522 Requested named DatasetType could not be found in registry.
523 """
524 storage = self._datasets.find(name)
525 if storage is None:
526 raise KeyError(f"DatasetType '{name}' could not be found.")
527 return storage.datasetType
529 def findDataset(self, datasetType: Union[DatasetType, str], dataId: Optional[DataId] = None, *,
530 collections: Any, **kwargs: Any) -> Optional[DatasetRef]:
531 """Find a dataset given its `DatasetType` and data ID.
533 This can be used to obtain a `DatasetRef` that permits the dataset to
534 be read from a `Datastore`.
536 Parameters
537 ----------
538 datasetType : `DatasetType` or `str`
539 A `DatasetType` or the name of one.
540 dataId : `dict` or `DataCoordinate`, optional
541 A `dict`-like object containing the `Dimension` links that identify
542 the dataset within a collection.
543 collections
544 An expression that fully or partially identifies the collections
545 to search for the dataset, such as a `str`, `re.Pattern`, or
546 iterable thereof. `...` can be used to return all collections.
547 See :ref:`daf_butler_collection_expressions` for more information.
548 **kwargs
549 Additional keyword arguments passed to
550 `DataCoordinate.standardize` to convert ``dataId`` to a true
551 `DataCoordinate` or augment an existing one.
553 Returns
554 -------
555 ref : `DatasetRef`
556 A reference to the dataset, or `None` if no matching Dataset
557 was found.
559 Raises
560 ------
561 LookupError
562 Raised if one or more data ID keys are missing or the dataset type
563 does not exist.
564 MissingCollectionError
565 Raised if any of ``collections`` does not exist in the registry.
566 """
567 if isinstance(datasetType, DatasetType):
568 storage = self._datasets.find(datasetType.name)
569 if storage is None:
570 raise LookupError(f"DatasetType '{datasetType}' has not been registered.")
571 else:
572 storage = self._datasets.find(datasetType)
573 if storage is None:
574 raise LookupError(f"DatasetType with name '{datasetType}' has not been registered.")
575 dataId = DataCoordinate.standardize(dataId, graph=storage.datasetType.dimensions,
576 universe=self.dimensions, **kwargs)
577 collections = CollectionSearch.fromExpression(collections)
578 for collectionRecord in collections.iter(self._collections, datasetType=storage.datasetType):
579 result = storage.find(collectionRecord, dataId)
580 if result is not None:
581 if result.datasetType.isComposite():
582 result = self._datasets.fetchComponents(result)
583 return result
584 return None
586 @transactional
587 def insertDatasets(self, datasetType: Union[DatasetType, str], dataIds: Iterable[DataId],
588 run: str, *, producer: Optional[Quantum] = None, recursive: bool = False
589 ) -> List[DatasetRef]:
590 """Insert one or more datasets into the `Registry`
592 This always adds new datasets; to associate existing datasets with
593 a new collection, use ``associate``.
595 Parameters
596 ----------
597 datasetType : `DatasetType` or `str`
598 A `DatasetType` or the name of one.
599 dataIds : `~collections.abc.Iterable` of `dict` or `DataCoordinate`
600 Dimension-based identifiers for the new datasets.
601 run : `str`
602 The name of the run that produced the datasets.
603 producer : `Quantum`
604 Unit of work that produced the datasets. May be `None` to store
605 no provenance information, but if present the `Quantum` must
606 already have been added to the Registry.
607 recursive : `bool`
608 If True, recursively add datasets and attach entries for component
609 datasets as well.
611 Returns
612 -------
613 refs : `list` of `DatasetRef`
614 Resolved `DatasetRef` instances for all given data IDs (in the same
615 order).
617 Raises
618 ------
619 ConflictingDefinitionError
620 If a dataset with the same dataset type and data ID as one of those
621 given already exists in ``run``.
622 MissingCollectionError
623 Raised if ``run`` does not exist in the registry.
624 """
625 if isinstance(datasetType, DatasetType):
626 storage = self._datasets.find(datasetType.name)
627 if storage is None:
628 raise LookupError(f"DatasetType '{datasetType}' has not been registered.")
629 else:
630 storage = self._datasets.find(datasetType)
631 if storage is None:
632 raise LookupError(f"DatasetType with name '{datasetType}' has not been registered.")
633 runRecord = self._collections.find(run)
634 dataIds = [self.expandDataId(dataId, graph=storage.datasetType.dimensions) for dataId in dataIds]
635 try:
636 refs = list(storage.insert(runRecord, dataIds, quantum=producer))
637 except sqlalchemy.exc.IntegrityError as err:
638 raise ConflictingDefinitionError(f"A database constraint failure was triggered by inserting "
639 f"one or more datasets of type {storage.datasetType} into "
640 f"collection '{run}'. "
641 f"This probably means a dataset with the same data ID "
642 f"and dataset type already exists, but it may also mean a "
643 f"dimension row is missing.") from err
644 if recursive and storage.datasetType.isComposite():
645 # Insert component rows by recursing.
646 composites = defaultdict(dict)
647 # TODO: we really shouldn't be inserting all components defined by
648 # the storage class, because there's no guarantee all of them are
649 # actually present in these datasets.
650 for componentName in storage.datasetType.storageClass.components:
651 componentDatasetType = storage.datasetType.makeComponentDatasetType(componentName)
652 componentRefs = self.insertDatasets(componentDatasetType,
653 dataIds=dataIds,
654 run=run,
655 producer=producer,
656 recursive=True)
657 for parentRef, componentRef in zip(refs, componentRefs):
658 composites[parentRef][componentName] = componentRef
659 if composites:
660 refs = list(self._datasets.attachComponents(composites.items()))
661 return refs
663 def getDataset(self, id: int) -> Optional[DatasetRef]:
664 """Retrieve a Dataset entry.
666 Parameters
667 ----------
668 id : `int`
669 The unique identifier for the dataset.
671 Returns
672 -------
673 ref : `DatasetRef` or `None`
674 A ref to the Dataset, or `None` if no matching Dataset
675 was found.
676 """
677 ref = self._datasets.getDatasetRef(id)
678 if ref is None:
679 return None
680 if ref.datasetType.isComposite():
681 return self._datasets.fetchComponents(ref)
682 return ref
684 @transactional
685 def removeDatasets(self, refs: Iterable[DatasetRef], *, recursive: bool = True):
686 """Remove datasets from the Registry.
688 The datasets will be removed unconditionally from all collections, and
689 any `Quantum` that consumed this dataset will instead be marked with
690 having a NULL input. `Datastore` records will *not* be deleted; the
691 caller is responsible for ensuring that the dataset has already been
692 removed from all Datastores.
694 Parameters
695 ----------
696 refs : `Iterable` of `DatasetRef`
697 References to the datasets to be removed. Must include a valid
698 ``id`` attribute, and should be considered invalidated upon return.
699 recursive : `bool`, optional
700 If `True`, remove all component datasets as well. Note that
701 this only removes components that are actually included in the
702 given `DatasetRef` instances, which may not be the same as those in
703 the database (especially if they were obtained from
704 `queryDatasets`, which does not populate `DatasetRef.components`).
706 Raises
707 ------
708 AmbiguousDatasetError
709 Raised if any ``ref.id`` is `None`.
710 OrphanedRecordError
711 Raised if any dataset is still present in any `Datastore`.
712 """
713 for datasetType, refsForType in DatasetRef.groupByType(refs, recursive=recursive).items():
714 storage = self._datasets.find(datasetType.name)
715 try:
716 storage.delete(refsForType)
717 except sqlalchemy.exc.IntegrityError as err:
718 raise OrphanedRecordError("One or more datasets is still "
719 "present in one or more Datastores.") from err
721 @transactional
722 def attachComponents(self, parent: DatasetRef, components: Mapping[str, DatasetRef]):
723 """Attach components to a dataset.
725 Parameters
726 ----------
727 parent : `DatasetRef`
728 A reference to the parent dataset.
729 components : `Mapping` [ `str`, `DatasetRef` ]
730 Mapping from component name to the `DatasetRef` for that component.
732 Returns
733 -------
734 ref : `DatasetRef`
735 An updated version of ``parent`` with components included.
737 Returns
738 -------
739 ref : `DatasetRef`
740 A version ``parent`` with ``component`` included in its components.
742 Raises
743 ------
744 AmbiguousDatasetError
745 Raised if ``parent.id`` or any `DatasetRef.id` in ``components``
746 is `None`.
747 """
748 for name, ref in components.items():
749 if ref.datasetType.storageClass != parent.datasetType.storageClass.components[name]:
750 raise TypeError(f"Expected storage class "
751 f"'{parent.datasetType.storageClass.components[name].name}' "
752 f"for component '{name}' of dataset {parent}; got "
753 f"dataset {ref} with storage class "
754 f"'{ref.datasetType.storageClass.name}'.")
755 ref, = self._datasets.attachComponents([(parent, components)])
756 return ref
758 @transactional
759 def associate(self, collection: str, refs: Iterable[DatasetRef], *, recursive: bool = True):
760 """Add existing datasets to a `~CollectionType.TAGGED` collection.
762 If a DatasetRef with the same exact integer ID is already in a
763 collection nothing is changed. If a `DatasetRef` with the same
764 `DatasetType` and data ID but with different integer ID
765 exists in the collection, `ConflictingDefinitionError` is raised.
767 Parameters
768 ----------
769 collection : `str`
770 Indicates the collection the datasets should be associated with.
771 refs : `Iterable` [ `DatasetRef` ]
772 An iterable of resolved `DatasetRef` instances that already exist
773 in this `Registry`.
774 recursive : `bool`, optional
775 If `True`, associate all component datasets as well. Note that
776 this only associates components that are actually included in the
777 given `DatasetRef` instances, which may not be the same as those in
778 the database (especially if they were obtained from
779 `queryDatasets`, which does not populate `DatasetRef.components`).
781 Raises
782 ------
783 ConflictingDefinitionError
784 If a Dataset with the given `DatasetRef` already exists in the
785 given collection.
786 AmbiguousDatasetError
787 Raised if ``any(ref.id is None for ref in refs)``.
788 MissingCollectionError
789 Raised if ``collection`` does not exist in the registry.
790 TypeError
791 Raise adding new datasets to the given ``collection`` is not
792 allowed.
793 """
794 collectionRecord = self._collections.find(collection)
795 if collectionRecord.type is not CollectionType.TAGGED:
796 raise TypeError(f"Collection '{collection}' has type {collectionRecord.type.name}, not TAGGED.")
797 for datasetType, refsForType in DatasetRef.groupByType(refs, recursive=recursive).items():
798 storage = self._datasets.find(datasetType.name)
799 try:
800 storage.associate(collectionRecord, refsForType)
801 except sqlalchemy.exc.IntegrityError as err:
802 raise ConflictingDefinitionError(
803 f"Constraint violation while associating dataset of type {datasetType.name} with "
804 f"collection {collection}. This probably means that one or more datasets with the same "
805 f"dataset type and data ID already exist in the collection, but it may also indicate "
806 f"that the datasets do not exist."
807 ) from err
809 @transactional
810 def disassociate(self, collection: str, refs: Iterable[DatasetRef], *, recursive: bool = True):
811 """Remove existing datasets from a `~CollectionType.TAGGED` collection.
813 ``collection`` and ``ref`` combinations that are not currently
814 associated are silently ignored.
816 Parameters
817 ----------
818 collection : `str`
819 The collection the datasets should no longer be associated with.
820 refs : `Iterable` [ `DatasetRef` ]
821 An iterable of resolved `DatasetRef` instances that already exist
822 in this `Registry`.
823 recursive : `bool`, optional
824 If `True`, disassociate all component datasets as well. Note that
825 this only disassociates components that are actually included in
826 the given `DatasetRef` instances, which may not be the same as
827 those in the database (especially if they were obtained from
828 `queryDatasets`, which does not populate `DatasetRef.components`).
830 Raises
831 ------
832 AmbiguousDatasetError
833 Raised if any of the given dataset references is unresolved.
834 MissingCollectionError
835 Raised if ``collection`` does not exist in the registry.
836 TypeError
837 Raise adding new datasets to the given ``collection`` is not
838 allowed.
839 """
840 collectionRecord = self._collections.find(collection)
841 if collectionRecord.type is not CollectionType.TAGGED:
842 raise TypeError(f"Collection '{collection}' has type {collectionRecord.type.name}; "
843 "expected TAGGED.")
844 for datasetType, refsForType in DatasetRef.groupByType(refs, recursive=recursive).items():
845 storage = self._datasets.find(datasetType.name)
846 storage.disassociate(collectionRecord, refsForType)
848 @transactional
849 def insertDatasetLocations(self, datastoreName: str, refs: Iterable[DatasetRef]):
850 """Record that a datastore holds the given datasets.
852 Typically used by `Datastore`.
854 Parameters
855 ----------
856 datastoreName : `str`
857 Name of the datastore holding these datasets.
858 refs : `~collections.abc.Iterable` of `DatasetRef`
859 References to the datasets.
861 Raises
862 ------
863 AmbiguousDatasetError
864 Raised if ``any(ref.id is None for ref in refs)``.
865 """
866 self._db.insert(
867 self._tables.dataset_location,
868 *[{"datastore_name": datastoreName, "dataset_id": ref.getCheckedId()} for ref in refs]
869 )
871 @transactional
872 def moveDatasetLocationToTrash(self, datastoreName: str, refs: Iterable[DatasetRef]):
873 """Move the dataset location information to trash.
875 Parameters
876 ----------
877 datastoreName : `str`
878 Name of the datastore holding these datasets.
879 refs : `~collections.abc.Iterable` of `DatasetRef`
880 References to the datasets.
881 """
882 # We only want to move rows that already exist in the main table
883 filtered = self.checkDatasetLocations(datastoreName, refs)
884 self.canDeleteDatasetLocations(datastoreName, filtered)
885 self.removeDatasetLocation(datastoreName, filtered)
887 @transactional
888 def canDeleteDatasetLocations(self, datastoreName: str, refs: Iterable[DatasetRef]):
889 """Record that a datastore can delete this dataset
891 Parameters
892 ----------
893 datastoreName : `str`
894 Name of the datastore holding these datasets.
895 refs : `~collections.abc.Iterable` of `DatasetRef`
896 References to the datasets.
898 Raises
899 ------
900 AmbiguousDatasetError
901 Raised if ``any(ref.id is None for ref in refs)``.
902 """
903 self._db.insert(
904 self._tables.dataset_location_trash,
905 *[{"datastore_name": datastoreName, "dataset_id": ref.getCheckedId()} for ref in refs]
906 )
908 def checkDatasetLocations(self, datastoreName: str, refs: Iterable[DatasetRef]) -> List[DatasetRef]:
909 """Check which refs are listed for this datastore.
911 Parameters
912 ----------
913 datastoreName : `str`
914 Name of the datastore holding these datasets.
915 refs : `~collections.abc.Iterable` of `DatasetRef`
916 References to the datasets.
918 Returns
919 -------
920 present : `list` of `DatasetRef`
921 All the `DatasetRef` that are listed.
922 """
924 table = self._tables.dataset_location
925 result = self._db.query(
926 sqlalchemy.sql.select(
927 [table.columns.datastore_name, table.columns.dataset_id]
928 ).where(
929 sqlalchemy.sql.and_(table.columns.dataset_id.in_([ref.id for ref in refs]),
930 table.columns.datastore_name == datastoreName)
931 )
932 ).fetchall()
934 matched_ids = {r["dataset_id"] for r in result}
935 return [ref for ref in refs if ref.id in matched_ids]
937 def getDatasetLocations(self, ref: DatasetRef) -> Set[str]:
938 """Retrieve datastore locations for a given dataset.
940 Typically used by `Datastore`.
942 Parameters
943 ----------
944 ref : `DatasetRef`
945 A reference to the dataset for which to retrieve storage
946 information.
948 Returns
949 -------
950 datastores : `set` of `str`
951 All the matching datastores holding this dataset. Empty set
952 if the dataset does not exist anywhere.
954 Raises
955 ------
956 AmbiguousDatasetError
957 Raised if ``ref.id`` is `None`.
958 """
959 table = self._tables.dataset_location
960 result = self._db.query(
961 sqlalchemy.sql.select(
962 [table.columns.datastore_name]
963 ).where(
964 table.columns.dataset_id == ref.id
965 )
966 ).fetchall()
967 return {r["datastore_name"] for r in result}
969 @transactional
970 def getTrashedDatasets(self, datastoreName: str) -> Set[FakeDatasetRef]:
971 """Retrieve all the dataset ref IDs that are in the trash
972 associated with the specified datastore.
974 Parameters
975 ----------
976 datastoreName : `str`
977 The relevant datastore name to use.
979 Returns
980 -------
981 ids : `set` of `FakeDatasetRef`
982 The IDs of datasets that can be safely removed from this datastore.
983 Can be empty.
984 """
985 table = self._tables.dataset_location_trash
986 result = self._db.query(
987 sqlalchemy.sql.select(
988 [table.columns.dataset_id]
989 ).where(
990 table.columns.datastore_name == datastoreName
991 )
992 ).fetchall()
993 return {FakeDatasetRef(r["dataset_id"]) for r in result}
995 @transactional
996 def emptyDatasetLocationsTrash(self, datastoreName: str, refs: Iterable[FakeDatasetRef]) -> None:
997 """Remove datastore location associated with these datasets from trash.
999 Typically used by `Datastore` when a dataset is removed.
1001 Parameters
1002 ----------
1003 datastoreName : `str`
1004 Name of this `Datastore`.
1005 refs : iterable of `FakeDatasetRef`
1006 The dataset IDs to be removed.
1008 Raises
1009 ------
1010 AmbiguousDatasetError
1011 Raised if ``ref.id`` is `None`.
1012 """
1013 if not refs:
1014 return
1015 self._db.delete(
1016 self._tables.dataset_location_trash,
1017 ["dataset_id", "datastore_name"],
1018 *[{"dataset_id": ref.id, "datastore_name": datastoreName} for ref in refs]
1019 )
1021 @transactional
1022 def removeDatasetLocation(self, datastoreName: str, refs: Iterable[DatasetRef]) -> None:
1023 """Remove datastore location associated with this dataset.
1025 Typically used by `Datastore` when a dataset is removed.
1027 Parameters
1028 ----------
1029 datastoreName : `str`
1030 Name of this `Datastore`.
1031 refs : iterable of `DatasetRef`
1032 A reference to the dataset for which information is to be removed.
1034 Raises
1035 ------
1036 AmbiguousDatasetError
1037 Raised if ``ref.id`` is `None`.
1038 """
1039 if not refs:
1040 return
1041 self._db.delete(
1042 self._tables.dataset_location,
1043 ["dataset_id", "datastore_name"],
1044 *[{"dataset_id": ref.getCheckedId(), "datastore_name": datastoreName} for ref in refs]
1045 )
1047 def expandDataId(self, dataId: Optional[DataId] = None, *, graph: Optional[DimensionGraph] = None,
1048 records: Optional[Mapping[DimensionElement, DimensionRecord]] = None, **kwds):
1049 """Expand a dimension-based data ID to include additional information.
1051 Parameters
1052 ----------
1053 dataId : `DataCoordinate` or `dict`, optional
1054 Data ID to be expanded; augmented and overridden by ``kwds``.
1055 graph : `DimensionGraph`, optional
1056 Set of dimensions for the expanded ID. If `None`, the dimensions
1057 will be inferred from the keys of ``dataId`` and ``kwds``.
1058 Dimensions that are in ``dataId`` or ``kwds`` but not in ``graph``
1059 are silently ignored, providing a way to extract and expand a
1060 subset of a data ID.
1061 records : mapping [`DimensionElement`, `DimensionRecord`], optional
1062 Dimension record data to use before querying the database for that
1063 data.
1064 **kwds
1065 Additional keywords are treated like additional key-value pairs for
1066 ``dataId``, extending and overriding
1068 Returns
1069 -------
1070 expanded : `ExpandedDataCoordinate`
1071 A data ID that includes full metadata for all of the dimensions it
1072 identifieds.
1073 """
1074 standardized = DataCoordinate.standardize(dataId, graph=graph, universe=self.dimensions, **kwds)
1075 if isinstance(standardized, ExpandedDataCoordinate):
1076 return standardized
1077 elif isinstance(dataId, ExpandedDataCoordinate):
1078 records = dict(records) if records is not None else {}
1079 records.update(dataId.records)
1080 else:
1081 records = dict(records) if records is not None else {}
1082 keys = dict(standardized)
1083 regions = []
1084 timespans = []
1085 for element in standardized.graph.primaryKeyTraversalOrder:
1086 record = records.get(element.name, ...) # Use ... to mean not found; None might mean NULL
1087 if record is ...:
1088 storage = self._dimensions[element]
1089 record = storage.fetch(keys)
1090 records[element] = record
1091 if record is not None:
1092 for d in element.implied:
1093 value = getattr(record, d.name)
1094 if keys.setdefault(d, value) != value:
1095 raise InconsistentDataIdError(f"Data ID {standardized} has {d.name}={keys[d]!r}, "
1096 f"but {element.name} implies {d.name}={value!r}.")
1097 if element in standardized.graph.spatial and record.region is not None:
1098 if any(record.region.relate(r) & lsst.sphgeom.DISJOINT for r in regions):
1099 raise InconsistentDataIdError(f"Data ID {standardized}'s region for {element.name} "
1100 f"is disjoint with those for other elements.")
1101 regions.append(record.region)
1102 if element in standardized.graph.temporal:
1103 if any(not record.timespan.overlaps(t) for t in timespans):
1104 raise InconsistentDataIdError(f"Data ID {standardized}'s timespan for {element.name}"
1105 f" is disjoint with those for other elements.")
1106 timespans.append(record.timespan)
1107 else:
1108 if element in standardized.graph.required:
1109 raise LookupError(
1110 f"Could not fetch record for required dimension {element.name} via keys {keys}."
1111 )
1112 if element.alwaysJoin:
1113 raise InconsistentDataIdError(
1114 f"Could not fetch record for element {element.name} via keys {keys}, ",
1115 f"but it is marked alwaysJoin=True; this means one or more dimensions are not "
1116 f"related."
1117 )
1118 records.update((d, None) for d in element.implied)
1119 return ExpandedDataCoordinate(standardized.graph, standardized.values(), records=records)
1121 def relateDataIds(self, a: DataId, b: DataId) -> Optional[ConsistentDataIds]:
1122 """Compare the keys and values of a pair of data IDs for consistency.
1124 See `ConsistentDataIds` for more information.
1126 Parameters
1127 ----------
1128 a : `dict` or `DataCoordinate`
1129 First data ID to be compared.
1130 b : `dict` or `DataCoordinate`
1131 Second data ID to be compared.
1133 Returns
1134 -------
1135 relationship : `ConsistentDataIds` or `None`
1136 Relationship information. This is not `None` and coerces to
1137 `True` in boolean contexts if and only if the data IDs are
1138 consistent in terms of all common key-value pairs, all many-to-many
1139 join tables, and all spatial andtemporal relationships.
1140 """
1141 a = DataCoordinate.standardize(a, universe=self.dimensions)
1142 b = DataCoordinate.standardize(b, universe=self.dimensions)
1143 aFull = getattr(a, "full", None)
1144 bFull = getattr(b, "full", None)
1145 aBest = aFull if aFull is not None else a
1146 bBest = bFull if bFull is not None else b
1147 jointKeys = aBest.keys() & bBest.keys()
1148 # If any common values are not equal, we know they are inconsistent.
1149 if any(aBest[k] != bBest[k] for k in jointKeys):
1150 return None
1151 # If the graphs are equal, we know the data IDs are.
1152 if a.graph == b.graph:
1153 return ConsistentDataIds(contains=True, within=True, overlaps=bool(jointKeys))
1154 # Result is still inconclusive. Try to expand a data ID containing
1155 # keys from both; that will fail if they are inconsistent.
1156 # First, if either input was already an ExpandedDataCoordinate, extract
1157 # its records so we don't have to query for them.
1158 records = {}
1159 if hasattr(a, "records"):
1160 records.update(a.records)
1161 if hasattr(b, "records"):
1162 records.update(b.records)
1163 try:
1164 self.expandDataId({**a, **b}, graph=(a.graph | b.graph), records=records)
1165 except InconsistentDataIdError:
1166 return None
1167 # We know the answer is not `None`; time to figure out what it is.
1168 return ConsistentDataIds(
1169 contains=(a.graph >= b.graph),
1170 within=(a.graph <= b.graph),
1171 overlaps=bool(a.graph & b.graph),
1172 )
1174 def insertDimensionData(self, element: Union[DimensionElement, str],
1175 *data: Union[dict, DimensionRecord],
1176 conform: bool = True):
1177 """Insert one or more dimension records into the database.
1179 Parameters
1180 ----------
1181 element : `DimensionElement` or `str`
1182 The `DimensionElement` or name thereof that identifies the table
1183 records will be inserted into.
1184 data : `dict` or `DimensionRecord` (variadic)
1185 One or more records to insert.
1186 conform : `bool`, optional
1187 If `False` (`True` is default) perform no checking or conversions,
1188 and assume that ``element`` is a `DimensionElement` instance and
1189 ``data`` is a one or more `DimensionRecord` instances of the
1190 appropriate subclass.
1191 """
1192 if conform:
1193 element = self.dimensions[element] # if this is a name, convert it to a true DimensionElement.
1194 records = [element.RecordClass.fromDict(row) if not type(row) is element.RecordClass else row
1195 for row in data]
1196 else:
1197 records = data
1198 storage = self._dimensions[element]
1199 storage.insert(*records)
1201 def syncDimensionData(self, element: Union[DimensionElement, str],
1202 row: Union[dict, DimensionRecord],
1203 conform: bool = True) -> bool:
1204 """Synchronize the given dimension record with the database, inserting
1205 if it does not already exist and comparing values if it does.
1207 Parameters
1208 ----------
1209 element : `DimensionElement` or `str`
1210 The `DimensionElement` or name thereof that identifies the table
1211 records will be inserted into.
1212 row : `dict` or `DimensionRecord`
1213 The record to insert.
1214 conform : `bool`, optional
1215 If `False` (`True` is default) perform no checking or conversions,
1216 and assume that ``element`` is a `DimensionElement` instance and
1217 ``data`` is a one or more `DimensionRecord` instances of the
1218 appropriate subclass.
1220 Returns
1221 -------
1222 inserted : `bool`
1223 `True` if a new row was inserted, `False` otherwise.
1225 Raises
1226 ------
1227 ConflictingDefinitionError
1228 Raised if the record exists in the database (according to primary
1229 key lookup) but is inconsistent with the given one.
1231 Notes
1232 -----
1233 This method cannot be called within transactions, as it needs to be
1234 able to perform its own transaction to be concurrent.
1235 """
1236 if conform:
1237 element = self.dimensions[element] # if this is a name, convert it to a true DimensionElement.
1238 record = element.RecordClass.fromDict(row) if not type(row) is element.RecordClass else row
1239 else:
1240 record = row
1241 storage = self._dimensions[element]
1242 return storage.sync(record)
1244 def queryDatasetTypes(self, expression: Any = ...) -> Iterator[DatasetType]:
1245 """Iterate over the dataset types whose names match an expression.
1247 Parameters
1248 ----------
1249 expression : `Any`, optional
1250 An expression that fully or partially identifies the dataset types
1251 to return, such as a `str`, `re.Pattern`, or iterable thereof.
1252 `...` can be used to return all dataset types, and is the default.
1253 See :ref:`daf_butler_dataset_type_expressions` for more
1254 information.
1256 Yields
1257 ------
1258 datasetType : `DatasetType`
1259 A `DatasetType` instance whose name matches ``expression``.
1260 """
1261 wildcard = CategorizedWildcard.fromExpression(expression, coerceUnrecognized=lambda d: d.name)
1262 if wildcard is ...:
1263 yield from self._datasets
1264 return
1265 done = set()
1266 for name in wildcard.strings:
1267 storage = self._datasets.find(name)
1268 if storage is not None:
1269 done.add(storage.datasetType)
1270 yield storage.datasetType
1271 if wildcard.patterns:
1272 for datasetType in self._datasets:
1273 if datasetType.name in done:
1274 continue
1275 if any(p.fullmatch(datasetType.name) for p in wildcard.patterns):
1276 yield datasetType
1278 def queryCollections(self, expression: Any = ...,
1279 datasetType: Optional[DatasetType] = None,
1280 collectionType: Optional[CollectionType] = None,
1281 flattenChains: bool = False,
1282 includeChains: Optional[bool] = None) -> Iterator[str]:
1283 """Iterate over the collections whose names match an expression.
1285 Parameters
1286 ----------
1287 expression : `Any`, optional
1288 An expression that fully or partially identifies the collections
1289 to return, such as a `str`, `re.Pattern`, or iterable thereof.
1290 `...` can be used to return all collections, and is the default.
1291 See :ref:`daf_butler_collection_expressions` for more
1292 information.
1293 datasetType : `DatasetType`, optional
1294 If provided, only yield collections that should be searched for
1295 this dataset type according to ``expression``. If this is
1296 not provided, any dataset type restrictions in ``expression`` are
1297 ignored.
1298 collectionType : `CollectionType`, optional
1299 If provided, only yield collections of this type.
1300 flattenChains : `bool`, optional
1301 If `True` (`False` is default), recursively yield the child
1302 collections of matching `~CollectionType.CHAINED` collections.
1303 includeChains : `bool`, optional
1304 If `True`, yield records for matching `~CollectionType.CHAINED`
1305 collections. Default is the opposite of ``flattenChains``: include
1306 either CHAINED collections or their children, but not both.
1308 Yields
1309 ------
1310 collection : `str`
1311 The name of a collection that matches ``expression``.
1312 """
1313 query = CollectionQuery.fromExpression(expression)
1314 for record in query.iter(self._collections, datasetType=datasetType, collectionType=collectionType,
1315 flattenChains=flattenChains, includeChains=includeChains):
1316 yield record.name
1318 def makeQueryBuilder(self, summary: QuerySummary) -> QueryBuilder:
1319 """Return a `QueryBuilder` instance capable of constructing and
1320 managing more complex queries than those obtainable via `Registry`
1321 interfaces.
1323 This is an advanced interface; downstream code should prefer
1324 `Registry.queryDimensions` and `Registry.queryDatasets` whenever those
1325 are sufficient.
1327 Parameters
1328 ----------
1329 summary : `QuerySummary`
1330 Object describing and categorizing the full set of dimensions that
1331 will be included in the query.
1333 Returns
1334 -------
1335 builder : `QueryBuilder`
1336 Object that can be used to construct and perform advanced queries.
1337 """
1338 return QueryBuilder(summary=summary,
1339 collections=self._collections,
1340 dimensions=self._dimensions,
1341 datasets=self._datasets)
1343 def queryDimensions(self, dimensions: Union[Iterable[Union[Dimension, str]], Dimension, str], *,
1344 dataId: Optional[DataId] = None,
1345 datasets: Any = None,
1346 collections: Any = None,
1347 where: Optional[str] = None,
1348 expand: bool = True,
1349 **kwds) -> Iterator[DataCoordinate]:
1350 """Query for and iterate over data IDs matching user-provided criteria.
1352 Parameters
1353 ----------
1354 dimensions : `Dimension` or `str`, or iterable thereof
1355 The dimensions of the data IDs to yield, as either `Dimension`
1356 instances or `str`. Will be automatically expanded to a complete
1357 `DimensionGraph`.
1358 dataId : `dict` or `DataCoordinate`, optional
1359 A data ID whose key-value pairs are used as equality constraints
1360 in the query.
1361 datasets : `Any`, optional
1362 An expression that fully or partially identifies dataset types
1363 that should constrain the yielded data IDs. For example, including
1364 "raw" here would constrain the yielded ``instrument``,
1365 ``exposure``, ``detector``, and ``physical_filter`` values to only
1366 those for which at least one "raw" dataset exists in
1367 ``collections``. Allowed types include `DatasetType`, `str`,
1368 `re.Pattern`, and iterables thereof. Unlike other dataset type
1369 expressions, `...` is not permitted - it doesn't make sense to
1370 constrain data IDs on the existence of *all* datasets.
1371 See :ref:`daf_butler_dataset_type_expressions` for more
1372 information.
1373 collections: `Any`, optional
1374 An expression that fully or partially identifies the collections
1375 to search for datasets, such as a `str`, `re.Pattern`, or iterable
1376 thereof. `...` can be used to return all collections. Must be
1377 provided if ``datasets`` is, and is ignored if it is not. See
1378 :ref:`daf_butler_collection_expressions` for more information.
1379 where : `str`, optional
1380 A string expression similar to a SQL WHERE clause. May involve
1381 any column of a dimension table or (as a shortcut for the primary
1382 key column of a dimension table) dimension name. See
1383 :ref:`daf_butler_dimension_expressions` for more information.
1384 expand : `bool`, optional
1385 If `True` (default) yield `ExpandedDataCoordinate` instead of
1386 minimal `DataCoordinate` base-class instances.
1387 kwds
1388 Additional keyword arguments are forwarded to
1389 `DataCoordinate.standardize` when processing the ``dataId``
1390 argument (and may be used to provide a constraining data ID even
1391 when the ``dataId`` argument is `None`).
1393 Yields
1394 ------
1395 dataId : `DataCoordinate`
1396 Data IDs matching the given query parameters. Order is
1397 unspecified.
1398 """
1399 dimensions = iterable(dimensions)
1400 standardizedDataId = self.expandDataId(dataId, **kwds)
1401 standardizedDatasetTypes = []
1402 requestedDimensionNames = set(self.dimensions.extract(dimensions).names)
1403 if datasets is not None:
1404 if collections is None:
1405 raise TypeError("Cannot pass 'datasets' without 'collections'.")
1406 for datasetType in self.queryDatasetTypes(datasets):
1407 requestedDimensionNames.update(datasetType.dimensions.names)
1408 standardizedDatasetTypes.append(datasetType)
1409 # Preprocess collections expression in case the original included
1410 # single-pass iterators (we'll want to use it multiple times
1411 # below).
1412 collections = CollectionQuery.fromExpression(collections)
1414 summary = QuerySummary(
1415 requested=DimensionGraph(self.dimensions, names=requestedDimensionNames),
1416 dataId=standardizedDataId,
1417 expression=where,
1418 )
1419 builder = self.makeQueryBuilder(summary)
1420 for datasetType in standardizedDatasetTypes:
1421 builder.joinDataset(datasetType, collections, isResult=False)
1422 query = builder.finish()
1423 predicate = query.predicate()
1424 for row in self._db.query(query.sql):
1425 if predicate(row):
1426 result = query.extractDataId(row)
1427 if expand:
1428 yield self.expandDataId(result, records=standardizedDataId.records)
1429 else:
1430 yield result
1432 def queryDatasets(self, datasetType: Any, *,
1433 collections: Any,
1434 dimensions: Optional[Iterable[Union[Dimension, str]]] = None,
1435 dataId: Optional[DataId] = None,
1436 where: Optional[str] = None,
1437 deduplicate: bool = False,
1438 expand: bool = True,
1439 **kwds) -> Iterator[DatasetRef]:
1440 """Query for and iterate over dataset references matching user-provided
1441 criteria.
1443 Parameters
1444 ----------
1445 datasetType
1446 An expression that fully or partially identifies the dataset types
1447 to be queried. Allowed types include `DatasetType`, `str`,
1448 `re.Pattern`, and iterables thereof. The special value `...` can
1449 be used to query all dataset types. See
1450 :ref:`daf_butler_dataset_type_expressions` for more information.
1451 collections
1452 An expression that fully or partially identifies the collections
1453 to search for datasets, such as a `str`, `re.Pattern`, or iterable
1454 thereof. `...` can be used to return all collections. See
1455 :ref:`daf_butler_collection_expressions` for more information.
1456 dimensions : `~collections.abc.Iterable` of `Dimension` or `str`
1457 Dimensions to include in the query (in addition to those used
1458 to identify the queried dataset type(s)), either to constrain
1459 the resulting datasets to those for which a matching dimension
1460 exists, or to relate the dataset type's dimensions to dimensions
1461 referenced by the ``dataId`` or ``where`` arguments.
1462 dataId : `dict` or `DataCoordinate`, optional
1463 A data ID whose key-value pairs are used as equality constraints
1464 in the query.
1465 where : `str`, optional
1466 A string expression similar to a SQL WHERE clause. May involve
1467 any column of a dimension table or (as a shortcut for the primary
1468 key column of a dimension table) dimension name. See
1469 :ref:`daf_butler_dimension_expressions` for more information.
1470 deduplicate : `bool`, optional
1471 If `True` (`False` is default), for each result data ID, only
1472 yield one `DatasetRef` of each `DatasetType`, from the first
1473 collection in which a dataset of that dataset type appears
1474 (according to the order of ``collections`` passed in). If `True`,
1475 ``collections`` must not contain regular expressions and may not
1476 be `...`.
1477 expand : `bool`, optional
1478 If `True` (default) attach `ExpandedDataCoordinate` instead of
1479 minimal `DataCoordinate` base-class instances.
1480 kwds
1481 Additional keyword arguments are forwarded to
1482 `DataCoordinate.standardize` when processing the ``dataId``
1483 argument (and may be used to provide a constraining data ID even
1484 when the ``dataId`` argument is `None`).
1486 Yields
1487 ------
1488 ref : `DatasetRef`
1489 Dataset references matching the given query criteria. These
1490 are grouped by `DatasetType` if the query evaluates to multiple
1491 dataset types, but order is otherwise unspecified.
1493 Raises
1494 ------
1495 TypeError
1496 Raised when the arguments are incompatible, such as when a
1497 collection wildcard is passed when ``deduplicate`` is `True`.
1499 Notes
1500 -----
1501 When multiple dataset types are queried in a single call, the
1502 results of this operation are equivalent to querying for each dataset
1503 type separately in turn, and no information about the relationships
1504 between datasets of different types is included. In contexts where
1505 that kind of information is important, the recommended pattern is to
1506 use `queryDimensions` to first obtain data IDs (possibly with the
1507 desired dataset types and collections passed as constraints to the
1508 query), and then use multiple (generally much simpler) calls to
1509 `queryDatasets` with the returned data IDs passed as constraints.
1510 """
1511 # Standardize the collections expression.
1512 if deduplicate:
1513 collections = CollectionSearch.fromExpression(collections)
1514 else:
1515 collections = CollectionQuery.fromExpression(collections)
1516 # Standardize and expand the data ID provided as a constraint.
1517 standardizedDataId = self.expandDataId(dataId, **kwds)
1518 # If the datasetType passed isn't actually a DatasetType, expand it
1519 # (it could be an expression that yields multiple DatasetTypes) and
1520 # recurse.
1521 if not isinstance(datasetType, DatasetType):
1522 for trueDatasetType in self.queryDatasetTypes(datasetType):
1523 yield from self.queryDatasets(trueDatasetType, collections=collections,
1524 dimensions=dimensions, dataId=standardizedDataId,
1525 where=where, deduplicate=deduplicate)
1526 return
1527 # The full set of dimensions in the query is the combination of those
1528 # needed for the DatasetType and those explicitly requested, if any.
1529 requestedDimensionNames = set(datasetType.dimensions.names)
1530 if dimensions is not None:
1531 requestedDimensionNames.update(self.dimensions.extract(dimensions).names)
1532 # Construct the summary structure needed to construct a QueryBuilder.
1533 summary = QuerySummary(
1534 requested=DimensionGraph(self.dimensions, names=requestedDimensionNames),
1535 dataId=standardizedDataId,
1536 expression=where,
1537 )
1538 builder = self.makeQueryBuilder(summary)
1539 # Add the dataset subquery to the query, telling the QueryBuilder to
1540 # include the rank of the selected collection in the results only if we
1541 # need to deduplicate. Note that if any of the collections are
1542 # actually wildcard expressions, and we've asked for deduplication,
1543 # this will raise TypeError for us.
1544 if not builder.joinDataset(datasetType, collections, isResult=True, addRank=deduplicate):
1545 return
1546 query = builder.finish()
1547 predicate = query.predicate()
1548 if not deduplicate:
1549 # No need to de-duplicate across collections.
1550 for row in self._db.query(query.sql):
1551 if predicate(row):
1552 dataId = query.extractDataId(row, graph=datasetType.dimensions)
1553 if expand:
1554 dataId = self.expandDataId(dataId, records=standardizedDataId.records)
1555 yield query.extractDatasetRef(row, datasetType, dataId)[0]
1556 else:
1557 # For each data ID, yield only the DatasetRef with the lowest
1558 # collection rank.
1559 bestRefs = {}
1560 bestRanks = {}
1561 for row in self._db.query(query.sql):
1562 if predicate(row):
1563 ref, rank = query.extractDatasetRef(row, datasetType)
1564 bestRank = bestRanks.get(ref.dataId, sys.maxsize)
1565 if rank < bestRank:
1566 bestRefs[ref.dataId] = ref
1567 bestRanks[ref.dataId] = rank
1568 # If caller requested expanded data IDs, we defer that until here
1569 # so we do as little expansion as possible.
1570 if expand:
1571 for ref in bestRefs.values():
1572 dataId = self.expandDataId(ref.dataId, records=standardizedDataId.records)
1573 yield ref.expanded(dataId)
1574 else:
1575 yield from bestRefs.values()
1577 dimensions: DimensionUniverse
1578 """The universe of all dimensions known to the registry
1579 (`DimensionUniverse`).
1580 """
1582 storageClasses: StorageClassFactory
1583 """All storage classes known to the registry (`StorageClassFactory`).
1584 """