Coverage for python/lsst/daf/butler/registry/_registry.py : 11%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22from __future__ import annotations
24__all__ = (
25 "Registry",
26)
28from collections import defaultdict
29import contextlib
30import sys
31from typing import (
32 Any,
33 Dict,
34 Iterable,
35 Iterator,
36 List,
37 Mapping,
38 Optional,
39 Set,
40 Type,
41 TYPE_CHECKING,
42 Union,
43)
45import sqlalchemy
47from ..core import (
48 Config,
49 DataCoordinate,
50 DataId,
51 DatasetRef,
52 DatasetType,
53 ddl,
54 Dimension,
55 DimensionElement,
56 DimensionGraph,
57 DimensionRecord,
58 DimensionUniverse,
59 ExpandedDataCoordinate,
60 NamedKeyDict,
61 StorageClassFactory,
62)
63from ..core.utils import doImport, iterable, transactional
64from ._config import RegistryConfig
65from .queries import (
66 QueryBuilder,
67 QuerySummary,
68)
69from ._collectionType import CollectionType
70from ._exceptions import ConflictingDefinitionError, InconsistentDataIdError, OrphanedRecordError
71from .wildcards import CategorizedWildcard, CollectionQuery, CollectionSearch, Ellipsis
72from .interfaces import ChainedCollectionRecord, RunRecord
73from .versions import ButlerVersionsManager
75if TYPE_CHECKING: 75 ↛ 76line 75 didn't jump to line 76, because the condition on line 75 was never true
76 from ..butlerConfig import ButlerConfig
77 from .interfaces import (
78 ButlerAttributeManager,
79 CollectionManager,
80 Database,
81 OpaqueTableStorageManager,
82 DimensionRecordStorageManager,
83 DatasetRecordStorageManager,
84 DatastoreRegistryBridgeManager,
85 )
88class Registry:
89 """Registry interface.
91 Parameters
92 ----------
93 config : `ButlerConfig`, `RegistryConfig`, `Config` or `str`
94 Registry configuration
95 """
97 defaultConfigFile = None
98 """Path to configuration defaults. Relative to $DAF_BUTLER_DIR/config or
99 absolute path. Can be None if no defaults specified.
100 """
102 @classmethod
103 def fromConfig(cls, config: Union[ButlerConfig, RegistryConfig, Config, str], create: bool = False,
104 butlerRoot: Optional[str] = None, writeable: bool = True) -> Registry:
105 """Create `Registry` subclass instance from `config`.
107 Uses ``registry.cls`` from `config` to determine which subclass to
108 instantiate.
110 Parameters
111 ----------
112 config : `ButlerConfig`, `RegistryConfig`, `Config` or `str`
113 Registry configuration
114 create : `bool`, optional
115 Assume empty Registry and create a new one.
116 butlerRoot : `str`, optional
117 Path to the repository root this `Registry` will manage.
118 writeable : `bool`, optional
119 If `True` (default) create a read-write connection to the database.
121 Returns
122 -------
123 registry : `Registry` (subclass)
124 A new `Registry` subclass instance.
125 """
126 if not isinstance(config, RegistryConfig):
127 if isinstance(config, str) or isinstance(config, Config):
128 config = RegistryConfig(config)
129 else:
130 raise ValueError("Incompatible Registry configuration: {}".format(config))
131 config.replaceRoot(butlerRoot)
132 DatabaseClass = config.getDatabaseClass()
133 database = DatabaseClass.fromUri(str(config.connectionString), origin=config.get("origin", 0),
134 namespace=config.get("namespace"), writeable=writeable)
135 universe = DimensionUniverse(config)
136 attributes = doImport(config["managers", "attributes"])
137 opaque = doImport(config["managers", "opaque"])
138 dimensions = doImport(config["managers", "dimensions"])
139 collections = doImport(config["managers", "collections"])
140 datasets = doImport(config["managers", "datasets"])
141 datastoreBridges = doImport(config["managers", "datastores"])
142 versions = ButlerVersionsManager.fromConfig(config.get("schema_versions"))
144 return cls(database, universe, dimensions=dimensions, attributes=attributes, opaque=opaque,
145 collections=collections, datasets=datasets, datastoreBridges=datastoreBridges,
146 versions=versions, writeable=writeable, create=create)
148 def __init__(self, database: Database, universe: DimensionUniverse, *,
149 attributes: Type[ButlerAttributeManager],
150 opaque: Type[OpaqueTableStorageManager],
151 dimensions: Type[DimensionRecordStorageManager],
152 collections: Type[CollectionManager],
153 datasets: Type[DatasetRecordStorageManager],
154 datastoreBridges: Type[DatastoreRegistryBridgeManager],
155 versions: ButlerVersionsManager,
156 writeable: bool = True,
157 create: bool = False):
158 self._db = database
159 self.storageClasses = StorageClassFactory()
160 with self._db.declareStaticTables(create=create) as context:
161 self._attributes = attributes.initialize(self._db, context)
162 self._dimensions = dimensions.initialize(self._db, context, universe=universe)
163 self._collections = collections.initialize(self._db, context)
164 self._datasets = datasets.initialize(self._db, context,
165 collections=self._collections,
166 universe=self.dimensions)
167 self._opaque = opaque.initialize(self._db, context)
168 self._datastoreBridges = datastoreBridges.initialize(self._db, context,
169 opaque=self._opaque,
170 datasets=datasets,
171 universe=self.dimensions)
172 context.addInitializer(lambda db: versions.storeVersions(self._attributes))
174 # This call does not do anything right now as we do not have a way to
175 # split tables between sub-schemas yet.
176 versions.checkVersionDigests()
177 if not create:
178 # verify that configured versions are compatible with schema
179 versions.checkStoredVersions(self._attributes, writeable)
181 self._collections.refresh()
182 self._datasets.refresh(universe=self._dimensions.universe)
184 def __str__(self) -> str:
185 return str(self._db)
187 def __repr__(self) -> str:
188 return f"Registry({self._db!r}, {self.dimensions!r})"
190 def isWriteable(self) -> bool:
191 """Return `True` if this registry allows write operations, and `False`
192 otherwise.
193 """
194 return self._db.isWriteable()
196 @property
197 def dimensions(self) -> DimensionUniverse:
198 """All dimensions recognized by this `Registry` (`DimensionUniverse`).
199 """
200 return self._dimensions.universe
202 @contextlib.contextmanager
203 def transaction(self) -> Iterator[None]:
204 """Return a context manager that represents a transaction.
205 """
206 # TODO make savepoint=False the default.
207 try:
208 with self._db.transaction():
209 yield
210 except BaseException:
211 # TODO: this clears the caches sometimes when we wouldn't actually
212 # need to. Can we avoid that?
213 self._dimensions.clearCaches()
214 raise
216 def registerOpaqueTable(self, tableName: str, spec: ddl.TableSpec) -> None:
217 """Add an opaque (to the `Registry`) table for use by a `Datastore` or
218 other data repository client.
220 Opaque table records can be added via `insertOpaqueData`, retrieved via
221 `fetchOpaqueData`, and removed via `deleteOpaqueData`.
223 Parameters
224 ----------
225 tableName : `str`
226 Logical name of the opaque table. This may differ from the
227 actual name used in the database by a prefix and/or suffix.
228 spec : `ddl.TableSpec`
229 Specification for the table to be added.
230 """
231 self._opaque.register(tableName, spec)
233 @transactional
234 def insertOpaqueData(self, tableName: str, *data: dict) -> None:
235 """Insert records into an opaque table.
237 Parameters
238 ----------
239 tableName : `str`
240 Logical name of the opaque table. Must match the name used in a
241 previous call to `registerOpaqueTable`.
242 data
243 Each additional positional argument is a dictionary that represents
244 a single row to be added.
245 """
246 self._opaque[tableName].insert(*data)
248 def fetchOpaqueData(self, tableName: str, **where: Any) -> Iterator[dict]:
249 """Retrieve records from an opaque table.
251 Parameters
252 ----------
253 tableName : `str`
254 Logical name of the opaque table. Must match the name used in a
255 previous call to `registerOpaqueTable`.
256 where
257 Additional keyword arguments are interpreted as equality
258 constraints that restrict the returned rows (combined with AND);
259 keyword arguments are column names and values are the values they
260 must have.
262 Yields
263 ------
264 row : `dict`
265 A dictionary representing a single result row.
266 """
267 yield from self._opaque[tableName].fetch(**where)
269 @transactional
270 def deleteOpaqueData(self, tableName: str, **where: Any) -> None:
271 """Remove records from an opaque table.
273 Parameters
274 ----------
275 tableName : `str`
276 Logical name of the opaque table. Must match the name used in a
277 previous call to `registerOpaqueTable`.
278 where
279 Additional keyword arguments are interpreted as equality
280 constraints that restrict the deleted rows (combined with AND);
281 keyword arguments are column names and values are the values they
282 must have.
283 """
284 self._opaque[tableName].delete(**where)
286 def registerCollection(self, name: str, type: CollectionType = CollectionType.TAGGED) -> None:
287 """Add a new collection if one with the given name does not exist.
289 Parameters
290 ----------
291 name : `str`
292 The name of the collection to create.
293 type : `CollectionType`
294 Enum value indicating the type of collection to create.
296 Notes
297 -----
298 This method cannot be called within transactions, as it needs to be
299 able to perform its own transaction to be concurrent.
300 """
301 self._collections.register(name, type)
303 def getCollectionType(self, name: str) -> CollectionType:
304 """Return an enumeration value indicating the type of the given
305 collection.
307 Parameters
308 ----------
309 name : `str`
310 The name of the collection.
312 Returns
313 -------
314 type : `CollectionType`
315 Enum value indicating the type of this collection.
317 Raises
318 ------
319 MissingCollectionError
320 Raised if no collection with the given name exists.
321 """
322 return self._collections.find(name).type
324 def registerRun(self, name: str) -> None:
325 """Add a new run if one with the given name does not exist.
327 Parameters
328 ----------
329 name : `str`
330 The name of the run to create.
332 Notes
333 -----
334 This method cannot be called within transactions, as it needs to be
335 able to perform its own transaction to be concurrent.
336 """
337 self._collections.register(name, CollectionType.RUN)
339 @transactional
340 def removeCollection(self, name: str) -> None:
341 """Completely remove the given collection.
343 Parameters
344 ----------
345 name : `str`
346 The name of the collection to remove.
348 Raises
349 ------
350 MissingCollectionError
351 Raised if no collection with the given name exists.
353 Notes
354 -----
355 If this is a `~CollectionType.RUN` collection, all datasets and quanta
356 in it are also fully removed. This requires that those datasets be
357 removed (or at least trashed) from any datastores that hold them first.
359 A collection may not be deleted as long as it is referenced by a
360 `~CollectionType.CHAINED` collection; the ``CHAINED`` collection must
361 be deleted or redefined first.
362 """
363 self._collections.remove(name)
365 def getCollectionChain(self, parent: str) -> CollectionSearch:
366 """Return the child collections in a `~CollectionType.CHAINED`
367 collection.
369 Parameters
370 ----------
371 parent : `str`
372 Name of the chained collection. Must have already been added via
373 a call to `Registry.registerCollection`.
375 Returns
376 -------
377 children : `CollectionSearch`
378 An object that defines the search path of the collection.
379 See :ref:`daf_butler_collection_expressions` for more information.
381 Raises
382 ------
383 MissingCollectionError
384 Raised if ``parent`` does not exist in the `Registry`.
385 TypeError
386 Raised if ``parent`` does not correspond to a
387 `~CollectionType.CHAINED` collection.
388 """
389 record = self._collections.find(parent)
390 if record.type is not CollectionType.CHAINED:
391 raise TypeError(f"Collection '{parent}' has type {record.type.name}, not CHAINED.")
392 assert isinstance(record, ChainedCollectionRecord)
393 return record.children
395 @transactional
396 def setCollectionChain(self, parent: str, children: Any) -> None:
397 """Define or redefine a `~CollectionType.CHAINED` collection.
399 Parameters
400 ----------
401 parent : `str`
402 Name of the chained collection. Must have already been added via
403 a call to `Registry.registerCollection`.
404 children : `Any`
405 An expression defining an ordered search of child collections,
406 generally an iterable of `str`. Restrictions on the dataset types
407 to be searched can also be included, by passing mapping or an
408 iterable containing tuples; see
409 :ref:`daf_butler_collection_expressions` for more information.
411 Raises
412 ------
413 MissingCollectionError
414 Raised when any of the given collections do not exist in the
415 `Registry`.
416 TypeError
417 Raised if ``parent`` does not correspond to a
418 `~CollectionType.CHAINED` collection.
419 ValueError
420 Raised if the given collections contains a cycle.
421 """
422 record = self._collections.find(parent)
423 if record.type is not CollectionType.CHAINED:
424 raise TypeError(f"Collection '{parent}' has type {record.type.name}, not CHAINED.")
425 assert isinstance(record, ChainedCollectionRecord)
426 children = CollectionSearch.fromExpression(children)
427 if children != record.children:
428 record.update(self._collections, children)
430 def registerDatasetType(self, datasetType: DatasetType) -> bool:
431 """
432 Add a new `DatasetType` to the Registry.
434 It is not an error to register the same `DatasetType` twice.
436 Parameters
437 ----------
438 datasetType : `DatasetType`
439 The `DatasetType` to be added.
441 Returns
442 -------
443 inserted : `bool`
444 `True` if ``datasetType`` was inserted, `False` if an identical
445 existing `DatsetType` was found. Note that in either case the
446 DatasetType is guaranteed to be defined in the Registry
447 consistently with the given definition.
449 Raises
450 ------
451 ValueError
452 Raised if the dimensions or storage class are invalid.
453 ConflictingDefinitionError
454 Raised if this DatasetType is already registered with a different
455 definition.
457 Notes
458 -----
459 This method cannot be called within transactions, as it needs to be
460 able to perform its own transaction to be concurrent.
461 """
462 _, inserted = self._datasets.register(datasetType)
463 return inserted
465 def getDatasetType(self, name: str) -> DatasetType:
466 """Get the `DatasetType`.
468 Parameters
469 ----------
470 name : `str`
471 Name of the type.
473 Returns
474 -------
475 type : `DatasetType`
476 The `DatasetType` associated with the given name.
478 Raises
479 ------
480 KeyError
481 Requested named DatasetType could not be found in registry.
482 """
483 storage = self._datasets.find(name)
484 if storage is None:
485 raise KeyError(f"DatasetType '{name}' could not be found.")
486 return storage.datasetType
488 def findDataset(self, datasetType: Union[DatasetType, str], dataId: Optional[DataId] = None, *,
489 collections: Any, **kwargs: Any) -> Optional[DatasetRef]:
490 """Find a dataset given its `DatasetType` and data ID.
492 This can be used to obtain a `DatasetRef` that permits the dataset to
493 be read from a `Datastore`. If the dataset is a component and can not
494 be found using the provided dataset type, a dataset ref for the parent
495 will be returned instead but with the correct dataset type.
497 Parameters
498 ----------
499 datasetType : `DatasetType` or `str`
500 A `DatasetType` or the name of one.
501 dataId : `dict` or `DataCoordinate`, optional
502 A `dict`-like object containing the `Dimension` links that identify
503 the dataset within a collection.
504 collections
505 An expression that fully or partially identifies the collections
506 to search for the dataset, such as a `str`, `re.Pattern`, or
507 iterable thereof. `...` can be used to return all collections.
508 See :ref:`daf_butler_collection_expressions` for more information.
509 **kwargs
510 Additional keyword arguments passed to
511 `DataCoordinate.standardize` to convert ``dataId`` to a true
512 `DataCoordinate` or augment an existing one.
514 Returns
515 -------
516 ref : `DatasetRef`
517 A reference to the dataset, or `None` if no matching Dataset
518 was found.
520 Raises
521 ------
522 LookupError
523 Raised if one or more data ID keys are missing or the dataset type
524 does not exist.
525 MissingCollectionError
526 Raised if any of ``collections`` does not exist in the registry.
527 """
528 if isinstance(datasetType, DatasetType):
529 storage = self._datasets.find(datasetType.name)
530 if storage is None:
531 raise LookupError(f"DatasetType '{datasetType}' has not been registered.")
532 else:
533 storage = self._datasets.find(datasetType)
534 if storage is None:
535 raise LookupError(f"DatasetType with name '{datasetType}' has not been registered.")
536 dataId = DataCoordinate.standardize(dataId, graph=storage.datasetType.dimensions,
537 universe=self.dimensions, **kwargs)
538 collections = CollectionSearch.fromExpression(collections)
539 for collectionRecord in collections.iter(self._collections, datasetType=storage.datasetType):
540 result = storage.find(collectionRecord, dataId)
541 if result is not None:
542 return result
544 # fallback to the parent if we got nothing and this was a component
545 if storage.datasetType.isComponent():
546 parentType, _ = storage.datasetType.nameAndComponent()
547 parentRef = self.findDataset(parentType, dataId, collections=collections, **kwargs)
548 if parentRef is not None:
549 # Should already conform and we know no components
550 return DatasetRef(storage.datasetType, parentRef.dataId, id=parentRef.id,
551 run=parentRef.run, conform=False, hasParentId=True)
553 return None
555 @transactional
556 def insertDatasets(self, datasetType: Union[DatasetType, str], dataIds: Iterable[DataId],
557 run: str) -> List[DatasetRef]:
558 """Insert one or more datasets into the `Registry`
560 This always adds new datasets; to associate existing datasets with
561 a new collection, use ``associate``.
563 Parameters
564 ----------
565 datasetType : `DatasetType` or `str`
566 A `DatasetType` or the name of one.
567 dataIds : `~collections.abc.Iterable` of `dict` or `DataCoordinate`
568 Dimension-based identifiers for the new datasets.
569 run : `str`
570 The name of the run that produced the datasets.
572 Returns
573 -------
574 refs : `list` of `DatasetRef`
575 Resolved `DatasetRef` instances for all given data IDs (in the same
576 order).
578 Raises
579 ------
580 ConflictingDefinitionError
581 If a dataset with the same dataset type and data ID as one of those
582 given already exists in ``run``.
583 MissingCollectionError
584 Raised if ``run`` does not exist in the registry.
585 """
586 if isinstance(datasetType, DatasetType):
587 storage = self._datasets.find(datasetType.name)
588 if storage is None:
589 raise LookupError(f"DatasetType '{datasetType}' has not been registered.")
590 else:
591 storage = self._datasets.find(datasetType)
592 if storage is None:
593 raise LookupError(f"DatasetType with name '{datasetType}' has not been registered.")
594 runRecord = self._collections.find(run)
595 if runRecord.type is not CollectionType.RUN:
596 raise TypeError("Given collection is of type {runRecord.type.name}; RUN collection required.")
597 assert isinstance(runRecord, RunRecord)
598 expandedDataIds = [self.expandDataId(dataId, graph=storage.datasetType.dimensions)
599 for dataId in dataIds]
600 try:
601 refs = list(storage.insert(runRecord, expandedDataIds))
602 except sqlalchemy.exc.IntegrityError as err:
603 raise ConflictingDefinitionError(f"A database constraint failure was triggered by inserting "
604 f"one or more datasets of type {storage.datasetType} into "
605 f"collection '{run}'. "
606 f"This probably means a dataset with the same data ID "
607 f"and dataset type already exists, but it may also mean a "
608 f"dimension row is missing.") from err
609 return refs
611 def getDataset(self, id: int) -> Optional[DatasetRef]:
612 """Retrieve a Dataset entry.
614 Parameters
615 ----------
616 id : `int`
617 The unique identifier for the dataset.
619 Returns
620 -------
621 ref : `DatasetRef` or `None`
622 A ref to the Dataset, or `None` if no matching Dataset
623 was found.
624 """
625 ref = self._datasets.getDatasetRef(id, universe=self.dimensions)
626 if ref is None:
627 return None
628 return ref
630 @transactional
631 def removeDatasets(self, refs: Iterable[DatasetRef]) -> None:
632 """Remove datasets from the Registry.
634 The datasets will be removed unconditionally from all collections, and
635 any `Quantum` that consumed this dataset will instead be marked with
636 having a NULL input. `Datastore` records will *not* be deleted; the
637 caller is responsible for ensuring that the dataset has already been
638 removed from all Datastores.
640 Parameters
641 ----------
642 refs : `Iterable` of `DatasetRef`
643 References to the datasets to be removed. Must include a valid
644 ``id`` attribute, and should be considered invalidated upon return.
646 Raises
647 ------
648 AmbiguousDatasetError
649 Raised if any ``ref.id`` is `None`.
650 OrphanedRecordError
651 Raised if any dataset is still present in any `Datastore`.
652 """
653 for datasetType, refsForType in DatasetRef.groupByType(refs).items():
654 storage = self._datasets.find(datasetType.name)
655 assert storage is not None
656 try:
657 storage.delete(refsForType)
658 except sqlalchemy.exc.IntegrityError as err:
659 raise OrphanedRecordError("One or more datasets is still "
660 "present in one or more Datastores.") from err
662 @transactional
663 def associate(self, collection: str, refs: Iterable[DatasetRef]) -> None:
664 """Add existing datasets to a `~CollectionType.TAGGED` collection.
666 If a DatasetRef with the same exact integer ID is already in a
667 collection nothing is changed. If a `DatasetRef` with the same
668 `DatasetType` and data ID but with different integer ID
669 exists in the collection, `ConflictingDefinitionError` is raised.
671 Parameters
672 ----------
673 collection : `str`
674 Indicates the collection the datasets should be associated with.
675 refs : `Iterable` [ `DatasetRef` ]
676 An iterable of resolved `DatasetRef` instances that already exist
677 in this `Registry`.
679 Raises
680 ------
681 ConflictingDefinitionError
682 If a Dataset with the given `DatasetRef` already exists in the
683 given collection.
684 AmbiguousDatasetError
685 Raised if ``any(ref.id is None for ref in refs)``.
686 MissingCollectionError
687 Raised if ``collection`` does not exist in the registry.
688 TypeError
689 Raise adding new datasets to the given ``collection`` is not
690 allowed.
691 """
692 collectionRecord = self._collections.find(collection)
693 if collectionRecord.type is not CollectionType.TAGGED:
694 raise TypeError(f"Collection '{collection}' has type {collectionRecord.type.name}, not TAGGED.")
695 for datasetType, refsForType in DatasetRef.groupByType(refs).items():
696 storage = self._datasets.find(datasetType.name)
697 assert storage is not None
698 try:
699 storage.associate(collectionRecord, refsForType)
700 except sqlalchemy.exc.IntegrityError as err:
701 raise ConflictingDefinitionError(
702 f"Constraint violation while associating dataset of type {datasetType.name} with "
703 f"collection {collection}. This probably means that one or more datasets with the same "
704 f"dataset type and data ID already exist in the collection, but it may also indicate "
705 f"that the datasets do not exist."
706 ) from err
708 @transactional
709 def disassociate(self, collection: str, refs: Iterable[DatasetRef]) -> None:
710 """Remove existing datasets from a `~CollectionType.TAGGED` collection.
712 ``collection`` and ``ref`` combinations that are not currently
713 associated are silently ignored.
715 Parameters
716 ----------
717 collection : `str`
718 The collection the datasets should no longer be associated with.
719 refs : `Iterable` [ `DatasetRef` ]
720 An iterable of resolved `DatasetRef` instances that already exist
721 in this `Registry`.
723 Raises
724 ------
725 AmbiguousDatasetError
726 Raised if any of the given dataset references is unresolved.
727 MissingCollectionError
728 Raised if ``collection`` does not exist in the registry.
729 TypeError
730 Raise adding new datasets to the given ``collection`` is not
731 allowed.
732 """
733 collectionRecord = self._collections.find(collection)
734 if collectionRecord.type is not CollectionType.TAGGED:
735 raise TypeError(f"Collection '{collection}' has type {collectionRecord.type.name}; "
736 "expected TAGGED.")
737 for datasetType, refsForType in DatasetRef.groupByType(refs).items():
738 storage = self._datasets.find(datasetType.name)
739 assert storage is not None
740 storage.disassociate(collectionRecord, refsForType)
742 def getDatastoreBridgeManager(self) -> DatastoreRegistryBridgeManager:
743 """Return an object that allows a new `Datastore` instance to
744 communicate with this `Registry`.
746 Returns
747 -------
748 manager : `DatastoreRegistryBridgeManager`
749 Object that mediates communication between this `Registry` and its
750 associated datastores.
751 """
752 return self._datastoreBridges
754 def getDatasetLocations(self, ref: DatasetRef) -> Iterable[str]:
755 """Retrieve datastore locations for a given dataset.
757 Parameters
758 ----------
759 ref : `DatasetRef`
760 A reference to the dataset for which to retrieve storage
761 information.
763 Returns
764 -------
765 datastores : `Iterable` [ `str` ]
766 All the matching datastores holding this dataset.
768 Raises
769 ------
770 AmbiguousDatasetError
771 Raised if ``ref.id`` is `None`.
772 """
773 return self._datastoreBridges.findDatastores(ref)
775 def expandDataId(self, dataId: Optional[DataId] = None, *, graph: Optional[DimensionGraph] = None,
776 records: Optional[Mapping[DimensionElement, Optional[DimensionRecord]]] = None,
777 **kwargs: Any) -> ExpandedDataCoordinate:
778 """Expand a dimension-based data ID to include additional information.
780 Parameters
781 ----------
782 dataId : `DataCoordinate` or `dict`, optional
783 Data ID to be expanded; augmented and overridden by ``kwds``.
784 graph : `DimensionGraph`, optional
785 Set of dimensions for the expanded ID. If `None`, the dimensions
786 will be inferred from the keys of ``dataId`` and ``kwds``.
787 Dimensions that are in ``dataId`` or ``kwds`` but not in ``graph``
788 are silently ignored, providing a way to extract and expand a
789 subset of a data ID.
790 records : `Mapping` [`DimensionElement`, `DimensionRecord`], optional
791 Dimension record data to use before querying the database for that
792 data.
793 **kwargs
794 Additional keywords are treated like additional key-value pairs for
795 ``dataId``, extending and overriding
797 Returns
798 -------
799 expanded : `ExpandedDataCoordinate`
800 A data ID that includes full metadata for all of the dimensions it
801 identifieds.
802 """
803 standardized = DataCoordinate.standardize(dataId, graph=graph, universe=self.dimensions, **kwargs)
804 if isinstance(standardized, ExpandedDataCoordinate):
805 return standardized
806 elif isinstance(dataId, ExpandedDataCoordinate):
807 records = NamedKeyDict(records) if records is not None else NamedKeyDict()
808 records.update(dataId.records)
809 else:
810 records = NamedKeyDict(records) if records is not None else NamedKeyDict()
811 keys = dict(standardized.byName())
812 for element in standardized.graph.primaryKeyTraversalOrder:
813 record = records.get(element.name, ...) # Use ... to mean not found; None might mean NULL
814 if record is ...:
815 storage = self._dimensions[element]
816 record = storage.fetch(keys)
817 records[element] = record
818 if record is not None:
819 for d in element.implied:
820 value = getattr(record, d.name)
821 if keys.setdefault(d.name, value) != value:
822 raise InconsistentDataIdError(
823 f"Data ID {standardized} has {d.name}={keys[d.name]!r}, "
824 f"but {element.name} implies {d.name}={value!r}."
825 )
826 else:
827 if element in standardized.graph.required:
828 raise LookupError(
829 f"Could not fetch record for required dimension {element.name} via keys {keys}."
830 )
831 if element.alwaysJoin:
832 raise InconsistentDataIdError(
833 f"Could not fetch record for element {element.name} via keys {keys}, ",
834 "but it is marked alwaysJoin=True; this means one or more dimensions are not "
835 "related."
836 )
837 records.update((d, None) for d in element.implied)
838 return ExpandedDataCoordinate(standardized.graph, standardized.values(), records=records)
840 def insertDimensionData(self, element: Union[DimensionElement, str],
841 *data: Union[Mapping[str, Any], DimensionRecord],
842 conform: bool = True) -> None:
843 """Insert one or more dimension records into the database.
845 Parameters
846 ----------
847 element : `DimensionElement` or `str`
848 The `DimensionElement` or name thereof that identifies the table
849 records will be inserted into.
850 data : `dict` or `DimensionRecord` (variadic)
851 One or more records to insert.
852 conform : `bool`, optional
853 If `False` (`True` is default) perform no checking or conversions,
854 and assume that ``element`` is a `DimensionElement` instance and
855 ``data`` is a one or more `DimensionRecord` instances of the
856 appropriate subclass.
857 """
858 if conform:
859 if isinstance(element, str):
860 element = self.dimensions[element]
861 records = [row if isinstance(row, DimensionRecord) else element.RecordClass.fromDict(row)
862 for row in data]
863 else:
864 # Ignore typing since caller said to trust them with conform=False.
865 records = data # type: ignore
866 storage = self._dimensions[element] # type: ignore
867 storage.insert(*records)
869 def syncDimensionData(self, element: Union[DimensionElement, str],
870 row: Union[Mapping[str, Any], DimensionRecord],
871 conform: bool = True) -> bool:
872 """Synchronize the given dimension record with the database, inserting
873 if it does not already exist and comparing values if it does.
875 Parameters
876 ----------
877 element : `DimensionElement` or `str`
878 The `DimensionElement` or name thereof that identifies the table
879 records will be inserted into.
880 row : `dict` or `DimensionRecord`
881 The record to insert.
882 conform : `bool`, optional
883 If `False` (`True` is default) perform no checking or conversions,
884 and assume that ``element`` is a `DimensionElement` instance and
885 ``data`` is a one or more `DimensionRecord` instances of the
886 appropriate subclass.
888 Returns
889 -------
890 inserted : `bool`
891 `True` if a new row was inserted, `False` otherwise.
893 Raises
894 ------
895 ConflictingDefinitionError
896 Raised if the record exists in the database (according to primary
897 key lookup) but is inconsistent with the given one.
899 Notes
900 -----
901 This method cannot be called within transactions, as it needs to be
902 able to perform its own transaction to be concurrent.
903 """
904 if conform:
905 if isinstance(element, str):
906 element = self.dimensions[element]
907 record = row if isinstance(row, DimensionRecord) else element.RecordClass.fromDict(row)
908 else:
909 # Ignore typing since caller said to trust them with conform=False.
910 record = row # type: ignore
911 storage = self._dimensions[element] # type: ignore
912 return storage.sync(record)
914 def queryDatasetTypes(self, expression: Any = ..., *, components: Optional[bool] = None
915 ) -> Iterator[DatasetType]:
916 """Iterate over the dataset types whose names match an expression.
918 Parameters
919 ----------
920 expression : `Any`, optional
921 An expression that fully or partially identifies the dataset types
922 to return, such as a `str`, `re.Pattern`, or iterable thereof.
923 `...` can be used to return all dataset types, and is the default.
924 See :ref:`daf_butler_dataset_type_expressions` for more
925 information.
926 components : `bool`, optional
927 If `True`, apply all expression patterns to component dataset type
928 names as well. If `False`, never apply patterns to components.
929 If `None` (default), apply patterns to components only if their
930 parent datasets were not matched by the expression.
931 Fully-specified component datasets (`str` or `DatasetType`
932 instances) are always included.
934 Yields
935 ------
936 datasetType : `DatasetType`
937 A `DatasetType` instance whose name matches ``expression``.
938 """
939 wildcard = CategorizedWildcard.fromExpression(expression, coerceUnrecognized=lambda d: d.name)
940 if wildcard is Ellipsis:
941 for datasetType in self._datasets:
942 if components or not datasetType.isComponent():
943 yield datasetType
944 return
945 done: Set[str] = set()
946 for name in wildcard.strings:
947 storage = self._datasets.find(name)
948 if storage is not None:
949 done.add(storage.datasetType.name)
950 yield storage.datasetType
951 if wildcard.patterns:
952 # If components (the argument) is None, we'll save component
953 # dataset that we might want to match, but only if their parents
954 # didn't get included.
955 componentsForLater = []
956 for datasetType in self._datasets:
957 if datasetType.name in done:
958 continue
959 parentName, componentName = datasetType.nameAndComponent()
960 if componentName is not None and not components:
961 if components is None and parentName not in done:
962 componentsForLater.append(datasetType)
963 continue
964 if any(p.fullmatch(datasetType.name) for p in wildcard.patterns):
965 done.add(datasetType.name)
966 yield datasetType
967 # Go back and try to match saved components.
968 for datasetType in componentsForLater:
969 parentName, _ = datasetType.nameAndComponent()
970 if parentName not in done and any(p.fullmatch(datasetType.name) for p in wildcard.patterns):
971 yield datasetType
973 def queryCollections(self, expression: Any = ...,
974 datasetType: Optional[DatasetType] = None,
975 collectionType: Optional[CollectionType] = None,
976 flattenChains: bool = False,
977 includeChains: Optional[bool] = None) -> Iterator[str]:
978 """Iterate over the collections whose names match an expression.
980 Parameters
981 ----------
982 expression : `Any`, optional
983 An expression that fully or partially identifies the collections
984 to return, such as a `str`, `re.Pattern`, or iterable thereof.
985 `...` can be used to return all collections, and is the default.
986 See :ref:`daf_butler_collection_expressions` for more
987 information.
988 datasetType : `DatasetType`, optional
989 If provided, only yield collections that should be searched for
990 this dataset type according to ``expression``. If this is
991 not provided, any dataset type restrictions in ``expression`` are
992 ignored.
993 collectionType : `CollectionType`, optional
994 If provided, only yield collections of this type.
995 flattenChains : `bool`, optional
996 If `True` (`False` is default), recursively yield the child
997 collections of matching `~CollectionType.CHAINED` collections.
998 includeChains : `bool`, optional
999 If `True`, yield records for matching `~CollectionType.CHAINED`
1000 collections. Default is the opposite of ``flattenChains``: include
1001 either CHAINED collections or their children, but not both.
1003 Yields
1004 ------
1005 collection : `str`
1006 The name of a collection that matches ``expression``.
1007 """
1008 query = CollectionQuery.fromExpression(expression)
1009 for record in query.iter(self._collections, datasetType=datasetType, collectionType=collectionType,
1010 flattenChains=flattenChains, includeChains=includeChains):
1011 yield record.name
1013 def makeQueryBuilder(self, summary: QuerySummary) -> QueryBuilder:
1014 """Return a `QueryBuilder` instance capable of constructing and
1015 managing more complex queries than those obtainable via `Registry`
1016 interfaces.
1018 This is an advanced interface; downstream code should prefer
1019 `Registry.queryDimensions` and `Registry.queryDatasets` whenever those
1020 are sufficient.
1022 Parameters
1023 ----------
1024 summary : `QuerySummary`
1025 Object describing and categorizing the full set of dimensions that
1026 will be included in the query.
1028 Returns
1029 -------
1030 builder : `QueryBuilder`
1031 Object that can be used to construct and perform advanced queries.
1032 """
1033 return QueryBuilder(summary=summary,
1034 collections=self._collections,
1035 dimensions=self._dimensions,
1036 datasets=self._datasets)
1038 def queryDimensions(self, dimensions: Union[Iterable[Union[Dimension, str]], Dimension, str], *,
1039 dataId: Optional[DataId] = None,
1040 datasets: Any = None,
1041 collections: Any = None,
1042 where: Optional[str] = None,
1043 expand: bool = True,
1044 components: Optional[bool] = None,
1045 **kwargs: Any) -> Iterator[DataCoordinate]:
1046 """Query for and iterate over data IDs matching user-provided criteria.
1048 Parameters
1049 ----------
1050 dimensions : `Dimension` or `str`, or iterable thereof
1051 The dimensions of the data IDs to yield, as either `Dimension`
1052 instances or `str`. Will be automatically expanded to a complete
1053 `DimensionGraph`.
1054 dataId : `dict` or `DataCoordinate`, optional
1055 A data ID whose key-value pairs are used as equality constraints
1056 in the query.
1057 datasets : `Any`, optional
1058 An expression that fully or partially identifies dataset types
1059 that should constrain the yielded data IDs. For example, including
1060 "raw" here would constrain the yielded ``instrument``,
1061 ``exposure``, ``detector``, and ``physical_filter`` values to only
1062 those for which at least one "raw" dataset exists in
1063 ``collections``. Allowed types include `DatasetType`, `str`,
1064 `re.Pattern`, and iterables thereof. Unlike other dataset type
1065 expressions, `...` is not permitted - it doesn't make sense to
1066 constrain data IDs on the existence of *all* datasets.
1067 See :ref:`daf_butler_dataset_type_expressions` for more
1068 information.
1069 collections: `Any`, optional
1070 An expression that fully or partially identifies the collections
1071 to search for datasets, such as a `str`, `re.Pattern`, or iterable
1072 thereof. `...` can be used to return all collections. Must be
1073 provided if ``datasets`` is, and is ignored if it is not. See
1074 :ref:`daf_butler_collection_expressions` for more information.
1075 where : `str`, optional
1076 A string expression similar to a SQL WHERE clause. May involve
1077 any column of a dimension table or (as a shortcut for the primary
1078 key column of a dimension table) dimension name. See
1079 :ref:`daf_butler_dimension_expressions` for more information.
1080 expand : `bool`, optional
1081 If `True` (default) yield `ExpandedDataCoordinate` instead of
1082 minimal `DataCoordinate` base-class instances.
1083 components : `bool`, optional
1084 If `True`, apply all dataset expression patterns to component
1085 dataset type names as well. If `False`, never apply patterns to
1086 components. If `None` (default), apply patterns to components only
1087 if their parent datasets were not matched by the expression.
1088 Fully-specified component datasets (`str` or `DatasetType`
1089 instances) are always included.
1090 **kwargs
1091 Additional keyword arguments are forwarded to
1092 `DataCoordinate.standardize` when processing the ``dataId``
1093 argument (and may be used to provide a constraining data ID even
1094 when the ``dataId`` argument is `None`).
1096 Yields
1097 ------
1098 dataId : `DataCoordinate`
1099 Data IDs matching the given query parameters. Order is
1100 unspecified.
1101 """
1102 dimensions = iterable(dimensions)
1103 standardizedDataId = self.expandDataId(dataId, **kwargs)
1104 standardizedDatasetTypes = set()
1105 requestedDimensionNames = set(self.dimensions.extract(dimensions).names)
1106 if datasets is not None:
1107 if collections is None:
1108 raise TypeError("Cannot pass 'datasets' without 'collections'.")
1109 for datasetType in self.queryDatasetTypes(datasets, components=components):
1110 requestedDimensionNames.update(datasetType.dimensions.names)
1111 # If any matched dataset type is a component, just operate on
1112 # its parent instead, because Registry doesn't know anything
1113 # about what components exist, and here (unlike queryDatasets)
1114 # we don't care about returning them.
1115 parentDatasetTypeName, componentName = datasetType.nameAndComponent()
1116 if componentName is not None:
1117 datasetType = self.getDatasetType(parentDatasetTypeName)
1118 standardizedDatasetTypes.add(datasetType)
1119 # Preprocess collections expression in case the original included
1120 # single-pass iterators (we'll want to use it multiple times
1121 # below).
1122 collections = CollectionQuery.fromExpression(collections)
1124 summary = QuerySummary(
1125 requested=DimensionGraph(self.dimensions, names=requestedDimensionNames),
1126 dataId=standardizedDataId,
1127 expression=where,
1128 )
1129 builder = self.makeQueryBuilder(summary)
1130 for datasetType in standardizedDatasetTypes:
1131 builder.joinDataset(datasetType, collections, isResult=False)
1132 query = builder.finish()
1133 predicate = query.predicate()
1134 for row in self._db.query(query.sql):
1135 if predicate(row):
1136 result = query.extractDataId(row)
1137 if expand:
1138 yield self.expandDataId(result, records=standardizedDataId.records)
1139 else:
1140 yield result
1142 def queryDatasets(self, datasetType: Any, *,
1143 collections: Any,
1144 dimensions: Optional[Iterable[Union[Dimension, str]]] = None,
1145 dataId: Optional[DataId] = None,
1146 where: Optional[str] = None,
1147 deduplicate: bool = False,
1148 expand: bool = True,
1149 components: Optional[bool] = None,
1150 **kwargs: Any) -> Iterator[DatasetRef]:
1151 """Query for and iterate over dataset references matching user-provided
1152 criteria.
1154 Parameters
1155 ----------
1156 datasetType
1157 An expression that fully or partially identifies the dataset types
1158 to be queried. Allowed types include `DatasetType`, `str`,
1159 `re.Pattern`, and iterables thereof. The special value `...` can
1160 be used to query all dataset types. See
1161 :ref:`daf_butler_dataset_type_expressions` for more information.
1162 collections
1163 An expression that fully or partially identifies the collections
1164 to search for datasets, such as a `str`, `re.Pattern`, or iterable
1165 thereof. `...` can be used to return all collections. See
1166 :ref:`daf_butler_collection_expressions` for more information.
1167 dimensions : `~collections.abc.Iterable` of `Dimension` or `str`
1168 Dimensions to include in the query (in addition to those used
1169 to identify the queried dataset type(s)), either to constrain
1170 the resulting datasets to those for which a matching dimension
1171 exists, or to relate the dataset type's dimensions to dimensions
1172 referenced by the ``dataId`` or ``where`` arguments.
1173 dataId : `dict` or `DataCoordinate`, optional
1174 A data ID whose key-value pairs are used as equality constraints
1175 in the query.
1176 where : `str`, optional
1177 A string expression similar to a SQL WHERE clause. May involve
1178 any column of a dimension table or (as a shortcut for the primary
1179 key column of a dimension table) dimension name. See
1180 :ref:`daf_butler_dimension_expressions` for more information.
1181 deduplicate : `bool`, optional
1182 If `True` (`False` is default), for each result data ID, only
1183 yield one `DatasetRef` of each `DatasetType`, from the first
1184 collection in which a dataset of that dataset type appears
1185 (according to the order of ``collections`` passed in). If `True`,
1186 ``collections`` must not contain regular expressions and may not
1187 be `...`.
1188 expand : `bool`, optional
1189 If `True` (default) attach `ExpandedDataCoordinate` instead of
1190 minimal `DataCoordinate` base-class instances.
1191 components : `bool`, optional
1192 If `True`, apply all dataset expression patterns to component
1193 dataset type names as well. If `False`, never apply patterns to
1194 components. If `None` (default), apply patterns to components only
1195 if their parent datasets were not matched by the expression.
1196 Fully-specified component datasets (`str` or `DatasetType`
1197 instances) are always included.
1198 **kwargs
1199 Additional keyword arguments are forwarded to
1200 `DataCoordinate.standardize` when processing the ``dataId``
1201 argument (and may be used to provide a constraining data ID even
1202 when the ``dataId`` argument is `None`).
1204 Yields
1205 ------
1206 ref : `DatasetRef`
1207 Dataset references matching the given query criteria. These
1208 are grouped by `DatasetType` if the query evaluates to multiple
1209 dataset types, but order is otherwise unspecified.
1211 Raises
1212 ------
1213 TypeError
1214 Raised when the arguments are incompatible, such as when a
1215 collection wildcard is passed when ``deduplicate`` is `True`.
1217 Notes
1218 -----
1219 When multiple dataset types are queried in a single call, the
1220 results of this operation are equivalent to querying for each dataset
1221 type separately in turn, and no information about the relationships
1222 between datasets of different types is included. In contexts where
1223 that kind of information is important, the recommended pattern is to
1224 use `queryDimensions` to first obtain data IDs (possibly with the
1225 desired dataset types and collections passed as constraints to the
1226 query), and then use multiple (generally much simpler) calls to
1227 `queryDatasets` with the returned data IDs passed as constraints.
1228 """
1229 # Standardize the collections expression.
1230 if deduplicate:
1231 collections = CollectionSearch.fromExpression(collections)
1232 else:
1233 collections = CollectionQuery.fromExpression(collections)
1234 # Standardize and expand the data ID provided as a constraint.
1235 standardizedDataId = self.expandDataId(dataId, **kwargs)
1237 # We can only query directly if given a non-component DatasetType
1238 # instance. If we were given an expression or str or a component
1239 # DatasetType instance, we'll populate this dict, recurse, and return.
1240 # If we already have a non-component DatasetType, it will remain None
1241 # and we'll run the query directly.
1242 composition: Optional[
1243 Dict[
1244 DatasetType, # parent dataset type
1245 List[Optional[str]] # component name, or None for parent
1246 ]
1247 ] = None
1248 if not isinstance(datasetType, DatasetType):
1249 # We were given a dataset type expression (which may be as simple
1250 # as a str). Loop over all matching datasets, delegating handling
1251 # of the `components` argument to queryDatasetTypes, as we populate
1252 # the composition dict.
1253 composition = defaultdict(list)
1254 for trueDatasetType in self.queryDatasetTypes(datasetType, components=components):
1255 parentName, componentName = trueDatasetType.nameAndComponent()
1256 if componentName is not None:
1257 parentDatasetType = self.getDatasetType(parentName)
1258 composition.setdefault(parentDatasetType, []).append(componentName)
1259 else:
1260 composition.setdefault(trueDatasetType, []).append(None)
1261 elif datasetType.isComponent():
1262 # We were given a true DatasetType instance, but it's a component.
1263 # the composition dict will have exactly one item.
1264 parentName, componentName = datasetType.nameAndComponent()
1265 parentDatasetType = self.getDatasetType(parentName)
1266 composition = {parentDatasetType: [componentName]}
1267 if composition is not None:
1268 # We need to recurse. Do that once for each parent dataset type.
1269 for parentDatasetType, componentNames in composition.items():
1270 for parentRef in self.queryDatasets(parentDatasetType, collections=collections,
1271 dimensions=dimensions, dataId=standardizedDataId,
1272 where=where, deduplicate=deduplicate):
1273 # Loop over components, yielding one for each one for each
1274 # one requested.
1275 for componentName in componentNames:
1276 if componentName is None:
1277 yield parentRef
1278 else:
1279 yield parentRef.makeComponentRef(componentName)
1280 return
1281 # If we get here, there's no need to recurse (or we are already
1282 # recursing; there can only ever be one level of recursion).
1284 # The full set of dimensions in the query is the combination of those
1285 # needed for the DatasetType and those explicitly requested, if any.
1286 requestedDimensionNames = set(datasetType.dimensions.names)
1287 if dimensions is not None:
1288 requestedDimensionNames.update(self.dimensions.extract(dimensions).names)
1289 # Construct the summary structure needed to construct a QueryBuilder.
1290 summary = QuerySummary(
1291 requested=DimensionGraph(self.dimensions, names=requestedDimensionNames),
1292 dataId=standardizedDataId,
1293 expression=where,
1294 )
1295 builder = self.makeQueryBuilder(summary)
1296 # Add the dataset subquery to the query, telling the QueryBuilder to
1297 # include the rank of the selected collection in the results only if we
1298 # need to deduplicate. Note that if any of the collections are
1299 # actually wildcard expressions, and we've asked for deduplication,
1300 # this will raise TypeError for us.
1301 if not builder.joinDataset(datasetType, collections, isResult=True, addRank=deduplicate):
1302 return
1303 query = builder.finish()
1304 predicate = query.predicate()
1305 if not deduplicate:
1306 # No need to de-duplicate across collections.
1307 for row in self._db.query(query.sql):
1308 if predicate(row):
1309 dataId = query.extractDataId(row, graph=datasetType.dimensions)
1310 if expand:
1311 dataId = self.expandDataId(dataId, records=standardizedDataId.records)
1312 yield query.extractDatasetRef(row, datasetType, dataId)[0]
1313 else:
1314 # For each data ID, yield only the DatasetRef with the lowest
1315 # collection rank.
1316 bestRefs = {}
1317 bestRanks: Dict[DataCoordinate, int] = {}
1318 for row in self._db.query(query.sql):
1319 if predicate(row):
1320 ref, rank = query.extractDatasetRef(row, datasetType)
1321 bestRank = bestRanks.get(ref.dataId, sys.maxsize)
1322 assert rank is not None
1323 if rank < bestRank:
1324 bestRefs[ref.dataId] = ref
1325 bestRanks[ref.dataId] = rank
1326 # If caller requested expanded data IDs, we defer that until here
1327 # so we do as little expansion as possible.
1328 if expand:
1329 for ref in bestRefs.values():
1330 dataId = self.expandDataId(ref.dataId, records=standardizedDataId.records)
1331 yield ref.expanded(dataId)
1332 else:
1333 yield from bestRefs.values()
1335 storageClasses: StorageClassFactory
1336 """All storage classes known to the registry (`StorageClassFactory`).
1337 """