Coverage for python/lsst/daf/butler/registry/_registry.py : 11%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22from __future__ import annotations
24__all__ = (
25 "Registry",
26)
28from collections import defaultdict
29import contextlib
30import sys
31from typing import (
32 Any,
33 Dict,
34 Iterable,
35 Iterator,
36 List,
37 Mapping,
38 Optional,
39 Set,
40 Type,
41 TYPE_CHECKING,
42 Union,
43)
45import sqlalchemy
47from ..core import (
48 Config,
49 DataCoordinate,
50 DataId,
51 DatasetRef,
52 DatasetType,
53 ddl,
54 Dimension,
55 DimensionElement,
56 DimensionGraph,
57 DimensionRecord,
58 DimensionUniverse,
59 ExpandedDataCoordinate,
60 NamedKeyDict,
61 StorageClassFactory,
62)
63from ..core.utils import doImport, iterable, transactional
64from ._config import RegistryConfig
65from .queries import (
66 QueryBuilder,
67 QuerySummary,
68)
69from ._collectionType import CollectionType
70from ._exceptions import ConflictingDefinitionError, InconsistentDataIdError, OrphanedRecordError
71from .wildcards import CategorizedWildcard, CollectionQuery, CollectionSearch, Ellipsis
72from .interfaces import ChainedCollectionRecord, RunRecord
73from .versions import ButlerVersionsManager
75if TYPE_CHECKING: 75 ↛ 76line 75 didn't jump to line 76, because the condition on line 75 was never true
76 from ..butlerConfig import ButlerConfig
77 from .interfaces import (
78 ButlerAttributeManager,
79 CollectionManager,
80 Database,
81 OpaqueTableStorageManager,
82 DimensionRecordStorageManager,
83 DatasetRecordStorageManager,
84 DatastoreRegistryBridgeManager,
85 )
88class Registry:
89 """Registry interface.
91 Parameters
92 ----------
93 config : `ButlerConfig`, `RegistryConfig`, `Config` or `str`
94 Registry configuration
95 """
97 defaultConfigFile = None
98 """Path to configuration defaults. Relative to $DAF_BUTLER_DIR/config or
99 absolute path. Can be None if no defaults specified.
100 """
102 @classmethod
103 def fromConfig(cls, config: Union[ButlerConfig, RegistryConfig, Config, str], create: bool = False,
104 butlerRoot: Optional[str] = None, writeable: bool = True) -> Registry:
105 """Create `Registry` subclass instance from `config`.
107 Uses ``registry.cls`` from `config` to determine which subclass to
108 instantiate.
110 Parameters
111 ----------
112 config : `ButlerConfig`, `RegistryConfig`, `Config` or `str`
113 Registry configuration
114 create : `bool`, optional
115 Assume empty Registry and create a new one.
116 butlerRoot : `str`, optional
117 Path to the repository root this `Registry` will manage.
118 writeable : `bool`, optional
119 If `True` (default) create a read-write connection to the database.
121 Returns
122 -------
123 registry : `Registry` (subclass)
124 A new `Registry` subclass instance.
125 """
126 if not isinstance(config, RegistryConfig):
127 if isinstance(config, str) or isinstance(config, Config):
128 config = RegistryConfig(config)
129 else:
130 raise ValueError("Incompatible Registry configuration: {}".format(config))
131 config.replaceRoot(butlerRoot)
132 DatabaseClass = config.getDatabaseClass()
133 database = DatabaseClass.fromUri(str(config.connectionString), origin=config.get("origin", 0),
134 namespace=config.get("namespace"), writeable=writeable)
135 universe = DimensionUniverse(config)
136 attributes = doImport(config["managers", "attributes"])
137 opaque = doImport(config["managers", "opaque"])
138 dimensions = doImport(config["managers", "dimensions"])
139 collections = doImport(config["managers", "collections"])
140 datasets = doImport(config["managers", "datasets"])
141 datastoreBridges = doImport(config["managers", "datastores"])
142 versions = ButlerVersionsManager.fromConfig(config.get("schema_versions"))
144 return cls(database, universe, dimensions=dimensions, attributes=attributes, opaque=opaque,
145 collections=collections, datasets=datasets, datastoreBridges=datastoreBridges,
146 versions=versions, writeable=writeable, create=create)
148 def __init__(self, database: Database, universe: DimensionUniverse, *,
149 attributes: Type[ButlerAttributeManager],
150 opaque: Type[OpaqueTableStorageManager],
151 dimensions: Type[DimensionRecordStorageManager],
152 collections: Type[CollectionManager],
153 datasets: Type[DatasetRecordStorageManager],
154 datastoreBridges: Type[DatastoreRegistryBridgeManager],
155 versions: ButlerVersionsManager,
156 writeable: bool = True,
157 create: bool = False):
158 self._db = database
159 self.storageClasses = StorageClassFactory()
160 with self._db.declareStaticTables(create=create) as context:
161 self._attributes = attributes.initialize(self._db, context)
162 self._dimensions = dimensions.initialize(self._db, context, universe=universe)
163 self._collections = collections.initialize(self._db, context)
164 self._datasets = datasets.initialize(self._db, context,
165 collections=self._collections,
166 universe=self.dimensions)
167 self._opaque = opaque.initialize(self._db, context)
168 self._datastoreBridges = datastoreBridges.initialize(self._db, context,
169 opaque=self._opaque,
170 datasets=datasets,
171 universe=self.dimensions)
172 context.addInitializer(lambda db: versions.storeVersions(self._attributes))
174 # This call does not do anything right now as we do not have a way to
175 # split tables between sub-schemas yet.
176 versions.checkVersionDigests()
177 if not create:
178 # verify that configured versions are compatible with schema
179 versions.checkStoredVersions(self._attributes, writeable)
181 self._collections.refresh()
182 self._datasets.refresh(universe=self._dimensions.universe)
184 def __str__(self) -> str:
185 return str(self._db)
187 def __repr__(self) -> str:
188 return f"Registry({self._db!r}, {self.dimensions!r})"
190 def isWriteable(self) -> bool:
191 """Return `True` if this registry allows write operations, and `False`
192 otherwise.
193 """
194 return self._db.isWriteable()
196 @property
197 def dimensions(self) -> DimensionUniverse:
198 """All dimensions recognized by this `Registry` (`DimensionUniverse`).
199 """
200 return self._dimensions.universe
202 @contextlib.contextmanager
203 def transaction(self) -> Iterator[None]:
204 """Return a context manager that represents a transaction.
205 """
206 # TODO make savepoint=False the default.
207 try:
208 with self._db.transaction():
209 yield
210 except BaseException:
211 # TODO: this clears the caches sometimes when we wouldn't actually
212 # need to. Can we avoid that?
213 self._dimensions.clearCaches()
214 raise
216 def registerOpaqueTable(self, tableName: str, spec: ddl.TableSpec) -> None:
217 """Add an opaque (to the `Registry`) table for use by a `Datastore` or
218 other data repository client.
220 Opaque table records can be added via `insertOpaqueData`, retrieved via
221 `fetchOpaqueData`, and removed via `deleteOpaqueData`.
223 Parameters
224 ----------
225 tableName : `str`
226 Logical name of the opaque table. This may differ from the
227 actual name used in the database by a prefix and/or suffix.
228 spec : `ddl.TableSpec`
229 Specification for the table to be added.
230 """
231 self._opaque.register(tableName, spec)
233 @transactional
234 def insertOpaqueData(self, tableName: str, *data: dict) -> None:
235 """Insert records into an opaque table.
237 Parameters
238 ----------
239 tableName : `str`
240 Logical name of the opaque table. Must match the name used in a
241 previous call to `registerOpaqueTable`.
242 data
243 Each additional positional argument is a dictionary that represents
244 a single row to be added.
245 """
246 self._opaque[tableName].insert(*data)
248 def fetchOpaqueData(self, tableName: str, **where: Any) -> Iterator[dict]:
249 """Retrieve records from an opaque table.
251 Parameters
252 ----------
253 tableName : `str`
254 Logical name of the opaque table. Must match the name used in a
255 previous call to `registerOpaqueTable`.
256 where
257 Additional keyword arguments are interpreted as equality
258 constraints that restrict the returned rows (combined with AND);
259 keyword arguments are column names and values are the values they
260 must have.
262 Yields
263 ------
264 row : `dict`
265 A dictionary representing a single result row.
266 """
267 yield from self._opaque[tableName].fetch(**where)
269 @transactional
270 def deleteOpaqueData(self, tableName: str, **where: Any) -> None:
271 """Remove records from an opaque table.
273 Parameters
274 ----------
275 tableName : `str`
276 Logical name of the opaque table. Must match the name used in a
277 previous call to `registerOpaqueTable`.
278 where
279 Additional keyword arguments are interpreted as equality
280 constraints that restrict the deleted rows (combined with AND);
281 keyword arguments are column names and values are the values they
282 must have.
283 """
284 self._opaque[tableName].delete(**where)
286 def registerCollection(self, name: str, type: CollectionType = CollectionType.TAGGED) -> None:
287 """Add a new collection if one with the given name does not exist.
289 Parameters
290 ----------
291 name : `str`
292 The name of the collection to create.
293 type : `CollectionType`
294 Enum value indicating the type of collection to create.
296 Notes
297 -----
298 This method cannot be called within transactions, as it needs to be
299 able to perform its own transaction to be concurrent.
300 """
301 self._collections.register(name, type)
303 def getCollectionType(self, name: str) -> CollectionType:
304 """Return an enumeration value indicating the type of the given
305 collection.
307 Parameters
308 ----------
309 name : `str`
310 The name of the collection.
312 Returns
313 -------
314 type : `CollectionType`
315 Enum value indicating the type of this collection.
317 Raises
318 ------
319 MissingCollectionError
320 Raised if no collection with the given name exists.
321 """
322 return self._collections.find(name).type
324 def registerRun(self, name: str) -> None:
325 """Add a new run if one with the given name does not exist.
327 Parameters
328 ----------
329 name : `str`
330 The name of the run to create.
332 Notes
333 -----
334 This method cannot be called within transactions, as it needs to be
335 able to perform its own transaction to be concurrent.
336 """
337 self._collections.register(name, CollectionType.RUN)
339 @transactional
340 def removeCollection(self, name: str) -> None:
341 """Completely remove the given collection.
343 Parameters
344 ----------
345 name : `str`
346 The name of the collection to remove.
348 Raises
349 ------
350 MissingCollectionError
351 Raised if no collection with the given name exists.
353 Notes
354 -----
355 If this is a `~CollectionType.RUN` collection, all datasets and quanta
356 in it are also fully removed. This requires that those datasets be
357 removed (or at least trashed) from any datastores that hold them first.
359 A collection may not be deleted as long as it is referenced by a
360 `~CollectionType.CHAINED` collection; the ``CHAINED`` collection must
361 be deleted or redefined first.
362 """
363 self._collections.remove(name)
365 def getCollectionChain(self, parent: str) -> CollectionSearch:
366 """Return the child collections in a `~CollectionType.CHAINED`
367 collection.
369 Parameters
370 ----------
371 parent : `str`
372 Name of the chained collection. Must have already been added via
373 a call to `Registry.registerCollection`.
375 Returns
376 -------
377 children : `CollectionSearch`
378 An object that defines the search path of the collection.
379 See :ref:`daf_butler_collection_expressions` for more information.
381 Raises
382 ------
383 MissingCollectionError
384 Raised if ``parent`` does not exist in the `Registry`.
385 TypeError
386 Raised if ``parent`` does not correspond to a
387 `~CollectionType.CHAINED` collection.
388 """
389 record = self._collections.find(parent)
390 if record.type is not CollectionType.CHAINED:
391 raise TypeError(f"Collection '{parent}' has type {record.type.name}, not CHAINED.")
392 assert isinstance(record, ChainedCollectionRecord)
393 return record.children
395 @transactional
396 def setCollectionChain(self, parent: str, children: Any) -> None:
397 """Define or redefine a `~CollectionType.CHAINED` collection.
399 Parameters
400 ----------
401 parent : `str`
402 Name of the chained collection. Must have already been added via
403 a call to `Registry.registerCollection`.
404 children : `Any`
405 An expression defining an ordered search of child collections,
406 generally an iterable of `str`. Restrictions on the dataset types
407 to be searched can also be included, by passing mapping or an
408 iterable containing tuples; see
409 :ref:`daf_butler_collection_expressions` for more information.
411 Raises
412 ------
413 MissingCollectionError
414 Raised when any of the given collections do not exist in the
415 `Registry`.
416 TypeError
417 Raised if ``parent`` does not correspond to a
418 `~CollectionType.CHAINED` collection.
419 ValueError
420 Raised if the given collections contains a cycle.
421 """
422 record = self._collections.find(parent)
423 if record.type is not CollectionType.CHAINED:
424 raise TypeError(f"Collection '{parent}' has type {record.type.name}, not CHAINED.")
425 assert isinstance(record, ChainedCollectionRecord)
426 children = CollectionSearch.fromExpression(children)
427 if children != record.children:
428 record.update(self._collections, children)
430 def registerDatasetType(self, datasetType: DatasetType) -> bool:
431 """
432 Add a new `DatasetType` to the Registry.
434 It is not an error to register the same `DatasetType` twice.
436 Parameters
437 ----------
438 datasetType : `DatasetType`
439 The `DatasetType` to be added.
441 Returns
442 -------
443 inserted : `bool`
444 `True` if ``datasetType`` was inserted, `False` if an identical
445 existing `DatsetType` was found. Note that in either case the
446 DatasetType is guaranteed to be defined in the Registry
447 consistently with the given definition.
449 Raises
450 ------
451 ValueError
452 Raised if the dimensions or storage class are invalid.
453 ConflictingDefinitionError
454 Raised if this DatasetType is already registered with a different
455 definition.
457 Notes
458 -----
459 This method cannot be called within transactions, as it needs to be
460 able to perform its own transaction to be concurrent.
461 """
462 _, inserted = self._datasets.register(datasetType)
463 return inserted
465 def getDatasetType(self, name: str) -> DatasetType:
466 """Get the `DatasetType`.
468 Parameters
469 ----------
470 name : `str`
471 Name of the type.
473 Returns
474 -------
475 type : `DatasetType`
476 The `DatasetType` associated with the given name.
478 Raises
479 ------
480 KeyError
481 Requested named DatasetType could not be found in registry.
482 """
483 storage = self._datasets.find(name)
484 if storage is None:
485 raise KeyError(f"DatasetType '{name}' could not be found.")
486 return storage.datasetType
488 def findDataset(self, datasetType: Union[DatasetType, str], dataId: Optional[DataId] = None, *,
489 collections: Any, **kwargs: Any) -> Optional[DatasetRef]:
490 """Find a dataset given its `DatasetType` and data ID.
492 This can be used to obtain a `DatasetRef` that permits the dataset to
493 be read from a `Datastore`. If the dataset is a component and can not
494 be found using the provided dataset type, a dataset ref for the parent
495 will be returned instead but with the correct dataset type.
497 Parameters
498 ----------
499 datasetType : `DatasetType` or `str`
500 A `DatasetType` or the name of one.
501 dataId : `dict` or `DataCoordinate`, optional
502 A `dict`-like object containing the `Dimension` links that identify
503 the dataset within a collection.
504 collections
505 An expression that fully or partially identifies the collections
506 to search for the dataset, such as a `str`, `re.Pattern`, or
507 iterable thereof. `...` can be used to return all collections.
508 See :ref:`daf_butler_collection_expressions` for more information.
509 **kwargs
510 Additional keyword arguments passed to
511 `DataCoordinate.standardize` to convert ``dataId`` to a true
512 `DataCoordinate` or augment an existing one.
514 Returns
515 -------
516 ref : `DatasetRef`
517 A reference to the dataset, or `None` if no matching Dataset
518 was found.
520 Raises
521 ------
522 LookupError
523 Raised if one or more data ID keys are missing or the dataset type
524 does not exist.
525 MissingCollectionError
526 Raised if any of ``collections`` does not exist in the registry.
527 """
528 if isinstance(datasetType, DatasetType):
529 storage = self._datasets.find(datasetType.name)
530 if storage is None:
531 raise LookupError(f"DatasetType '{datasetType}' has not been registered.")
532 else:
533 storage = self._datasets.find(datasetType)
534 if storage is None:
535 raise LookupError(f"DatasetType with name '{datasetType}' has not been registered.")
536 dataId = DataCoordinate.standardize(dataId, graph=storage.datasetType.dimensions,
537 universe=self.dimensions, **kwargs)
538 collections = CollectionSearch.fromExpression(collections)
539 for collectionRecord in collections.iter(self._collections, datasetType=storage.datasetType):
540 result = storage.find(collectionRecord, dataId)
541 if result is not None:
542 return result
544 return None
546 @transactional
547 def insertDatasets(self, datasetType: Union[DatasetType, str], dataIds: Iterable[DataId],
548 run: str) -> List[DatasetRef]:
549 """Insert one or more datasets into the `Registry`
551 This always adds new datasets; to associate existing datasets with
552 a new collection, use ``associate``.
554 Parameters
555 ----------
556 datasetType : `DatasetType` or `str`
557 A `DatasetType` or the name of one.
558 dataIds : `~collections.abc.Iterable` of `dict` or `DataCoordinate`
559 Dimension-based identifiers for the new datasets.
560 run : `str`
561 The name of the run that produced the datasets.
563 Returns
564 -------
565 refs : `list` of `DatasetRef`
566 Resolved `DatasetRef` instances for all given data IDs (in the same
567 order).
569 Raises
570 ------
571 ConflictingDefinitionError
572 If a dataset with the same dataset type and data ID as one of those
573 given already exists in ``run``.
574 MissingCollectionError
575 Raised if ``run`` does not exist in the registry.
576 """
577 if isinstance(datasetType, DatasetType):
578 storage = self._datasets.find(datasetType.name)
579 if storage is None:
580 raise LookupError(f"DatasetType '{datasetType}' has not been registered.")
581 else:
582 storage = self._datasets.find(datasetType)
583 if storage is None:
584 raise LookupError(f"DatasetType with name '{datasetType}' has not been registered.")
585 runRecord = self._collections.find(run)
586 if runRecord.type is not CollectionType.RUN:
587 raise TypeError("Given collection is of type {runRecord.type.name}; RUN collection required.")
588 assert isinstance(runRecord, RunRecord)
589 expandedDataIds = [self.expandDataId(dataId, graph=storage.datasetType.dimensions)
590 for dataId in dataIds]
591 try:
592 refs = list(storage.insert(runRecord, expandedDataIds))
593 except sqlalchemy.exc.IntegrityError as err:
594 raise ConflictingDefinitionError(f"A database constraint failure was triggered by inserting "
595 f"one or more datasets of type {storage.datasetType} into "
596 f"collection '{run}'. "
597 f"This probably means a dataset with the same data ID "
598 f"and dataset type already exists, but it may also mean a "
599 f"dimension row is missing.") from err
600 return refs
602 def getDataset(self, id: int) -> Optional[DatasetRef]:
603 """Retrieve a Dataset entry.
605 Parameters
606 ----------
607 id : `int`
608 The unique identifier for the dataset.
610 Returns
611 -------
612 ref : `DatasetRef` or `None`
613 A ref to the Dataset, or `None` if no matching Dataset
614 was found.
615 """
616 ref = self._datasets.getDatasetRef(id, universe=self.dimensions)
617 if ref is None:
618 return None
619 return ref
621 @transactional
622 def removeDatasets(self, refs: Iterable[DatasetRef]) -> None:
623 """Remove datasets from the Registry.
625 The datasets will be removed unconditionally from all collections, and
626 any `Quantum` that consumed this dataset will instead be marked with
627 having a NULL input. `Datastore` records will *not* be deleted; the
628 caller is responsible for ensuring that the dataset has already been
629 removed from all Datastores.
631 Parameters
632 ----------
633 refs : `Iterable` of `DatasetRef`
634 References to the datasets to be removed. Must include a valid
635 ``id`` attribute, and should be considered invalidated upon return.
637 Raises
638 ------
639 AmbiguousDatasetError
640 Raised if any ``ref.id`` is `None`.
641 OrphanedRecordError
642 Raised if any dataset is still present in any `Datastore`.
643 """
644 for datasetType, refsForType in DatasetRef.groupByType(refs).items():
645 storage = self._datasets.find(datasetType.name)
646 assert storage is not None
647 try:
648 storage.delete(refsForType)
649 except sqlalchemy.exc.IntegrityError as err:
650 raise OrphanedRecordError("One or more datasets is still "
651 "present in one or more Datastores.") from err
653 @transactional
654 def associate(self, collection: str, refs: Iterable[DatasetRef]) -> None:
655 """Add existing datasets to a `~CollectionType.TAGGED` collection.
657 If a DatasetRef with the same exact integer ID is already in a
658 collection nothing is changed. If a `DatasetRef` with the same
659 `DatasetType` and data ID but with different integer ID
660 exists in the collection, `ConflictingDefinitionError` is raised.
662 Parameters
663 ----------
664 collection : `str`
665 Indicates the collection the datasets should be associated with.
666 refs : `Iterable` [ `DatasetRef` ]
667 An iterable of resolved `DatasetRef` instances that already exist
668 in this `Registry`.
670 Raises
671 ------
672 ConflictingDefinitionError
673 If a Dataset with the given `DatasetRef` already exists in the
674 given collection.
675 AmbiguousDatasetError
676 Raised if ``any(ref.id is None for ref in refs)``.
677 MissingCollectionError
678 Raised if ``collection`` does not exist in the registry.
679 TypeError
680 Raise adding new datasets to the given ``collection`` is not
681 allowed.
682 """
683 collectionRecord = self._collections.find(collection)
684 if collectionRecord.type is not CollectionType.TAGGED:
685 raise TypeError(f"Collection '{collection}' has type {collectionRecord.type.name}, not TAGGED.")
686 for datasetType, refsForType in DatasetRef.groupByType(refs).items():
687 storage = self._datasets.find(datasetType.name)
688 assert storage is not None
689 try:
690 storage.associate(collectionRecord, refsForType)
691 except sqlalchemy.exc.IntegrityError as err:
692 raise ConflictingDefinitionError(
693 f"Constraint violation while associating dataset of type {datasetType.name} with "
694 f"collection {collection}. This probably means that one or more datasets with the same "
695 f"dataset type and data ID already exist in the collection, but it may also indicate "
696 f"that the datasets do not exist."
697 ) from err
699 @transactional
700 def disassociate(self, collection: str, refs: Iterable[DatasetRef]) -> None:
701 """Remove existing datasets from a `~CollectionType.TAGGED` collection.
703 ``collection`` and ``ref`` combinations that are not currently
704 associated are silently ignored.
706 Parameters
707 ----------
708 collection : `str`
709 The collection the datasets should no longer be associated with.
710 refs : `Iterable` [ `DatasetRef` ]
711 An iterable of resolved `DatasetRef` instances that already exist
712 in this `Registry`.
714 Raises
715 ------
716 AmbiguousDatasetError
717 Raised if any of the given dataset references is unresolved.
718 MissingCollectionError
719 Raised if ``collection`` does not exist in the registry.
720 TypeError
721 Raise adding new datasets to the given ``collection`` is not
722 allowed.
723 """
724 collectionRecord = self._collections.find(collection)
725 if collectionRecord.type is not CollectionType.TAGGED:
726 raise TypeError(f"Collection '{collection}' has type {collectionRecord.type.name}; "
727 "expected TAGGED.")
728 for datasetType, refsForType in DatasetRef.groupByType(refs).items():
729 storage = self._datasets.find(datasetType.name)
730 assert storage is not None
731 storage.disassociate(collectionRecord, refsForType)
733 def getDatastoreBridgeManager(self) -> DatastoreRegistryBridgeManager:
734 """Return an object that allows a new `Datastore` instance to
735 communicate with this `Registry`.
737 Returns
738 -------
739 manager : `DatastoreRegistryBridgeManager`
740 Object that mediates communication between this `Registry` and its
741 associated datastores.
742 """
743 return self._datastoreBridges
745 def getDatasetLocations(self, ref: DatasetRef) -> Iterable[str]:
746 """Retrieve datastore locations for a given dataset.
748 Parameters
749 ----------
750 ref : `DatasetRef`
751 A reference to the dataset for which to retrieve storage
752 information.
754 Returns
755 -------
756 datastores : `Iterable` [ `str` ]
757 All the matching datastores holding this dataset.
759 Raises
760 ------
761 AmbiguousDatasetError
762 Raised if ``ref.id`` is `None`.
763 """
764 return self._datastoreBridges.findDatastores(ref)
766 def expandDataId(self, dataId: Optional[DataId] = None, *, graph: Optional[DimensionGraph] = None,
767 records: Optional[Mapping[DimensionElement, Optional[DimensionRecord]]] = None,
768 **kwargs: Any) -> ExpandedDataCoordinate:
769 """Expand a dimension-based data ID to include additional information.
771 Parameters
772 ----------
773 dataId : `DataCoordinate` or `dict`, optional
774 Data ID to be expanded; augmented and overridden by ``kwds``.
775 graph : `DimensionGraph`, optional
776 Set of dimensions for the expanded ID. If `None`, the dimensions
777 will be inferred from the keys of ``dataId`` and ``kwds``.
778 Dimensions that are in ``dataId`` or ``kwds`` but not in ``graph``
779 are silently ignored, providing a way to extract and expand a
780 subset of a data ID.
781 records : `Mapping` [`DimensionElement`, `DimensionRecord`], optional
782 Dimension record data to use before querying the database for that
783 data.
784 **kwargs
785 Additional keywords are treated like additional key-value pairs for
786 ``dataId``, extending and overriding
788 Returns
789 -------
790 expanded : `ExpandedDataCoordinate`
791 A data ID that includes full metadata for all of the dimensions it
792 identifieds.
793 """
794 standardized = DataCoordinate.standardize(dataId, graph=graph, universe=self.dimensions, **kwargs)
795 if isinstance(standardized, ExpandedDataCoordinate):
796 return standardized
797 elif isinstance(dataId, ExpandedDataCoordinate):
798 records = NamedKeyDict(records) if records is not None else NamedKeyDict()
799 records.update(dataId.records)
800 else:
801 records = NamedKeyDict(records) if records is not None else NamedKeyDict()
802 keys = dict(standardized.byName())
803 for element in standardized.graph.primaryKeyTraversalOrder:
804 record = records.get(element.name, ...) # Use ... to mean not found; None might mean NULL
805 if record is ...:
806 storage = self._dimensions[element]
807 record = storage.fetch(keys)
808 records[element] = record
809 if record is not None:
810 for d in element.implied:
811 value = getattr(record, d.name)
812 if keys.setdefault(d.name, value) != value:
813 raise InconsistentDataIdError(
814 f"Data ID {standardized} has {d.name}={keys[d.name]!r}, "
815 f"but {element.name} implies {d.name}={value!r}."
816 )
817 else:
818 if element in standardized.graph.required:
819 raise LookupError(
820 f"Could not fetch record for required dimension {element.name} via keys {keys}."
821 )
822 if element.alwaysJoin:
823 raise InconsistentDataIdError(
824 f"Could not fetch record for element {element.name} via keys {keys}, ",
825 "but it is marked alwaysJoin=True; this means one or more dimensions are not "
826 "related."
827 )
828 records.update((d, None) for d in element.implied)
829 return ExpandedDataCoordinate(standardized.graph, standardized.values(), records=records)
831 def insertDimensionData(self, element: Union[DimensionElement, str],
832 *data: Union[Mapping[str, Any], DimensionRecord],
833 conform: bool = True) -> None:
834 """Insert one or more dimension records into the database.
836 Parameters
837 ----------
838 element : `DimensionElement` or `str`
839 The `DimensionElement` or name thereof that identifies the table
840 records will be inserted into.
841 data : `dict` or `DimensionRecord` (variadic)
842 One or more records to insert.
843 conform : `bool`, optional
844 If `False` (`True` is default) perform no checking or conversions,
845 and assume that ``element`` is a `DimensionElement` instance and
846 ``data`` is a one or more `DimensionRecord` instances of the
847 appropriate subclass.
848 """
849 if conform:
850 if isinstance(element, str):
851 element = self.dimensions[element]
852 records = [row if isinstance(row, DimensionRecord) else element.RecordClass.fromDict(row)
853 for row in data]
854 else:
855 # Ignore typing since caller said to trust them with conform=False.
856 records = data # type: ignore
857 storage = self._dimensions[element] # type: ignore
858 storage.insert(*records)
860 def syncDimensionData(self, element: Union[DimensionElement, str],
861 row: Union[Mapping[str, Any], DimensionRecord],
862 conform: bool = True) -> bool:
863 """Synchronize the given dimension record with the database, inserting
864 if it does not already exist and comparing values if it does.
866 Parameters
867 ----------
868 element : `DimensionElement` or `str`
869 The `DimensionElement` or name thereof that identifies the table
870 records will be inserted into.
871 row : `dict` or `DimensionRecord`
872 The record to insert.
873 conform : `bool`, optional
874 If `False` (`True` is default) perform no checking or conversions,
875 and assume that ``element`` is a `DimensionElement` instance and
876 ``data`` is a one or more `DimensionRecord` instances of the
877 appropriate subclass.
879 Returns
880 -------
881 inserted : `bool`
882 `True` if a new row was inserted, `False` otherwise.
884 Raises
885 ------
886 ConflictingDefinitionError
887 Raised if the record exists in the database (according to primary
888 key lookup) but is inconsistent with the given one.
890 Notes
891 -----
892 This method cannot be called within transactions, as it needs to be
893 able to perform its own transaction to be concurrent.
894 """
895 if conform:
896 if isinstance(element, str):
897 element = self.dimensions[element]
898 record = row if isinstance(row, DimensionRecord) else element.RecordClass.fromDict(row)
899 else:
900 # Ignore typing since caller said to trust them with conform=False.
901 record = row # type: ignore
902 storage = self._dimensions[element] # type: ignore
903 return storage.sync(record)
905 def queryDatasetTypes(self, expression: Any = ..., *, components: Optional[bool] = None
906 ) -> Iterator[DatasetType]:
907 """Iterate over the dataset types whose names match an expression.
909 Parameters
910 ----------
911 expression : `Any`, optional
912 An expression that fully or partially identifies the dataset types
913 to return, such as a `str`, `re.Pattern`, or iterable thereof.
914 `...` can be used to return all dataset types, and is the default.
915 See :ref:`daf_butler_dataset_type_expressions` for more
916 information.
917 components : `bool`, optional
918 If `True`, apply all expression patterns to component dataset type
919 names as well. If `False`, never apply patterns to components.
920 If `None` (default), apply patterns to components only if their
921 parent datasets were not matched by the expression.
922 Fully-specified component datasets (`str` or `DatasetType`
923 instances) are always included.
925 Yields
926 ------
927 datasetType : `DatasetType`
928 A `DatasetType` instance whose name matches ``expression``.
929 """
930 wildcard = CategorizedWildcard.fromExpression(expression, coerceUnrecognized=lambda d: d.name)
931 if wildcard is Ellipsis:
932 for datasetType in self._datasets:
933 # The dataset type can no longer be a component
934 yield datasetType
935 if components and datasetType.isComposite():
936 # Automatically create the component dataset types
937 for component in datasetType.makeAllComponentDatasetTypes():
938 yield component
939 return
940 done: Set[str] = set()
941 for name in wildcard.strings:
942 storage = self._datasets.find(name)
943 if storage is not None:
944 done.add(storage.datasetType.name)
945 yield storage.datasetType
946 if wildcard.patterns:
947 # If components (the argument) is None, we'll save component
948 # dataset that we might want to match, but only if their parents
949 # didn't get included.
950 componentsForLater = []
951 for registeredDatasetType in self._datasets:
952 # Components are not stored in registry so expand them here
953 allDatasetTypes = [registeredDatasetType] \
954 + registeredDatasetType.makeAllComponentDatasetTypes()
955 for datasetType in allDatasetTypes:
956 if datasetType.name in done:
957 continue
958 parentName, componentName = datasetType.nameAndComponent()
959 if componentName is not None and not components:
960 if components is None and parentName not in done:
961 componentsForLater.append(datasetType)
962 continue
963 if any(p.fullmatch(datasetType.name) for p in wildcard.patterns):
964 done.add(datasetType.name)
965 yield datasetType
966 # Go back and try to match saved components.
967 for datasetType in componentsForLater:
968 parentName, _ = datasetType.nameAndComponent()
969 if parentName not in done and any(p.fullmatch(datasetType.name) for p in wildcard.patterns):
970 yield datasetType
972 def queryCollections(self, expression: Any = ...,
973 datasetType: Optional[DatasetType] = None,
974 collectionType: Optional[CollectionType] = None,
975 flattenChains: bool = False,
976 includeChains: Optional[bool] = None) -> Iterator[str]:
977 """Iterate over the collections whose names match an expression.
979 Parameters
980 ----------
981 expression : `Any`, optional
982 An expression that fully or partially identifies the collections
983 to return, such as a `str`, `re.Pattern`, or iterable thereof.
984 `...` can be used to return all collections, and is the default.
985 See :ref:`daf_butler_collection_expressions` for more
986 information.
987 datasetType : `DatasetType`, optional
988 If provided, only yield collections that should be searched for
989 this dataset type according to ``expression``. If this is
990 not provided, any dataset type restrictions in ``expression`` are
991 ignored.
992 collectionType : `CollectionType`, optional
993 If provided, only yield collections of this type.
994 flattenChains : `bool`, optional
995 If `True` (`False` is default), recursively yield the child
996 collections of matching `~CollectionType.CHAINED` collections.
997 includeChains : `bool`, optional
998 If `True`, yield records for matching `~CollectionType.CHAINED`
999 collections. Default is the opposite of ``flattenChains``: include
1000 either CHAINED collections or their children, but not both.
1002 Yields
1003 ------
1004 collection : `str`
1005 The name of a collection that matches ``expression``.
1006 """
1007 query = CollectionQuery.fromExpression(expression)
1008 for record in query.iter(self._collections, datasetType=datasetType, collectionType=collectionType,
1009 flattenChains=flattenChains, includeChains=includeChains):
1010 yield record.name
1012 def makeQueryBuilder(self, summary: QuerySummary) -> QueryBuilder:
1013 """Return a `QueryBuilder` instance capable of constructing and
1014 managing more complex queries than those obtainable via `Registry`
1015 interfaces.
1017 This is an advanced interface; downstream code should prefer
1018 `Registry.queryDimensions` and `Registry.queryDatasets` whenever those
1019 are sufficient.
1021 Parameters
1022 ----------
1023 summary : `QuerySummary`
1024 Object describing and categorizing the full set of dimensions that
1025 will be included in the query.
1027 Returns
1028 -------
1029 builder : `QueryBuilder`
1030 Object that can be used to construct and perform advanced queries.
1031 """
1032 return QueryBuilder(summary=summary,
1033 collections=self._collections,
1034 dimensions=self._dimensions,
1035 datasets=self._datasets)
1037 def queryDimensions(self, dimensions: Union[Iterable[Union[Dimension, str]], Dimension, str], *,
1038 dataId: Optional[DataId] = None,
1039 datasets: Any = None,
1040 collections: Any = None,
1041 where: Optional[str] = None,
1042 expand: bool = True,
1043 components: Optional[bool] = None,
1044 **kwargs: Any) -> Iterator[DataCoordinate]:
1045 """Query for and iterate over data IDs matching user-provided criteria.
1047 Parameters
1048 ----------
1049 dimensions : `Dimension` or `str`, or iterable thereof
1050 The dimensions of the data IDs to yield, as either `Dimension`
1051 instances or `str`. Will be automatically expanded to a complete
1052 `DimensionGraph`.
1053 dataId : `dict` or `DataCoordinate`, optional
1054 A data ID whose key-value pairs are used as equality constraints
1055 in the query.
1056 datasets : `Any`, optional
1057 An expression that fully or partially identifies dataset types
1058 that should constrain the yielded data IDs. For example, including
1059 "raw" here would constrain the yielded ``instrument``,
1060 ``exposure``, ``detector``, and ``physical_filter`` values to only
1061 those for which at least one "raw" dataset exists in
1062 ``collections``. Allowed types include `DatasetType`, `str`,
1063 `re.Pattern`, and iterables thereof. Unlike other dataset type
1064 expressions, `...` is not permitted - it doesn't make sense to
1065 constrain data IDs on the existence of *all* datasets.
1066 See :ref:`daf_butler_dataset_type_expressions` for more
1067 information.
1068 collections: `Any`, optional
1069 An expression that fully or partially identifies the collections
1070 to search for datasets, such as a `str`, `re.Pattern`, or iterable
1071 thereof. `...` can be used to return all collections. Must be
1072 provided if ``datasets`` is, and is ignored if it is not. See
1073 :ref:`daf_butler_collection_expressions` for more information.
1074 where : `str`, optional
1075 A string expression similar to a SQL WHERE clause. May involve
1076 any column of a dimension table or (as a shortcut for the primary
1077 key column of a dimension table) dimension name. See
1078 :ref:`daf_butler_dimension_expressions` for more information.
1079 expand : `bool`, optional
1080 If `True` (default) yield `ExpandedDataCoordinate` instead of
1081 minimal `DataCoordinate` base-class instances.
1082 components : `bool`, optional
1083 If `True`, apply all dataset expression patterns to component
1084 dataset type names as well. If `False`, never apply patterns to
1085 components. If `None` (default), apply patterns to components only
1086 if their parent datasets were not matched by the expression.
1087 Fully-specified component datasets (`str` or `DatasetType`
1088 instances) are always included.
1089 **kwargs
1090 Additional keyword arguments are forwarded to
1091 `DataCoordinate.standardize` when processing the ``dataId``
1092 argument (and may be used to provide a constraining data ID even
1093 when the ``dataId`` argument is `None`).
1095 Yields
1096 ------
1097 dataId : `DataCoordinate`
1098 Data IDs matching the given query parameters. Order is
1099 unspecified.
1100 """
1101 dimensions = iterable(dimensions)
1102 standardizedDataId = self.expandDataId(dataId, **kwargs)
1103 standardizedDatasetTypes = set()
1104 requestedDimensionNames = set(self.dimensions.extract(dimensions).names)
1105 if datasets is not None:
1106 if collections is None:
1107 raise TypeError("Cannot pass 'datasets' without 'collections'.")
1108 for datasetType in self.queryDatasetTypes(datasets, components=components):
1109 requestedDimensionNames.update(datasetType.dimensions.names)
1110 # If any matched dataset type is a component, just operate on
1111 # its parent instead, because Registry doesn't know anything
1112 # about what components exist, and here (unlike queryDatasets)
1113 # we don't care about returning them.
1114 parentDatasetTypeName, componentName = datasetType.nameAndComponent()
1115 if componentName is not None:
1116 datasetType = self.getDatasetType(parentDatasetTypeName)
1117 standardizedDatasetTypes.add(datasetType)
1118 # Preprocess collections expression in case the original included
1119 # single-pass iterators (we'll want to use it multiple times
1120 # below).
1121 collections = CollectionQuery.fromExpression(collections)
1123 summary = QuerySummary(
1124 requested=DimensionGraph(self.dimensions, names=requestedDimensionNames),
1125 dataId=standardizedDataId,
1126 expression=where,
1127 )
1128 builder = self.makeQueryBuilder(summary)
1129 for datasetType in standardizedDatasetTypes:
1130 builder.joinDataset(datasetType, collections, isResult=False)
1131 query = builder.finish()
1132 predicate = query.predicate()
1133 for row in self._db.query(query.sql):
1134 if predicate(row):
1135 result = query.extractDataId(row)
1136 if expand:
1137 yield self.expandDataId(result, records=standardizedDataId.records)
1138 else:
1139 yield result
1141 def queryDatasets(self, datasetType: Any, *,
1142 collections: Any,
1143 dimensions: Optional[Iterable[Union[Dimension, str]]] = None,
1144 dataId: Optional[DataId] = None,
1145 where: Optional[str] = None,
1146 deduplicate: bool = False,
1147 expand: bool = True,
1148 components: Optional[bool] = None,
1149 **kwargs: Any) -> Iterator[DatasetRef]:
1150 """Query for and iterate over dataset references matching user-provided
1151 criteria.
1153 Parameters
1154 ----------
1155 datasetType
1156 An expression that fully or partially identifies the dataset types
1157 to be queried. Allowed types include `DatasetType`, `str`,
1158 `re.Pattern`, and iterables thereof. The special value `...` can
1159 be used to query all dataset types. See
1160 :ref:`daf_butler_dataset_type_expressions` for more information.
1161 collections
1162 An expression that fully or partially identifies the collections
1163 to search for datasets, such as a `str`, `re.Pattern`, or iterable
1164 thereof. `...` can be used to return all collections. See
1165 :ref:`daf_butler_collection_expressions` for more information.
1166 dimensions : `~collections.abc.Iterable` of `Dimension` or `str`
1167 Dimensions to include in the query (in addition to those used
1168 to identify the queried dataset type(s)), either to constrain
1169 the resulting datasets to those for which a matching dimension
1170 exists, or to relate the dataset type's dimensions to dimensions
1171 referenced by the ``dataId`` or ``where`` arguments.
1172 dataId : `dict` or `DataCoordinate`, optional
1173 A data ID whose key-value pairs are used as equality constraints
1174 in the query.
1175 where : `str`, optional
1176 A string expression similar to a SQL WHERE clause. May involve
1177 any column of a dimension table or (as a shortcut for the primary
1178 key column of a dimension table) dimension name. See
1179 :ref:`daf_butler_dimension_expressions` for more information.
1180 deduplicate : `bool`, optional
1181 If `True` (`False` is default), for each result data ID, only
1182 yield one `DatasetRef` of each `DatasetType`, from the first
1183 collection in which a dataset of that dataset type appears
1184 (according to the order of ``collections`` passed in). If `True`,
1185 ``collections`` must not contain regular expressions and may not
1186 be `...`.
1187 expand : `bool`, optional
1188 If `True` (default) attach `ExpandedDataCoordinate` instead of
1189 minimal `DataCoordinate` base-class instances.
1190 components : `bool`, optional
1191 If `True`, apply all dataset expression patterns to component
1192 dataset type names as well. If `False`, never apply patterns to
1193 components. If `None` (default), apply patterns to components only
1194 if their parent datasets were not matched by the expression.
1195 Fully-specified component datasets (`str` or `DatasetType`
1196 instances) are always included.
1197 **kwargs
1198 Additional keyword arguments are forwarded to
1199 `DataCoordinate.standardize` when processing the ``dataId``
1200 argument (and may be used to provide a constraining data ID even
1201 when the ``dataId`` argument is `None`).
1203 Yields
1204 ------
1205 ref : `DatasetRef`
1206 Dataset references matching the given query criteria. These
1207 are grouped by `DatasetType` if the query evaluates to multiple
1208 dataset types, but order is otherwise unspecified.
1210 Raises
1211 ------
1212 TypeError
1213 Raised when the arguments are incompatible, such as when a
1214 collection wildcard is passed when ``deduplicate`` is `True`.
1216 Notes
1217 -----
1218 When multiple dataset types are queried in a single call, the
1219 results of this operation are equivalent to querying for each dataset
1220 type separately in turn, and no information about the relationships
1221 between datasets of different types is included. In contexts where
1222 that kind of information is important, the recommended pattern is to
1223 use `queryDimensions` to first obtain data IDs (possibly with the
1224 desired dataset types and collections passed as constraints to the
1225 query), and then use multiple (generally much simpler) calls to
1226 `queryDatasets` with the returned data IDs passed as constraints.
1227 """
1228 # Standardize the collections expression.
1229 if deduplicate:
1230 collections = CollectionSearch.fromExpression(collections)
1231 else:
1232 collections = CollectionQuery.fromExpression(collections)
1233 # Standardize and expand the data ID provided as a constraint.
1234 standardizedDataId = self.expandDataId(dataId, **kwargs)
1236 # We can only query directly if given a non-component DatasetType
1237 # instance. If we were given an expression or str or a component
1238 # DatasetType instance, we'll populate this dict, recurse, and return.
1239 # If we already have a non-component DatasetType, it will remain None
1240 # and we'll run the query directly.
1241 composition: Optional[
1242 Dict[
1243 DatasetType, # parent dataset type
1244 List[Optional[str]] # component name, or None for parent
1245 ]
1246 ] = None
1247 if not isinstance(datasetType, DatasetType):
1248 # We were given a dataset type expression (which may be as simple
1249 # as a str). Loop over all matching datasets, delegating handling
1250 # of the `components` argument to queryDatasetTypes, as we populate
1251 # the composition dict.
1252 composition = defaultdict(list)
1253 for trueDatasetType in self.queryDatasetTypes(datasetType, components=components):
1254 parentName, componentName = trueDatasetType.nameAndComponent()
1255 if componentName is not None:
1256 parentDatasetType = self.getDatasetType(parentName)
1257 composition.setdefault(parentDatasetType, []).append(componentName)
1258 else:
1259 composition.setdefault(trueDatasetType, []).append(None)
1260 elif datasetType.isComponent():
1261 # We were given a true DatasetType instance, but it's a component.
1262 # the composition dict will have exactly one item.
1263 parentName, componentName = datasetType.nameAndComponent()
1264 parentDatasetType = self.getDatasetType(parentName)
1265 composition = {parentDatasetType: [componentName]}
1266 if composition is not None:
1267 # We need to recurse. Do that once for each parent dataset type.
1268 for parentDatasetType, componentNames in composition.items():
1269 for parentRef in self.queryDatasets(parentDatasetType, collections=collections,
1270 dimensions=dimensions, dataId=standardizedDataId,
1271 where=where, deduplicate=deduplicate):
1272 # Loop over components, yielding one for each one for each
1273 # one requested.
1274 for componentName in componentNames:
1275 if componentName is None:
1276 yield parentRef
1277 else:
1278 yield parentRef.makeComponentRef(componentName)
1279 return
1280 # If we get here, there's no need to recurse (or we are already
1281 # recursing; there can only ever be one level of recursion).
1283 # The full set of dimensions in the query is the combination of those
1284 # needed for the DatasetType and those explicitly requested, if any.
1285 requestedDimensionNames = set(datasetType.dimensions.names)
1286 if dimensions is not None:
1287 requestedDimensionNames.update(self.dimensions.extract(dimensions).names)
1288 # Construct the summary structure needed to construct a QueryBuilder.
1289 summary = QuerySummary(
1290 requested=DimensionGraph(self.dimensions, names=requestedDimensionNames),
1291 dataId=standardizedDataId,
1292 expression=where,
1293 )
1294 builder = self.makeQueryBuilder(summary)
1295 # Add the dataset subquery to the query, telling the QueryBuilder to
1296 # include the rank of the selected collection in the results only if we
1297 # need to deduplicate. Note that if any of the collections are
1298 # actually wildcard expressions, and we've asked for deduplication,
1299 # this will raise TypeError for us.
1300 if not builder.joinDataset(datasetType, collections, isResult=True, addRank=deduplicate):
1301 return
1302 query = builder.finish()
1303 predicate = query.predicate()
1304 if not deduplicate:
1305 # No need to de-duplicate across collections.
1306 for row in self._db.query(query.sql):
1307 if predicate(row):
1308 dataId = query.extractDataId(row, graph=datasetType.dimensions)
1309 if expand:
1310 dataId = self.expandDataId(dataId, records=standardizedDataId.records)
1311 yield query.extractDatasetRef(row, datasetType, dataId)[0]
1312 else:
1313 # For each data ID, yield only the DatasetRef with the lowest
1314 # collection rank.
1315 bestRefs = {}
1316 bestRanks: Dict[DataCoordinate, int] = {}
1317 for row in self._db.query(query.sql):
1318 if predicate(row):
1319 ref, rank = query.extractDatasetRef(row, datasetType)
1320 bestRank = bestRanks.get(ref.dataId, sys.maxsize)
1321 assert rank is not None
1322 if rank < bestRank:
1323 bestRefs[ref.dataId] = ref
1324 bestRanks[ref.dataId] = rank
1325 # If caller requested expanded data IDs, we defer that until here
1326 # so we do as little expansion as possible.
1327 if expand:
1328 for ref in bestRefs.values():
1329 dataId = self.expandDataId(ref.dataId, records=standardizedDataId.records)
1330 yield ref.expanded(dataId)
1331 else:
1332 yield from bestRefs.values()
1334 storageClasses: StorageClassFactory
1335 """All storage classes known to the registry (`StorageClassFactory`).
1336 """