Coverage for python/lsst/daf/butler/registry/_registry.py : 11%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22from __future__ import annotations
24__all__ = (
25 "Registry",
26)
28from collections import defaultdict
29import contextlib
30import sys
31from typing import (
32 Any,
33 Dict,
34 Iterable,
35 Iterator,
36 List,
37 Mapping,
38 Optional,
39 Set,
40 Type,
41 TYPE_CHECKING,
42 Union,
43)
45import sqlalchemy
47from ..core import (
48 Config,
49 DataCoordinate,
50 DataId,
51 DatasetRef,
52 DatasetType,
53 ddl,
54 Dimension,
55 DimensionElement,
56 DimensionGraph,
57 DimensionRecord,
58 DimensionUniverse,
59 ExpandedDataCoordinate,
60 NamedKeyDict,
61 StorageClassFactory,
62)
63from ..core.utils import doImport, iterable, transactional
64from ._config import RegistryConfig
65from .queries import (
66 QueryBuilder,
67 QuerySummary,
68)
69from ._collectionType import CollectionType
70from ._exceptions import ConflictingDefinitionError, InconsistentDataIdError, OrphanedRecordError
71from .wildcards import CategorizedWildcard, CollectionQuery, CollectionSearch, Ellipsis
72from .interfaces import ChainedCollectionRecord, RunRecord
74if TYPE_CHECKING: 74 ↛ 75line 74 didn't jump to line 75, because the condition on line 74 was never true
75 from ..butlerConfig import ButlerConfig
76 from .interfaces import (
77 ButlerAttributeManager,
78 CollectionManager,
79 Database,
80 OpaqueTableStorageManager,
81 DimensionRecordStorageManager,
82 DatasetRecordStorageManager,
83 DatastoreRegistryBridgeManager,
84 )
87class Registry:
88 """Registry interface.
90 Parameters
91 ----------
92 config : `ButlerConfig`, `RegistryConfig`, `Config` or `str`
93 Registry configuration
94 """
96 defaultConfigFile = None
97 """Path to configuration defaults. Relative to $DAF_BUTLER_DIR/config or
98 absolute path. Can be None if no defaults specified.
99 """
101 @classmethod
102 def fromConfig(cls, config: Union[ButlerConfig, RegistryConfig, Config, str], create: bool = False,
103 butlerRoot: Optional[str] = None, writeable: bool = True) -> Registry:
104 """Create `Registry` subclass instance from `config`.
106 Uses ``registry.cls`` from `config` to determine which subclass to
107 instantiate.
109 Parameters
110 ----------
111 config : `ButlerConfig`, `RegistryConfig`, `Config` or `str`
112 Registry configuration
113 create : `bool`, optional
114 Assume empty Registry and create a new one.
115 butlerRoot : `str`, optional
116 Path to the repository root this `Registry` will manage.
117 writeable : `bool`, optional
118 If `True` (default) create a read-write connection to the database.
120 Returns
121 -------
122 registry : `Registry` (subclass)
123 A new `Registry` subclass instance.
124 """
125 if not isinstance(config, RegistryConfig):
126 if isinstance(config, str) or isinstance(config, Config):
127 config = RegistryConfig(config)
128 else:
129 raise ValueError("Incompatible Registry configuration: {}".format(config))
130 config.replaceRoot(butlerRoot)
131 DatabaseClass = config.getDatabaseClass()
132 database = DatabaseClass.fromUri(str(config.connectionString), origin=config.get("origin", 0),
133 namespace=config.get("namespace"), writeable=writeable)
134 universe = DimensionUniverse(config)
135 attributes = doImport(config["managers", "attributes"])
136 opaque = doImport(config["managers", "opaque"])
137 dimensions = doImport(config["managers", "dimensions"])
138 collections = doImport(config["managers", "collections"])
139 datasets = doImport(config["managers", "datasets"])
140 datastoreBridges = doImport(config["managers", "datastores"])
141 return cls(database, universe, dimensions=dimensions, attributes=attributes, opaque=opaque,
142 collections=collections, datasets=datasets, datastoreBridges=datastoreBridges,
143 create=create)
145 def __init__(self, database: Database, universe: DimensionUniverse, *,
146 attributes: Type[ButlerAttributeManager],
147 opaque: Type[OpaqueTableStorageManager],
148 dimensions: Type[DimensionRecordStorageManager],
149 collections: Type[CollectionManager],
150 datasets: Type[DatasetRecordStorageManager],
151 datastoreBridges: Type[DatastoreRegistryBridgeManager],
152 create: bool = False):
153 self._db = database
154 self.storageClasses = StorageClassFactory()
155 with self._db.declareStaticTables(create=create) as context:
156 self._attributes = attributes.initialize(self._db, context)
157 self._dimensions = dimensions.initialize(self._db, context, universe=universe)
158 self._collections = collections.initialize(self._db, context)
159 self._datasets = datasets.initialize(self._db, context,
160 collections=self._collections,
161 universe=self.dimensions)
162 self._opaque = opaque.initialize(self._db, context)
163 self._datastoreBridges = datastoreBridges.initialize(self._db, context,
164 opaque=self._opaque,
165 datasets=datasets,
166 universe=self.dimensions)
167 self._collections.refresh()
168 self._datasets.refresh(universe=self._dimensions.universe)
170 def __str__(self) -> str:
171 return str(self._db)
173 def __repr__(self) -> str:
174 return f"Registry({self._db!r}, {self.dimensions!r})"
176 def isWriteable(self) -> bool:
177 """Return `True` if this registry allows write operations, and `False`
178 otherwise.
179 """
180 return self._db.isWriteable()
182 @property
183 def dimensions(self) -> DimensionUniverse:
184 """All dimensions recognized by this `Registry` (`DimensionUniverse`).
185 """
186 return self._dimensions.universe
188 @contextlib.contextmanager
189 def transaction(self) -> Iterator[None]:
190 """Return a context manager that represents a transaction.
191 """
192 # TODO make savepoint=False the default.
193 try:
194 with self._db.transaction():
195 yield
196 except BaseException:
197 # TODO: this clears the caches sometimes when we wouldn't actually
198 # need to. Can we avoid that?
199 self._dimensions.clearCaches()
200 raise
202 def registerOpaqueTable(self, tableName: str, spec: ddl.TableSpec) -> None:
203 """Add an opaque (to the `Registry`) table for use by a `Datastore` or
204 other data repository client.
206 Opaque table records can be added via `insertOpaqueData`, retrieved via
207 `fetchOpaqueData`, and removed via `deleteOpaqueData`.
209 Parameters
210 ----------
211 tableName : `str`
212 Logical name of the opaque table. This may differ from the
213 actual name used in the database by a prefix and/or suffix.
214 spec : `ddl.TableSpec`
215 Specification for the table to be added.
216 """
217 self._opaque.register(tableName, spec)
219 @transactional
220 def insertOpaqueData(self, tableName: str, *data: dict) -> None:
221 """Insert records into an opaque table.
223 Parameters
224 ----------
225 tableName : `str`
226 Logical name of the opaque table. Must match the name used in a
227 previous call to `registerOpaqueTable`.
228 data
229 Each additional positional argument is a dictionary that represents
230 a single row to be added.
231 """
232 self._opaque[tableName].insert(*data)
234 def fetchOpaqueData(self, tableName: str, **where: Any) -> Iterator[dict]:
235 """Retrieve records from an opaque table.
237 Parameters
238 ----------
239 tableName : `str`
240 Logical name of the opaque table. Must match the name used in a
241 previous call to `registerOpaqueTable`.
242 where
243 Additional keyword arguments are interpreted as equality
244 constraints that restrict the returned rows (combined with AND);
245 keyword arguments are column names and values are the values they
246 must have.
248 Yields
249 ------
250 row : `dict`
251 A dictionary representing a single result row.
252 """
253 yield from self._opaque[tableName].fetch(**where)
255 @transactional
256 def deleteOpaqueData(self, tableName: str, **where: Any) -> None:
257 """Remove records from an opaque table.
259 Parameters
260 ----------
261 tableName : `str`
262 Logical name of the opaque table. Must match the name used in a
263 previous call to `registerOpaqueTable`.
264 where
265 Additional keyword arguments are interpreted as equality
266 constraints that restrict the deleted rows (combined with AND);
267 keyword arguments are column names and values are the values they
268 must have.
269 """
270 self._opaque[tableName].delete(**where)
272 def registerCollection(self, name: str, type: CollectionType = CollectionType.TAGGED) -> None:
273 """Add a new collection if one with the given name does not exist.
275 Parameters
276 ----------
277 name : `str`
278 The name of the collection to create.
279 type : `CollectionType`
280 Enum value indicating the type of collection to create.
282 Notes
283 -----
284 This method cannot be called within transactions, as it needs to be
285 able to perform its own transaction to be concurrent.
286 """
287 self._collections.register(name, type)
289 def getCollectionType(self, name: str) -> CollectionType:
290 """Return an enumeration value indicating the type of the given
291 collection.
293 Parameters
294 ----------
295 name : `str`
296 The name of the collection.
298 Returns
299 -------
300 type : `CollectionType`
301 Enum value indicating the type of this collection.
303 Raises
304 ------
305 MissingCollectionError
306 Raised if no collection with the given name exists.
307 """
308 return self._collections.find(name).type
310 def registerRun(self, name: str) -> None:
311 """Add a new run if one with the given name does not exist.
313 Parameters
314 ----------
315 name : `str`
316 The name of the run to create.
318 Notes
319 -----
320 This method cannot be called within transactions, as it needs to be
321 able to perform its own transaction to be concurrent.
322 """
323 self._collections.register(name, CollectionType.RUN)
325 @transactional
326 def removeCollection(self, name: str) -> None:
327 """Completely remove the given collection.
329 Parameters
330 ----------
331 name : `str`
332 The name of the collection to remove.
334 Raises
335 ------
336 MissingCollectionError
337 Raised if no collection with the given name exists.
339 Notes
340 -----
341 If this is a `~CollectionType.RUN` collection, all datasets and quanta
342 in it are also fully removed. This requires that those datasets be
343 removed (or at least trashed) from any datastores that hold them first.
345 A collection may not be deleted as long as it is referenced by a
346 `~CollectionType.CHAINED` collection; the ``CHAINED`` collection must
347 be deleted or redefined first.
348 """
349 self._collections.remove(name)
351 def getCollectionChain(self, parent: str) -> CollectionSearch:
352 """Return the child collections in a `~CollectionType.CHAINED`
353 collection.
355 Parameters
356 ----------
357 parent : `str`
358 Name of the chained collection. Must have already been added via
359 a call to `Registry.registerCollection`.
361 Returns
362 -------
363 children : `CollectionSearch`
364 An object that defines the search path of the collection.
365 See :ref:`daf_butler_collection_expressions` for more information.
367 Raises
368 ------
369 MissingCollectionError
370 Raised if ``parent`` does not exist in the `Registry`.
371 TypeError
372 Raised if ``parent`` does not correspond to a
373 `~CollectionType.CHAINED` collection.
374 """
375 record = self._collections.find(parent)
376 if record.type is not CollectionType.CHAINED:
377 raise TypeError(f"Collection '{parent}' has type {record.type.name}, not CHAINED.")
378 assert isinstance(record, ChainedCollectionRecord)
379 return record.children
381 @transactional
382 def setCollectionChain(self, parent: str, children: Any) -> None:
383 """Define or redefine a `~CollectionType.CHAINED` collection.
385 Parameters
386 ----------
387 parent : `str`
388 Name of the chained collection. Must have already been added via
389 a call to `Registry.registerCollection`.
390 children : `Any`
391 An expression defining an ordered search of child collections,
392 generally an iterable of `str`. Restrictions on the dataset types
393 to be searched can also be included, by passing mapping or an
394 iterable containing tuples; see
395 :ref:`daf_butler_collection_expressions` for more information.
397 Raises
398 ------
399 MissingCollectionError
400 Raised when any of the given collections do not exist in the
401 `Registry`.
402 TypeError
403 Raised if ``parent`` does not correspond to a
404 `~CollectionType.CHAINED` collection.
405 ValueError
406 Raised if the given collections contains a cycle.
407 """
408 record = self._collections.find(parent)
409 if record.type is not CollectionType.CHAINED:
410 raise TypeError(f"Collection '{parent}' has type {record.type.name}, not CHAINED.")
411 assert isinstance(record, ChainedCollectionRecord)
412 children = CollectionSearch.fromExpression(children)
413 if children != record.children:
414 record.update(self._collections, children)
416 def registerDatasetType(self, datasetType: DatasetType) -> bool:
417 """
418 Add a new `DatasetType` to the Registry.
420 It is not an error to register the same `DatasetType` twice.
422 Parameters
423 ----------
424 datasetType : `DatasetType`
425 The `DatasetType` to be added.
427 Returns
428 -------
429 inserted : `bool`
430 `True` if ``datasetType`` was inserted, `False` if an identical
431 existing `DatsetType` was found. Note that in either case the
432 DatasetType is guaranteed to be defined in the Registry
433 consistently with the given definition.
435 Raises
436 ------
437 ValueError
438 Raised if the dimensions or storage class are invalid.
439 ConflictingDefinitionError
440 Raised if this DatasetType is already registered with a different
441 definition.
443 Notes
444 -----
445 This method cannot be called within transactions, as it needs to be
446 able to perform its own transaction to be concurrent.
447 """
448 _, inserted = self._datasets.register(datasetType)
449 return inserted
451 def getDatasetType(self, name: str) -> DatasetType:
452 """Get the `DatasetType`.
454 Parameters
455 ----------
456 name : `str`
457 Name of the type.
459 Returns
460 -------
461 type : `DatasetType`
462 The `DatasetType` associated with the given name.
464 Raises
465 ------
466 KeyError
467 Requested named DatasetType could not be found in registry.
468 """
469 storage = self._datasets.find(name)
470 if storage is None:
471 raise KeyError(f"DatasetType '{name}' could not be found.")
472 return storage.datasetType
474 def findDataset(self, datasetType: Union[DatasetType, str], dataId: Optional[DataId] = None, *,
475 collections: Any, **kwargs: Any) -> Optional[DatasetRef]:
476 """Find a dataset given its `DatasetType` and data ID.
478 This can be used to obtain a `DatasetRef` that permits the dataset to
479 be read from a `Datastore`. If the dataset is a component and can not
480 be found using the provided dataset type, a dataset ref for the parent
481 will be returned instead but with the correct dataset type.
483 Parameters
484 ----------
485 datasetType : `DatasetType` or `str`
486 A `DatasetType` or the name of one.
487 dataId : `dict` or `DataCoordinate`, optional
488 A `dict`-like object containing the `Dimension` links that identify
489 the dataset within a collection.
490 collections
491 An expression that fully or partially identifies the collections
492 to search for the dataset, such as a `str`, `re.Pattern`, or
493 iterable thereof. `...` can be used to return all collections.
494 See :ref:`daf_butler_collection_expressions` for more information.
495 **kwargs
496 Additional keyword arguments passed to
497 `DataCoordinate.standardize` to convert ``dataId`` to a true
498 `DataCoordinate` or augment an existing one.
500 Returns
501 -------
502 ref : `DatasetRef`
503 A reference to the dataset, or `None` if no matching Dataset
504 was found.
506 Raises
507 ------
508 LookupError
509 Raised if one or more data ID keys are missing or the dataset type
510 does not exist.
511 MissingCollectionError
512 Raised if any of ``collections`` does not exist in the registry.
513 """
514 if isinstance(datasetType, DatasetType):
515 storage = self._datasets.find(datasetType.name)
516 if storage is None:
517 raise LookupError(f"DatasetType '{datasetType}' has not been registered.")
518 else:
519 storage = self._datasets.find(datasetType)
520 if storage is None:
521 raise LookupError(f"DatasetType with name '{datasetType}' has not been registered.")
522 dataId = DataCoordinate.standardize(dataId, graph=storage.datasetType.dimensions,
523 universe=self.dimensions, **kwargs)
524 collections = CollectionSearch.fromExpression(collections)
525 for collectionRecord in collections.iter(self._collections, datasetType=storage.datasetType):
526 result = storage.find(collectionRecord, dataId)
527 if result is not None:
528 return result
530 # fallback to the parent if we got nothing and this was a component
531 if storage.datasetType.isComponent():
532 parentType, _ = storage.datasetType.nameAndComponent()
533 parentRef = self.findDataset(parentType, dataId, collections=collections, **kwargs)
534 if parentRef is not None:
535 # Should already conform and we know no components
536 return DatasetRef(storage.datasetType, parentRef.dataId, id=parentRef.id,
537 run=parentRef.run, conform=False, hasParentId=True)
539 return None
541 @transactional
542 def insertDatasets(self, datasetType: Union[DatasetType, str], dataIds: Iterable[DataId],
543 run: str) -> List[DatasetRef]:
544 """Insert one or more datasets into the `Registry`
546 This always adds new datasets; to associate existing datasets with
547 a new collection, use ``associate``.
549 Parameters
550 ----------
551 datasetType : `DatasetType` or `str`
552 A `DatasetType` or the name of one.
553 dataIds : `~collections.abc.Iterable` of `dict` or `DataCoordinate`
554 Dimension-based identifiers for the new datasets.
555 run : `str`
556 The name of the run that produced the datasets.
558 Returns
559 -------
560 refs : `list` of `DatasetRef`
561 Resolved `DatasetRef` instances for all given data IDs (in the same
562 order).
564 Raises
565 ------
566 ConflictingDefinitionError
567 If a dataset with the same dataset type and data ID as one of those
568 given already exists in ``run``.
569 MissingCollectionError
570 Raised if ``run`` does not exist in the registry.
571 """
572 if isinstance(datasetType, DatasetType):
573 storage = self._datasets.find(datasetType.name)
574 if storage is None:
575 raise LookupError(f"DatasetType '{datasetType}' has not been registered.")
576 else:
577 storage = self._datasets.find(datasetType)
578 if storage is None:
579 raise LookupError(f"DatasetType with name '{datasetType}' has not been registered.")
580 runRecord = self._collections.find(run)
581 if runRecord.type is not CollectionType.RUN:
582 raise TypeError("Given collection is of type {runRecord.type.name}; RUN collection required.")
583 assert isinstance(runRecord, RunRecord)
584 expandedDataIds = [self.expandDataId(dataId, graph=storage.datasetType.dimensions)
585 for dataId in dataIds]
586 try:
587 refs = list(storage.insert(runRecord, expandedDataIds))
588 except sqlalchemy.exc.IntegrityError as err:
589 raise ConflictingDefinitionError(f"A database constraint failure was triggered by inserting "
590 f"one or more datasets of type {storage.datasetType} into "
591 f"collection '{run}'. "
592 f"This probably means a dataset with the same data ID "
593 f"and dataset type already exists, but it may also mean a "
594 f"dimension row is missing.") from err
595 return refs
597 def getDataset(self, id: int) -> Optional[DatasetRef]:
598 """Retrieve a Dataset entry.
600 Parameters
601 ----------
602 id : `int`
603 The unique identifier for the dataset.
605 Returns
606 -------
607 ref : `DatasetRef` or `None`
608 A ref to the Dataset, or `None` if no matching Dataset
609 was found.
610 """
611 ref = self._datasets.getDatasetRef(id, universe=self.dimensions)
612 if ref is None:
613 return None
614 return ref
616 @transactional
617 def removeDatasets(self, refs: Iterable[DatasetRef]) -> None:
618 """Remove datasets from the Registry.
620 The datasets will be removed unconditionally from all collections, and
621 any `Quantum` that consumed this dataset will instead be marked with
622 having a NULL input. `Datastore` records will *not* be deleted; the
623 caller is responsible for ensuring that the dataset has already been
624 removed from all Datastores.
626 Parameters
627 ----------
628 refs : `Iterable` of `DatasetRef`
629 References to the datasets to be removed. Must include a valid
630 ``id`` attribute, and should be considered invalidated upon return.
632 Raises
633 ------
634 AmbiguousDatasetError
635 Raised if any ``ref.id`` is `None`.
636 OrphanedRecordError
637 Raised if any dataset is still present in any `Datastore`.
638 """
639 for datasetType, refsForType in DatasetRef.groupByType(refs).items():
640 storage = self._datasets.find(datasetType.name)
641 assert storage is not None
642 try:
643 storage.delete(refsForType)
644 except sqlalchemy.exc.IntegrityError as err:
645 raise OrphanedRecordError("One or more datasets is still "
646 "present in one or more Datastores.") from err
648 @transactional
649 def associate(self, collection: str, refs: Iterable[DatasetRef]) -> None:
650 """Add existing datasets to a `~CollectionType.TAGGED` collection.
652 If a DatasetRef with the same exact integer ID is already in a
653 collection nothing is changed. If a `DatasetRef` with the same
654 `DatasetType` and data ID but with different integer ID
655 exists in the collection, `ConflictingDefinitionError` is raised.
657 Parameters
658 ----------
659 collection : `str`
660 Indicates the collection the datasets should be associated with.
661 refs : `Iterable` [ `DatasetRef` ]
662 An iterable of resolved `DatasetRef` instances that already exist
663 in this `Registry`.
665 Raises
666 ------
667 ConflictingDefinitionError
668 If a Dataset with the given `DatasetRef` already exists in the
669 given collection.
670 AmbiguousDatasetError
671 Raised if ``any(ref.id is None for ref in refs)``.
672 MissingCollectionError
673 Raised if ``collection`` does not exist in the registry.
674 TypeError
675 Raise adding new datasets to the given ``collection`` is not
676 allowed.
677 """
678 collectionRecord = self._collections.find(collection)
679 if collectionRecord.type is not CollectionType.TAGGED:
680 raise TypeError(f"Collection '{collection}' has type {collectionRecord.type.name}, not TAGGED.")
681 for datasetType, refsForType in DatasetRef.groupByType(refs).items():
682 storage = self._datasets.find(datasetType.name)
683 assert storage is not None
684 try:
685 storage.associate(collectionRecord, refsForType)
686 except sqlalchemy.exc.IntegrityError as err:
687 raise ConflictingDefinitionError(
688 f"Constraint violation while associating dataset of type {datasetType.name} with "
689 f"collection {collection}. This probably means that one or more datasets with the same "
690 f"dataset type and data ID already exist in the collection, but it may also indicate "
691 f"that the datasets do not exist."
692 ) from err
694 @transactional
695 def disassociate(self, collection: str, refs: Iterable[DatasetRef]) -> None:
696 """Remove existing datasets from a `~CollectionType.TAGGED` collection.
698 ``collection`` and ``ref`` combinations that are not currently
699 associated are silently ignored.
701 Parameters
702 ----------
703 collection : `str`
704 The collection the datasets should no longer be associated with.
705 refs : `Iterable` [ `DatasetRef` ]
706 An iterable of resolved `DatasetRef` instances that already exist
707 in this `Registry`.
709 Raises
710 ------
711 AmbiguousDatasetError
712 Raised if any of the given dataset references is unresolved.
713 MissingCollectionError
714 Raised if ``collection`` does not exist in the registry.
715 TypeError
716 Raise adding new datasets to the given ``collection`` is not
717 allowed.
718 """
719 collectionRecord = self._collections.find(collection)
720 if collectionRecord.type is not CollectionType.TAGGED:
721 raise TypeError(f"Collection '{collection}' has type {collectionRecord.type.name}; "
722 "expected TAGGED.")
723 for datasetType, refsForType in DatasetRef.groupByType(refs).items():
724 storage = self._datasets.find(datasetType.name)
725 assert storage is not None
726 storage.disassociate(collectionRecord, refsForType)
728 def getDatastoreBridgeManager(self) -> DatastoreRegistryBridgeManager:
729 """Return an object that allows a new `Datastore` instance to
730 communicate with this `Registry`.
732 Returns
733 -------
734 manager : `DatastoreRegistryBridgeManager`
735 Object that mediates communication between this `Registry` and its
736 associated datastores.
737 """
738 return self._datastoreBridges
740 def getDatasetLocations(self, ref: DatasetRef) -> Iterable[str]:
741 """Retrieve datastore locations for a given dataset.
743 Parameters
744 ----------
745 ref : `DatasetRef`
746 A reference to the dataset for which to retrieve storage
747 information.
749 Returns
750 -------
751 datastores : `Iterable` [ `str` ]
752 All the matching datastores holding this dataset.
754 Raises
755 ------
756 AmbiguousDatasetError
757 Raised if ``ref.id`` is `None`.
758 """
759 return self._datastoreBridges.findDatastores(ref)
761 def expandDataId(self, dataId: Optional[DataId] = None, *, graph: Optional[DimensionGraph] = None,
762 records: Optional[Mapping[DimensionElement, Optional[DimensionRecord]]] = None,
763 **kwargs: Any) -> ExpandedDataCoordinate:
764 """Expand a dimension-based data ID to include additional information.
766 Parameters
767 ----------
768 dataId : `DataCoordinate` or `dict`, optional
769 Data ID to be expanded; augmented and overridden by ``kwds``.
770 graph : `DimensionGraph`, optional
771 Set of dimensions for the expanded ID. If `None`, the dimensions
772 will be inferred from the keys of ``dataId`` and ``kwds``.
773 Dimensions that are in ``dataId`` or ``kwds`` but not in ``graph``
774 are silently ignored, providing a way to extract and expand a
775 subset of a data ID.
776 records : `Mapping` [`DimensionElement`, `DimensionRecord`], optional
777 Dimension record data to use before querying the database for that
778 data.
779 **kwargs
780 Additional keywords are treated like additional key-value pairs for
781 ``dataId``, extending and overriding
783 Returns
784 -------
785 expanded : `ExpandedDataCoordinate`
786 A data ID that includes full metadata for all of the dimensions it
787 identifieds.
788 """
789 standardized = DataCoordinate.standardize(dataId, graph=graph, universe=self.dimensions, **kwargs)
790 if isinstance(standardized, ExpandedDataCoordinate):
791 return standardized
792 elif isinstance(dataId, ExpandedDataCoordinate):
793 records = NamedKeyDict(records) if records is not None else NamedKeyDict()
794 records.update(dataId.records)
795 else:
796 records = NamedKeyDict(records) if records is not None else NamedKeyDict()
797 keys = dict(standardized.byName())
798 for element in standardized.graph.primaryKeyTraversalOrder:
799 record = records.get(element.name, ...) # Use ... to mean not found; None might mean NULL
800 if record is ...:
801 storage = self._dimensions[element]
802 record = storage.fetch(keys)
803 records[element] = record
804 if record is not None:
805 for d in element.implied:
806 value = getattr(record, d.name)
807 if keys.setdefault(d.name, value) != value:
808 raise InconsistentDataIdError(
809 f"Data ID {standardized} has {d.name}={keys[d.name]!r}, "
810 f"but {element.name} implies {d.name}={value!r}."
811 )
812 else:
813 if element in standardized.graph.required:
814 raise LookupError(
815 f"Could not fetch record for required dimension {element.name} via keys {keys}."
816 )
817 if element.alwaysJoin:
818 raise InconsistentDataIdError(
819 f"Could not fetch record for element {element.name} via keys {keys}, ",
820 "but it is marked alwaysJoin=True; this means one or more dimensions are not "
821 "related."
822 )
823 records.update((d, None) for d in element.implied)
824 return ExpandedDataCoordinate(standardized.graph, standardized.values(), records=records)
826 def insertDimensionData(self, element: Union[DimensionElement, str],
827 *data: Union[Mapping[str, Any], DimensionRecord],
828 conform: bool = True) -> None:
829 """Insert one or more dimension records into the database.
831 Parameters
832 ----------
833 element : `DimensionElement` or `str`
834 The `DimensionElement` or name thereof that identifies the table
835 records will be inserted into.
836 data : `dict` or `DimensionRecord` (variadic)
837 One or more records to insert.
838 conform : `bool`, optional
839 If `False` (`True` is default) perform no checking or conversions,
840 and assume that ``element`` is a `DimensionElement` instance and
841 ``data`` is a one or more `DimensionRecord` instances of the
842 appropriate subclass.
843 """
844 if conform:
845 if isinstance(element, str):
846 element = self.dimensions[element]
847 records = [row if isinstance(row, DimensionRecord) else element.RecordClass.fromDict(row)
848 for row in data]
849 else:
850 # Ignore typing since caller said to trust them with conform=False.
851 records = data # type: ignore
852 storage = self._dimensions[element] # type: ignore
853 storage.insert(*records)
855 def syncDimensionData(self, element: Union[DimensionElement, str],
856 row: Union[Mapping[str, Any], DimensionRecord],
857 conform: bool = True) -> bool:
858 """Synchronize the given dimension record with the database, inserting
859 if it does not already exist and comparing values if it does.
861 Parameters
862 ----------
863 element : `DimensionElement` or `str`
864 The `DimensionElement` or name thereof that identifies the table
865 records will be inserted into.
866 row : `dict` or `DimensionRecord`
867 The record to insert.
868 conform : `bool`, optional
869 If `False` (`True` is default) perform no checking or conversions,
870 and assume that ``element`` is a `DimensionElement` instance and
871 ``data`` is a one or more `DimensionRecord` instances of the
872 appropriate subclass.
874 Returns
875 -------
876 inserted : `bool`
877 `True` if a new row was inserted, `False` otherwise.
879 Raises
880 ------
881 ConflictingDefinitionError
882 Raised if the record exists in the database (according to primary
883 key lookup) but is inconsistent with the given one.
885 Notes
886 -----
887 This method cannot be called within transactions, as it needs to be
888 able to perform its own transaction to be concurrent.
889 """
890 if conform:
891 if isinstance(element, str):
892 element = self.dimensions[element]
893 record = row if isinstance(row, DimensionRecord) else element.RecordClass.fromDict(row)
894 else:
895 # Ignore typing since caller said to trust them with conform=False.
896 record = row # type: ignore
897 storage = self._dimensions[element] # type: ignore
898 return storage.sync(record)
900 def queryDatasetTypes(self, expression: Any = ..., *, components: Optional[bool] = None
901 ) -> Iterator[DatasetType]:
902 """Iterate over the dataset types whose names match an expression.
904 Parameters
905 ----------
906 expression : `Any`, optional
907 An expression that fully or partially identifies the dataset types
908 to return, such as a `str`, `re.Pattern`, or iterable thereof.
909 `...` can be used to return all dataset types, and is the default.
910 See :ref:`daf_butler_dataset_type_expressions` for more
911 information.
912 components : `bool`, optional
913 If `True`, apply all expression patterns to component dataset type
914 names as well. If `False`, never apply patterns to components.
915 If `None` (default), apply patterns to components only if their
916 parent datasets were not matched by the expression.
917 Fully-specified component datasets (`str` or `DatasetType`
918 instances) are always included.
920 Yields
921 ------
922 datasetType : `DatasetType`
923 A `DatasetType` instance whose name matches ``expression``.
924 """
925 wildcard = CategorizedWildcard.fromExpression(expression, coerceUnrecognized=lambda d: d.name)
926 if wildcard is Ellipsis:
927 for datasetType in self._datasets:
928 if components or not datasetType.isComponent():
929 yield datasetType
930 return
931 done: Set[str] = set()
932 for name in wildcard.strings:
933 storage = self._datasets.find(name)
934 if storage is not None:
935 done.add(storage.datasetType.name)
936 yield storage.datasetType
937 if wildcard.patterns:
938 # If components (the argument) is None, we'll save component
939 # dataset that we might want to match, but only if their parents
940 # didn't get included.
941 componentsForLater = []
942 for datasetType in self._datasets:
943 if datasetType.name in done:
944 continue
945 parentName, componentName = datasetType.nameAndComponent()
946 if componentName is not None and not components:
947 if components is None and parentName not in done:
948 componentsForLater.append(datasetType)
949 continue
950 if any(p.fullmatch(datasetType.name) for p in wildcard.patterns):
951 done.add(datasetType.name)
952 yield datasetType
953 # Go back and try to match saved components.
954 for datasetType in componentsForLater:
955 parentName, _ = datasetType.nameAndComponent()
956 if parentName not in done and any(p.fullmatch(datasetType.name) for p in wildcard.patterns):
957 yield datasetType
959 def queryCollections(self, expression: Any = ...,
960 datasetType: Optional[DatasetType] = None,
961 collectionType: Optional[CollectionType] = None,
962 flattenChains: bool = False,
963 includeChains: Optional[bool] = None) -> Iterator[str]:
964 """Iterate over the collections whose names match an expression.
966 Parameters
967 ----------
968 expression : `Any`, optional
969 An expression that fully or partially identifies the collections
970 to return, such as a `str`, `re.Pattern`, or iterable thereof.
971 `...` can be used to return all collections, and is the default.
972 See :ref:`daf_butler_collection_expressions` for more
973 information.
974 datasetType : `DatasetType`, optional
975 If provided, only yield collections that should be searched for
976 this dataset type according to ``expression``. If this is
977 not provided, any dataset type restrictions in ``expression`` are
978 ignored.
979 collectionType : `CollectionType`, optional
980 If provided, only yield collections of this type.
981 flattenChains : `bool`, optional
982 If `True` (`False` is default), recursively yield the child
983 collections of matching `~CollectionType.CHAINED` collections.
984 includeChains : `bool`, optional
985 If `True`, yield records for matching `~CollectionType.CHAINED`
986 collections. Default is the opposite of ``flattenChains``: include
987 either CHAINED collections or their children, but not both.
989 Yields
990 ------
991 collection : `str`
992 The name of a collection that matches ``expression``.
993 """
994 query = CollectionQuery.fromExpression(expression)
995 for record in query.iter(self._collections, datasetType=datasetType, collectionType=collectionType,
996 flattenChains=flattenChains, includeChains=includeChains):
997 yield record.name
999 def makeQueryBuilder(self, summary: QuerySummary) -> QueryBuilder:
1000 """Return a `QueryBuilder` instance capable of constructing and
1001 managing more complex queries than those obtainable via `Registry`
1002 interfaces.
1004 This is an advanced interface; downstream code should prefer
1005 `Registry.queryDimensions` and `Registry.queryDatasets` whenever those
1006 are sufficient.
1008 Parameters
1009 ----------
1010 summary : `QuerySummary`
1011 Object describing and categorizing the full set of dimensions that
1012 will be included in the query.
1014 Returns
1015 -------
1016 builder : `QueryBuilder`
1017 Object that can be used to construct and perform advanced queries.
1018 """
1019 return QueryBuilder(summary=summary,
1020 collections=self._collections,
1021 dimensions=self._dimensions,
1022 datasets=self._datasets)
1024 def queryDimensions(self, dimensions: Union[Iterable[Union[Dimension, str]], Dimension, str], *,
1025 dataId: Optional[DataId] = None,
1026 datasets: Any = None,
1027 collections: Any = None,
1028 where: Optional[str] = None,
1029 expand: bool = True,
1030 components: Optional[bool] = None,
1031 **kwargs: Any) -> Iterator[DataCoordinate]:
1032 """Query for and iterate over data IDs matching user-provided criteria.
1034 Parameters
1035 ----------
1036 dimensions : `Dimension` or `str`, or iterable thereof
1037 The dimensions of the data IDs to yield, as either `Dimension`
1038 instances or `str`. Will be automatically expanded to a complete
1039 `DimensionGraph`.
1040 dataId : `dict` or `DataCoordinate`, optional
1041 A data ID whose key-value pairs are used as equality constraints
1042 in the query.
1043 datasets : `Any`, optional
1044 An expression that fully or partially identifies dataset types
1045 that should constrain the yielded data IDs. For example, including
1046 "raw" here would constrain the yielded ``instrument``,
1047 ``exposure``, ``detector``, and ``physical_filter`` values to only
1048 those for which at least one "raw" dataset exists in
1049 ``collections``. Allowed types include `DatasetType`, `str`,
1050 `re.Pattern`, and iterables thereof. Unlike other dataset type
1051 expressions, `...` is not permitted - it doesn't make sense to
1052 constrain data IDs on the existence of *all* datasets.
1053 See :ref:`daf_butler_dataset_type_expressions` for more
1054 information.
1055 collections: `Any`, optional
1056 An expression that fully or partially identifies the collections
1057 to search for datasets, such as a `str`, `re.Pattern`, or iterable
1058 thereof. `...` can be used to return all collections. Must be
1059 provided if ``datasets`` is, and is ignored if it is not. See
1060 :ref:`daf_butler_collection_expressions` for more information.
1061 where : `str`, optional
1062 A string expression similar to a SQL WHERE clause. May involve
1063 any column of a dimension table or (as a shortcut for the primary
1064 key column of a dimension table) dimension name. See
1065 :ref:`daf_butler_dimension_expressions` for more information.
1066 expand : `bool`, optional
1067 If `True` (default) yield `ExpandedDataCoordinate` instead of
1068 minimal `DataCoordinate` base-class instances.
1069 components : `bool`, optional
1070 If `True`, apply all dataset expression patterns to component
1071 dataset type names as well. If `False`, never apply patterns to
1072 components. If `None` (default), apply patterns to components only
1073 if their parent datasets were not matched by the expression.
1074 Fully-specified component datasets (`str` or `DatasetType`
1075 instances) are always included.
1076 **kwargs
1077 Additional keyword arguments are forwarded to
1078 `DataCoordinate.standardize` when processing the ``dataId``
1079 argument (and may be used to provide a constraining data ID even
1080 when the ``dataId`` argument is `None`).
1082 Yields
1083 ------
1084 dataId : `DataCoordinate`
1085 Data IDs matching the given query parameters. Order is
1086 unspecified.
1087 """
1088 dimensions = iterable(dimensions)
1089 standardizedDataId = self.expandDataId(dataId, **kwargs)
1090 standardizedDatasetTypes = set()
1091 requestedDimensionNames = set(self.dimensions.extract(dimensions).names)
1092 if datasets is not None:
1093 if collections is None:
1094 raise TypeError("Cannot pass 'datasets' without 'collections'.")
1095 for datasetType in self.queryDatasetTypes(datasets, components=components):
1096 requestedDimensionNames.update(datasetType.dimensions.names)
1097 # If any matched dataset type is a component, just operate on
1098 # its parent instead, because Registry doesn't know anything
1099 # about what components exist, and here (unlike queryDatasets)
1100 # we don't care about returning them.
1101 parentDatasetTypeName, componentName = datasetType.nameAndComponent()
1102 if componentName is not None:
1103 datasetType = self.getDatasetType(parentDatasetTypeName)
1104 standardizedDatasetTypes.add(datasetType)
1105 # Preprocess collections expression in case the original included
1106 # single-pass iterators (we'll want to use it multiple times
1107 # below).
1108 collections = CollectionQuery.fromExpression(collections)
1110 summary = QuerySummary(
1111 requested=DimensionGraph(self.dimensions, names=requestedDimensionNames),
1112 dataId=standardizedDataId,
1113 expression=where,
1114 )
1115 builder = self.makeQueryBuilder(summary)
1116 for datasetType in standardizedDatasetTypes:
1117 builder.joinDataset(datasetType, collections, isResult=False)
1118 query = builder.finish()
1119 predicate = query.predicate()
1120 for row in self._db.query(query.sql):
1121 if predicate(row):
1122 result = query.extractDataId(row)
1123 if expand:
1124 yield self.expandDataId(result, records=standardizedDataId.records)
1125 else:
1126 yield result
1128 def queryDatasets(self, datasetType: Any, *,
1129 collections: Any,
1130 dimensions: Optional[Iterable[Union[Dimension, str]]] = None,
1131 dataId: Optional[DataId] = None,
1132 where: Optional[str] = None,
1133 deduplicate: bool = False,
1134 expand: bool = True,
1135 components: Optional[bool] = None,
1136 **kwargs: Any) -> Iterator[DatasetRef]:
1137 """Query for and iterate over dataset references matching user-provided
1138 criteria.
1140 Parameters
1141 ----------
1142 datasetType
1143 An expression that fully or partially identifies the dataset types
1144 to be queried. Allowed types include `DatasetType`, `str`,
1145 `re.Pattern`, and iterables thereof. The special value `...` can
1146 be used to query all dataset types. See
1147 :ref:`daf_butler_dataset_type_expressions` for more information.
1148 collections
1149 An expression that fully or partially identifies the collections
1150 to search for datasets, such as a `str`, `re.Pattern`, or iterable
1151 thereof. `...` can be used to return all collections. See
1152 :ref:`daf_butler_collection_expressions` for more information.
1153 dimensions : `~collections.abc.Iterable` of `Dimension` or `str`
1154 Dimensions to include in the query (in addition to those used
1155 to identify the queried dataset type(s)), either to constrain
1156 the resulting datasets to those for which a matching dimension
1157 exists, or to relate the dataset type's dimensions to dimensions
1158 referenced by the ``dataId`` or ``where`` arguments.
1159 dataId : `dict` or `DataCoordinate`, optional
1160 A data ID whose key-value pairs are used as equality constraints
1161 in the query.
1162 where : `str`, optional
1163 A string expression similar to a SQL WHERE clause. May involve
1164 any column of a dimension table or (as a shortcut for the primary
1165 key column of a dimension table) dimension name. See
1166 :ref:`daf_butler_dimension_expressions` for more information.
1167 deduplicate : `bool`, optional
1168 If `True` (`False` is default), for each result data ID, only
1169 yield one `DatasetRef` of each `DatasetType`, from the first
1170 collection in which a dataset of that dataset type appears
1171 (according to the order of ``collections`` passed in). If `True`,
1172 ``collections`` must not contain regular expressions and may not
1173 be `...`.
1174 expand : `bool`, optional
1175 If `True` (default) attach `ExpandedDataCoordinate` instead of
1176 minimal `DataCoordinate` base-class instances.
1177 components : `bool`, optional
1178 If `True`, apply all dataset expression patterns to component
1179 dataset type names as well. If `False`, never apply patterns to
1180 components. If `None` (default), apply patterns to components only
1181 if their parent datasets were not matched by the expression.
1182 Fully-specified component datasets (`str` or `DatasetType`
1183 instances) are always included.
1184 **kwargs
1185 Additional keyword arguments are forwarded to
1186 `DataCoordinate.standardize` when processing the ``dataId``
1187 argument (and may be used to provide a constraining data ID even
1188 when the ``dataId`` argument is `None`).
1190 Yields
1191 ------
1192 ref : `DatasetRef`
1193 Dataset references matching the given query criteria. These
1194 are grouped by `DatasetType` if the query evaluates to multiple
1195 dataset types, but order is otherwise unspecified.
1197 Raises
1198 ------
1199 TypeError
1200 Raised when the arguments are incompatible, such as when a
1201 collection wildcard is passed when ``deduplicate`` is `True`.
1203 Notes
1204 -----
1205 When multiple dataset types are queried in a single call, the
1206 results of this operation are equivalent to querying for each dataset
1207 type separately in turn, and no information about the relationships
1208 between datasets of different types is included. In contexts where
1209 that kind of information is important, the recommended pattern is to
1210 use `queryDimensions` to first obtain data IDs (possibly with the
1211 desired dataset types and collections passed as constraints to the
1212 query), and then use multiple (generally much simpler) calls to
1213 `queryDatasets` with the returned data IDs passed as constraints.
1214 """
1215 # Standardize the collections expression.
1216 if deduplicate:
1217 collections = CollectionSearch.fromExpression(collections)
1218 else:
1219 collections = CollectionQuery.fromExpression(collections)
1220 # Standardize and expand the data ID provided as a constraint.
1221 standardizedDataId = self.expandDataId(dataId, **kwargs)
1223 # We can only query directly if given a non-component DatasetType
1224 # instance. If we were given an expression or str or a component
1225 # DatasetType instance, we'll populate this dict, recurse, and return.
1226 # If we already have a non-component DatasetType, it will remain None
1227 # and we'll run the query directly.
1228 composition: Optional[
1229 Dict[
1230 DatasetType, # parent dataset type
1231 List[Optional[str]] # component name, or None for parent
1232 ]
1233 ] = None
1234 if not isinstance(datasetType, DatasetType):
1235 # We were given a dataset type expression (which may be as simple
1236 # as a str). Loop over all matching datasets, delegating handling
1237 # of the `components` argument to queryDatasetTypes, as we populate
1238 # the composition dict.
1239 composition = defaultdict(list)
1240 for trueDatasetType in self.queryDatasetTypes(datasetType, components=components):
1241 parentName, componentName = trueDatasetType.nameAndComponent()
1242 if componentName is not None:
1243 parentDatasetType = self.getDatasetType(parentName)
1244 composition.setdefault(parentDatasetType, []).append(componentName)
1245 else:
1246 composition.setdefault(trueDatasetType, []).append(None)
1247 elif datasetType.isComponent():
1248 # We were given a true DatasetType instance, but it's a component.
1249 # the composition dict will have exactly one item.
1250 parentName, componentName = datasetType.nameAndComponent()
1251 parentDatasetType = self.getDatasetType(parentName)
1252 composition = {parentDatasetType: [componentName]}
1253 if composition is not None:
1254 # We need to recurse. Do that once for each parent dataset type.
1255 for parentDatasetType, componentNames in composition.items():
1256 for parentRef in self.queryDatasets(parentDatasetType, collections=collections,
1257 dimensions=dimensions, dataId=standardizedDataId,
1258 where=where, deduplicate=deduplicate):
1259 # Loop over components, yielding one for each one for each
1260 # one requested.
1261 for componentName in componentNames:
1262 if componentName is None:
1263 yield parentRef
1264 else:
1265 yield parentRef.makeComponentRef(componentName)
1266 return
1267 # If we get here, there's no need to recurse (or we are already
1268 # recursing; there can only ever be one level of recursion).
1270 # The full set of dimensions in the query is the combination of those
1271 # needed for the DatasetType and those explicitly requested, if any.
1272 requestedDimensionNames = set(datasetType.dimensions.names)
1273 if dimensions is not None:
1274 requestedDimensionNames.update(self.dimensions.extract(dimensions).names)
1275 # Construct the summary structure needed to construct a QueryBuilder.
1276 summary = QuerySummary(
1277 requested=DimensionGraph(self.dimensions, names=requestedDimensionNames),
1278 dataId=standardizedDataId,
1279 expression=where,
1280 )
1281 builder = self.makeQueryBuilder(summary)
1282 # Add the dataset subquery to the query, telling the QueryBuilder to
1283 # include the rank of the selected collection in the results only if we
1284 # need to deduplicate. Note that if any of the collections are
1285 # actually wildcard expressions, and we've asked for deduplication,
1286 # this will raise TypeError for us.
1287 if not builder.joinDataset(datasetType, collections, isResult=True, addRank=deduplicate):
1288 return
1289 query = builder.finish()
1290 predicate = query.predicate()
1291 if not deduplicate:
1292 # No need to de-duplicate across collections.
1293 for row in self._db.query(query.sql):
1294 if predicate(row):
1295 dataId = query.extractDataId(row, graph=datasetType.dimensions)
1296 if expand:
1297 dataId = self.expandDataId(dataId, records=standardizedDataId.records)
1298 yield query.extractDatasetRef(row, datasetType, dataId)[0]
1299 else:
1300 # For each data ID, yield only the DatasetRef with the lowest
1301 # collection rank.
1302 bestRefs = {}
1303 bestRanks: Dict[DataCoordinate, int] = {}
1304 for row in self._db.query(query.sql):
1305 if predicate(row):
1306 ref, rank = query.extractDatasetRef(row, datasetType)
1307 bestRank = bestRanks.get(ref.dataId, sys.maxsize)
1308 assert rank is not None
1309 if rank < bestRank:
1310 bestRefs[ref.dataId] = ref
1311 bestRanks[ref.dataId] = rank
1312 # If caller requested expanded data IDs, we defer that until here
1313 # so we do as little expansion as possible.
1314 if expand:
1315 for ref in bestRefs.values():
1316 dataId = self.expandDataId(ref.dataId, records=standardizedDataId.records)
1317 yield ref.expanded(dataId)
1318 else:
1319 yield from bestRefs.values()
1321 storageClasses: StorageClassFactory
1322 """All storage classes known to the registry (`StorageClassFactory`).
1323 """