Coverage for python/lsst/daf/butler/registry/_registry.py : 12%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22from __future__ import annotations
24__all__ = (
25 "Registry",
26)
28from collections import defaultdict
29import contextlib
30import logging
31from typing import (
32 Any,
33 Dict,
34 Iterable,
35 Iterator,
36 List,
37 Mapping,
38 Optional,
39 Set,
40 Type,
41 TYPE_CHECKING,
42 Union,
43)
45import sqlalchemy
47from lsst.utils import doImport
48from ..core import (
49 ButlerURI,
50 Config,
51 DataCoordinate,
52 DataCoordinateIterable,
53 DataId,
54 DatasetAssociation,
55 DatasetRef,
56 DatasetType,
57 ddl,
58 Dimension,
59 DimensionConfig,
60 DimensionElement,
61 DimensionGraph,
62 DimensionRecord,
63 DimensionUniverse,
64 NamedKeyMapping,
65 NameLookupMapping,
66 StorageClassFactory,
67 Timespan,
68)
69from . import queries
70from ..core.utils import iterable, transactional
71from ._config import RegistryConfig
72from ._collectionType import CollectionType
73from ._exceptions import ConflictingDefinitionError, InconsistentDataIdError, OrphanedRecordError
74from .wildcards import CategorizedWildcard, CollectionQuery, CollectionSearch, Ellipsis
75from .interfaces import ChainedCollectionRecord, RunRecord
76from .versions import ButlerVersionsManager, DigestMismatchError
78if TYPE_CHECKING: 78 ↛ 79line 78 didn't jump to line 79, because the condition on line 78 was never true
79 from .._butlerConfig import ButlerConfig
80 from .interfaces import (
81 ButlerAttributeManager,
82 CollectionManager,
83 Database,
84 OpaqueTableStorageManager,
85 DimensionRecordStorageManager,
86 DatasetRecordStorageManager,
87 DatastoreRegistryBridgeManager,
88 )
91_LOG = logging.getLogger(__name__)
93# key for dimensions configuration in attributes table
94_DIMENSIONS_ATTR = "config:dimensions.json"
97class Registry:
98 """Registry interface.
100 Parameters
101 ----------
102 database : `Database`
103 Database instance to store Registry.
104 attributes : `type`
105 Manager class implementing `ButlerAttributeManager`.
106 opaque : `type`
107 Manager class implementing `OpaqueTableStorageManager`.
108 dimensions : `type`
109 Manager class implementing `DimensionRecordStorageManager`.
110 collections : `type`
111 Manager class implementing `CollectionManager`.
112 datasets : `type`
113 Manager class implementing `DatasetRecordStorageManager`.
114 datastoreBridges : `type`
115 Manager class implementing `DatastoreRegistryBridgeManager`.
116 dimensionConfig : `DimensionConfig`, optional
117 Dimension universe configuration, only used when ``create`` is True.
118 writeable : `bool`, optional
119 If True then Registry will support write operations.
120 create : `bool`, optional
121 If True then database schema will be initialized, it must be empty
122 before instantiating Registry.
123 """
125 defaultConfigFile: Optional[str] = None
126 """Path to configuration defaults. Accessed within the ``configs`` resource
127 or relative to a search path. Can be None if no defaults specified.
128 """
130 @classmethod
131 def createFromConfig(cls, config: Optional[Union[RegistryConfig, str]] = None,
132 dimensionConfig: Optional[Union[DimensionConfig, str]] = None,
133 butlerRoot: Optional[str] = None) -> Registry:
134 """Create registry database and return `Registry` instance.
136 This method initializes database contents, database must be empty
137 prior to calling this method.
139 Parameters
140 ----------
141 config : `RegistryConfig` or `str`, optional
142 Registry configuration, if missing then default configuration will
143 be loaded from registry.yaml.
144 dimensionConfig : `DimensionConfig` or `str`, optional
145 Dimensions configuration, if missing then default configuration
146 will be loaded from dimensions.yaml.
147 butlerRoot : `str`, optional
148 Path to the repository root this `Registry` will manage.
150 Returns
151 -------
152 registry : `Registry`
153 A new `Registry` instance.
154 """
155 if isinstance(config, str):
156 config = RegistryConfig(config)
157 elif config is None:
158 config = RegistryConfig()
159 elif not isinstance(config, RegistryConfig):
160 raise TypeError(f"Incompatible Registry configuration type: {type(config)}")
161 config.replaceRoot(butlerRoot)
163 if isinstance(dimensionConfig, str):
164 dimensionConfig = DimensionConfig(config)
165 elif dimensionConfig is None:
166 dimensionConfig = DimensionConfig()
167 elif not isinstance(dimensionConfig, DimensionConfig):
168 raise TypeError(f"Incompatible Dimension configuration type: {type(dimensionConfig)}")
170 DatabaseClass = config.getDatabaseClass()
171 database = DatabaseClass.fromUri(str(config.connectionString), origin=config.get("origin", 0),
172 namespace=config.get("namespace"))
173 attributes = doImport(config["managers", "attributes"])
174 opaque = doImport(config["managers", "opaque"])
175 dimensions = doImport(config["managers", "dimensions"])
176 collections = doImport(config["managers", "collections"])
177 datasets = doImport(config["managers", "datasets"])
178 datastoreBridges = doImport(config["managers", "datastores"])
180 return cls(database, dimensions=dimensions, attributes=attributes, opaque=opaque,
181 collections=collections, datasets=datasets, datastoreBridges=datastoreBridges,
182 dimensionConfig=dimensionConfig, create=True)
184 @classmethod
185 def fromConfig(cls, config: Union[ButlerConfig, RegistryConfig, Config, str],
186 butlerRoot: Optional[Union[str, ButlerURI]] = None, writeable: bool = True) -> Registry:
187 """Create `Registry` subclass instance from `config`.
189 Registry database must be inbitialized prior to calling this method.
191 Parameters
192 ----------
193 config : `ButlerConfig`, `RegistryConfig`, `Config` or `str`
194 Registry configuration
195 butlerRoot : `str` or `ButlerURI`, optional
196 Path to the repository root this `Registry` will manage.
197 writeable : `bool`, optional
198 If `True` (default) create a read-write connection to the database.
200 Returns
201 -------
202 registry : `Registry` (subclass)
203 A new `Registry` subclass instance.
204 """
205 if not isinstance(config, RegistryConfig):
206 if isinstance(config, str) or isinstance(config, Config):
207 config = RegistryConfig(config)
208 else:
209 raise ValueError("Incompatible Registry configuration: {}".format(config))
210 config.replaceRoot(butlerRoot)
211 DatabaseClass = config.getDatabaseClass()
212 database = DatabaseClass.fromUri(str(config.connectionString), origin=config.get("origin", 0),
213 namespace=config.get("namespace"), writeable=writeable)
214 attributes = doImport(config["managers", "attributes"])
215 opaque = doImport(config["managers", "opaque"])
216 dimensions = doImport(config["managers", "dimensions"])
217 collections = doImport(config["managers", "collections"])
218 datasets = doImport(config["managers", "datasets"])
219 datastoreBridges = doImport(config["managers", "datastores"])
221 return cls(database, dimensions=dimensions, attributes=attributes, opaque=opaque,
222 collections=collections, datasets=datasets, datastoreBridges=datastoreBridges,
223 dimensionConfig=None, writeable=writeable)
225 def __init__(self, database: Database, *,
226 attributes: Type[ButlerAttributeManager],
227 opaque: Type[OpaqueTableStorageManager],
228 dimensions: Type[DimensionRecordStorageManager],
229 collections: Type[CollectionManager],
230 datasets: Type[DatasetRecordStorageManager],
231 datastoreBridges: Type[DatastoreRegistryBridgeManager],
232 dimensionConfig: Optional[DimensionConfig] = None,
233 writeable: bool = True,
234 create: bool = False):
235 self._db = database
236 self.storageClasses = StorageClassFactory()
238 # With existing registry we have to read dimensions config from
239 # database before we initialize all other managers.
240 if dimensionConfig is None:
241 assert not create, "missing DimensionConfig when create=True"
242 with self._db.declareStaticTables(create=False) as context:
243 self._attributes = attributes.initialize(self._db, context)
245 versions = ButlerVersionsManager(
246 self._attributes,
247 dict(attributes=self._attributes)
248 )
249 # verify that configured versions are compatible with schema
250 versions.checkManagersConfig()
251 versions.checkManagersVersions(writeable)
253 # get serialized as a string from database
254 dimensionsString = self._attributes.get(_DIMENSIONS_ATTR)
255 if dimensionsString is not None:
256 dimensionConfig = DimensionConfig(Config.fromString(dimensionsString, format="json"))
257 else:
258 raise LookupError(f"Registry attribute {_DIMENSIONS_ATTR} is missing from database")
260 # make universe
261 universe = DimensionUniverse(dimensionConfig)
263 with self._db.declareStaticTables(create=create) as context:
264 self._attributes = attributes.initialize(self._db, context)
265 self._dimensions = dimensions.initialize(self._db, context, universe=universe)
266 self._collections = collections.initialize(self._db, context, dimensions=self._dimensions)
267 self._datasets = datasets.initialize(self._db, context,
268 collections=self._collections,
269 dimensions=self._dimensions)
270 self._opaque = opaque.initialize(self._db, context)
271 self._datastoreBridges = datastoreBridges.initialize(self._db, context,
272 opaque=self._opaque,
273 datasets=datasets,
274 universe=self._dimensions.universe)
275 versions = ButlerVersionsManager(
276 self._attributes,
277 dict(
278 attributes=self._attributes,
279 opaque=self._opaque,
280 dimensions=self._dimensions,
281 collections=self._collections,
282 datasets=self._datasets,
283 datastores=self._datastoreBridges,
284 )
285 )
286 # store managers and their versions in attributes table
287 context.addInitializer(lambda db: versions.storeManagersConfig())
288 context.addInitializer(lambda db: versions.storeManagersVersions())
289 # dump universe config as json into attributes (faster than YAML)
290 json = dimensionConfig.dump(format="json")
291 if json is not None:
292 # Convert Optional[str] to str for mypy
293 json_str = json
294 context.addInitializer(
295 lambda db: self._attributes.set(_DIMENSIONS_ATTR, json_str)
296 )
297 else:
298 raise RuntimeError("Unexpectedly failed to serialize DimensionConfig to JSON")
300 if not create:
301 # verify that configured versions are compatible with schema
302 versions.checkManagersConfig()
303 versions.checkManagersVersions(writeable)
304 try:
305 versions.checkManagersDigests()
306 except DigestMismatchError as exc:
307 # potentially digest mismatch is a serious error but during
308 # development it could be benign, treat this as warning for
309 # now.
310 _LOG.warning(f"Registry schema digest mismatch: {exc}")
312 self._dimensions.refresh()
313 self._collections.refresh()
314 self._datasets.refresh()
316 def __str__(self) -> str:
317 return str(self._db)
319 def __repr__(self) -> str:
320 return f"Registry({self._db!r}, {self.dimensions!r})"
322 def isWriteable(self) -> bool:
323 """Return `True` if this registry allows write operations, and `False`
324 otherwise.
325 """
326 return self._db.isWriteable()
328 @property
329 def dimensions(self) -> DimensionUniverse:
330 """All dimensions recognized by this `Registry` (`DimensionUniverse`).
331 """
332 return self._dimensions.universe
334 @contextlib.contextmanager
335 def transaction(self, *, savepoint: bool = False) -> Iterator[None]:
336 """Return a context manager that represents a transaction.
337 """
338 try:
339 with self._db.transaction(savepoint=savepoint):
340 yield
341 except BaseException:
342 # TODO: this clears the caches sometimes when we wouldn't actually
343 # need to. Can we avoid that?
344 self._dimensions.clearCaches()
345 raise
347 def registerOpaqueTable(self, tableName: str, spec: ddl.TableSpec) -> None:
348 """Add an opaque (to the `Registry`) table for use by a `Datastore` or
349 other data repository client.
351 Opaque table records can be added via `insertOpaqueData`, retrieved via
352 `fetchOpaqueData`, and removed via `deleteOpaqueData`.
354 Parameters
355 ----------
356 tableName : `str`
357 Logical name of the opaque table. This may differ from the
358 actual name used in the database by a prefix and/or suffix.
359 spec : `ddl.TableSpec`
360 Specification for the table to be added.
361 """
362 self._opaque.register(tableName, spec)
364 @transactional
365 def insertOpaqueData(self, tableName: str, *data: dict) -> None:
366 """Insert records into an opaque table.
368 Parameters
369 ----------
370 tableName : `str`
371 Logical name of the opaque table. Must match the name used in a
372 previous call to `registerOpaqueTable`.
373 data
374 Each additional positional argument is a dictionary that represents
375 a single row to be added.
376 """
377 self._opaque[tableName].insert(*data)
379 def fetchOpaqueData(self, tableName: str, **where: Any) -> Iterator[dict]:
380 """Retrieve records from an opaque table.
382 Parameters
383 ----------
384 tableName : `str`
385 Logical name of the opaque table. Must match the name used in a
386 previous call to `registerOpaqueTable`.
387 where
388 Additional keyword arguments are interpreted as equality
389 constraints that restrict the returned rows (combined with AND);
390 keyword arguments are column names and values are the values they
391 must have.
393 Yields
394 ------
395 row : `dict`
396 A dictionary representing a single result row.
397 """
398 yield from self._opaque[tableName].fetch(**where)
400 @transactional
401 def deleteOpaqueData(self, tableName: str, **where: Any) -> None:
402 """Remove records from an opaque table.
404 Parameters
405 ----------
406 tableName : `str`
407 Logical name of the opaque table. Must match the name used in a
408 previous call to `registerOpaqueTable`.
409 where
410 Additional keyword arguments are interpreted as equality
411 constraints that restrict the deleted rows (combined with AND);
412 keyword arguments are column names and values are the values they
413 must have.
414 """
415 self._opaque[tableName].delete(**where)
417 def registerCollection(self, name: str, type: CollectionType = CollectionType.TAGGED,
418 doc: Optional[str] = None) -> None:
419 """Add a new collection if one with the given name does not exist.
421 Parameters
422 ----------
423 name : `str`
424 The name of the collection to create.
425 type : `CollectionType`
426 Enum value indicating the type of collection to create.
427 doc : `str`, optional
428 Documentation string for the collection.
430 Notes
431 -----
432 This method cannot be called within transactions, as it needs to be
433 able to perform its own transaction to be concurrent.
434 """
435 self._collections.register(name, type, doc=doc)
437 def getCollectionType(self, name: str) -> CollectionType:
438 """Return an enumeration value indicating the type of the given
439 collection.
441 Parameters
442 ----------
443 name : `str`
444 The name of the collection.
446 Returns
447 -------
448 type : `CollectionType`
449 Enum value indicating the type of this collection.
451 Raises
452 ------
453 MissingCollectionError
454 Raised if no collection with the given name exists.
455 """
456 return self._collections.find(name).type
458 def registerRun(self, name: str, doc: Optional[str] = None) -> None:
459 """Add a new run if one with the given name does not exist.
461 Parameters
462 ----------
463 name : `str`
464 The name of the run to create.
465 doc : `str`, optional
466 Documentation string for the collection.
468 Notes
469 -----
470 This method cannot be called within transactions, as it needs to be
471 able to perform its own transaction to be concurrent.
472 """
473 self._collections.register(name, CollectionType.RUN, doc=doc)
475 @transactional
476 def removeCollection(self, name: str) -> None:
477 """Completely remove the given collection.
479 Parameters
480 ----------
481 name : `str`
482 The name of the collection to remove.
484 Raises
485 ------
486 MissingCollectionError
487 Raised if no collection with the given name exists.
489 Notes
490 -----
491 If this is a `~CollectionType.RUN` collection, all datasets and quanta
492 in it are also fully removed. This requires that those datasets be
493 removed (or at least trashed) from any datastores that hold them first.
495 A collection may not be deleted as long as it is referenced by a
496 `~CollectionType.CHAINED` collection; the ``CHAINED`` collection must
497 be deleted or redefined first.
498 """
499 self._collections.remove(name)
501 def getCollectionChain(self, parent: str) -> CollectionSearch:
502 """Return the child collections in a `~CollectionType.CHAINED`
503 collection.
505 Parameters
506 ----------
507 parent : `str`
508 Name of the chained collection. Must have already been added via
509 a call to `Registry.registerCollection`.
511 Returns
512 -------
513 children : `CollectionSearch`
514 An object that defines the search path of the collection.
515 See :ref:`daf_butler_collection_expressions` for more information.
517 Raises
518 ------
519 MissingCollectionError
520 Raised if ``parent`` does not exist in the `Registry`.
521 TypeError
522 Raised if ``parent`` does not correspond to a
523 `~CollectionType.CHAINED` collection.
524 """
525 record = self._collections.find(parent)
526 if record.type is not CollectionType.CHAINED:
527 raise TypeError(f"Collection '{parent}' has type {record.type.name}, not CHAINED.")
528 assert isinstance(record, ChainedCollectionRecord)
529 return record.children
531 @transactional
532 def setCollectionChain(self, parent: str, children: Any) -> None:
533 """Define or redefine a `~CollectionType.CHAINED` collection.
535 Parameters
536 ----------
537 parent : `str`
538 Name of the chained collection. Must have already been added via
539 a call to `Registry.registerCollection`.
540 children : `Any`
541 An expression defining an ordered search of child collections,
542 generally an iterable of `str`; see
543 :ref:`daf_butler_collection_expressions` for more information.
545 Raises
546 ------
547 MissingCollectionError
548 Raised when any of the given collections do not exist in the
549 `Registry`.
550 TypeError
551 Raised if ``parent`` does not correspond to a
552 `~CollectionType.CHAINED` collection.
553 ValueError
554 Raised if the given collections contains a cycle.
555 """
556 record = self._collections.find(parent)
557 if record.type is not CollectionType.CHAINED:
558 raise TypeError(f"Collection '{parent}' has type {record.type.name}, not CHAINED.")
559 assert isinstance(record, ChainedCollectionRecord)
560 children = CollectionSearch.fromExpression(children)
561 if children != record.children:
562 record.update(self._collections, children)
564 def getCollectionDocumentation(self, collection: str) -> Optional[str]:
565 """Retrieve the documentation string for a collection.
567 Parameters
568 ----------
569 name : `str`
570 Name of the collection.
572 Returns
573 -------
574 docs : `str` or `None`
575 Docstring for the collection with the given name.
576 """
577 return self._collections.getDocumentation(self._collections.find(collection).key)
579 def setCollectionDocumentation(self, collection: str, doc: Optional[str]) -> None:
580 """Set the documentation string for a collection.
582 Parameters
583 ----------
584 name : `str`
585 Name of the collection.
586 docs : `str` or `None`
587 Docstring for the collection with the given name; will replace any
588 existing docstring. Passing `None` will remove any existing
589 docstring.
590 """
591 self._collections.setDocumentation(self._collections.find(collection).key, doc)
593 def registerDatasetType(self, datasetType: DatasetType) -> bool:
594 """
595 Add a new `DatasetType` to the Registry.
597 It is not an error to register the same `DatasetType` twice.
599 Parameters
600 ----------
601 datasetType : `DatasetType`
602 The `DatasetType` to be added.
604 Returns
605 -------
606 inserted : `bool`
607 `True` if ``datasetType`` was inserted, `False` if an identical
608 existing `DatsetType` was found. Note that in either case the
609 DatasetType is guaranteed to be defined in the Registry
610 consistently with the given definition.
612 Raises
613 ------
614 ValueError
615 Raised if the dimensions or storage class are invalid.
616 ConflictingDefinitionError
617 Raised if this DatasetType is already registered with a different
618 definition.
620 Notes
621 -----
622 This method cannot be called within transactions, as it needs to be
623 able to perform its own transaction to be concurrent.
624 """
625 _, inserted = self._datasets.register(datasetType)
626 return inserted
628 def removeDatasetType(self, name: str) -> None:
629 """Remove the named `DatasetType` from the registry.
631 .. warning::
633 Registry caches the dataset type definitions. This means that
634 deleting the dataset type definition may result in unexpected
635 behavior from other butler processes that are active that have
636 not seen the deletion.
638 Parameters
639 ----------
640 name : `str`
641 Name of the type to be removed.
643 Raises
644 ------
645 lsst.daf.butler.registry.OrphanedRecordError
646 Raised if an attempt is made to remove the dataset type definition
647 when there are already datasets associated with it.
649 Notes
650 -----
651 If the dataset type is not registered the method will return without
652 action.
653 """
654 self._datasets.remove(name)
656 def getDatasetType(self, name: str) -> DatasetType:
657 """Get the `DatasetType`.
659 Parameters
660 ----------
661 name : `str`
662 Name of the type.
664 Returns
665 -------
666 type : `DatasetType`
667 The `DatasetType` associated with the given name.
669 Raises
670 ------
671 KeyError
672 Requested named DatasetType could not be found in registry.
673 """
674 return self._datasets[name].datasetType
676 def findDataset(self, datasetType: Union[DatasetType, str], dataId: Optional[DataId] = None, *,
677 collections: Any, timespan: Optional[Timespan] = None,
678 **kwargs: Any) -> Optional[DatasetRef]:
679 """Find a dataset given its `DatasetType` and data ID.
681 This can be used to obtain a `DatasetRef` that permits the dataset to
682 be read from a `Datastore`. If the dataset is a component and can not
683 be found using the provided dataset type, a dataset ref for the parent
684 will be returned instead but with the correct dataset type.
686 Parameters
687 ----------
688 datasetType : `DatasetType` or `str`
689 A `DatasetType` or the name of one.
690 dataId : `dict` or `DataCoordinate`, optional
691 A `dict`-like object containing the `Dimension` links that identify
692 the dataset within a collection.
693 collections
694 An expression that fully or partially identifies the collections to
695 search for the dataset; see
696 :ref:`daf_butler_collection_expressions` for more information.
697 timespan : `Timespan`, optional
698 A timespan that the validity range of the dataset must overlap.
699 If not provided, any `~CollectionType.CALIBRATION` collections
700 matched by the ``collections`` argument will not be searched.
701 **kwargs
702 Additional keyword arguments passed to
703 `DataCoordinate.standardize` to convert ``dataId`` to a true
704 `DataCoordinate` or augment an existing one.
706 Returns
707 -------
708 ref : `DatasetRef`
709 A reference to the dataset, or `None` if no matching Dataset
710 was found.
712 Raises
713 ------
714 LookupError
715 Raised if one or more data ID keys are missing.
716 KeyError
717 Raised if the dataset type does not exist.
718 MissingCollectionError
719 Raised if any of ``collections`` does not exist in the registry.
721 Notes
722 -----
723 This method simply returns `None` and does not raise an exception even
724 when the set of collections searched is intrinsically incompatible with
725 the dataset type, e.g. if ``datasetType.isCalibration() is False``, but
726 only `~CollectionType.CALIBRATION` collections are being searched.
727 This may make it harder to debug some lookup failures, but the behavior
728 is intentional; we consider it more important that failed searches are
729 reported consistently, regardless of the reason, and that adding
730 additional collections that do not contain a match to the search path
731 never changes the behavior.
732 """
733 if isinstance(datasetType, DatasetType):
734 storage = self._datasets[datasetType.name]
735 else:
736 storage = self._datasets[datasetType]
737 dataId = DataCoordinate.standardize(dataId, graph=storage.datasetType.dimensions,
738 universe=self.dimensions, **kwargs)
739 collections = CollectionSearch.fromExpression(collections)
740 for collectionRecord in collections.iter(self._collections):
741 if (collectionRecord.type is CollectionType.CALIBRATION
742 and (not storage.datasetType.isCalibration() or timespan is None)):
743 continue
744 result = storage.find(collectionRecord, dataId, timespan=timespan)
745 if result is not None:
746 return result
748 return None
750 @transactional
751 def insertDatasets(self, datasetType: Union[DatasetType, str], dataIds: Iterable[DataId],
752 run: str) -> List[DatasetRef]:
753 """Insert one or more datasets into the `Registry`
755 This always adds new datasets; to associate existing datasets with
756 a new collection, use ``associate``.
758 Parameters
759 ----------
760 datasetType : `DatasetType` or `str`
761 A `DatasetType` or the name of one.
762 dataIds : `~collections.abc.Iterable` of `dict` or `DataCoordinate`
763 Dimension-based identifiers for the new datasets.
764 run : `str`
765 The name of the run that produced the datasets.
767 Returns
768 -------
769 refs : `list` of `DatasetRef`
770 Resolved `DatasetRef` instances for all given data IDs (in the same
771 order).
773 Raises
774 ------
775 ConflictingDefinitionError
776 If a dataset with the same dataset type and data ID as one of those
777 given already exists in ``run``.
778 MissingCollectionError
779 Raised if ``run`` does not exist in the registry.
780 """
781 if isinstance(datasetType, DatasetType):
782 storage = self._datasets.find(datasetType.name)
783 if storage is None:
784 raise LookupError(f"DatasetType '{datasetType}' has not been registered.")
785 else:
786 storage = self._datasets.find(datasetType)
787 if storage is None:
788 raise LookupError(f"DatasetType with name '{datasetType}' has not been registered.")
789 runRecord = self._collections.find(run)
790 if runRecord.type is not CollectionType.RUN:
791 raise TypeError(f"Given collection is of type {runRecord.type.name}; RUN collection required.")
792 assert isinstance(runRecord, RunRecord)
793 expandedDataIds = [self.expandDataId(dataId, graph=storage.datasetType.dimensions)
794 for dataId in dataIds]
795 try:
796 refs = list(storage.insert(runRecord, expandedDataIds))
797 except sqlalchemy.exc.IntegrityError as err:
798 raise ConflictingDefinitionError(f"A database constraint failure was triggered by inserting "
799 f"one or more datasets of type {storage.datasetType} into "
800 f"collection '{run}'. "
801 f"This probably means a dataset with the same data ID "
802 f"and dataset type already exists, but it may also mean a "
803 f"dimension row is missing.") from err
804 return refs
806 def getDataset(self, id: int) -> Optional[DatasetRef]:
807 """Retrieve a Dataset entry.
809 Parameters
810 ----------
811 id : `int`
812 The unique identifier for the dataset.
814 Returns
815 -------
816 ref : `DatasetRef` or `None`
817 A ref to the Dataset, or `None` if no matching Dataset
818 was found.
819 """
820 ref = self._datasets.getDatasetRef(id)
821 if ref is None:
822 return None
823 return ref
825 @transactional
826 def removeDatasets(self, refs: Iterable[DatasetRef]) -> None:
827 """Remove datasets from the Registry.
829 The datasets will be removed unconditionally from all collections, and
830 any `Quantum` that consumed this dataset will instead be marked with
831 having a NULL input. `Datastore` records will *not* be deleted; the
832 caller is responsible for ensuring that the dataset has already been
833 removed from all Datastores.
835 Parameters
836 ----------
837 refs : `Iterable` of `DatasetRef`
838 References to the datasets to be removed. Must include a valid
839 ``id`` attribute, and should be considered invalidated upon return.
841 Raises
842 ------
843 AmbiguousDatasetError
844 Raised if any ``ref.id`` is `None`.
845 OrphanedRecordError
846 Raised if any dataset is still present in any `Datastore`.
847 """
848 for datasetType, refsForType in DatasetRef.groupByType(refs).items():
849 storage = self._datasets.find(datasetType.name)
850 assert storage is not None
851 try:
852 storage.delete(refsForType)
853 except sqlalchemy.exc.IntegrityError as err:
854 raise OrphanedRecordError("One or more datasets is still "
855 "present in one or more Datastores.") from err
857 @transactional
858 def associate(self, collection: str, refs: Iterable[DatasetRef]) -> None:
859 """Add existing datasets to a `~CollectionType.TAGGED` collection.
861 If a DatasetRef with the same exact integer ID is already in a
862 collection nothing is changed. If a `DatasetRef` with the same
863 `DatasetType` and data ID but with different integer ID
864 exists in the collection, `ConflictingDefinitionError` is raised.
866 Parameters
867 ----------
868 collection : `str`
869 Indicates the collection the datasets should be associated with.
870 refs : `Iterable` [ `DatasetRef` ]
871 An iterable of resolved `DatasetRef` instances that already exist
872 in this `Registry`.
874 Raises
875 ------
876 ConflictingDefinitionError
877 If a Dataset with the given `DatasetRef` already exists in the
878 given collection.
879 AmbiguousDatasetError
880 Raised if ``any(ref.id is None for ref in refs)``.
881 MissingCollectionError
882 Raised if ``collection`` does not exist in the registry.
883 TypeError
884 Raise adding new datasets to the given ``collection`` is not
885 allowed.
886 """
887 collectionRecord = self._collections.find(collection)
888 if collectionRecord.type is not CollectionType.TAGGED:
889 raise TypeError(f"Collection '{collection}' has type {collectionRecord.type.name}, not TAGGED.")
890 for datasetType, refsForType in DatasetRef.groupByType(refs).items():
891 storage = self._datasets.find(datasetType.name)
892 assert storage is not None
893 try:
894 storage.associate(collectionRecord, refsForType)
895 except sqlalchemy.exc.IntegrityError as err:
896 raise ConflictingDefinitionError(
897 f"Constraint violation while associating dataset of type {datasetType.name} with "
898 f"collection {collection}. This probably means that one or more datasets with the same "
899 f"dataset type and data ID already exist in the collection, but it may also indicate "
900 f"that the datasets do not exist."
901 ) from err
903 @transactional
904 def disassociate(self, collection: str, refs: Iterable[DatasetRef]) -> None:
905 """Remove existing datasets from a `~CollectionType.TAGGED` collection.
907 ``collection`` and ``ref`` combinations that are not currently
908 associated are silently ignored.
910 Parameters
911 ----------
912 collection : `str`
913 The collection the datasets should no longer be associated with.
914 refs : `Iterable` [ `DatasetRef` ]
915 An iterable of resolved `DatasetRef` instances that already exist
916 in this `Registry`.
918 Raises
919 ------
920 AmbiguousDatasetError
921 Raised if any of the given dataset references is unresolved.
922 MissingCollectionError
923 Raised if ``collection`` does not exist in the registry.
924 TypeError
925 Raise adding new datasets to the given ``collection`` is not
926 allowed.
927 """
928 collectionRecord = self._collections.find(collection)
929 if collectionRecord.type is not CollectionType.TAGGED:
930 raise TypeError(f"Collection '{collection}' has type {collectionRecord.type.name}; "
931 "expected TAGGED.")
932 for datasetType, refsForType in DatasetRef.groupByType(refs).items():
933 storage = self._datasets.find(datasetType.name)
934 assert storage is not None
935 storage.disassociate(collectionRecord, refsForType)
937 @transactional
938 def certify(self, collection: str, refs: Iterable[DatasetRef], timespan: Timespan) -> None:
939 """Associate one or more datasets with a calibration collection and a
940 validity range within it.
942 Parameters
943 ----------
944 collection : `str`
945 The name of an already-registered `~CollectionType.CALIBRATION`
946 collection.
947 refs : `Iterable` [ `DatasetRef` ]
948 Datasets to be associated.
949 timespan : `Timespan`
950 The validity range for these datasets within the collection.
952 Raises
953 ------
954 AmbiguousDatasetError
955 Raised if any of the given `DatasetRef` instances is unresolved.
956 ConflictingDefinitionError
957 Raised if the collection already contains a different dataset with
958 the same `DatasetType` and data ID and an overlapping validity
959 range.
960 TypeError
961 Raised if ``collection`` is not a `~CollectionType.CALIBRATION`
962 collection or if one or more datasets are of a dataset type for
963 which `DatasetType.isCalibration` returns `False`.
964 """
965 collectionRecord = self._collections.find(collection)
966 for datasetType, refsForType in DatasetRef.groupByType(refs).items():
967 storage = self._datasets[datasetType.name]
968 storage.certify(collectionRecord, refsForType, timespan)
970 @transactional
971 def decertify(self, collection: str, datasetType: Union[str, DatasetType], timespan: Timespan, *,
972 dataIds: Optional[Iterable[DataId]] = None) -> None:
973 """Remove or adjust datasets to clear a validity range within a
974 calibration collection.
976 Parameters
977 ----------
978 collection : `str`
979 The name of an already-registered `~CollectionType.CALIBRATION`
980 collection.
981 datasetType : `str` or `DatasetType`
982 Name or `DatasetType` instance for the datasets to be decertified.
983 timespan : `Timespan`, optional
984 The validity range to remove datasets from within the collection.
985 Datasets that overlap this range but are not contained by it will
986 have their validity ranges adjusted to not overlap it, which may
987 split a single dataset validity range into two.
988 dataIds : `Iterable` [ `DataId` ], optional
989 Data IDs that should be decertified within the given validity range
990 If `None`, all data IDs for ``self.datasetType`` will be
991 decertified.
993 Raises
994 ------
995 TypeError
996 Raised if ``collection`` is not a `~CollectionType.CALIBRATION`
997 collection or if ``datasetType.isCalibration() is False``.
998 """
999 collectionRecord = self._collections.find(collection)
1000 if isinstance(datasetType, str):
1001 storage = self._datasets[datasetType]
1002 else:
1003 storage = self._datasets[datasetType.name]
1004 standardizedDataIds = None
1005 if dataIds is not None:
1006 standardizedDataIds = [DataCoordinate.standardize(d, graph=storage.datasetType.dimensions)
1007 for d in dataIds]
1008 storage.decertify(collectionRecord, timespan, dataIds=standardizedDataIds)
1010 def getDatastoreBridgeManager(self) -> DatastoreRegistryBridgeManager:
1011 """Return an object that allows a new `Datastore` instance to
1012 communicate with this `Registry`.
1014 Returns
1015 -------
1016 manager : `DatastoreRegistryBridgeManager`
1017 Object that mediates communication between this `Registry` and its
1018 associated datastores.
1019 """
1020 return self._datastoreBridges
1022 def getDatasetLocations(self, ref: DatasetRef) -> Iterable[str]:
1023 """Retrieve datastore locations for a given dataset.
1025 Parameters
1026 ----------
1027 ref : `DatasetRef`
1028 A reference to the dataset for which to retrieve storage
1029 information.
1031 Returns
1032 -------
1033 datastores : `Iterable` [ `str` ]
1034 All the matching datastores holding this dataset.
1036 Raises
1037 ------
1038 AmbiguousDatasetError
1039 Raised if ``ref.id`` is `None`.
1040 """
1041 return self._datastoreBridges.findDatastores(ref)
1043 def expandDataId(self, dataId: Optional[DataId] = None, *, graph: Optional[DimensionGraph] = None,
1044 records: Optional[NameLookupMapping[DimensionElement, Optional[DimensionRecord]]] = None,
1045 **kwargs: Any) -> DataCoordinate:
1046 """Expand a dimension-based data ID to include additional information.
1048 Parameters
1049 ----------
1050 dataId : `DataCoordinate` or `dict`, optional
1051 Data ID to be expanded; augmented and overridden by ``kwds``.
1052 graph : `DimensionGraph`, optional
1053 Set of dimensions for the expanded ID. If `None`, the dimensions
1054 will be inferred from the keys of ``dataId`` and ``kwds``.
1055 Dimensions that are in ``dataId`` or ``kwds`` but not in ``graph``
1056 are silently ignored, providing a way to extract and expand a
1057 subset of a data ID.
1058 records : `Mapping` [`str`, `DimensionRecord`], optional
1059 Dimension record data to use before querying the database for that
1060 data, keyed by element name.
1061 **kwargs
1062 Additional keywords are treated like additional key-value pairs for
1063 ``dataId``, extending and overriding
1065 Returns
1066 -------
1067 expanded : `DataCoordinate`
1068 A data ID that includes full metadata for all of the dimensions it
1069 identifieds, i.e. guarantees that ``expanded.hasRecords()`` and
1070 ``expanded.hasFull()`` both return `True`.
1071 """
1072 standardized = DataCoordinate.standardize(dataId, graph=graph, universe=self.dimensions, **kwargs)
1073 if standardized.hasRecords():
1074 return standardized
1075 if records is None:
1076 records = {}
1077 elif isinstance(records, NamedKeyMapping):
1078 records = records.byName()
1079 else:
1080 records = dict(records)
1081 if isinstance(dataId, DataCoordinate) and dataId.hasRecords():
1082 records.update(dataId.records.byName())
1083 keys = standardized.byName()
1084 for element in standardized.graph.primaryKeyTraversalOrder:
1085 record = records.get(element.name, ...) # Use ... to mean not found; None might mean NULL
1086 if record is ...:
1087 if isinstance(element, Dimension) and keys.get(element.name) is None:
1088 if element in standardized.graph.required:
1089 raise LookupError(
1090 f"No value or null value for required dimension {element.name}."
1091 )
1092 keys[element.name] = None
1093 record = None
1094 else:
1095 storage = self._dimensions[element]
1096 dataIdSet = DataCoordinateIterable.fromScalar(
1097 DataCoordinate.standardize(keys, graph=element.graph)
1098 )
1099 fetched = tuple(storage.fetch(dataIdSet))
1100 try:
1101 (record,) = fetched
1102 except ValueError:
1103 record = None
1104 records[element.name] = record
1105 if record is not None:
1106 for d in element.implied:
1107 value = getattr(record, d.name)
1108 if keys.setdefault(d.name, value) != value:
1109 raise InconsistentDataIdError(
1110 f"Data ID {standardized} has {d.name}={keys[d.name]!r}, "
1111 f"but {element.name} implies {d.name}={value!r}."
1112 )
1113 else:
1114 if element in standardized.graph.required:
1115 raise LookupError(
1116 f"Could not fetch record for required dimension {element.name} via keys {keys}."
1117 )
1118 if element.alwaysJoin:
1119 raise InconsistentDataIdError(
1120 f"Could not fetch record for element {element.name} via keys {keys}, ",
1121 "but it is marked alwaysJoin=True; this means one or more dimensions are not "
1122 "related."
1123 )
1124 for d in element.implied:
1125 keys.setdefault(d.name, None)
1126 records.setdefault(d.name, None)
1127 return DataCoordinate.standardize(keys, graph=standardized.graph).expanded(records=records)
1129 def insertDimensionData(self, element: Union[DimensionElement, str],
1130 *data: Union[Mapping[str, Any], DimensionRecord],
1131 conform: bool = True) -> None:
1132 """Insert one or more dimension records into the database.
1134 Parameters
1135 ----------
1136 element : `DimensionElement` or `str`
1137 The `DimensionElement` or name thereof that identifies the table
1138 records will be inserted into.
1139 data : `dict` or `DimensionRecord` (variadic)
1140 One or more records to insert.
1141 conform : `bool`, optional
1142 If `False` (`True` is default) perform no checking or conversions,
1143 and assume that ``element`` is a `DimensionElement` instance and
1144 ``data`` is a one or more `DimensionRecord` instances of the
1145 appropriate subclass.
1146 """
1147 if conform:
1148 if isinstance(element, str):
1149 element = self.dimensions[element]
1150 records = [row if isinstance(row, DimensionRecord) else element.RecordClass(**row)
1151 for row in data]
1152 else:
1153 # Ignore typing since caller said to trust them with conform=False.
1154 records = data # type: ignore
1155 storage = self._dimensions[element] # type: ignore
1156 storage.insert(*records)
1158 def syncDimensionData(self, element: Union[DimensionElement, str],
1159 row: Union[Mapping[str, Any], DimensionRecord],
1160 conform: bool = True) -> bool:
1161 """Synchronize the given dimension record with the database, inserting
1162 if it does not already exist and comparing values if it does.
1164 Parameters
1165 ----------
1166 element : `DimensionElement` or `str`
1167 The `DimensionElement` or name thereof that identifies the table
1168 records will be inserted into.
1169 row : `dict` or `DimensionRecord`
1170 The record to insert.
1171 conform : `bool`, optional
1172 If `False` (`True` is default) perform no checking or conversions,
1173 and assume that ``element`` is a `DimensionElement` instance and
1174 ``data`` is a one or more `DimensionRecord` instances of the
1175 appropriate subclass.
1177 Returns
1178 -------
1179 inserted : `bool`
1180 `True` if a new row was inserted, `False` otherwise.
1182 Raises
1183 ------
1184 ConflictingDefinitionError
1185 Raised if the record exists in the database (according to primary
1186 key lookup) but is inconsistent with the given one.
1187 """
1188 if conform:
1189 if isinstance(element, str):
1190 element = self.dimensions[element]
1191 record = row if isinstance(row, DimensionRecord) else element.RecordClass(**row)
1192 else:
1193 # Ignore typing since caller said to trust them with conform=False.
1194 record = row # type: ignore
1195 storage = self._dimensions[element] # type: ignore
1196 return storage.sync(record)
1198 def queryDatasetTypes(self, expression: Any = ..., *, components: Optional[bool] = None
1199 ) -> Iterator[DatasetType]:
1200 """Iterate over the dataset types whose names match an expression.
1202 Parameters
1203 ----------
1204 expression : `Any`, optional
1205 An expression that fully or partially identifies the dataset types
1206 to return, such as a `str`, `re.Pattern`, or iterable thereof.
1207 `...` can be used to return all dataset types, and is the default.
1208 See :ref:`daf_butler_dataset_type_expressions` for more
1209 information.
1210 components : `bool`, optional
1211 If `True`, apply all expression patterns to component dataset type
1212 names as well. If `False`, never apply patterns to components.
1213 If `None` (default), apply patterns to components only if their
1214 parent datasets were not matched by the expression.
1215 Fully-specified component datasets (`str` or `DatasetType`
1216 instances) are always included.
1218 Yields
1219 ------
1220 datasetType : `DatasetType`
1221 A `DatasetType` instance whose name matches ``expression``.
1222 """
1223 wildcard = CategorizedWildcard.fromExpression(expression, coerceUnrecognized=lambda d: d.name)
1224 if wildcard is Ellipsis:
1225 for datasetType in self._datasets:
1226 # The dataset type can no longer be a component
1227 yield datasetType
1228 if components and datasetType.isComposite():
1229 # Automatically create the component dataset types
1230 for component in datasetType.makeAllComponentDatasetTypes():
1231 yield component
1232 return
1233 done: Set[str] = set()
1234 for name in wildcard.strings:
1235 storage = self._datasets.find(name)
1236 if storage is not None:
1237 done.add(storage.datasetType.name)
1238 yield storage.datasetType
1239 if wildcard.patterns:
1240 # If components (the argument) is None, we'll save component
1241 # dataset that we might want to match, but only if their parents
1242 # didn't get included.
1243 componentsForLater = []
1244 for registeredDatasetType in self._datasets:
1245 # Components are not stored in registry so expand them here
1246 allDatasetTypes = [registeredDatasetType] \
1247 + registeredDatasetType.makeAllComponentDatasetTypes()
1248 for datasetType in allDatasetTypes:
1249 if datasetType.name in done:
1250 continue
1251 parentName, componentName = datasetType.nameAndComponent()
1252 if componentName is not None and not components:
1253 if components is None and parentName not in done:
1254 componentsForLater.append(datasetType)
1255 continue
1256 if any(p.fullmatch(datasetType.name) for p in wildcard.patterns):
1257 done.add(datasetType.name)
1258 yield datasetType
1259 # Go back and try to match saved components.
1260 for datasetType in componentsForLater:
1261 parentName, _ = datasetType.nameAndComponent()
1262 if parentName not in done and any(p.fullmatch(datasetType.name) for p in wildcard.patterns):
1263 yield datasetType
1265 def queryCollections(self, expression: Any = ...,
1266 datasetType: Optional[DatasetType] = None,
1267 collectionTypes: Iterable[CollectionType] = CollectionType.all(),
1268 flattenChains: bool = False,
1269 includeChains: Optional[bool] = None) -> Iterator[str]:
1270 """Iterate over the collections whose names match an expression.
1272 Parameters
1273 ----------
1274 expression : `Any`, optional
1275 An expression that fully or partially identifies the collections
1276 to return, such as a `str`, `re.Pattern`, or iterable thereof.
1277 `...` can be used to return all collections, and is the default.
1278 See :ref:`daf_butler_collection_expressions` for more
1279 information.
1280 datasetType : `DatasetType`, optional
1281 If provided, only yield collections that may contain datasets of
1282 this type. This is a conservative approximation in general; it may
1283 yield collections that do not have any such datasets.
1284 collectionTypes : `AbstractSet` [ `CollectionType` ], optional
1285 If provided, only yield collections of these types.
1286 flattenChains : `bool`, optional
1287 If `True` (`False` is default), recursively yield the child
1288 collections of matching `~CollectionType.CHAINED` collections.
1289 includeChains : `bool`, optional
1290 If `True`, yield records for matching `~CollectionType.CHAINED`
1291 collections. Default is the opposite of ``flattenChains``: include
1292 either CHAINED collections or their children, but not both.
1294 Yields
1295 ------
1296 collection : `str`
1297 The name of a collection that matches ``expression``.
1298 """
1299 # Right now the datasetTypes argument is completely ignored, but that
1300 # is consistent with its [lack of] guarantees. DM-24939 or a follow-up
1301 # ticket will take care of that.
1302 query = CollectionQuery.fromExpression(expression)
1303 for record in query.iter(self._collections, collectionTypes=frozenset(collectionTypes),
1304 flattenChains=flattenChains, includeChains=includeChains):
1305 yield record.name
1307 def makeQueryBuilder(self, summary: queries.QuerySummary) -> queries.QueryBuilder:
1308 """Return a `QueryBuilder` instance capable of constructing and
1309 managing more complex queries than those obtainable via `Registry`
1310 interfaces.
1312 This is an advanced interface; downstream code should prefer
1313 `Registry.queryDataIds` and `Registry.queryDatasets` whenever those
1314 are sufficient.
1316 Parameters
1317 ----------
1318 summary : `queries.QuerySummary`
1319 Object describing and categorizing the full set of dimensions that
1320 will be included in the query.
1322 Returns
1323 -------
1324 builder : `queries.QueryBuilder`
1325 Object that can be used to construct and perform advanced queries.
1326 """
1327 return queries.QueryBuilder(
1328 summary,
1329 queries.RegistryManagers(
1330 collections=self._collections,
1331 dimensions=self._dimensions,
1332 datasets=self._datasets
1333 )
1334 )
1336 def queryDatasets(self, datasetType: Any, *,
1337 collections: Any,
1338 dimensions: Optional[Iterable[Union[Dimension, str]]] = None,
1339 dataId: Optional[DataId] = None,
1340 where: Optional[str] = None,
1341 findFirst: bool = False,
1342 components: Optional[bool] = None,
1343 check: bool = True,
1344 **kwargs: Any) -> queries.DatasetQueryResults:
1345 """Query for and iterate over dataset references matching user-provided
1346 criteria.
1348 Parameters
1349 ----------
1350 datasetType
1351 An expression that fully or partially identifies the dataset types
1352 to be queried. Allowed types include `DatasetType`, `str`,
1353 `re.Pattern`, and iterables thereof. The special value `...` can
1354 be used to query all dataset types. See
1355 :ref:`daf_butler_dataset_type_expressions` for more information.
1356 collections
1357 An expression that fully or partially identifies the collections
1358 to search for datasets, such as a `str`, `re.Pattern`, or iterable
1359 thereof. `...` can be used to find datasets from all
1360 `~CollectionType.RUN` collections (no other collections are
1361 necessary, because all datasets are in a ``RUN`` collection). See
1362 :ref:`daf_butler_collection_expressions` for more information.
1363 dimensions : `~collections.abc.Iterable` of `Dimension` or `str`
1364 Dimensions to include in the query (in addition to those used
1365 to identify the queried dataset type(s)), either to constrain
1366 the resulting datasets to those for which a matching dimension
1367 exists, or to relate the dataset type's dimensions to dimensions
1368 referenced by the ``dataId`` or ``where`` arguments.
1369 dataId : `dict` or `DataCoordinate`, optional
1370 A data ID whose key-value pairs are used as equality constraints
1371 in the query.
1372 where : `str`, optional
1373 A string expression similar to a SQL WHERE clause. May involve
1374 any column of a dimension table or (as a shortcut for the primary
1375 key column of a dimension table) dimension name. See
1376 :ref:`daf_butler_dimension_expressions` for more information.
1377 findFirst : `bool`, optional
1378 If `True` (`False` is default), for each result data ID, only
1379 yield one `DatasetRef` of each `DatasetType`, from the first
1380 collection in which a dataset of that dataset type appears
1381 (according to the order of ``collections`` passed in). If `True`,
1382 ``collections`` must not contain regular expressions and may not
1383 be `...`.
1384 components : `bool`, optional
1385 If `True`, apply all dataset expression patterns to component
1386 dataset type names as well. If `False`, never apply patterns to
1387 components. If `None` (default), apply patterns to components only
1388 if their parent datasets were not matched by the expression.
1389 Fully-specified component datasets (`str` or `DatasetType`
1390 instances) are always included.
1391 check : `bool`, optional
1392 If `True` (default) check the query for consistency before
1393 executing it. This may reject some valid queries that resemble
1394 common mistakes (e.g. queries for visits without specifying an
1395 instrument).
1396 **kwargs
1397 Additional keyword arguments are forwarded to
1398 `DataCoordinate.standardize` when processing the ``dataId``
1399 argument (and may be used to provide a constraining data ID even
1400 when the ``dataId`` argument is `None`).
1402 Returns
1403 -------
1404 refs : `queries.DatasetQueryResults`
1405 Dataset references matching the given query criteria.
1407 Raises
1408 ------
1409 TypeError
1410 Raised when the arguments are incompatible, such as when a
1411 collection wildcard is passed when ``findFirst`` is `True`.
1413 Notes
1414 -----
1415 When multiple dataset types are queried in a single call, the
1416 results of this operation are equivalent to querying for each dataset
1417 type separately in turn, and no information about the relationships
1418 between datasets of different types is included. In contexts where
1419 that kind of information is important, the recommended pattern is to
1420 use `queryDataIds` to first obtain data IDs (possibly with the
1421 desired dataset types and collections passed as constraints to the
1422 query), and then use multiple (generally much simpler) calls to
1423 `queryDatasets` with the returned data IDs passed as constraints.
1424 """
1425 # Standardize the collections expression.
1426 if findFirst:
1427 collections = CollectionSearch.fromExpression(collections)
1428 else:
1429 collections = CollectionQuery.fromExpression(collections)
1430 # Standardize and expand the data ID provided as a constraint.
1431 standardizedDataId = self.expandDataId(dataId, **kwargs)
1433 # We can only query directly if given a non-component DatasetType
1434 # instance. If we were given an expression or str or a component
1435 # DatasetType instance, we'll populate this dict, recurse, and return.
1436 # If we already have a non-component DatasetType, it will remain None
1437 # and we'll run the query directly.
1438 composition: Optional[
1439 Dict[
1440 DatasetType, # parent dataset type
1441 List[Optional[str]] # component name, or None for parent
1442 ]
1443 ] = None
1444 if not isinstance(datasetType, DatasetType):
1445 # We were given a dataset type expression (which may be as simple
1446 # as a str). Loop over all matching datasets, delegating handling
1447 # of the `components` argument to queryDatasetTypes, as we populate
1448 # the composition dict.
1449 composition = defaultdict(list)
1450 for trueDatasetType in self.queryDatasetTypes(datasetType, components=components):
1451 parentName, componentName = trueDatasetType.nameAndComponent()
1452 if componentName is not None:
1453 parentDatasetType = self.getDatasetType(parentName)
1454 composition.setdefault(parentDatasetType, []).append(componentName)
1455 else:
1456 composition.setdefault(trueDatasetType, []).append(None)
1457 elif datasetType.isComponent():
1458 # We were given a true DatasetType instance, but it's a component.
1459 # the composition dict will have exactly one item.
1460 parentName, componentName = datasetType.nameAndComponent()
1461 parentDatasetType = self.getDatasetType(parentName)
1462 composition = {parentDatasetType: [componentName]}
1463 if composition is not None:
1464 # We need to recurse. Do that once for each parent dataset type.
1465 chain = []
1466 for parentDatasetType, componentNames in composition.items():
1467 parentResults = self.queryDatasets(
1468 parentDatasetType,
1469 collections=collections,
1470 dimensions=dimensions,
1471 dataId=standardizedDataId,
1472 where=where,
1473 findFirst=findFirst,
1474 check=check,
1475 )
1476 if isinstance(parentResults, queries.ParentDatasetQueryResults):
1477 chain.append(
1478 parentResults.withComponents(componentNames)
1479 )
1480 else:
1481 # Should only happen if we know there would be no results.
1482 assert isinstance(parentResults, queries.ChainedDatasetQueryResults) \
1483 and not parentResults._chain
1484 return queries.ChainedDatasetQueryResults(chain)
1485 # If we get here, there's no need to recurse (or we are already
1486 # recursing; there can only ever be one level of recursion).
1488 # The full set of dimensions in the query is the combination of those
1489 # needed for the DatasetType and those explicitly requested, if any.
1490 requestedDimensionNames = set(datasetType.dimensions.names)
1491 if dimensions is not None:
1492 requestedDimensionNames.update(self.dimensions.extract(dimensions).names)
1493 # Construct the summary structure needed to construct a QueryBuilder.
1494 summary = queries.QuerySummary(
1495 requested=DimensionGraph(self.dimensions, names=requestedDimensionNames),
1496 dataId=standardizedDataId,
1497 expression=where,
1498 check=check,
1499 )
1500 builder = self.makeQueryBuilder(summary)
1501 # Add the dataset subquery to the query, telling the QueryBuilder to
1502 # include the rank of the selected collection in the results only if we
1503 # need to findFirst. Note that if any of the collections are
1504 # actually wildcard expressions, and we've asked for deduplication,
1505 # this will raise TypeError for us.
1506 if not builder.joinDataset(datasetType, collections, isResult=True, findFirst=findFirst):
1507 return queries.ChainedDatasetQueryResults(())
1508 query = builder.finish()
1509 return queries.ParentDatasetQueryResults(self._db, query, components=[None])
1511 def queryDataIds(self, dimensions: Union[Iterable[Union[Dimension, str]], Dimension, str], *,
1512 dataId: Optional[DataId] = None,
1513 datasets: Any = None,
1514 collections: Any = None,
1515 where: Optional[str] = None,
1516 components: Optional[bool] = None,
1517 check: bool = True,
1518 **kwargs: Any) -> queries.DataCoordinateQueryResults:
1519 """Query for data IDs matching user-provided criteria.
1521 Parameters
1522 ----------
1523 dimensions : `Dimension` or `str`, or iterable thereof
1524 The dimensions of the data IDs to yield, as either `Dimension`
1525 instances or `str`. Will be automatically expanded to a complete
1526 `DimensionGraph`.
1527 dataId : `dict` or `DataCoordinate`, optional
1528 A data ID whose key-value pairs are used as equality constraints
1529 in the query.
1530 datasets : `Any`, optional
1531 An expression that fully or partially identifies dataset types
1532 that should constrain the yielded data IDs. For example, including
1533 "raw" here would constrain the yielded ``instrument``,
1534 ``exposure``, ``detector``, and ``physical_filter`` values to only
1535 those for which at least one "raw" dataset exists in
1536 ``collections``. Allowed types include `DatasetType`, `str`,
1537 `re.Pattern`, and iterables thereof. Unlike other dataset type
1538 expressions, ``...`` is not permitted - it doesn't make sense to
1539 constrain data IDs on the existence of *all* datasets.
1540 See :ref:`daf_butler_dataset_type_expressions` for more
1541 information.
1542 collections: `Any`, optional
1543 An expression that fully or partially identifies the collections
1544 to search for datasets, such as a `str`, `re.Pattern`, or iterable
1545 thereof. `...` can be used to return all collections. Must be
1546 provided if ``datasets`` is, and is ignored if it is not. See
1547 :ref:`daf_butler_collection_expressions` for more information.
1548 where : `str`, optional
1549 A string expression similar to a SQL WHERE clause. May involve
1550 any column of a dimension table or (as a shortcut for the primary
1551 key column of a dimension table) dimension name. See
1552 :ref:`daf_butler_dimension_expressions` for more information.
1553 components : `bool`, optional
1554 If `True`, apply all dataset expression patterns to component
1555 dataset type names as well. If `False`, never apply patterns to
1556 components. If `None` (default), apply patterns to components only
1557 if their parent datasets were not matched by the expression.
1558 Fully-specified component datasets (`str` or `DatasetType`
1559 instances) are always included.
1560 check : `bool`, optional
1561 If `True` (default) check the query for consistency before
1562 executing it. This may reject some valid queries that resemble
1563 common mistakes (e.g. queries for visits without specifying an
1564 instrument).
1565 **kwargs
1566 Additional keyword arguments are forwarded to
1567 `DataCoordinate.standardize` when processing the ``dataId``
1568 argument (and may be used to provide a constraining data ID even
1569 when the ``dataId`` argument is `None`).
1571 Returns
1572 -------
1573 dataIds : `DataCoordinateQueryResults`
1574 Data IDs matching the given query parameters. These are guaranteed
1575 to identify all dimensions (`DataCoordinate.hasFull` returns
1576 `True`), but will not contain `DimensionRecord` objects
1577 (`DataCoordinate.hasRecords` returns `False`). Call
1578 `DataCoordinateQueryResults.expanded` on the returned object to
1579 fetch those (and consider using
1580 `DataCoordinateQueryResults.materialize` on the returned object
1581 first if the expected number of rows is very large). See
1582 documentation for those methods for additional information.
1583 """
1584 dimensions = iterable(dimensions)
1585 standardizedDataId = self.expandDataId(dataId, **kwargs)
1586 standardizedDatasetTypes = set()
1587 requestedDimensions = self.dimensions.extract(dimensions)
1588 queryDimensionNames = set(requestedDimensions.names)
1589 if datasets is not None:
1590 if collections is None:
1591 raise TypeError("Cannot pass 'datasets' without 'collections'.")
1592 for datasetType in self.queryDatasetTypes(datasets, components=components):
1593 queryDimensionNames.update(datasetType.dimensions.names)
1594 # If any matched dataset type is a component, just operate on
1595 # its parent instead, because Registry doesn't know anything
1596 # about what components exist, and here (unlike queryDatasets)
1597 # we don't care about returning them.
1598 parentDatasetTypeName, componentName = datasetType.nameAndComponent()
1599 if componentName is not None:
1600 datasetType = self.getDatasetType(parentDatasetTypeName)
1601 standardizedDatasetTypes.add(datasetType)
1602 # Preprocess collections expression in case the original included
1603 # single-pass iterators (we'll want to use it multiple times
1604 # below).
1605 collections = CollectionQuery.fromExpression(collections)
1607 summary = queries.QuerySummary(
1608 requested=DimensionGraph(self.dimensions, names=queryDimensionNames),
1609 dataId=standardizedDataId,
1610 expression=where,
1611 check=check,
1612 )
1613 builder = self.makeQueryBuilder(summary)
1614 for datasetType in standardizedDatasetTypes:
1615 builder.joinDataset(datasetType, collections, isResult=False)
1616 query = builder.finish()
1617 return queries.DataCoordinateQueryResults(self._db, query)
1619 def queryDimensionRecords(self, element: Union[DimensionElement, str], *,
1620 dataId: Optional[DataId] = None,
1621 datasets: Any = None,
1622 collections: Any = None,
1623 where: Optional[str] = None,
1624 components: Optional[bool] = None,
1625 check: bool = True,
1626 **kwargs: Any) -> Iterator[DimensionRecord]:
1627 """Query for dimension information matching user-provided criteria.
1629 Parameters
1630 ----------
1631 element : `DimensionElement` or `str`
1632 The dimension element to obtain r
1633 dataId : `dict` or `DataCoordinate`, optional
1634 A data ID whose key-value pairs are used as equality constraints
1635 in the query.
1636 datasets : `Any`, optional
1637 An expression that fully or partially identifies dataset types
1638 that should constrain the yielded records. See `queryDataIds` and
1639 :ref:`daf_butler_dataset_type_expressions` for more information.
1640 collections: `Any`, optional
1641 An expression that fully or partially identifies the collections
1642 to search for datasets. See `queryDataIds` and
1643 :ref:`daf_butler_collection_expressions` for more information.
1644 where : `str`, optional
1645 A string expression similar to a SQL WHERE clause. See
1646 `queryDataIds` and :ref:`daf_butler_dimension_expressions` for more
1647 information.
1648 components : `bool`, optional
1649 Whether to apply dataset expressions to components as well.
1650 See `queryDataIds` for more information.
1651 check : `bool`, optional
1652 If `True` (default) check the query for consistency before
1653 executing it. This may reject some valid queries that resemble
1654 common mistakes (e.g. queries for visits without specifying an
1655 instrument).
1656 **kwargs
1657 Additional keyword arguments are forwarded to
1658 `DataCoordinate.standardize` when processing the ``dataId``
1659 argument (and may be used to provide a constraining data ID even
1660 when the ``dataId`` argument is `None`).
1662 Returns
1663 -------
1664 dataIds : `DataCoordinateQueryResults`
1665 Data IDs matching the given query parameters.
1666 """
1667 if not isinstance(element, DimensionElement):
1668 element = self.dimensions[element]
1669 dataIds = self.queryDataIds(element.graph, dataId=dataId, datasets=datasets, collections=collections,
1670 where=where, components=components, check=check, **kwargs)
1671 return iter(self._dimensions[element].fetch(dataIds))
1673 def queryDatasetAssociations(
1674 self,
1675 datasetType: Union[str, DatasetType],
1676 collections: Any = ...,
1677 *,
1678 collectionTypes: Iterable[CollectionType] = CollectionType.all(),
1679 flattenChains: bool = False,
1680 ) -> Iterator[DatasetAssociation]:
1681 """Iterate over dataset-collection combinations where the dataset is in
1682 the collection.
1684 This method is a temporary placeholder for better support for
1685 assocation results in `queryDatasets`. It will probably be
1686 removed in the future, and should be avoided in production code
1687 whenever possible.
1689 Parameters
1690 ----------
1691 datasetType : `DatasetType` or `str`
1692 A dataset type object or the name of one.
1693 collections: `Any`, optional
1694 An expression that fully or partially identifies the collections
1695 to search for datasets. See `queryCollections` and
1696 :ref:`daf_butler_collection_expressions` for more information.
1697 collectionTypes : `AbstractSet` [ `CollectionType` ], optional
1698 If provided, only yield associations from collections of these
1699 types.
1700 flattenChains : `bool`, optional
1701 If `True` (default) search in the children of
1702 `~CollectionType.CHAINED` collections. If `False`, ``CHAINED``
1703 collections are ignored.
1705 Yields
1706 ------
1707 association : `DatasetAssociation`
1708 Object representing the relationship beween a single dataset and
1709 a single collection.
1710 """
1711 collections = CollectionQuery.fromExpression(collections)
1712 tsRepr = self._db.getTimespanRepresentation()
1713 if isinstance(datasetType, str):
1714 storage = self._datasets[datasetType]
1715 else:
1716 storage = self._datasets[datasetType.name]
1717 for collectionRecord in collections.iter(self._collections,
1718 collectionTypes=frozenset(collectionTypes),
1719 flattenChains=flattenChains):
1720 query = storage.select(collectionRecord)
1721 if query is None:
1722 continue
1723 for row in self._db.query(query.combine()):
1724 dataId = DataCoordinate.fromRequiredValues(
1725 storage.datasetType.dimensions,
1726 tuple(row[name] for name in storage.datasetType.dimensions.required.names)
1727 )
1728 runRecord = self._collections[row[self._collections.getRunForeignKeyName()]]
1729 ref = DatasetRef(storage.datasetType, dataId, id=row["id"], run=runRecord.name,
1730 conform=False)
1731 if collectionRecord.type is CollectionType.CALIBRATION:
1732 timespan = tsRepr.extract(row)
1733 else:
1734 timespan = None
1735 yield DatasetAssociation(ref=ref, collection=collectionRecord.name, timespan=timespan)
1737 storageClasses: StorageClassFactory
1738 """All storage classes known to the registry (`StorageClassFactory`).
1739 """