Coverage for python/lsst/daf/butler/registry/_registry.py : 12%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22from __future__ import annotations
24__all__ = (
25 "Registry",
26)
28from collections import defaultdict
29import contextlib
30import logging
31from typing import (
32 Any,
33 Dict,
34 Iterable,
35 Iterator,
36 List,
37 Mapping,
38 Optional,
39 Set,
40 Type,
41 TYPE_CHECKING,
42 Union,
43)
45import sqlalchemy
47from lsst.utils import doImport
48from ..core import (
49 ButlerURI,
50 Config,
51 DataCoordinate,
52 DataCoordinateIterable,
53 DataId,
54 DatasetAssociation,
55 DatasetRef,
56 DatasetType,
57 ddl,
58 Dimension,
59 DimensionConfig,
60 DimensionElement,
61 DimensionGraph,
62 DimensionRecord,
63 DimensionUniverse,
64 NamedKeyMapping,
65 NameLookupMapping,
66 StorageClassFactory,
67 Timespan,
68)
69from . import queries
70from ..core.utils import iterable, transactional
71from ._config import RegistryConfig
72from ._collectionType import CollectionType
73from ._exceptions import ConflictingDefinitionError, InconsistentDataIdError, OrphanedRecordError
74from .wildcards import CategorizedWildcard, CollectionQuery, CollectionSearch, Ellipsis
75from .interfaces import ChainedCollectionRecord, RunRecord
76from .versions import ButlerVersionsManager, DigestMismatchError
78if TYPE_CHECKING: 78 ↛ 79line 78 didn't jump to line 79, because the condition on line 78 was never true
79 from .._butlerConfig import ButlerConfig
80 from .interfaces import (
81 ButlerAttributeManager,
82 CollectionManager,
83 Database,
84 OpaqueTableStorageManager,
85 DimensionRecordStorageManager,
86 DatasetRecordStorageManager,
87 DatastoreRegistryBridgeManager,
88 )
91_LOG = logging.getLogger(__name__)
93# key for dimensions configuration in attributes table
94_DIMENSIONS_ATTR = "config:dimensions.json"
97class Registry:
98 """Registry interface.
100 Parameters
101 ----------
102 database : `Database`
103 Database instance to store Registry.
104 attributes : `type`
105 Manager class implementing `ButlerAttributeManager`.
106 opaque : `type`
107 Manager class implementing `OpaqueTableStorageManager`.
108 dimensions : `type`
109 Manager class implementing `DimensionRecordStorageManager`.
110 collections : `type`
111 Manager class implementing `CollectionManager`.
112 datasets : `type`
113 Manager class implementing `DatasetRecordStorageManager`.
114 datastoreBridges : `type`
115 Manager class implementing `DatastoreRegistryBridgeManager`.
116 dimensionConfig : `DimensionConfig`, optional
117 Dimension universe configuration, only used when ``create`` is True.
118 writeable : `bool`, optional
119 If True then Registry will support write operations.
120 create : `bool`, optional
121 If True then database schema will be initialized, it must be empty
122 before instantiating Registry.
123 """
125 defaultConfigFile: Optional[str] = None
126 """Path to configuration defaults. Accessed within the ``configs`` resource
127 or relative to a search path. Can be None if no defaults specified.
128 """
130 @classmethod
131 def createFromConfig(cls, config: Optional[Union[RegistryConfig, str]] = None,
132 dimensionConfig: Optional[Union[DimensionConfig, str]] = None,
133 butlerRoot: Optional[str] = None) -> Registry:
134 """Create registry database and return `Registry` instance.
136 This method initializes database contents, database must be empty
137 prior to calling this method.
139 Parameters
140 ----------
141 config : `RegistryConfig` or `str`, optional
142 Registry configuration, if missing then default configuration will
143 be loaded from registry.yaml.
144 dimensionConfig : `DimensionConfig` or `str`, optional
145 Dimensions configuration, if missing then default configuration
146 will be loaded from dimensions.yaml.
147 butlerRoot : `str`, optional
148 Path to the repository root this `Registry` will manage.
150 Returns
151 -------
152 registry : `Registry`
153 A new `Registry` instance.
154 """
155 if isinstance(config, str):
156 config = RegistryConfig(config)
157 elif config is None:
158 config = RegistryConfig()
159 elif not isinstance(config, RegistryConfig):
160 raise TypeError(f"Incompatible Registry configuration type: {type(config)}")
161 config.replaceRoot(butlerRoot)
163 if isinstance(dimensionConfig, str):
164 dimensionConfig = DimensionConfig(config)
165 elif dimensionConfig is None:
166 dimensionConfig = DimensionConfig()
167 elif not isinstance(dimensionConfig, DimensionConfig):
168 raise TypeError(f"Incompatible Dimension configuration type: {type(dimensionConfig)}")
170 DatabaseClass = config.getDatabaseClass()
171 database = DatabaseClass.fromUri(str(config.connectionString), origin=config.get("origin", 0),
172 namespace=config.get("namespace"))
173 attributes = doImport(config["managers", "attributes"])
174 opaque = doImport(config["managers", "opaque"])
175 dimensions = doImport(config["managers", "dimensions"])
176 collections = doImport(config["managers", "collections"])
177 datasets = doImport(config["managers", "datasets"])
178 datastoreBridges = doImport(config["managers", "datastores"])
180 return cls(database, dimensions=dimensions, attributes=attributes, opaque=opaque,
181 collections=collections, datasets=datasets, datastoreBridges=datastoreBridges,
182 dimensionConfig=dimensionConfig, create=True)
184 @classmethod
185 def fromConfig(cls, config: Union[ButlerConfig, RegistryConfig, Config, str],
186 butlerRoot: Optional[Union[str, ButlerURI]] = None, writeable: bool = True) -> Registry:
187 """Create `Registry` subclass instance from `config`.
189 Registry database must be inbitialized prior to calling this method.
191 Parameters
192 ----------
193 config : `ButlerConfig`, `RegistryConfig`, `Config` or `str`
194 Registry configuration
195 butlerRoot : `str` or `ButlerURI`, optional
196 Path to the repository root this `Registry` will manage.
197 writeable : `bool`, optional
198 If `True` (default) create a read-write connection to the database.
200 Returns
201 -------
202 registry : `Registry` (subclass)
203 A new `Registry` subclass instance.
204 """
205 if not isinstance(config, RegistryConfig):
206 if isinstance(config, str) or isinstance(config, Config):
207 config = RegistryConfig(config)
208 else:
209 raise ValueError("Incompatible Registry configuration: {}".format(config))
210 config.replaceRoot(butlerRoot)
211 DatabaseClass = config.getDatabaseClass()
212 database = DatabaseClass.fromUri(str(config.connectionString), origin=config.get("origin", 0),
213 namespace=config.get("namespace"), writeable=writeable)
214 attributes = doImport(config["managers", "attributes"])
215 opaque = doImport(config["managers", "opaque"])
216 dimensions = doImport(config["managers", "dimensions"])
217 collections = doImport(config["managers", "collections"])
218 datasets = doImport(config["managers", "datasets"])
219 datastoreBridges = doImport(config["managers", "datastores"])
221 return cls(database, dimensions=dimensions, attributes=attributes, opaque=opaque,
222 collections=collections, datasets=datasets, datastoreBridges=datastoreBridges,
223 dimensionConfig=None, writeable=writeable)
225 def __init__(self, database: Database, *,
226 attributes: Type[ButlerAttributeManager],
227 opaque: Type[OpaqueTableStorageManager],
228 dimensions: Type[DimensionRecordStorageManager],
229 collections: Type[CollectionManager],
230 datasets: Type[DatasetRecordStorageManager],
231 datastoreBridges: Type[DatastoreRegistryBridgeManager],
232 dimensionConfig: Optional[DimensionConfig] = None,
233 writeable: bool = True,
234 create: bool = False):
235 self._db = database
236 self.storageClasses = StorageClassFactory()
238 # With existing registry we have to read dimensions config from
239 # database before we initialize all other managers.
240 if dimensionConfig is None:
241 assert not create, "missing DimensionConfig when create=True"
242 with self._db.declareStaticTables(create=False) as context:
243 self._attributes = attributes.initialize(self._db, context)
245 versions = ButlerVersionsManager(
246 self._attributes,
247 dict(attributes=self._attributes)
248 )
249 # verify that configured versions are compatible with schema
250 versions.checkManagersConfig()
251 versions.checkManagersVersions(writeable)
253 # get serialized as a string from database
254 dimensionsString = self._attributes.get(_DIMENSIONS_ATTR)
255 if dimensionsString is not None:
256 dimensionConfig = DimensionConfig(Config.fromString(dimensionsString, format="json"))
257 else:
258 raise LookupError(f"Registry attribute {_DIMENSIONS_ATTR} is missing from database")
260 # make universe
261 universe = DimensionUniverse(dimensionConfig)
263 with self._db.declareStaticTables(create=create) as context:
264 self._attributes = attributes.initialize(self._db, context)
265 self._dimensions = dimensions.initialize(self._db, context, universe=universe)
266 self._collections = collections.initialize(self._db, context, dimensions=self._dimensions)
267 self._datasets = datasets.initialize(self._db, context,
268 collections=self._collections,
269 dimensions=self._dimensions)
270 self._opaque = opaque.initialize(self._db, context)
271 self._datastoreBridges = datastoreBridges.initialize(self._db, context,
272 opaque=self._opaque,
273 datasets=datasets,
274 universe=self._dimensions.universe)
275 versions = ButlerVersionsManager(
276 self._attributes,
277 dict(
278 attributes=self._attributes,
279 opaque=self._opaque,
280 dimensions=self._dimensions,
281 collections=self._collections,
282 datasets=self._datasets,
283 datastores=self._datastoreBridges,
284 )
285 )
286 # store managers and their versions in attributes table
287 context.addInitializer(lambda db: versions.storeManagersConfig())
288 context.addInitializer(lambda db: versions.storeManagersVersions())
289 # dump universe config as json into attributes (faster than YAML)
290 json = dimensionConfig.dump(format="json")
291 if json is not None:
292 # Convert Optional[str] to str for mypy
293 json_str = json
294 context.addInitializer(
295 lambda db: self._attributes.set(_DIMENSIONS_ATTR, json_str)
296 )
297 else:
298 raise RuntimeError("Unexpectedly failed to serialize DimensionConfig to JSON")
300 if not create:
301 # verify that configured versions are compatible with schema
302 versions.checkManagersConfig()
303 versions.checkManagersVersions(writeable)
304 try:
305 versions.checkManagersDigests()
306 except DigestMismatchError as exc:
307 # potentially digest mismatch is a serious error but during
308 # development it could be benign, treat this as warning for
309 # now.
310 _LOG.warning(f"Registry schema digest mismatch: {exc}")
312 self._dimensions.refresh()
313 self._collections.refresh()
314 self._datasets.refresh()
316 def __str__(self) -> str:
317 return str(self._db)
319 def __repr__(self) -> str:
320 return f"Registry({self._db!r}, {self.dimensions!r})"
322 def isWriteable(self) -> bool:
323 """Return `True` if this registry allows write operations, and `False`
324 otherwise.
325 """
326 return self._db.isWriteable()
328 @property
329 def dimensions(self) -> DimensionUniverse:
330 """All dimensions recognized by this `Registry` (`DimensionUniverse`).
331 """
332 return self._dimensions.universe
334 @contextlib.contextmanager
335 def transaction(self, *, savepoint: bool = False) -> Iterator[None]:
336 """Return a context manager that represents a transaction.
337 """
338 try:
339 with self._db.transaction(savepoint=savepoint):
340 yield
341 except BaseException:
342 # TODO: this clears the caches sometimes when we wouldn't actually
343 # need to. Can we avoid that?
344 self._dimensions.clearCaches()
345 raise
347 def registerOpaqueTable(self, tableName: str, spec: ddl.TableSpec) -> None:
348 """Add an opaque (to the `Registry`) table for use by a `Datastore` or
349 other data repository client.
351 Opaque table records can be added via `insertOpaqueData`, retrieved via
352 `fetchOpaqueData`, and removed via `deleteOpaqueData`.
354 Parameters
355 ----------
356 tableName : `str`
357 Logical name of the opaque table. This may differ from the
358 actual name used in the database by a prefix and/or suffix.
359 spec : `ddl.TableSpec`
360 Specification for the table to be added.
361 """
362 self._opaque.register(tableName, spec)
364 @transactional
365 def insertOpaqueData(self, tableName: str, *data: dict) -> None:
366 """Insert records into an opaque table.
368 Parameters
369 ----------
370 tableName : `str`
371 Logical name of the opaque table. Must match the name used in a
372 previous call to `registerOpaqueTable`.
373 data
374 Each additional positional argument is a dictionary that represents
375 a single row to be added.
376 """
377 self._opaque[tableName].insert(*data)
379 def fetchOpaqueData(self, tableName: str, **where: Any) -> Iterator[dict]:
380 """Retrieve records from an opaque table.
382 Parameters
383 ----------
384 tableName : `str`
385 Logical name of the opaque table. Must match the name used in a
386 previous call to `registerOpaqueTable`.
387 where
388 Additional keyword arguments are interpreted as equality
389 constraints that restrict the returned rows (combined with AND);
390 keyword arguments are column names and values are the values they
391 must have.
393 Yields
394 ------
395 row : `dict`
396 A dictionary representing a single result row.
397 """
398 yield from self._opaque[tableName].fetch(**where)
400 @transactional
401 def deleteOpaqueData(self, tableName: str, **where: Any) -> None:
402 """Remove records from an opaque table.
404 Parameters
405 ----------
406 tableName : `str`
407 Logical name of the opaque table. Must match the name used in a
408 previous call to `registerOpaqueTable`.
409 where
410 Additional keyword arguments are interpreted as equality
411 constraints that restrict the deleted rows (combined with AND);
412 keyword arguments are column names and values are the values they
413 must have.
414 """
415 self._opaque[tableName].delete(**where)
417 def registerCollection(self, name: str, type: CollectionType = CollectionType.TAGGED,
418 doc: Optional[str] = None) -> None:
419 """Add a new collection if one with the given name does not exist.
421 Parameters
422 ----------
423 name : `str`
424 The name of the collection to create.
425 type : `CollectionType`
426 Enum value indicating the type of collection to create.
427 doc : `str`, optional
428 Documentation string for the collection.
430 Notes
431 -----
432 This method cannot be called within transactions, as it needs to be
433 able to perform its own transaction to be concurrent.
434 """
435 self._collections.register(name, type, doc=doc)
437 def getCollectionType(self, name: str) -> CollectionType:
438 """Return an enumeration value indicating the type of the given
439 collection.
441 Parameters
442 ----------
443 name : `str`
444 The name of the collection.
446 Returns
447 -------
448 type : `CollectionType`
449 Enum value indicating the type of this collection.
451 Raises
452 ------
453 MissingCollectionError
454 Raised if no collection with the given name exists.
455 """
456 return self._collections.find(name).type
458 def registerRun(self, name: str, doc: Optional[str] = None) -> None:
459 """Add a new run if one with the given name does not exist.
461 Parameters
462 ----------
463 name : `str`
464 The name of the run to create.
465 doc : `str`, optional
466 Documentation string for the collection.
468 Notes
469 -----
470 This method cannot be called within transactions, as it needs to be
471 able to perform its own transaction to be concurrent.
472 """
473 self._collections.register(name, CollectionType.RUN, doc=doc)
475 @transactional
476 def removeCollection(self, name: str) -> None:
477 """Completely remove the given collection.
479 Parameters
480 ----------
481 name : `str`
482 The name of the collection to remove.
484 Raises
485 ------
486 MissingCollectionError
487 Raised if no collection with the given name exists.
489 Notes
490 -----
491 If this is a `~CollectionType.RUN` collection, all datasets and quanta
492 in it are also fully removed. This requires that those datasets be
493 removed (or at least trashed) from any datastores that hold them first.
495 A collection may not be deleted as long as it is referenced by a
496 `~CollectionType.CHAINED` collection; the ``CHAINED`` collection must
497 be deleted or redefined first.
498 """
499 self._collections.remove(name)
501 def getCollectionChain(self, parent: str) -> CollectionSearch:
502 """Return the child collections in a `~CollectionType.CHAINED`
503 collection.
505 Parameters
506 ----------
507 parent : `str`
508 Name of the chained collection. Must have already been added via
509 a call to `Registry.registerCollection`.
511 Returns
512 -------
513 children : `CollectionSearch`
514 An object that defines the search path of the collection.
515 See :ref:`daf_butler_collection_expressions` for more information.
517 Raises
518 ------
519 MissingCollectionError
520 Raised if ``parent`` does not exist in the `Registry`.
521 TypeError
522 Raised if ``parent`` does not correspond to a
523 `~CollectionType.CHAINED` collection.
524 """
525 record = self._collections.find(parent)
526 if record.type is not CollectionType.CHAINED:
527 raise TypeError(f"Collection '{parent}' has type {record.type.name}, not CHAINED.")
528 assert isinstance(record, ChainedCollectionRecord)
529 return record.children
531 @transactional
532 def setCollectionChain(self, parent: str, children: Any) -> None:
533 """Define or redefine a `~CollectionType.CHAINED` collection.
535 Parameters
536 ----------
537 parent : `str`
538 Name of the chained collection. Must have already been added via
539 a call to `Registry.registerCollection`.
540 children : `Any`
541 An expression defining an ordered search of child collections,
542 generally an iterable of `str`; see
543 :ref:`daf_butler_collection_expressions` for more information.
545 Raises
546 ------
547 MissingCollectionError
548 Raised when any of the given collections do not exist in the
549 `Registry`.
550 TypeError
551 Raised if ``parent`` does not correspond to a
552 `~CollectionType.CHAINED` collection.
553 ValueError
554 Raised if the given collections contains a cycle.
555 """
556 record = self._collections.find(parent)
557 if record.type is not CollectionType.CHAINED:
558 raise TypeError(f"Collection '{parent}' has type {record.type.name}, not CHAINED.")
559 assert isinstance(record, ChainedCollectionRecord)
560 children = CollectionSearch.fromExpression(children)
561 if children != record.children:
562 record.update(self._collections, children)
564 def getCollectionDocumentation(self, collection: str) -> Optional[str]:
565 """Retrieve the documentation string for a collection.
567 Parameters
568 ----------
569 name : `str`
570 Name of the collection.
572 Returns
573 -------
574 docs : `str` or `None`
575 Docstring for the collection with the given name.
576 """
577 return self._collections.getDocumentation(self._collections.find(collection).key)
579 def setCollectionDocumentation(self, collection: str, doc: Optional[str]) -> None:
580 """Set the documentation string for a collection.
582 Parameters
583 ----------
584 name : `str`
585 Name of the collection.
586 docs : `str` or `None`
587 Docstring for the collection with the given name; will replace any
588 existing docstring. Passing `None` will remove any existing
589 docstring.
590 """
591 self._collections.setDocumentation(self._collections.find(collection).key, doc)
593 def registerDatasetType(self, datasetType: DatasetType) -> bool:
594 """
595 Add a new `DatasetType` to the Registry.
597 It is not an error to register the same `DatasetType` twice.
599 Parameters
600 ----------
601 datasetType : `DatasetType`
602 The `DatasetType` to be added.
604 Returns
605 -------
606 inserted : `bool`
607 `True` if ``datasetType`` was inserted, `False` if an identical
608 existing `DatsetType` was found. Note that in either case the
609 DatasetType is guaranteed to be defined in the Registry
610 consistently with the given definition.
612 Raises
613 ------
614 ValueError
615 Raised if the dimensions or storage class are invalid.
616 ConflictingDefinitionError
617 Raised if this DatasetType is already registered with a different
618 definition.
620 Notes
621 -----
622 This method cannot be called within transactions, as it needs to be
623 able to perform its own transaction to be concurrent.
624 """
625 _, inserted = self._datasets.register(datasetType)
626 return inserted
628 def removeDatasetType(self, name: str) -> None:
629 """Remove the named `DatasetType` from the registry.
631 .. warning::
633 Registry caches the dataset type definitions. This means that
634 deleting the dataset type definition may result in unexpected
635 behavior from other butler processes that are active that have
636 not seen the deletion.
638 Parameters
639 ----------
640 name : `str`
641 Name of the type to be removed.
643 Raises
644 ------
645 lsst.daf.butler.registry.OrphanedRecordError
646 Raised if an attempt is made to remove the dataset type definition
647 when there are already datasets associated with it.
649 Notes
650 -----
651 If the dataset type is not registered the method will return without
652 action.
653 """
654 self._datasets.remove(name)
656 def getDatasetType(self, name: str) -> DatasetType:
657 """Get the `DatasetType`.
659 Parameters
660 ----------
661 name : `str`
662 Name of the type.
664 Returns
665 -------
666 type : `DatasetType`
667 The `DatasetType` associated with the given name.
669 Raises
670 ------
671 KeyError
672 Requested named DatasetType could not be found in registry.
673 """
674 return self._datasets[name].datasetType
676 def findDataset(self, datasetType: Union[DatasetType, str], dataId: Optional[DataId] = None, *,
677 collections: Any, timespan: Optional[Timespan] = None,
678 **kwargs: Any) -> Optional[DatasetRef]:
679 """Find a dataset given its `DatasetType` and data ID.
681 This can be used to obtain a `DatasetRef` that permits the dataset to
682 be read from a `Datastore`. If the dataset is a component and can not
683 be found using the provided dataset type, a dataset ref for the parent
684 will be returned instead but with the correct dataset type.
686 Parameters
687 ----------
688 datasetType : `DatasetType` or `str`
689 A `DatasetType` or the name of one.
690 dataId : `dict` or `DataCoordinate`, optional
691 A `dict`-like object containing the `Dimension` links that identify
692 the dataset within a collection.
693 collections
694 An expression that fully or partially identifies the collections to
695 search for the dataset; see
696 :ref:`daf_butler_collection_expressions` for more information.
697 timespan : `Timespan`, optional
698 A timespan that the validity range of the dataset must overlap.
699 If not provided, any `~CollectionType.CALIBRATION` collections
700 matched by the ``collections`` argument will not be searched.
701 **kwargs
702 Additional keyword arguments passed to
703 `DataCoordinate.standardize` to convert ``dataId`` to a true
704 `DataCoordinate` or augment an existing one.
706 Returns
707 -------
708 ref : `DatasetRef`
709 A reference to the dataset, or `None` if no matching Dataset
710 was found.
712 Raises
713 ------
714 LookupError
715 Raised if one or more data ID keys are missing.
716 KeyError
717 Raised if the dataset type does not exist.
718 MissingCollectionError
719 Raised if any of ``collections`` does not exist in the registry.
721 Notes
722 -----
723 This method simply returns `None` and does not raise an exception even
724 when the set of collections searched is intrinsically incompatible with
725 the dataset type, e.g. if ``datasetType.isCalibration() is False``, but
726 only `~CollectionType.CALIBRATION` collections are being searched.
727 This may make it harder to debug some lookup failures, but the behavior
728 is intentional; we consider it more important that failed searches are
729 reported consistently, regardless of the reason, and that adding
730 additional collections that do not contain a match to the search path
731 never changes the behavior.
732 """
733 if isinstance(datasetType, DatasetType):
734 storage = self._datasets[datasetType.name]
735 else:
736 storage = self._datasets[datasetType]
737 dataId = DataCoordinate.standardize(dataId, graph=storage.datasetType.dimensions,
738 universe=self.dimensions, **kwargs)
739 collections = CollectionSearch.fromExpression(collections)
740 for collectionRecord in collections.iter(self._collections):
741 if (collectionRecord.type is CollectionType.CALIBRATION
742 and (not storage.datasetType.isCalibration() or timespan is None)):
743 continue
744 result = storage.find(collectionRecord, dataId, timespan=timespan)
745 if result is not None:
746 return result
748 return None
750 @transactional
751 def insertDatasets(self, datasetType: Union[DatasetType, str], dataIds: Iterable[DataId],
752 run: str) -> List[DatasetRef]:
753 """Insert one or more datasets into the `Registry`
755 This always adds new datasets; to associate existing datasets with
756 a new collection, use ``associate``.
758 Parameters
759 ----------
760 datasetType : `DatasetType` or `str`
761 A `DatasetType` or the name of one.
762 dataIds : `~collections.abc.Iterable` of `dict` or `DataCoordinate`
763 Dimension-based identifiers for the new datasets.
764 run : `str`
765 The name of the run that produced the datasets.
767 Returns
768 -------
769 refs : `list` of `DatasetRef`
770 Resolved `DatasetRef` instances for all given data IDs (in the same
771 order).
773 Raises
774 ------
775 ConflictingDefinitionError
776 If a dataset with the same dataset type and data ID as one of those
777 given already exists in ``run``.
778 MissingCollectionError
779 Raised if ``run`` does not exist in the registry.
780 """
781 if isinstance(datasetType, DatasetType):
782 storage = self._datasets.find(datasetType.name)
783 if storage is None:
784 raise LookupError(f"DatasetType '{datasetType}' has not been registered.")
785 else:
786 storage = self._datasets.find(datasetType)
787 if storage is None:
788 raise LookupError(f"DatasetType with name '{datasetType}' has not been registered.")
789 runRecord = self._collections.find(run)
790 if runRecord.type is not CollectionType.RUN:
791 raise TypeError(f"Given collection is of type {runRecord.type.name}; RUN collection required.")
792 assert isinstance(runRecord, RunRecord)
793 expandedDataIds = [self.expandDataId(dataId, graph=storage.datasetType.dimensions)
794 for dataId in dataIds]
795 try:
796 refs = list(storage.insert(runRecord, expandedDataIds))
797 except sqlalchemy.exc.IntegrityError as err:
798 raise ConflictingDefinitionError(f"A database constraint failure was triggered by inserting "
799 f"one or more datasets of type {storage.datasetType} into "
800 f"collection '{run}'. "
801 f"This probably means a dataset with the same data ID "
802 f"and dataset type already exists, but it may also mean a "
803 f"dimension row is missing.") from err
804 return refs
806 def getDataset(self, id: int) -> Optional[DatasetRef]:
807 """Retrieve a Dataset entry.
809 Parameters
810 ----------
811 id : `int`
812 The unique identifier for the dataset.
814 Returns
815 -------
816 ref : `DatasetRef` or `None`
817 A ref to the Dataset, or `None` if no matching Dataset
818 was found.
819 """
820 ref = self._datasets.getDatasetRef(id)
821 if ref is None:
822 return None
823 return ref
825 @transactional
826 def removeDatasets(self, refs: Iterable[DatasetRef]) -> None:
827 """Remove datasets from the Registry.
829 The datasets will be removed unconditionally from all collections, and
830 any `Quantum` that consumed this dataset will instead be marked with
831 having a NULL input. `Datastore` records will *not* be deleted; the
832 caller is responsible for ensuring that the dataset has already been
833 removed from all Datastores.
835 Parameters
836 ----------
837 refs : `Iterable` of `DatasetRef`
838 References to the datasets to be removed. Must include a valid
839 ``id`` attribute, and should be considered invalidated upon return.
841 Raises
842 ------
843 AmbiguousDatasetError
844 Raised if any ``ref.id`` is `None`.
845 OrphanedRecordError
846 Raised if any dataset is still present in any `Datastore`.
847 """
848 for datasetType, refsForType in DatasetRef.groupByType(refs).items():
849 storage = self._datasets.find(datasetType.name)
850 assert storage is not None
851 try:
852 storage.delete(refsForType)
853 except sqlalchemy.exc.IntegrityError as err:
854 raise OrphanedRecordError("One or more datasets is still "
855 "present in one or more Datastores.") from err
857 @transactional
858 def associate(self, collection: str, refs: Iterable[DatasetRef]) -> None:
859 """Add existing datasets to a `~CollectionType.TAGGED` collection.
861 If a DatasetRef with the same exact integer ID is already in a
862 collection nothing is changed. If a `DatasetRef` with the same
863 `DatasetType` and data ID but with different integer ID
864 exists in the collection, `ConflictingDefinitionError` is raised.
866 Parameters
867 ----------
868 collection : `str`
869 Indicates the collection the datasets should be associated with.
870 refs : `Iterable` [ `DatasetRef` ]
871 An iterable of resolved `DatasetRef` instances that already exist
872 in this `Registry`.
874 Raises
875 ------
876 ConflictingDefinitionError
877 If a Dataset with the given `DatasetRef` already exists in the
878 given collection.
879 AmbiguousDatasetError
880 Raised if ``any(ref.id is None for ref in refs)``.
881 MissingCollectionError
882 Raised if ``collection`` does not exist in the registry.
883 TypeError
884 Raise adding new datasets to the given ``collection`` is not
885 allowed.
886 """
887 collectionRecord = self._collections.find(collection)
888 if collectionRecord.type is not CollectionType.TAGGED:
889 raise TypeError(f"Collection '{collection}' has type {collectionRecord.type.name}, not TAGGED.")
890 for datasetType, refsForType in DatasetRef.groupByType(refs).items():
891 storage = self._datasets.find(datasetType.name)
892 assert storage is not None
893 try:
894 storage.associate(collectionRecord, refsForType)
895 except sqlalchemy.exc.IntegrityError as err:
896 raise ConflictingDefinitionError(
897 f"Constraint violation while associating dataset of type {datasetType.name} with "
898 f"collection {collection}. This probably means that one or more datasets with the same "
899 f"dataset type and data ID already exist in the collection, but it may also indicate "
900 f"that the datasets do not exist."
901 ) from err
903 @transactional
904 def disassociate(self, collection: str, refs: Iterable[DatasetRef]) -> None:
905 """Remove existing datasets from a `~CollectionType.TAGGED` collection.
907 ``collection`` and ``ref`` combinations that are not currently
908 associated are silently ignored.
910 Parameters
911 ----------
912 collection : `str`
913 The collection the datasets should no longer be associated with.
914 refs : `Iterable` [ `DatasetRef` ]
915 An iterable of resolved `DatasetRef` instances that already exist
916 in this `Registry`.
918 Raises
919 ------
920 AmbiguousDatasetError
921 Raised if any of the given dataset references is unresolved.
922 MissingCollectionError
923 Raised if ``collection`` does not exist in the registry.
924 TypeError
925 Raise adding new datasets to the given ``collection`` is not
926 allowed.
927 """
928 collectionRecord = self._collections.find(collection)
929 if collectionRecord.type is not CollectionType.TAGGED:
930 raise TypeError(f"Collection '{collection}' has type {collectionRecord.type.name}; "
931 "expected TAGGED.")
932 for datasetType, refsForType in DatasetRef.groupByType(refs).items():
933 storage = self._datasets.find(datasetType.name)
934 assert storage is not None
935 storage.disassociate(collectionRecord, refsForType)
937 @transactional
938 def certify(self, collection: str, refs: Iterable[DatasetRef], timespan: Timespan) -> None:
939 """Associate one or more datasets with a calibration collection and a
940 validity range within it.
942 Parameters
943 ----------
944 collection : `str`
945 The name of an already-registered `~CollectionType.CALIBRATION`
946 collection.
947 refs : `Iterable` [ `DatasetRef` ]
948 Datasets to be associated.
949 timespan : `Timespan`
950 The validity range for these datasets within the collection.
952 Raises
953 ------
954 AmbiguousDatasetError
955 Raised if any of the given `DatasetRef` instances is unresolved.
956 ConflictingDefinitionError
957 Raised if the collection already contains a different dataset with
958 the same `DatasetType` and data ID and an overlapping validity
959 range.
960 TypeError
961 Raised if ``collection`` is not a `~CollectionType.CALIBRATION`
962 collection or if one or more datasets are of a dataset type for
963 which `DatasetType.isCalibration` returns `False`.
964 """
965 collectionRecord = self._collections.find(collection)
966 for datasetType, refsForType in DatasetRef.groupByType(refs).items():
967 storage = self._datasets[datasetType.name]
968 storage.certify(collectionRecord, refsForType, timespan)
970 @transactional
971 def decertify(self, collection: str, datasetType: Union[str, DatasetType], timespan: Timespan, *,
972 dataIds: Optional[Iterable[DataId]] = None) -> None:
973 """Remove or adjust datasets to clear a validity range within a
974 calibration collection.
976 Parameters
977 ----------
978 collection : `str`
979 The name of an already-registered `~CollectionType.CALIBRATION`
980 collection.
981 datasetType : `str` or `DatasetType`
982 Name or `DatasetType` instance for the datasets to be decertified.
983 timespan : `Timespan`, optional
984 The validity range to remove datasets from within the collection.
985 Datasets that overlap this range but are not contained by it will
986 have their validity ranges adjusted to not overlap it, which may
987 split a single dataset validity range into two.
988 dataIds : `Iterable` [ `DataId` ], optional
989 Data IDs that should be decertified within the given validity range
990 If `None`, all data IDs for ``self.datasetType`` will be
991 decertified.
993 Raises
994 ------
995 TypeError
996 Raised if ``collection`` is not a `~CollectionType.CALIBRATION`
997 collection or if ``datasetType.isCalibration() is False``.
998 """
999 collectionRecord = self._collections.find(collection)
1000 if isinstance(datasetType, str):
1001 storage = self._datasets[datasetType]
1002 else:
1003 storage = self._datasets[datasetType.name]
1004 standardizedDataIds = None
1005 if dataIds is not None:
1006 standardizedDataIds = [DataCoordinate.standardize(d, graph=storage.datasetType.dimensions)
1007 for d in dataIds]
1008 storage.decertify(collectionRecord, timespan, dataIds=standardizedDataIds)
1010 def getDatastoreBridgeManager(self) -> DatastoreRegistryBridgeManager:
1011 """Return an object that allows a new `Datastore` instance to
1012 communicate with this `Registry`.
1014 Returns
1015 -------
1016 manager : `DatastoreRegistryBridgeManager`
1017 Object that mediates communication between this `Registry` and its
1018 associated datastores.
1019 """
1020 return self._datastoreBridges
1022 def getDatasetLocations(self, ref: DatasetRef) -> Iterable[str]:
1023 """Retrieve datastore locations for a given dataset.
1025 Parameters
1026 ----------
1027 ref : `DatasetRef`
1028 A reference to the dataset for which to retrieve storage
1029 information.
1031 Returns
1032 -------
1033 datastores : `Iterable` [ `str` ]
1034 All the matching datastores holding this dataset.
1036 Raises
1037 ------
1038 AmbiguousDatasetError
1039 Raised if ``ref.id`` is `None`.
1040 """
1041 return self._datastoreBridges.findDatastores(ref)
1043 def expandDataId(self, dataId: Optional[DataId] = None, *, graph: Optional[DimensionGraph] = None,
1044 records: Optional[NameLookupMapping[DimensionElement, Optional[DimensionRecord]]] = None,
1045 **kwargs: Any) -> DataCoordinate:
1046 """Expand a dimension-based data ID to include additional information.
1048 Parameters
1049 ----------
1050 dataId : `DataCoordinate` or `dict`, optional
1051 Data ID to be expanded; augmented and overridden by ``kwds``.
1052 graph : `DimensionGraph`, optional
1053 Set of dimensions for the expanded ID. If `None`, the dimensions
1054 will be inferred from the keys of ``dataId`` and ``kwds``.
1055 Dimensions that are in ``dataId`` or ``kwds`` but not in ``graph``
1056 are silently ignored, providing a way to extract and expand a
1057 subset of a data ID.
1058 records : `Mapping` [`str`, `DimensionRecord`], optional
1059 Dimension record data to use before querying the database for that
1060 data, keyed by element name.
1061 **kwargs
1062 Additional keywords are treated like additional key-value pairs for
1063 ``dataId``, extending and overriding
1065 Returns
1066 -------
1067 expanded : `DataCoordinate`
1068 A data ID that includes full metadata for all of the dimensions it
1069 identifieds, i.e. guarantees that ``expanded.hasRecords()`` and
1070 ``expanded.hasFull()`` both return `True`.
1071 """
1072 standardized = DataCoordinate.standardize(dataId, graph=graph, universe=self.dimensions, **kwargs)
1073 if standardized.hasRecords():
1074 return standardized
1075 if records is None:
1076 records = {}
1077 elif isinstance(records, NamedKeyMapping):
1078 records = records.byName()
1079 else:
1080 records = dict(records)
1081 if isinstance(dataId, DataCoordinate) and dataId.hasRecords():
1082 records.update(dataId.records.byName())
1083 keys = standardized.byName()
1084 for element in standardized.graph.primaryKeyTraversalOrder:
1085 record = records.get(element.name, ...) # Use ... to mean not found; None might mean NULL
1086 if record is ...:
1087 if isinstance(element, Dimension) and keys.get(element.name) is None:
1088 if element in standardized.graph.required:
1089 raise LookupError(
1090 f"No value or null value for required dimension {element.name}."
1091 )
1092 keys[element.name] = None
1093 record = None
1094 else:
1095 storage = self._dimensions[element]
1096 dataIdSet = DataCoordinateIterable.fromScalar(
1097 DataCoordinate.standardize(keys, graph=element.graph)
1098 )
1099 fetched = tuple(storage.fetch(dataIdSet))
1100 try:
1101 (record,) = fetched
1102 except ValueError:
1103 record = None
1104 records[element.name] = record
1105 if record is not None:
1106 for d in element.implied:
1107 value = getattr(record, d.name)
1108 if keys.setdefault(d.name, value) != value:
1109 raise InconsistentDataIdError(
1110 f"Data ID {standardized} has {d.name}={keys[d.name]!r}, "
1111 f"but {element.name} implies {d.name}={value!r}."
1112 )
1113 else:
1114 if element in standardized.graph.required:
1115 raise LookupError(
1116 f"Could not fetch record for required dimension {element.name} via keys {keys}."
1117 )
1118 if element.alwaysJoin:
1119 raise InconsistentDataIdError(
1120 f"Could not fetch record for element {element.name} via keys {keys}, ",
1121 "but it is marked alwaysJoin=True; this means one or more dimensions are not "
1122 "related."
1123 )
1124 for d in element.implied:
1125 keys.setdefault(d.name, None)
1126 records.setdefault(d.name, None)
1127 return DataCoordinate.standardize(keys, graph=standardized.graph).expanded(records=records)
1129 def insertDimensionData(self, element: Union[DimensionElement, str],
1130 *data: Union[Mapping[str, Any], DimensionRecord],
1131 conform: bool = True) -> None:
1132 """Insert one or more dimension records into the database.
1134 Parameters
1135 ----------
1136 element : `DimensionElement` or `str`
1137 The `DimensionElement` or name thereof that identifies the table
1138 records will be inserted into.
1139 data : `dict` or `DimensionRecord` (variadic)
1140 One or more records to insert.
1141 conform : `bool`, optional
1142 If `False` (`True` is default) perform no checking or conversions,
1143 and assume that ``element`` is a `DimensionElement` instance and
1144 ``data`` is a one or more `DimensionRecord` instances of the
1145 appropriate subclass.
1146 """
1147 if conform:
1148 if isinstance(element, str):
1149 element = self.dimensions[element]
1150 records = [row if isinstance(row, DimensionRecord) else element.RecordClass(**row)
1151 for row in data]
1152 else:
1153 # Ignore typing since caller said to trust them with conform=False.
1154 records = data # type: ignore
1155 storage = self._dimensions[element] # type: ignore
1156 storage.insert(*records)
1158 def syncDimensionData(self, element: Union[DimensionElement, str],
1159 row: Union[Mapping[str, Any], DimensionRecord],
1160 conform: bool = True) -> bool:
1161 """Synchronize the given dimension record with the database, inserting
1162 if it does not already exist and comparing values if it does.
1164 Parameters
1165 ----------
1166 element : `DimensionElement` or `str`
1167 The `DimensionElement` or name thereof that identifies the table
1168 records will be inserted into.
1169 row : `dict` or `DimensionRecord`
1170 The record to insert.
1171 conform : `bool`, optional
1172 If `False` (`True` is default) perform no checking or conversions,
1173 and assume that ``element`` is a `DimensionElement` instance and
1174 ``data`` is a one or more `DimensionRecord` instances of the
1175 appropriate subclass.
1177 Returns
1178 -------
1179 inserted : `bool`
1180 `True` if a new row was inserted, `False` otherwise.
1182 Raises
1183 ------
1184 ConflictingDefinitionError
1185 Raised if the record exists in the database (according to primary
1186 key lookup) but is inconsistent with the given one.
1187 """
1188 if conform:
1189 if isinstance(element, str):
1190 element = self.dimensions[element]
1191 record = row if isinstance(row, DimensionRecord) else element.RecordClass(**row)
1192 else:
1193 # Ignore typing since caller said to trust them with conform=False.
1194 record = row # type: ignore
1195 storage = self._dimensions[element] # type: ignore
1196 return storage.sync(record)
1198 def queryDatasetTypes(self, expression: Any = ..., *, components: Optional[bool] = None
1199 ) -> Iterator[DatasetType]:
1200 """Iterate over the dataset types whose names match an expression.
1202 Parameters
1203 ----------
1204 expression : `Any`, optional
1205 An expression that fully or partially identifies the dataset types
1206 to return, such as a `str`, `re.Pattern`, or iterable thereof.
1207 `...` can be used to return all dataset types, and is the default.
1208 See :ref:`daf_butler_dataset_type_expressions` for more
1209 information.
1210 components : `bool`, optional
1211 If `True`, apply all expression patterns to component dataset type
1212 names as well. If `False`, never apply patterns to components.
1213 If `None` (default), apply patterns to components only if their
1214 parent datasets were not matched by the expression.
1215 Fully-specified component datasets (`str` or `DatasetType`
1216 instances) are always included.
1218 Yields
1219 ------
1220 datasetType : `DatasetType`
1221 A `DatasetType` instance whose name matches ``expression``.
1222 """
1223 wildcard = CategorizedWildcard.fromExpression(expression, coerceUnrecognized=lambda d: d.name)
1224 if wildcard is Ellipsis:
1225 for datasetType in self._datasets:
1226 # The dataset type can no longer be a component
1227 yield datasetType
1228 if components:
1229 # Automatically create the component dataset types
1230 try:
1231 componentsForDatasetType = datasetType.makeAllComponentDatasetTypes()
1232 except KeyError as err:
1233 _LOG.warning(f"Could not load storage class {err} for {datasetType.name}; "
1234 "if it has components they will not be included in query results.")
1235 else:
1236 yield from componentsForDatasetType
1237 return
1238 done: Set[str] = set()
1239 for name in wildcard.strings:
1240 storage = self._datasets.find(name)
1241 if storage is not None:
1242 done.add(storage.datasetType.name)
1243 yield storage.datasetType
1244 if wildcard.patterns:
1245 # If components (the argument) is None, we'll save component
1246 # dataset that we might want to match, but only if their parents
1247 # didn't get included.
1248 componentsForLater = []
1249 for registeredDatasetType in self._datasets:
1250 # Components are not stored in registry so expand them here
1251 allDatasetTypes = [registeredDatasetType]
1252 try:
1253 allDatasetTypes.extend(registeredDatasetType.makeAllComponentDatasetTypes())
1254 except KeyError as err:
1255 _LOG.warning(f"Could not load storage class {err} for {registeredDatasetType.name}; "
1256 "if it has components they will not be included in query results.")
1257 for datasetType in allDatasetTypes:
1258 if datasetType.name in done:
1259 continue
1260 parentName, componentName = datasetType.nameAndComponent()
1261 if componentName is not None and not components:
1262 if components is None and parentName not in done:
1263 componentsForLater.append(datasetType)
1264 continue
1265 if any(p.fullmatch(datasetType.name) for p in wildcard.patterns):
1266 done.add(datasetType.name)
1267 yield datasetType
1268 # Go back and try to match saved components.
1269 for datasetType in componentsForLater:
1270 parentName, _ = datasetType.nameAndComponent()
1271 if parentName not in done and any(p.fullmatch(datasetType.name) for p in wildcard.patterns):
1272 yield datasetType
1274 def queryCollections(self, expression: Any = ...,
1275 datasetType: Optional[DatasetType] = None,
1276 collectionTypes: Iterable[CollectionType] = CollectionType.all(),
1277 flattenChains: bool = False,
1278 includeChains: Optional[bool] = None) -> Iterator[str]:
1279 """Iterate over the collections whose names match an expression.
1281 Parameters
1282 ----------
1283 expression : `Any`, optional
1284 An expression that fully or partially identifies the collections
1285 to return, such as a `str`, `re.Pattern`, or iterable thereof.
1286 `...` can be used to return all collections, and is the default.
1287 See :ref:`daf_butler_collection_expressions` for more
1288 information.
1289 datasetType : `DatasetType`, optional
1290 If provided, only yield collections that may contain datasets of
1291 this type. This is a conservative approximation in general; it may
1292 yield collections that do not have any such datasets.
1293 collectionTypes : `AbstractSet` [ `CollectionType` ], optional
1294 If provided, only yield collections of these types.
1295 flattenChains : `bool`, optional
1296 If `True` (`False` is default), recursively yield the child
1297 collections of matching `~CollectionType.CHAINED` collections.
1298 includeChains : `bool`, optional
1299 If `True`, yield records for matching `~CollectionType.CHAINED`
1300 collections. Default is the opposite of ``flattenChains``: include
1301 either CHAINED collections or their children, but not both.
1303 Yields
1304 ------
1305 collection : `str`
1306 The name of a collection that matches ``expression``.
1307 """
1308 # Right now the datasetTypes argument is completely ignored, but that
1309 # is consistent with its [lack of] guarantees. DM-24939 or a follow-up
1310 # ticket will take care of that.
1311 query = CollectionQuery.fromExpression(expression)
1312 for record in query.iter(self._collections, collectionTypes=frozenset(collectionTypes),
1313 flattenChains=flattenChains, includeChains=includeChains):
1314 yield record.name
1316 def makeQueryBuilder(self, summary: queries.QuerySummary) -> queries.QueryBuilder:
1317 """Return a `QueryBuilder` instance capable of constructing and
1318 managing more complex queries than those obtainable via `Registry`
1319 interfaces.
1321 This is an advanced interface; downstream code should prefer
1322 `Registry.queryDataIds` and `Registry.queryDatasets` whenever those
1323 are sufficient.
1325 Parameters
1326 ----------
1327 summary : `queries.QuerySummary`
1328 Object describing and categorizing the full set of dimensions that
1329 will be included in the query.
1331 Returns
1332 -------
1333 builder : `queries.QueryBuilder`
1334 Object that can be used to construct and perform advanced queries.
1335 """
1336 return queries.QueryBuilder(
1337 summary,
1338 queries.RegistryManagers(
1339 collections=self._collections,
1340 dimensions=self._dimensions,
1341 datasets=self._datasets,
1342 TimespanReprClass=self._db.getTimespanRepresentation(),
1343 ),
1344 )
1346 def queryDatasets(self, datasetType: Any, *,
1347 collections: Any,
1348 dimensions: Optional[Iterable[Union[Dimension, str]]] = None,
1349 dataId: Optional[DataId] = None,
1350 where: Optional[str] = None,
1351 findFirst: bool = False,
1352 components: Optional[bool] = None,
1353 bind: Optional[Mapping[str, Any]] = None,
1354 check: bool = True,
1355 **kwargs: Any) -> queries.DatasetQueryResults:
1356 """Query for and iterate over dataset references matching user-provided
1357 criteria.
1359 Parameters
1360 ----------
1361 datasetType
1362 An expression that fully or partially identifies the dataset types
1363 to be queried. Allowed types include `DatasetType`, `str`,
1364 `re.Pattern`, and iterables thereof. The special value `...` can
1365 be used to query all dataset types. See
1366 :ref:`daf_butler_dataset_type_expressions` for more information.
1367 collections
1368 An expression that fully or partially identifies the collections
1369 to search for datasets, such as a `str`, `re.Pattern`, or iterable
1370 thereof. `...` can be used to find datasets from all
1371 `~CollectionType.RUN` collections (no other collections are
1372 necessary, because all datasets are in a ``RUN`` collection). See
1373 :ref:`daf_butler_collection_expressions` for more information.
1374 dimensions : `~collections.abc.Iterable` of `Dimension` or `str`
1375 Dimensions to include in the query (in addition to those used
1376 to identify the queried dataset type(s)), either to constrain
1377 the resulting datasets to those for which a matching dimension
1378 exists, or to relate the dataset type's dimensions to dimensions
1379 referenced by the ``dataId`` or ``where`` arguments.
1380 dataId : `dict` or `DataCoordinate`, optional
1381 A data ID whose key-value pairs are used as equality constraints
1382 in the query.
1383 where : `str`, optional
1384 A string expression similar to a SQL WHERE clause. May involve
1385 any column of a dimension table or (as a shortcut for the primary
1386 key column of a dimension table) dimension name. See
1387 :ref:`daf_butler_dimension_expressions` for more information.
1388 findFirst : `bool`, optional
1389 If `True` (`False` is default), for each result data ID, only
1390 yield one `DatasetRef` of each `DatasetType`, from the first
1391 collection in which a dataset of that dataset type appears
1392 (according to the order of ``collections`` passed in). If `True`,
1393 ``collections`` must not contain regular expressions and may not
1394 be `...`.
1395 components : `bool`, optional
1396 If `True`, apply all dataset expression patterns to component
1397 dataset type names as well. If `False`, never apply patterns to
1398 components. If `None` (default), apply patterns to components only
1399 if their parent datasets were not matched by the expression.
1400 Fully-specified component datasets (`str` or `DatasetType`
1401 instances) are always included.
1402 bind : `Mapping`, optional
1403 Mapping containing literal values that should be injected into the
1404 ``where`` expression, keyed by the identifiers they replace.
1405 check : `bool`, optional
1406 If `True` (default) check the query for consistency before
1407 executing it. This may reject some valid queries that resemble
1408 common mistakes (e.g. queries for visits without specifying an
1409 instrument).
1410 **kwargs
1411 Additional keyword arguments are forwarded to
1412 `DataCoordinate.standardize` when processing the ``dataId``
1413 argument (and may be used to provide a constraining data ID even
1414 when the ``dataId`` argument is `None`).
1416 Returns
1417 -------
1418 refs : `queries.DatasetQueryResults`
1419 Dataset references matching the given query criteria.
1421 Raises
1422 ------
1423 TypeError
1424 Raised when the arguments are incompatible, such as when a
1425 collection wildcard is passed when ``findFirst`` is `True`.
1427 Notes
1428 -----
1429 When multiple dataset types are queried in a single call, the
1430 results of this operation are equivalent to querying for each dataset
1431 type separately in turn, and no information about the relationships
1432 between datasets of different types is included. In contexts where
1433 that kind of information is important, the recommended pattern is to
1434 use `queryDataIds` to first obtain data IDs (possibly with the
1435 desired dataset types and collections passed as constraints to the
1436 query), and then use multiple (generally much simpler) calls to
1437 `queryDatasets` with the returned data IDs passed as constraints.
1438 """
1439 # Standardize the collections expression.
1440 if findFirst:
1441 collections = CollectionSearch.fromExpression(collections)
1442 else:
1443 collections = CollectionQuery.fromExpression(collections)
1444 # Standardize and expand the data ID provided as a constraint.
1445 standardizedDataId = self.expandDataId(dataId, **kwargs)
1447 # We can only query directly if given a non-component DatasetType
1448 # instance. If we were given an expression or str or a component
1449 # DatasetType instance, we'll populate this dict, recurse, and return.
1450 # If we already have a non-component DatasetType, it will remain None
1451 # and we'll run the query directly.
1452 composition: Optional[
1453 Dict[
1454 DatasetType, # parent dataset type
1455 List[Optional[str]] # component name, or None for parent
1456 ]
1457 ] = None
1458 if not isinstance(datasetType, DatasetType):
1459 # We were given a dataset type expression (which may be as simple
1460 # as a str). Loop over all matching datasets, delegating handling
1461 # of the `components` argument to queryDatasetTypes, as we populate
1462 # the composition dict.
1463 composition = defaultdict(list)
1464 for trueDatasetType in self.queryDatasetTypes(datasetType, components=components):
1465 parentName, componentName = trueDatasetType.nameAndComponent()
1466 if componentName is not None:
1467 parentDatasetType = self.getDatasetType(parentName)
1468 composition.setdefault(parentDatasetType, []).append(componentName)
1469 else:
1470 composition.setdefault(trueDatasetType, []).append(None)
1471 elif datasetType.isComponent():
1472 # We were given a true DatasetType instance, but it's a component.
1473 # the composition dict will have exactly one item.
1474 parentName, componentName = datasetType.nameAndComponent()
1475 parentDatasetType = self.getDatasetType(parentName)
1476 composition = {parentDatasetType: [componentName]}
1477 if composition is not None:
1478 # We need to recurse. Do that once for each parent dataset type.
1479 chain = []
1480 for parentDatasetType, componentNames in composition.items():
1481 parentResults = self.queryDatasets(
1482 parentDatasetType,
1483 collections=collections,
1484 dimensions=dimensions,
1485 dataId=standardizedDataId,
1486 where=where,
1487 findFirst=findFirst,
1488 check=check,
1489 )
1490 if isinstance(parentResults, queries.ParentDatasetQueryResults):
1491 chain.append(
1492 parentResults.withComponents(componentNames)
1493 )
1494 else:
1495 # Should only happen if we know there would be no results.
1496 assert isinstance(parentResults, queries.ChainedDatasetQueryResults) \
1497 and not parentResults._chain
1498 return queries.ChainedDatasetQueryResults(chain)
1499 # If we get here, there's no need to recurse (or we are already
1500 # recursing; there can only ever be one level of recursion).
1502 # The full set of dimensions in the query is the combination of those
1503 # needed for the DatasetType and those explicitly requested, if any.
1504 requestedDimensionNames = set(datasetType.dimensions.names)
1505 if dimensions is not None:
1506 requestedDimensionNames.update(self.dimensions.extract(dimensions).names)
1507 # Construct the summary structure needed to construct a QueryBuilder.
1508 summary = queries.QuerySummary(
1509 requested=DimensionGraph(self.dimensions, names=requestedDimensionNames),
1510 dataId=standardizedDataId,
1511 expression=where,
1512 bind=bind,
1513 check=check,
1514 )
1515 builder = self.makeQueryBuilder(summary)
1516 # Add the dataset subquery to the query, telling the QueryBuilder to
1517 # include the rank of the selected collection in the results only if we
1518 # need to findFirst. Note that if any of the collections are
1519 # actually wildcard expressions, and we've asked for deduplication,
1520 # this will raise TypeError for us.
1521 if not builder.joinDataset(datasetType, collections, isResult=True, findFirst=findFirst):
1522 return queries.ChainedDatasetQueryResults(())
1523 query = builder.finish()
1524 return queries.ParentDatasetQueryResults(self._db, query, components=[None])
1526 def queryDataIds(self, dimensions: Union[Iterable[Union[Dimension, str]], Dimension, str], *,
1527 dataId: Optional[DataId] = None,
1528 datasets: Any = None,
1529 collections: Any = None,
1530 where: Optional[str] = None,
1531 components: Optional[bool] = None,
1532 bind: Optional[Mapping[str, Any]] = None,
1533 check: bool = True,
1534 **kwargs: Any) -> queries.DataCoordinateQueryResults:
1535 """Query for data IDs matching user-provided criteria.
1537 Parameters
1538 ----------
1539 dimensions : `Dimension` or `str`, or iterable thereof
1540 The dimensions of the data IDs to yield, as either `Dimension`
1541 instances or `str`. Will be automatically expanded to a complete
1542 `DimensionGraph`.
1543 dataId : `dict` or `DataCoordinate`, optional
1544 A data ID whose key-value pairs are used as equality constraints
1545 in the query.
1546 datasets : `Any`, optional
1547 An expression that fully or partially identifies dataset types
1548 that should constrain the yielded data IDs. For example, including
1549 "raw" here would constrain the yielded ``instrument``,
1550 ``exposure``, ``detector``, and ``physical_filter`` values to only
1551 those for which at least one "raw" dataset exists in
1552 ``collections``. Allowed types include `DatasetType`, `str`,
1553 `re.Pattern`, and iterables thereof. Unlike other dataset type
1554 expressions, ``...`` is not permitted - it doesn't make sense to
1555 constrain data IDs on the existence of *all* datasets.
1556 See :ref:`daf_butler_dataset_type_expressions` for more
1557 information.
1558 collections: `Any`, optional
1559 An expression that fully or partially identifies the collections
1560 to search for datasets, such as a `str`, `re.Pattern`, or iterable
1561 thereof. `...` can be used to return all collections. Must be
1562 provided if ``datasets`` is, and is ignored if it is not. See
1563 :ref:`daf_butler_collection_expressions` for more information.
1564 where : `str`, optional
1565 A string expression similar to a SQL WHERE clause. May involve
1566 any column of a dimension table or (as a shortcut for the primary
1567 key column of a dimension table) dimension name. See
1568 :ref:`daf_butler_dimension_expressions` for more information.
1569 components : `bool`, optional
1570 If `True`, apply all dataset expression patterns to component
1571 dataset type names as well. If `False`, never apply patterns to
1572 components. If `None` (default), apply patterns to components only
1573 if their parent datasets were not matched by the expression.
1574 Fully-specified component datasets (`str` or `DatasetType`
1575 instances) are always included.
1576 bind : `Mapping`, optional
1577 Mapping containing literal values that should be injected into the
1578 ``where`` expression, keyed by the identifiers they replace.
1579 check : `bool`, optional
1580 If `True` (default) check the query for consistency before
1581 executing it. This may reject some valid queries that resemble
1582 common mistakes (e.g. queries for visits without specifying an
1583 instrument).
1584 **kwargs
1585 Additional keyword arguments are forwarded to
1586 `DataCoordinate.standardize` when processing the ``dataId``
1587 argument (and may be used to provide a constraining data ID even
1588 when the ``dataId`` argument is `None`).
1590 Returns
1591 -------
1592 dataIds : `DataCoordinateQueryResults`
1593 Data IDs matching the given query parameters. These are guaranteed
1594 to identify all dimensions (`DataCoordinate.hasFull` returns
1595 `True`), but will not contain `DimensionRecord` objects
1596 (`DataCoordinate.hasRecords` returns `False`). Call
1597 `DataCoordinateQueryResults.expanded` on the returned object to
1598 fetch those (and consider using
1599 `DataCoordinateQueryResults.materialize` on the returned object
1600 first if the expected number of rows is very large). See
1601 documentation for those methods for additional information.
1602 """
1603 dimensions = iterable(dimensions)
1604 standardizedDataId = self.expandDataId(dataId, **kwargs)
1605 standardizedDatasetTypes = set()
1606 requestedDimensions = self.dimensions.extract(dimensions)
1607 queryDimensionNames = set(requestedDimensions.names)
1608 if datasets is not None:
1609 if collections is None:
1610 raise TypeError("Cannot pass 'datasets' without 'collections'.")
1611 for datasetType in self.queryDatasetTypes(datasets, components=components):
1612 queryDimensionNames.update(datasetType.dimensions.names)
1613 # If any matched dataset type is a component, just operate on
1614 # its parent instead, because Registry doesn't know anything
1615 # about what components exist, and here (unlike queryDatasets)
1616 # we don't care about returning them.
1617 parentDatasetTypeName, componentName = datasetType.nameAndComponent()
1618 if componentName is not None:
1619 datasetType = self.getDatasetType(parentDatasetTypeName)
1620 standardizedDatasetTypes.add(datasetType)
1621 # Preprocess collections expression in case the original included
1622 # single-pass iterators (we'll want to use it multiple times
1623 # below).
1624 collections = CollectionQuery.fromExpression(collections)
1626 summary = queries.QuerySummary(
1627 requested=DimensionGraph(self.dimensions, names=queryDimensionNames),
1628 dataId=standardizedDataId,
1629 expression=where,
1630 bind=bind,
1631 check=check,
1632 )
1633 builder = self.makeQueryBuilder(summary)
1634 for datasetType in standardizedDatasetTypes:
1635 builder.joinDataset(datasetType, collections, isResult=False)
1636 query = builder.finish()
1637 return queries.DataCoordinateQueryResults(self._db, query)
1639 def queryDimensionRecords(self, element: Union[DimensionElement, str], *,
1640 dataId: Optional[DataId] = None,
1641 datasets: Any = None,
1642 collections: Any = None,
1643 where: Optional[str] = None,
1644 components: Optional[bool] = None,
1645 bind: Optional[Mapping[str, Any]] = None,
1646 check: bool = True,
1647 **kwargs: Any) -> Iterator[DimensionRecord]:
1648 """Query for dimension information matching user-provided criteria.
1650 Parameters
1651 ----------
1652 element : `DimensionElement` or `str`
1653 The dimension element to obtain r
1654 dataId : `dict` or `DataCoordinate`, optional
1655 A data ID whose key-value pairs are used as equality constraints
1656 in the query.
1657 datasets : `Any`, optional
1658 An expression that fully or partially identifies dataset types
1659 that should constrain the yielded records. See `queryDataIds` and
1660 :ref:`daf_butler_dataset_type_expressions` for more information.
1661 collections: `Any`, optional
1662 An expression that fully or partially identifies the collections
1663 to search for datasets. See `queryDataIds` and
1664 :ref:`daf_butler_collection_expressions` for more information.
1665 where : `str`, optional
1666 A string expression similar to a SQL WHERE clause. See
1667 `queryDataIds` and :ref:`daf_butler_dimension_expressions` for more
1668 information.
1669 components : `bool`, optional
1670 Whether to apply dataset expressions to components as well.
1671 See `queryDataIds` for more information.
1672 bind : `Mapping`, optional
1673 Mapping containing literal values that should be injected into the
1674 ``where`` expression, keyed by the identifiers they replace.
1675 check : `bool`, optional
1676 If `True` (default) check the query for consistency before
1677 executing it. This may reject some valid queries that resemble
1678 common mistakes (e.g. queries for visits without specifying an
1679 instrument).
1680 **kwargs
1681 Additional keyword arguments are forwarded to
1682 `DataCoordinate.standardize` when processing the ``dataId``
1683 argument (and may be used to provide a constraining data ID even
1684 when the ``dataId`` argument is `None`).
1686 Returns
1687 -------
1688 dataIds : `DataCoordinateQueryResults`
1689 Data IDs matching the given query parameters.
1690 """
1691 if not isinstance(element, DimensionElement):
1692 element = self.dimensions[element]
1693 dataIds = self.queryDataIds(element.graph, dataId=dataId, datasets=datasets, collections=collections,
1694 where=where, components=components, bind=bind, check=check, **kwargs)
1695 return iter(self._dimensions[element].fetch(dataIds))
1697 def queryDatasetAssociations(
1698 self,
1699 datasetType: Union[str, DatasetType],
1700 collections: Any = ...,
1701 *,
1702 collectionTypes: Iterable[CollectionType] = CollectionType.all(),
1703 flattenChains: bool = False,
1704 ) -> Iterator[DatasetAssociation]:
1705 """Iterate over dataset-collection combinations where the dataset is in
1706 the collection.
1708 This method is a temporary placeholder for better support for
1709 assocation results in `queryDatasets`. It will probably be
1710 removed in the future, and should be avoided in production code
1711 whenever possible.
1713 Parameters
1714 ----------
1715 datasetType : `DatasetType` or `str`
1716 A dataset type object or the name of one.
1717 collections: `Any`, optional
1718 An expression that fully or partially identifies the collections
1719 to search for datasets. See `queryCollections` and
1720 :ref:`daf_butler_collection_expressions` for more information.
1721 collectionTypes : `AbstractSet` [ `CollectionType` ], optional
1722 If provided, only yield associations from collections of these
1723 types.
1724 flattenChains : `bool`, optional
1725 If `True` (default) search in the children of
1726 `~CollectionType.CHAINED` collections. If `False`, ``CHAINED``
1727 collections are ignored.
1729 Yields
1730 ------
1731 association : `DatasetAssociation`
1732 Object representing the relationship beween a single dataset and
1733 a single collection.
1734 """
1735 collections = CollectionQuery.fromExpression(collections)
1736 TimespanReprClass = self._db.getTimespanRepresentation()
1737 if isinstance(datasetType, str):
1738 storage = self._datasets[datasetType]
1739 else:
1740 storage = self._datasets[datasetType.name]
1741 for collectionRecord in collections.iter(self._collections,
1742 collectionTypes=frozenset(collectionTypes),
1743 flattenChains=flattenChains):
1744 query = storage.select(collectionRecord)
1745 if query is None:
1746 continue
1747 for row in self._db.query(query.combine()):
1748 dataId = DataCoordinate.fromRequiredValues(
1749 storage.datasetType.dimensions,
1750 tuple(row[name] for name in storage.datasetType.dimensions.required.names)
1751 )
1752 runRecord = self._collections[row[self._collections.getRunForeignKeyName()]]
1753 ref = DatasetRef(storage.datasetType, dataId, id=row["id"], run=runRecord.name,
1754 conform=False)
1755 if collectionRecord.type is CollectionType.CALIBRATION:
1756 timespan = TimespanReprClass.extract(row)
1757 else:
1758 timespan = None
1759 yield DatasetAssociation(ref=ref, collection=collectionRecord.name, timespan=timespan)
1761 storageClasses: StorageClassFactory
1762 """All storage classes known to the registry (`StorageClassFactory`).
1763 """