Coverage for python/lsst/daf/butler/registry/_registry.py : 14%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22from __future__ import annotations
24__all__ = (
25 "Registry",
26)
28from collections import defaultdict
29import contextlib
30import logging
31from typing import (
32 Any,
33 Dict,
34 Iterable,
35 Iterator,
36 List,
37 Mapping,
38 Optional,
39 Set,
40 TYPE_CHECKING,
41 Union,
42)
44import sqlalchemy
46from ..core import (
47 ButlerURI,
48 Config,
49 DataCoordinate,
50 DataCoordinateIterable,
51 DataId,
52 DatasetAssociation,
53 DatasetRef,
54 DatasetType,
55 ddl,
56 Dimension,
57 DimensionConfig,
58 DimensionElement,
59 DimensionGraph,
60 DimensionRecord,
61 DimensionUniverse,
62 NamedKeyMapping,
63 NameLookupMapping,
64 Progress,
65 StorageClassFactory,
66 Timespan,
67)
68from . import queries
69from ..core.utils import iterable, transactional
70from ._config import RegistryConfig
71from ._collectionType import CollectionType
72from ._defaults import RegistryDefaults
73from ._exceptions import ConflictingDefinitionError, InconsistentDataIdError, OrphanedRecordError
74from .managers import RegistryManagerTypes, RegistryManagerInstances
75from .wildcards import CategorizedWildcard, CollectionQuery, CollectionSearch, Ellipsis
76from .summaries import CollectionSummary
77from .interfaces import ChainedCollectionRecord, RunRecord
79if TYPE_CHECKING: 79 ↛ 80line 79 didn't jump to line 80, because the condition on line 79 was never true
80 from .._butlerConfig import ButlerConfig
81 from .interfaces import (
82 Database,
83 DatastoreRegistryBridgeManager,
84 )
87_LOG = logging.getLogger(__name__)
89# key for dimensions configuration in attributes table
90_DIMENSIONS_ATTR = "config:dimensions.json"
93class Registry:
94 """Registry interface.
96 Parameters
97 ----------
98 database : `Database`
99 Database instance to store Registry.
100 defaults : `RegistryDefaults`, optional
101 Default collection search path and/or output `~CollectionType.RUN`
102 collection.
103 attributes : `type`
104 Manager class implementing `ButlerAttributeManager`.
105 opaque : `type`
106 Manager class implementing `OpaqueTableStorageManager`.
107 dimensions : `type`
108 Manager class implementing `DimensionRecordStorageManager`.
109 collections : `type`
110 Manager class implementing `CollectionManager`.
111 datasets : `type`
112 Manager class implementing `DatasetRecordStorageManager`.
113 datastoreBridges : `type`
114 Manager class implementing `DatastoreRegistryBridgeManager`.
115 dimensionConfig : `DimensionConfig`, optional
116 Dimension universe configuration, only used when ``create`` is True.
117 writeable : `bool`, optional
118 If True then Registry will support write operations.
119 create : `bool`, optional
120 If True then database schema will be initialized, it must be empty
121 before instantiating Registry.
122 """
124 defaultConfigFile: Optional[str] = None
125 """Path to configuration defaults. Accessed within the ``configs`` resource
126 or relative to a search path. Can be None if no defaults specified.
127 """
129 @classmethod
130 def createFromConfig(cls, config: Optional[Union[RegistryConfig, str]] = None,
131 dimensionConfig: Optional[Union[DimensionConfig, str]] = None,
132 butlerRoot: Optional[str] = None) -> Registry:
133 """Create registry database and return `Registry` instance.
135 This method initializes database contents, database must be empty
136 prior to calling this method.
138 Parameters
139 ----------
140 config : `RegistryConfig` or `str`, optional
141 Registry configuration, if missing then default configuration will
142 be loaded from registry.yaml.
143 dimensionConfig : `DimensionConfig` or `str`, optional
144 Dimensions configuration, if missing then default configuration
145 will be loaded from dimensions.yaml.
146 butlerRoot : `str`, optional
147 Path to the repository root this `Registry` will manage.
149 Returns
150 -------
151 registry : `Registry`
152 A new `Registry` instance.
153 """
154 if isinstance(config, str):
155 config = RegistryConfig(config)
156 elif config is None:
157 config = RegistryConfig()
158 elif not isinstance(config, RegistryConfig):
159 raise TypeError(f"Incompatible Registry configuration type: {type(config)}")
160 config.replaceRoot(butlerRoot)
162 if isinstance(dimensionConfig, str):
163 dimensionConfig = DimensionConfig(config)
164 elif dimensionConfig is None:
165 dimensionConfig = DimensionConfig()
166 elif not isinstance(dimensionConfig, DimensionConfig):
167 raise TypeError(f"Incompatible Dimension configuration type: {type(dimensionConfig)}")
169 DatabaseClass = config.getDatabaseClass()
170 database = DatabaseClass.fromUri(str(config.connectionString), origin=config.get("origin", 0),
171 namespace=config.get("namespace"))
172 managerTypes = RegistryManagerTypes.fromConfig(config)
173 managers = managerTypes.makeRepo(database, dimensionConfig)
174 return cls(database, RegistryDefaults(), managers)
176 @classmethod
177 def fromConfig(cls, config: Union[ButlerConfig, RegistryConfig, Config, str],
178 butlerRoot: Optional[Union[str, ButlerURI]] = None, writeable: bool = True,
179 defaults: Optional[RegistryDefaults] = None) -> Registry:
180 """Create `Registry` subclass instance from `config`.
182 Registry database must be inbitialized prior to calling this method.
184 Parameters
185 ----------
186 config : `ButlerConfig`, `RegistryConfig`, `Config` or `str`
187 Registry configuration
188 butlerRoot : `str` or `ButlerURI`, optional
189 Path to the repository root this `Registry` will manage.
190 writeable : `bool`, optional
191 If `True` (default) create a read-write connection to the database.
192 defaults : `RegistryDefaults`, optional
193 Default collection search path and/or output `~CollectionType.RUN`
194 collection.
196 Returns
197 -------
198 registry : `Registry` (subclass)
199 A new `Registry` subclass instance.
200 """
201 if not isinstance(config, RegistryConfig):
202 if isinstance(config, str) or isinstance(config, Config):
203 config = RegistryConfig(config)
204 else:
205 raise ValueError("Incompatible Registry configuration: {}".format(config))
206 config.replaceRoot(butlerRoot)
207 DatabaseClass = config.getDatabaseClass()
208 database = DatabaseClass.fromUri(str(config.connectionString), origin=config.get("origin", 0),
209 namespace=config.get("namespace"), writeable=writeable)
210 managerTypes = RegistryManagerTypes.fromConfig(config)
211 managers = managerTypes.loadRepo(database)
212 if defaults is None:
213 defaults = RegistryDefaults()
214 return cls(database, defaults, managers)
216 def __init__(self, database: Database, defaults: RegistryDefaults, managers: RegistryManagerInstances):
217 self._db = database
218 self._managers = managers
219 self.storageClasses = StorageClassFactory()
220 # Intentionally invoke property setter to initialize defaults. This
221 # can only be done after most of the rest of Registry has already been
222 # initialized, and must be done before the property getter is used.
223 self.defaults = defaults
225 def __str__(self) -> str:
226 return str(self._db)
228 def __repr__(self) -> str:
229 return f"Registry({self._db!r}, {self.dimensions!r})"
231 def isWriteable(self) -> bool:
232 """Return `True` if this registry allows write operations, and `False`
233 otherwise.
234 """
235 return self._db.isWriteable()
237 def copy(self, defaults: Optional[RegistryDefaults] = None) -> Registry:
238 """Create a new `Registry` backed by the same data repository and
239 connection as this one, but independent defaults.
241 Parameters
242 ----------
243 defaults : `RegistryDefaults`, optional
244 Default collections and data ID values for the new registry. If
245 not provided, ``self.defaults`` will be used (but future changes
246 to either registry's defaults will not affect the other).
248 Returns
249 -------
250 copy : `Registry`
251 A new `Registry` instance with its own defaults.
253 Notes
254 -----
255 Because the new registry shares a connection with the original, they
256 also share transaction state (despite the fact that their `transaction`
257 context manager methods do not reflect this), and must be used with
258 care.
259 """
260 if defaults is None:
261 # No need to copy, because `RegistryDefaults` is immutable; we
262 # effectively copy on write.
263 defaults = self.defaults
264 return Registry(self._db, defaults, self._managers)
266 @property
267 def dimensions(self) -> DimensionUniverse:
268 """All dimensions recognized by this `Registry` (`DimensionUniverse`).
269 """
270 return self._managers.dimensions.universe
272 @property
273 def defaults(self) -> RegistryDefaults:
274 """Default collection search path and/or output `~CollectionType.RUN`
275 collection (`RegistryDefaults`).
277 This is an immutable struct whose components may not be set
278 individually, but the entire struct can be set by assigning to this
279 property.
280 """
281 return self._defaults
283 @defaults.setter
284 def defaults(self, value: RegistryDefaults) -> None:
285 if value.run is not None:
286 self.registerRun(value.run)
287 value.finish(self)
288 self._defaults = value
290 def refresh(self) -> None:
291 """Refresh all in-memory state by querying the database.
293 This may be necessary to enable querying for entities added by other
294 `Registry` instances after this one was constructed.
295 """
296 self._managers.refresh()
298 @contextlib.contextmanager
299 def transaction(self, *, savepoint: bool = False) -> Iterator[None]:
300 """Return a context manager that represents a transaction.
301 """
302 try:
303 with self._db.transaction(savepoint=savepoint):
304 yield
305 except BaseException:
306 # TODO: this clears the caches sometimes when we wouldn't actually
307 # need to. Can we avoid that?
308 self._managers.dimensions.clearCaches()
309 raise
311 def resetConnectionPool(self) -> None:
312 """Reset SQLAlchemy connection pool for registry database.
314 This operation is useful when using registry with fork-based
315 multiprocessing. To use registry across fork boundary one has to make
316 sure that there are no currently active connections (no session or
317 transaction is in progress) and connection pool is reset using this
318 method. This method should be called by the child process immediately
319 after the fork.
320 """
321 self._db._engine.dispose()
323 def registerOpaqueTable(self, tableName: str, spec: ddl.TableSpec) -> None:
324 """Add an opaque (to the `Registry`) table for use by a `Datastore` or
325 other data repository client.
327 Opaque table records can be added via `insertOpaqueData`, retrieved via
328 `fetchOpaqueData`, and removed via `deleteOpaqueData`.
330 Parameters
331 ----------
332 tableName : `str`
333 Logical name of the opaque table. This may differ from the
334 actual name used in the database by a prefix and/or suffix.
335 spec : `ddl.TableSpec`
336 Specification for the table to be added.
337 """
338 self._managers.opaque.register(tableName, spec)
340 @transactional
341 def insertOpaqueData(self, tableName: str, *data: dict) -> None:
342 """Insert records into an opaque table.
344 Parameters
345 ----------
346 tableName : `str`
347 Logical name of the opaque table. Must match the name used in a
348 previous call to `registerOpaqueTable`.
349 data
350 Each additional positional argument is a dictionary that represents
351 a single row to be added.
352 """
353 self._managers.opaque[tableName].insert(*data)
355 def fetchOpaqueData(self, tableName: str, **where: Any) -> Iterator[dict]:
356 """Retrieve records from an opaque table.
358 Parameters
359 ----------
360 tableName : `str`
361 Logical name of the opaque table. Must match the name used in a
362 previous call to `registerOpaqueTable`.
363 where
364 Additional keyword arguments are interpreted as equality
365 constraints that restrict the returned rows (combined with AND);
366 keyword arguments are column names and values are the values they
367 must have.
369 Yields
370 ------
371 row : `dict`
372 A dictionary representing a single result row.
373 """
374 yield from self._managers.opaque[tableName].fetch(**where)
376 @transactional
377 def deleteOpaqueData(self, tableName: str, **where: Any) -> None:
378 """Remove records from an opaque table.
380 Parameters
381 ----------
382 tableName : `str`
383 Logical name of the opaque table. Must match the name used in a
384 previous call to `registerOpaqueTable`.
385 where
386 Additional keyword arguments are interpreted as equality
387 constraints that restrict the deleted rows (combined with AND);
388 keyword arguments are column names and values are the values they
389 must have.
390 """
391 self._managers.opaque[tableName].delete(where.keys(), where)
393 def registerCollection(self, name: str, type: CollectionType = CollectionType.TAGGED,
394 doc: Optional[str] = None) -> None:
395 """Add a new collection if one with the given name does not exist.
397 Parameters
398 ----------
399 name : `str`
400 The name of the collection to create.
401 type : `CollectionType`
402 Enum value indicating the type of collection to create.
403 doc : `str`, optional
404 Documentation string for the collection.
406 Notes
407 -----
408 This method cannot be called within transactions, as it needs to be
409 able to perform its own transaction to be concurrent.
410 """
411 self._managers.collections.register(name, type, doc=doc)
413 def getCollectionType(self, name: str) -> CollectionType:
414 """Return an enumeration value indicating the type of the given
415 collection.
417 Parameters
418 ----------
419 name : `str`
420 The name of the collection.
422 Returns
423 -------
424 type : `CollectionType`
425 Enum value indicating the type of this collection.
427 Raises
428 ------
429 MissingCollectionError
430 Raised if no collection with the given name exists.
431 """
432 return self._managers.collections.find(name).type
434 def registerRun(self, name: str, doc: Optional[str] = None) -> None:
435 """Add a new run if one with the given name does not exist.
437 Parameters
438 ----------
439 name : `str`
440 The name of the run to create.
441 doc : `str`, optional
442 Documentation string for the collection.
444 Notes
445 -----
446 This method cannot be called within transactions, as it needs to be
447 able to perform its own transaction to be concurrent.
448 """
449 self._managers.collections.register(name, CollectionType.RUN, doc=doc)
451 @transactional
452 def removeCollection(self, name: str) -> None:
453 """Completely remove the given collection.
455 Parameters
456 ----------
457 name : `str`
458 The name of the collection to remove.
460 Raises
461 ------
462 MissingCollectionError
463 Raised if no collection with the given name exists.
465 Notes
466 -----
467 If this is a `~CollectionType.RUN` collection, all datasets and quanta
468 in it are also fully removed. This requires that those datasets be
469 removed (or at least trashed) from any datastores that hold them first.
471 A collection may not be deleted as long as it is referenced by a
472 `~CollectionType.CHAINED` collection; the ``CHAINED`` collection must
473 be deleted or redefined first.
474 """
475 self._managers.collections.remove(name)
477 def getCollectionChain(self, parent: str) -> CollectionSearch:
478 """Return the child collections in a `~CollectionType.CHAINED`
479 collection.
481 Parameters
482 ----------
483 parent : `str`
484 Name of the chained collection. Must have already been added via
485 a call to `Registry.registerCollection`.
487 Returns
488 -------
489 children : `CollectionSearch`
490 An object that defines the search path of the collection.
491 See :ref:`daf_butler_collection_expressions` for more information.
493 Raises
494 ------
495 MissingCollectionError
496 Raised if ``parent`` does not exist in the `Registry`.
497 TypeError
498 Raised if ``parent`` does not correspond to a
499 `~CollectionType.CHAINED` collection.
500 """
501 record = self._managers.collections.find(parent)
502 if record.type is not CollectionType.CHAINED:
503 raise TypeError(f"Collection '{parent}' has type {record.type.name}, not CHAINED.")
504 assert isinstance(record, ChainedCollectionRecord)
505 return record.children
507 @transactional
508 def setCollectionChain(self, parent: str, children: Any, *, flatten: bool = False) -> None:
509 """Define or redefine a `~CollectionType.CHAINED` collection.
511 Parameters
512 ----------
513 parent : `str`
514 Name of the chained collection. Must have already been added via
515 a call to `Registry.registerCollection`.
516 children : `Any`
517 An expression defining an ordered search of child collections,
518 generally an iterable of `str`; see
519 :ref:`daf_butler_collection_expressions` for more information.
520 flatten : `bool`, optional
521 If `True` (`False` is default), recursively flatten out any nested
522 `~CollectionType.CHAINED` collections in ``children`` first.
524 Raises
525 ------
526 MissingCollectionError
527 Raised when any of the given collections do not exist in the
528 `Registry`.
529 TypeError
530 Raised if ``parent`` does not correspond to a
531 `~CollectionType.CHAINED` collection.
532 ValueError
533 Raised if the given collections contains a cycle.
534 """
535 record = self._managers.collections.find(parent)
536 if record.type is not CollectionType.CHAINED:
537 raise TypeError(f"Collection '{parent}' has type {record.type.name}, not CHAINED.")
538 assert isinstance(record, ChainedCollectionRecord)
539 children = CollectionSearch.fromExpression(children)
540 if children != record.children or flatten:
541 record.update(self._managers.collections, children, flatten=flatten)
543 def getCollectionDocumentation(self, collection: str) -> Optional[str]:
544 """Retrieve the documentation string for a collection.
546 Parameters
547 ----------
548 name : `str`
549 Name of the collection.
551 Returns
552 -------
553 docs : `str` or `None`
554 Docstring for the collection with the given name.
555 """
556 return self._managers.collections.getDocumentation(self._managers.collections.find(collection).key)
558 def setCollectionDocumentation(self, collection: str, doc: Optional[str]) -> None:
559 """Set the documentation string for a collection.
561 Parameters
562 ----------
563 name : `str`
564 Name of the collection.
565 docs : `str` or `None`
566 Docstring for the collection with the given name; will replace any
567 existing docstring. Passing `None` will remove any existing
568 docstring.
569 """
570 self._managers.collections.setDocumentation(self._managers.collections.find(collection).key, doc)
572 def getCollectionSummary(self, collection: str) -> CollectionSummary:
573 """Return a summary for the given collection.
575 Parameters
576 ----------
577 collection : `str`
578 Name of the collection for which a summary is to be retrieved.
580 Returns
581 -------
582 summary : `CollectionSummary`
583 Summary of the dataset types and governor dimension values in
584 this collection.
585 """
586 record = self._managers.collections.find(collection)
587 return self._managers.datasets.getCollectionSummary(record)
589 def registerDatasetType(self, datasetType: DatasetType) -> bool:
590 """
591 Add a new `DatasetType` to the Registry.
593 It is not an error to register the same `DatasetType` twice.
595 Parameters
596 ----------
597 datasetType : `DatasetType`
598 The `DatasetType` to be added.
600 Returns
601 -------
602 inserted : `bool`
603 `True` if ``datasetType`` was inserted, `False` if an identical
604 existing `DatsetType` was found. Note that in either case the
605 DatasetType is guaranteed to be defined in the Registry
606 consistently with the given definition.
608 Raises
609 ------
610 ValueError
611 Raised if the dimensions or storage class are invalid.
612 ConflictingDefinitionError
613 Raised if this DatasetType is already registered with a different
614 definition.
616 Notes
617 -----
618 This method cannot be called within transactions, as it needs to be
619 able to perform its own transaction to be concurrent.
620 """
621 _, inserted = self._managers.datasets.register(datasetType)
622 return inserted
624 def removeDatasetType(self, name: str) -> None:
625 """Remove the named `DatasetType` from the registry.
627 .. warning::
629 Registry caches the dataset type definitions. This means that
630 deleting the dataset type definition may result in unexpected
631 behavior from other butler processes that are active that have
632 not seen the deletion.
634 Parameters
635 ----------
636 name : `str`
637 Name of the type to be removed.
639 Raises
640 ------
641 lsst.daf.butler.registry.OrphanedRecordError
642 Raised if an attempt is made to remove the dataset type definition
643 when there are already datasets associated with it.
645 Notes
646 -----
647 If the dataset type is not registered the method will return without
648 action.
649 """
650 self._managers.datasets.remove(name)
652 def getDatasetType(self, name: str) -> DatasetType:
653 """Get the `DatasetType`.
655 Parameters
656 ----------
657 name : `str`
658 Name of the type.
660 Returns
661 -------
662 type : `DatasetType`
663 The `DatasetType` associated with the given name.
665 Raises
666 ------
667 KeyError
668 Requested named DatasetType could not be found in registry.
669 """
670 return self._managers.datasets[name].datasetType
672 def findDataset(self, datasetType: Union[DatasetType, str], dataId: Optional[DataId] = None, *,
673 collections: Any = None, timespan: Optional[Timespan] = None,
674 **kwargs: Any) -> Optional[DatasetRef]:
675 """Find a dataset given its `DatasetType` and data ID.
677 This can be used to obtain a `DatasetRef` that permits the dataset to
678 be read from a `Datastore`. If the dataset is a component and can not
679 be found using the provided dataset type, a dataset ref for the parent
680 will be returned instead but with the correct dataset type.
682 Parameters
683 ----------
684 datasetType : `DatasetType` or `str`
685 A `DatasetType` or the name of one.
686 dataId : `dict` or `DataCoordinate`, optional
687 A `dict`-like object containing the `Dimension` links that identify
688 the dataset within a collection.
689 collections, optional.
690 An expression that fully or partially identifies the collections to
691 search for the dataset; see
692 :ref:`daf_butler_collection_expressions` for more information.
693 Defaults to ``self.defaults.collections``.
694 timespan : `Timespan`, optional
695 A timespan that the validity range of the dataset must overlap.
696 If not provided, any `~CollectionType.CALIBRATION` collections
697 matched by the ``collections`` argument will not be searched.
698 **kwargs
699 Additional keyword arguments passed to
700 `DataCoordinate.standardize` to convert ``dataId`` to a true
701 `DataCoordinate` or augment an existing one.
703 Returns
704 -------
705 ref : `DatasetRef`
706 A reference to the dataset, or `None` if no matching Dataset
707 was found.
709 Raises
710 ------
711 TypeError
712 Raised if ``collections`` is `None` and
713 ``self.defaults.collections`` is `None`.
714 LookupError
715 Raised if one or more data ID keys are missing.
716 KeyError
717 Raised if the dataset type does not exist.
718 MissingCollectionError
719 Raised if any of ``collections`` does not exist in the registry.
721 Notes
722 -----
723 This method simply returns `None` and does not raise an exception even
724 when the set of collections searched is intrinsically incompatible with
725 the dataset type, e.g. if ``datasetType.isCalibration() is False``, but
726 only `~CollectionType.CALIBRATION` collections are being searched.
727 This may make it harder to debug some lookup failures, but the behavior
728 is intentional; we consider it more important that failed searches are
729 reported consistently, regardless of the reason, and that adding
730 additional collections that do not contain a match to the search path
731 never changes the behavior.
732 """
733 if isinstance(datasetType, DatasetType):
734 storage = self._managers.datasets[datasetType.name]
735 else:
736 storage = self._managers.datasets[datasetType]
737 dataId = DataCoordinate.standardize(dataId, graph=storage.datasetType.dimensions,
738 universe=self.dimensions, defaults=self.defaults.dataId,
739 **kwargs)
740 if collections is None:
741 if not self.defaults.collections:
742 raise TypeError("No collections provided to findDataset, "
743 "and no defaults from registry construction.")
744 collections = self.defaults.collections
745 else:
746 collections = CollectionSearch.fromExpression(collections)
747 for collectionRecord in collections.iter(self._managers.collections):
748 if (collectionRecord.type is CollectionType.CALIBRATION
749 and (not storage.datasetType.isCalibration() or timespan is None)):
750 continue
751 result = storage.find(collectionRecord, dataId, timespan=timespan)
752 if result is not None:
753 return result
755 return None
757 @transactional
758 def insertDatasets(self, datasetType: Union[DatasetType, str], dataIds: Iterable[DataId],
759 run: Optional[str] = None) -> List[DatasetRef]:
760 """Insert one or more datasets into the `Registry`
762 This always adds new datasets; to associate existing datasets with
763 a new collection, use ``associate``.
765 Parameters
766 ----------
767 datasetType : `DatasetType` or `str`
768 A `DatasetType` or the name of one.
769 dataIds : `~collections.abc.Iterable` of `dict` or `DataCoordinate`
770 Dimension-based identifiers for the new datasets.
771 run : `str`, optional
772 The name of the run that produced the datasets. Defaults to
773 ``self.defaults.run``.
775 Returns
776 -------
777 refs : `list` of `DatasetRef`
778 Resolved `DatasetRef` instances for all given data IDs (in the same
779 order).
781 Raises
782 ------
783 TypeError
784 Raised if ``run`` is `None` and ``self.defaults.run`` is `None`.
785 ConflictingDefinitionError
786 If a dataset with the same dataset type and data ID as one of those
787 given already exists in ``run``.
788 MissingCollectionError
789 Raised if ``run`` does not exist in the registry.
790 """
791 if isinstance(datasetType, DatasetType):
792 storage = self._managers.datasets.find(datasetType.name)
793 if storage is None:
794 raise LookupError(f"DatasetType '{datasetType}' has not been registered.")
795 else:
796 storage = self._managers.datasets.find(datasetType)
797 if storage is None:
798 raise LookupError(f"DatasetType with name '{datasetType}' has not been registered.")
799 if run is None:
800 if self.defaults.run is None:
801 raise TypeError("No run provided to insertDatasets, "
802 "and no default from registry construction.")
803 run = self.defaults.run
804 runRecord = self._managers.collections.find(run)
805 if runRecord.type is not CollectionType.RUN:
806 raise TypeError(f"Given collection is of type {runRecord.type.name}; RUN collection required.")
807 assert isinstance(runRecord, RunRecord)
808 progress = Progress("lsst.daf.butler.Registry.insertDatasets", level=logging.DEBUG)
809 expandedDataIds = [self.expandDataId(dataId, graph=storage.datasetType.dimensions)
810 for dataId in progress.wrap(dataIds,
811 f"Expanding {storage.datasetType.name} data IDs")]
812 try:
813 refs = list(storage.insert(runRecord, expandedDataIds))
814 except sqlalchemy.exc.IntegrityError as err:
815 raise ConflictingDefinitionError(f"A database constraint failure was triggered by inserting "
816 f"one or more datasets of type {storage.datasetType} into "
817 f"collection '{run}'. "
818 f"This probably means a dataset with the same data ID "
819 f"and dataset type already exists, but it may also mean a "
820 f"dimension row is missing.") from err
821 return refs
823 def getDataset(self, id: int) -> Optional[DatasetRef]:
824 """Retrieve a Dataset entry.
826 Parameters
827 ----------
828 id : `int`
829 The unique identifier for the dataset.
831 Returns
832 -------
833 ref : `DatasetRef` or `None`
834 A ref to the Dataset, or `None` if no matching Dataset
835 was found.
836 """
837 ref = self._managers.datasets.getDatasetRef(id)
838 if ref is None:
839 return None
840 return ref
842 @transactional
843 def removeDatasets(self, refs: Iterable[DatasetRef]) -> None:
844 """Remove datasets from the Registry.
846 The datasets will be removed unconditionally from all collections, and
847 any `Quantum` that consumed this dataset will instead be marked with
848 having a NULL input. `Datastore` records will *not* be deleted; the
849 caller is responsible for ensuring that the dataset has already been
850 removed from all Datastores.
852 Parameters
853 ----------
854 refs : `Iterable` of `DatasetRef`
855 References to the datasets to be removed. Must include a valid
856 ``id`` attribute, and should be considered invalidated upon return.
858 Raises
859 ------
860 AmbiguousDatasetError
861 Raised if any ``ref.id`` is `None`.
862 OrphanedRecordError
863 Raised if any dataset is still present in any `Datastore`.
864 """
865 progress = Progress("lsst.daf.butler.Registry.removeDatasets", level=logging.DEBUG)
866 for datasetType, refsForType in progress.iter_item_chunks(DatasetRef.groupByType(refs).items(),
867 desc="Removing datasets by type"):
868 storage = self._managers.datasets.find(datasetType.name)
869 assert storage is not None
870 try:
871 storage.delete(refsForType)
872 except sqlalchemy.exc.IntegrityError as err:
873 raise OrphanedRecordError("One or more datasets is still "
874 "present in one or more Datastores.") from err
876 @transactional
877 def associate(self, collection: str, refs: Iterable[DatasetRef]) -> None:
878 """Add existing datasets to a `~CollectionType.TAGGED` collection.
880 If a DatasetRef with the same exact integer ID is already in a
881 collection nothing is changed. If a `DatasetRef` with the same
882 `DatasetType` and data ID but with different integer ID
883 exists in the collection, `ConflictingDefinitionError` is raised.
885 Parameters
886 ----------
887 collection : `str`
888 Indicates the collection the datasets should be associated with.
889 refs : `Iterable` [ `DatasetRef` ]
890 An iterable of resolved `DatasetRef` instances that already exist
891 in this `Registry`.
893 Raises
894 ------
895 ConflictingDefinitionError
896 If a Dataset with the given `DatasetRef` already exists in the
897 given collection.
898 AmbiguousDatasetError
899 Raised if ``any(ref.id is None for ref in refs)``.
900 MissingCollectionError
901 Raised if ``collection`` does not exist in the registry.
902 TypeError
903 Raise adding new datasets to the given ``collection`` is not
904 allowed.
905 """
906 progress = Progress("lsst.daf.butler.Registry.associate", level=logging.DEBUG)
907 collectionRecord = self._managers.collections.find(collection)
908 if collectionRecord.type is not CollectionType.TAGGED:
909 raise TypeError(f"Collection '{collection}' has type {collectionRecord.type.name}, not TAGGED.")
910 for datasetType, refsForType in progress.iter_item_chunks(DatasetRef.groupByType(refs).items(),
911 desc="Associating datasets by type"):
912 storage = self._managers.datasets.find(datasetType.name)
913 assert storage is not None
914 try:
915 storage.associate(collectionRecord, refsForType)
916 except sqlalchemy.exc.IntegrityError as err:
917 raise ConflictingDefinitionError(
918 f"Constraint violation while associating dataset of type {datasetType.name} with "
919 f"collection {collection}. This probably means that one or more datasets with the same "
920 f"dataset type and data ID already exist in the collection, but it may also indicate "
921 f"that the datasets do not exist."
922 ) from err
924 @transactional
925 def disassociate(self, collection: str, refs: Iterable[DatasetRef]) -> None:
926 """Remove existing datasets from a `~CollectionType.TAGGED` collection.
928 ``collection`` and ``ref`` combinations that are not currently
929 associated are silently ignored.
931 Parameters
932 ----------
933 collection : `str`
934 The collection the datasets should no longer be associated with.
935 refs : `Iterable` [ `DatasetRef` ]
936 An iterable of resolved `DatasetRef` instances that already exist
937 in this `Registry`.
939 Raises
940 ------
941 AmbiguousDatasetError
942 Raised if any of the given dataset references is unresolved.
943 MissingCollectionError
944 Raised if ``collection`` does not exist in the registry.
945 TypeError
946 Raise adding new datasets to the given ``collection`` is not
947 allowed.
948 """
949 progress = Progress("lsst.daf.butler.Registry.disassociate", level=logging.DEBUG)
950 collectionRecord = self._managers.collections.find(collection)
951 if collectionRecord.type is not CollectionType.TAGGED:
952 raise TypeError(f"Collection '{collection}' has type {collectionRecord.type.name}; "
953 "expected TAGGED.")
954 for datasetType, refsForType in progress.iter_item_chunks(DatasetRef.groupByType(refs).items(),
955 desc="Disassociating datasets by type"):
956 storage = self._managers.datasets.find(datasetType.name)
957 assert storage is not None
958 storage.disassociate(collectionRecord, refsForType)
960 @transactional
961 def certify(self, collection: str, refs: Iterable[DatasetRef], timespan: Timespan) -> None:
962 """Associate one or more datasets with a calibration collection and a
963 validity range within it.
965 Parameters
966 ----------
967 collection : `str`
968 The name of an already-registered `~CollectionType.CALIBRATION`
969 collection.
970 refs : `Iterable` [ `DatasetRef` ]
971 Datasets to be associated.
972 timespan : `Timespan`
973 The validity range for these datasets within the collection.
975 Raises
976 ------
977 AmbiguousDatasetError
978 Raised if any of the given `DatasetRef` instances is unresolved.
979 ConflictingDefinitionError
980 Raised if the collection already contains a different dataset with
981 the same `DatasetType` and data ID and an overlapping validity
982 range.
983 TypeError
984 Raised if ``collection`` is not a `~CollectionType.CALIBRATION`
985 collection or if one or more datasets are of a dataset type for
986 which `DatasetType.isCalibration` returns `False`.
987 """
988 progress = Progress("lsst.daf.butler.Registry.certify", level=logging.DEBUG)
989 collectionRecord = self._managers.collections.find(collection)
990 for datasetType, refsForType in progress.iter_item_chunks(DatasetRef.groupByType(refs).items(),
991 desc="Certifying datasets by type"):
992 storage = self._managers.datasets[datasetType.name]
993 storage.certify(collectionRecord, refsForType, timespan)
995 @transactional
996 def decertify(self, collection: str, datasetType: Union[str, DatasetType], timespan: Timespan, *,
997 dataIds: Optional[Iterable[DataId]] = None) -> None:
998 """Remove or adjust datasets to clear a validity range within a
999 calibration collection.
1001 Parameters
1002 ----------
1003 collection : `str`
1004 The name of an already-registered `~CollectionType.CALIBRATION`
1005 collection.
1006 datasetType : `str` or `DatasetType`
1007 Name or `DatasetType` instance for the datasets to be decertified.
1008 timespan : `Timespan`, optional
1009 The validity range to remove datasets from within the collection.
1010 Datasets that overlap this range but are not contained by it will
1011 have their validity ranges adjusted to not overlap it, which may
1012 split a single dataset validity range into two.
1013 dataIds : `Iterable` [ `DataId` ], optional
1014 Data IDs that should be decertified within the given validity range
1015 If `None`, all data IDs for ``self.datasetType`` will be
1016 decertified.
1018 Raises
1019 ------
1020 TypeError
1021 Raised if ``collection`` is not a `~CollectionType.CALIBRATION`
1022 collection or if ``datasetType.isCalibration() is False``.
1023 """
1024 collectionRecord = self._managers.collections.find(collection)
1025 if isinstance(datasetType, str):
1026 storage = self._managers.datasets[datasetType]
1027 else:
1028 storage = self._managers.datasets[datasetType.name]
1029 standardizedDataIds = None
1030 if dataIds is not None:
1031 standardizedDataIds = [DataCoordinate.standardize(d, graph=storage.datasetType.dimensions)
1032 for d in dataIds]
1033 storage.decertify(collectionRecord, timespan, dataIds=standardizedDataIds)
1035 def getDatastoreBridgeManager(self) -> DatastoreRegistryBridgeManager:
1036 """Return an object that allows a new `Datastore` instance to
1037 communicate with this `Registry`.
1039 Returns
1040 -------
1041 manager : `DatastoreRegistryBridgeManager`
1042 Object that mediates communication between this `Registry` and its
1043 associated datastores.
1044 """
1045 return self._managers.datastores
1047 def getDatasetLocations(self, ref: DatasetRef) -> Iterable[str]:
1048 """Retrieve datastore locations for a given dataset.
1050 Parameters
1051 ----------
1052 ref : `DatasetRef`
1053 A reference to the dataset for which to retrieve storage
1054 information.
1056 Returns
1057 -------
1058 datastores : `Iterable` [ `str` ]
1059 All the matching datastores holding this dataset.
1061 Raises
1062 ------
1063 AmbiguousDatasetError
1064 Raised if ``ref.id`` is `None`.
1065 """
1066 return self._managers.datastores.findDatastores(ref)
1068 def expandDataId(self, dataId: Optional[DataId] = None, *, graph: Optional[DimensionGraph] = None,
1069 records: Optional[NameLookupMapping[DimensionElement, Optional[DimensionRecord]]] = None,
1070 withDefaults: bool = True,
1071 **kwargs: Any) -> DataCoordinate:
1072 """Expand a dimension-based data ID to include additional information.
1074 Parameters
1075 ----------
1076 dataId : `DataCoordinate` or `dict`, optional
1077 Data ID to be expanded; augmented and overridden by ``kwds``.
1078 graph : `DimensionGraph`, optional
1079 Set of dimensions for the expanded ID. If `None`, the dimensions
1080 will be inferred from the keys of ``dataId`` and ``kwds``.
1081 Dimensions that are in ``dataId`` or ``kwds`` but not in ``graph``
1082 are silently ignored, providing a way to extract and expand a
1083 subset of a data ID.
1084 records : `Mapping` [`str`, `DimensionRecord`], optional
1085 Dimension record data to use before querying the database for that
1086 data, keyed by element name.
1087 withDefaults : `bool`, optional
1088 Utilize ``self.defaults.dataId`` to fill in missing governor
1089 dimension key-value pairs. Defaults to `True` (i.e. defaults are
1090 used).
1091 **kwargs
1092 Additional keywords are treated like additional key-value pairs for
1093 ``dataId``, extending and overriding
1095 Returns
1096 -------
1097 expanded : `DataCoordinate`
1098 A data ID that includes full metadata for all of the dimensions it
1099 identifieds, i.e. guarantees that ``expanded.hasRecords()`` and
1100 ``expanded.hasFull()`` both return `True`.
1101 """
1102 if not withDefaults:
1103 defaults = None
1104 else:
1105 defaults = self.defaults.dataId
1106 standardized = DataCoordinate.standardize(dataId, graph=graph, universe=self.dimensions,
1107 defaults=defaults, **kwargs)
1108 if standardized.hasRecords():
1109 return standardized
1110 if records is None:
1111 records = {}
1112 elif isinstance(records, NamedKeyMapping):
1113 records = records.byName()
1114 else:
1115 records = dict(records)
1116 if isinstance(dataId, DataCoordinate) and dataId.hasRecords():
1117 records.update(dataId.records.byName())
1118 keys = standardized.byName()
1119 for element in standardized.graph.primaryKeyTraversalOrder:
1120 record = records.get(element.name, ...) # Use ... to mean not found; None might mean NULL
1121 if record is ...:
1122 if isinstance(element, Dimension) and keys.get(element.name) is None:
1123 if element in standardized.graph.required:
1124 raise LookupError(
1125 f"No value or null value for required dimension {element.name}."
1126 )
1127 keys[element.name] = None
1128 record = None
1129 else:
1130 storage = self._managers.dimensions[element]
1131 dataIdSet = DataCoordinateIterable.fromScalar(
1132 DataCoordinate.standardize(keys, graph=element.graph)
1133 )
1134 fetched = tuple(storage.fetch(dataIdSet))
1135 try:
1136 (record,) = fetched
1137 except ValueError:
1138 record = None
1139 records[element.name] = record
1140 if record is not None:
1141 for d in element.implied:
1142 value = getattr(record, d.name)
1143 if keys.setdefault(d.name, value) != value:
1144 raise InconsistentDataIdError(
1145 f"Data ID {standardized} has {d.name}={keys[d.name]!r}, "
1146 f"but {element.name} implies {d.name}={value!r}."
1147 )
1148 else:
1149 if element in standardized.graph.required:
1150 raise LookupError(
1151 f"Could not fetch record for required dimension {element.name} via keys {keys}."
1152 )
1153 if element.alwaysJoin:
1154 raise InconsistentDataIdError(
1155 f"Could not fetch record for element {element.name} via keys {keys}, ",
1156 "but it is marked alwaysJoin=True; this means one or more dimensions are not "
1157 "related."
1158 )
1159 for d in element.implied:
1160 keys.setdefault(d.name, None)
1161 records.setdefault(d.name, None)
1162 return DataCoordinate.standardize(keys, graph=standardized.graph).expanded(records=records)
1164 def insertDimensionData(self, element: Union[DimensionElement, str],
1165 *data: Union[Mapping[str, Any], DimensionRecord],
1166 conform: bool = True) -> None:
1167 """Insert one or more dimension records into the database.
1169 Parameters
1170 ----------
1171 element : `DimensionElement` or `str`
1172 The `DimensionElement` or name thereof that identifies the table
1173 records will be inserted into.
1174 data : `dict` or `DimensionRecord` (variadic)
1175 One or more records to insert.
1176 conform : `bool`, optional
1177 If `False` (`True` is default) perform no checking or conversions,
1178 and assume that ``element`` is a `DimensionElement` instance and
1179 ``data`` is a one or more `DimensionRecord` instances of the
1180 appropriate subclass.
1181 """
1182 if conform:
1183 if isinstance(element, str):
1184 element = self.dimensions[element]
1185 records = [row if isinstance(row, DimensionRecord) else element.RecordClass(**row)
1186 for row in data]
1187 else:
1188 # Ignore typing since caller said to trust them with conform=False.
1189 records = data # type: ignore
1190 storage = self._managers.dimensions[element] # type: ignore
1191 storage.insert(*records)
1193 def syncDimensionData(self, element: Union[DimensionElement, str],
1194 row: Union[Mapping[str, Any], DimensionRecord],
1195 conform: bool = True) -> bool:
1196 """Synchronize the given dimension record with the database, inserting
1197 if it does not already exist and comparing values if it does.
1199 Parameters
1200 ----------
1201 element : `DimensionElement` or `str`
1202 The `DimensionElement` or name thereof that identifies the table
1203 records will be inserted into.
1204 row : `dict` or `DimensionRecord`
1205 The record to insert.
1206 conform : `bool`, optional
1207 If `False` (`True` is default) perform no checking or conversions,
1208 and assume that ``element`` is a `DimensionElement` instance and
1209 ``data`` is a one or more `DimensionRecord` instances of the
1210 appropriate subclass.
1212 Returns
1213 -------
1214 inserted : `bool`
1215 `True` if a new row was inserted, `False` otherwise.
1217 Raises
1218 ------
1219 ConflictingDefinitionError
1220 Raised if the record exists in the database (according to primary
1221 key lookup) but is inconsistent with the given one.
1222 """
1223 if conform:
1224 if isinstance(element, str):
1225 element = self.dimensions[element]
1226 record = row if isinstance(row, DimensionRecord) else element.RecordClass(**row)
1227 else:
1228 # Ignore typing since caller said to trust them with conform=False.
1229 record = row # type: ignore
1230 storage = self._managers.dimensions[element] # type: ignore
1231 return storage.sync(record)
1233 def queryDatasetTypes(self, expression: Any = ..., *, components: Optional[bool] = None
1234 ) -> Iterator[DatasetType]:
1235 """Iterate over the dataset types whose names match an expression.
1237 Parameters
1238 ----------
1239 expression : `Any`, optional
1240 An expression that fully or partially identifies the dataset types
1241 to return, such as a `str`, `re.Pattern`, or iterable thereof.
1242 `...` can be used to return all dataset types, and is the default.
1243 See :ref:`daf_butler_dataset_type_expressions` for more
1244 information.
1245 components : `bool`, optional
1246 If `True`, apply all expression patterns to component dataset type
1247 names as well. If `False`, never apply patterns to components.
1248 If `None` (default), apply patterns to components only if their
1249 parent datasets were not matched by the expression.
1250 Fully-specified component datasets (`str` or `DatasetType`
1251 instances) are always included.
1253 Yields
1254 ------
1255 datasetType : `DatasetType`
1256 A `DatasetType` instance whose name matches ``expression``.
1257 """
1258 wildcard = CategorizedWildcard.fromExpression(expression, coerceUnrecognized=lambda d: d.name)
1259 if wildcard is Ellipsis:
1260 for datasetType in self._managers.datasets:
1261 # The dataset type can no longer be a component
1262 yield datasetType
1263 if components:
1264 # Automatically create the component dataset types
1265 try:
1266 componentsForDatasetType = datasetType.makeAllComponentDatasetTypes()
1267 except KeyError as err:
1268 _LOG.warning(f"Could not load storage class {err} for {datasetType.name}; "
1269 "if it has components they will not be included in query results.")
1270 else:
1271 yield from componentsForDatasetType
1272 return
1273 done: Set[str] = set()
1274 for name in wildcard.strings:
1275 storage = self._managers.datasets.find(name)
1276 if storage is not None:
1277 done.add(storage.datasetType.name)
1278 yield storage.datasetType
1279 if wildcard.patterns:
1280 # If components (the argument) is None, we'll save component
1281 # dataset that we might want to match, but only if their parents
1282 # didn't get included.
1283 componentsForLater = []
1284 for registeredDatasetType in self._managers.datasets:
1285 # Components are not stored in registry so expand them here
1286 allDatasetTypes = [registeredDatasetType]
1287 try:
1288 allDatasetTypes.extend(registeredDatasetType.makeAllComponentDatasetTypes())
1289 except KeyError as err:
1290 _LOG.warning(f"Could not load storage class {err} for {registeredDatasetType.name}; "
1291 "if it has components they will not be included in query results.")
1292 for datasetType in allDatasetTypes:
1293 if datasetType.name in done:
1294 continue
1295 parentName, componentName = datasetType.nameAndComponent()
1296 if componentName is not None and not components:
1297 if components is None and parentName not in done:
1298 componentsForLater.append(datasetType)
1299 continue
1300 if any(p.fullmatch(datasetType.name) for p in wildcard.patterns):
1301 done.add(datasetType.name)
1302 yield datasetType
1303 # Go back and try to match saved components.
1304 for datasetType in componentsForLater:
1305 parentName, _ = datasetType.nameAndComponent()
1306 if parentName not in done and any(p.fullmatch(datasetType.name) for p in wildcard.patterns):
1307 yield datasetType
1309 def queryCollections(self, expression: Any = ...,
1310 datasetType: Optional[DatasetType] = None,
1311 collectionTypes: Iterable[CollectionType] = CollectionType.all(),
1312 flattenChains: bool = False,
1313 includeChains: Optional[bool] = None) -> Iterator[str]:
1314 """Iterate over the collections whose names match an expression.
1316 Parameters
1317 ----------
1318 expression : `Any`, optional
1319 An expression that fully or partially identifies the collections
1320 to return, such as a `str`, `re.Pattern`, or iterable thereof.
1321 `...` can be used to return all collections, and is the default.
1322 See :ref:`daf_butler_collection_expressions` for more
1323 information.
1324 datasetType : `DatasetType`, optional
1325 If provided, only yield collections that may contain datasets of
1326 this type. This is a conservative approximation in general; it may
1327 yield collections that do not have any such datasets.
1328 collectionTypes : `AbstractSet` [ `CollectionType` ], optional
1329 If provided, only yield collections of these types.
1330 flattenChains : `bool`, optional
1331 If `True` (`False` is default), recursively yield the child
1332 collections of matching `~CollectionType.CHAINED` collections.
1333 includeChains : `bool`, optional
1334 If `True`, yield records for matching `~CollectionType.CHAINED`
1335 collections. Default is the opposite of ``flattenChains``: include
1336 either CHAINED collections or their children, but not both.
1338 Yields
1339 ------
1340 collection : `str`
1341 The name of a collection that matches ``expression``.
1342 """
1343 # Right now the datasetTypes argument is completely ignored, but that
1344 # is consistent with its [lack of] guarantees. DM-24939 or a follow-up
1345 # ticket will take care of that.
1346 query = CollectionQuery.fromExpression(expression)
1347 for record in query.iter(self._managers.collections, collectionTypes=frozenset(collectionTypes),
1348 flattenChains=flattenChains, includeChains=includeChains):
1349 yield record.name
1351 def makeQueryBuilder(self, summary: queries.QuerySummary) -> queries.QueryBuilder:
1352 """Return a `QueryBuilder` instance capable of constructing and
1353 managing more complex queries than those obtainable via `Registry`
1354 interfaces.
1356 This is an advanced interface; downstream code should prefer
1357 `Registry.queryDataIds` and `Registry.queryDatasets` whenever those
1358 are sufficient.
1360 Parameters
1361 ----------
1362 summary : `queries.QuerySummary`
1363 Object describing and categorizing the full set of dimensions that
1364 will be included in the query.
1366 Returns
1367 -------
1368 builder : `queries.QueryBuilder`
1369 Object that can be used to construct and perform advanced queries.
1370 """
1371 return queries.QueryBuilder(
1372 summary,
1373 queries.RegistryManagers(
1374 collections=self._managers.collections,
1375 dimensions=self._managers.dimensions,
1376 datasets=self._managers.datasets,
1377 TimespanReprClass=self._db.getTimespanRepresentation(),
1378 ),
1379 )
1381 def queryDatasets(self, datasetType: Any, *,
1382 collections: Any = None,
1383 dimensions: Optional[Iterable[Union[Dimension, str]]] = None,
1384 dataId: Optional[DataId] = None,
1385 where: Optional[str] = None,
1386 findFirst: bool = False,
1387 components: Optional[bool] = None,
1388 bind: Optional[Mapping[str, Any]] = None,
1389 check: bool = True,
1390 **kwargs: Any) -> queries.DatasetQueryResults:
1391 """Query for and iterate over dataset references matching user-provided
1392 criteria.
1394 Parameters
1395 ----------
1396 datasetType
1397 An expression that fully or partially identifies the dataset types
1398 to be queried. Allowed types include `DatasetType`, `str`,
1399 `re.Pattern`, and iterables thereof. The special value `...` can
1400 be used to query all dataset types. See
1401 :ref:`daf_butler_dataset_type_expressions` for more information.
1402 collections: optional
1403 An expression that fully or partially identifies the collections
1404 to search for datasets, such as a `str`, `re.Pattern`, or iterable
1405 thereof. `...` can be used to find datasets from all
1406 `~CollectionType.RUN` collections (no other collections are
1407 necessary, because all datasets are in a ``RUN`` collection). See
1408 :ref:`daf_butler_collection_expressions` for more information.
1409 If not provided, ``self.default.collections`` is used.
1410 dimensions : `~collections.abc.Iterable` of `Dimension` or `str`
1411 Dimensions to include in the query (in addition to those used
1412 to identify the queried dataset type(s)), either to constrain
1413 the resulting datasets to those for which a matching dimension
1414 exists, or to relate the dataset type's dimensions to dimensions
1415 referenced by the ``dataId`` or ``where`` arguments.
1416 dataId : `dict` or `DataCoordinate`, optional
1417 A data ID whose key-value pairs are used as equality constraints
1418 in the query.
1419 where : `str`, optional
1420 A string expression similar to a SQL WHERE clause. May involve
1421 any column of a dimension table or (as a shortcut for the primary
1422 key column of a dimension table) dimension name. See
1423 :ref:`daf_butler_dimension_expressions` for more information.
1424 findFirst : `bool`, optional
1425 If `True` (`False` is default), for each result data ID, only
1426 yield one `DatasetRef` of each `DatasetType`, from the first
1427 collection in which a dataset of that dataset type appears
1428 (according to the order of ``collections`` passed in). If `True`,
1429 ``collections`` must not contain regular expressions and may not
1430 be `...`.
1431 components : `bool`, optional
1432 If `True`, apply all dataset expression patterns to component
1433 dataset type names as well. If `False`, never apply patterns to
1434 components. If `None` (default), apply patterns to components only
1435 if their parent datasets were not matched by the expression.
1436 Fully-specified component datasets (`str` or `DatasetType`
1437 instances) are always included.
1438 bind : `Mapping`, optional
1439 Mapping containing literal values that should be injected into the
1440 ``where`` expression, keyed by the identifiers they replace.
1441 check : `bool`, optional
1442 If `True` (default) check the query for consistency before
1443 executing it. This may reject some valid queries that resemble
1444 common mistakes (e.g. queries for visits without specifying an
1445 instrument).
1446 **kwargs
1447 Additional keyword arguments are forwarded to
1448 `DataCoordinate.standardize` when processing the ``dataId``
1449 argument (and may be used to provide a constraining data ID even
1450 when the ``dataId`` argument is `None`).
1452 Returns
1453 -------
1454 refs : `queries.DatasetQueryResults`
1455 Dataset references matching the given query criteria.
1457 Raises
1458 ------
1459 TypeError
1460 Raised when the arguments are incompatible, such as when a
1461 collection wildcard is passed when ``findFirst`` is `True`, or
1462 when ``collections`` is `None` and``self.defaults.collections`` is
1463 also `None`.
1465 Notes
1466 -----
1467 When multiple dataset types are queried in a single call, the
1468 results of this operation are equivalent to querying for each dataset
1469 type separately in turn, and no information about the relationships
1470 between datasets of different types is included. In contexts where
1471 that kind of information is important, the recommended pattern is to
1472 use `queryDataIds` to first obtain data IDs (possibly with the
1473 desired dataset types and collections passed as constraints to the
1474 query), and then use multiple (generally much simpler) calls to
1475 `queryDatasets` with the returned data IDs passed as constraints.
1476 """
1477 # Standardize the collections expression.
1478 if collections is None:
1479 if not self.defaults.collections:
1480 raise TypeError("No collections provided to findDataset, "
1481 "and no defaults from registry construction.")
1482 collections = self.defaults.collections
1483 elif findFirst:
1484 collections = CollectionSearch.fromExpression(collections)
1485 else:
1486 collections = CollectionQuery.fromExpression(collections)
1487 # Standardize and expand the data ID provided as a constraint.
1488 standardizedDataId = self.expandDataId(dataId, **kwargs)
1490 # We can only query directly if given a non-component DatasetType
1491 # instance. If we were given an expression or str or a component
1492 # DatasetType instance, we'll populate this dict, recurse, and return.
1493 # If we already have a non-component DatasetType, it will remain None
1494 # and we'll run the query directly.
1495 composition: Optional[
1496 Dict[
1497 DatasetType, # parent dataset type
1498 List[Optional[str]] # component name, or None for parent
1499 ]
1500 ] = None
1501 if not isinstance(datasetType, DatasetType):
1502 # We were given a dataset type expression (which may be as simple
1503 # as a str). Loop over all matching datasets, delegating handling
1504 # of the `components` argument to queryDatasetTypes, as we populate
1505 # the composition dict.
1506 composition = defaultdict(list)
1507 for trueDatasetType in self.queryDatasetTypes(datasetType, components=components):
1508 parentName, componentName = trueDatasetType.nameAndComponent()
1509 if componentName is not None:
1510 parentDatasetType = self.getDatasetType(parentName)
1511 composition.setdefault(parentDatasetType, []).append(componentName)
1512 else:
1513 composition.setdefault(trueDatasetType, []).append(None)
1514 elif datasetType.isComponent():
1515 # We were given a true DatasetType instance, but it's a component.
1516 # the composition dict will have exactly one item.
1517 parentName, componentName = datasetType.nameAndComponent()
1518 parentDatasetType = self.getDatasetType(parentName)
1519 composition = {parentDatasetType: [componentName]}
1520 if composition is not None:
1521 # We need to recurse. Do that once for each parent dataset type.
1522 chain = []
1523 for parentDatasetType, componentNames in composition.items():
1524 parentResults = self.queryDatasets(
1525 parentDatasetType,
1526 collections=collections,
1527 dimensions=dimensions,
1528 dataId=standardizedDataId,
1529 where=where,
1530 findFirst=findFirst,
1531 check=check,
1532 )
1533 if isinstance(parentResults, queries.ParentDatasetQueryResults):
1534 chain.append(
1535 parentResults.withComponents(componentNames)
1536 )
1537 else:
1538 # Should only happen if we know there would be no results.
1539 assert isinstance(parentResults, queries.ChainedDatasetQueryResults) \
1540 and not parentResults._chain
1541 return queries.ChainedDatasetQueryResults(chain)
1542 # If we get here, there's no need to recurse (or we are already
1543 # recursing; there can only ever be one level of recursion).
1545 # The full set of dimensions in the query is the combination of those
1546 # needed for the DatasetType and those explicitly requested, if any.
1547 requestedDimensionNames = set(datasetType.dimensions.names)
1548 if dimensions is not None:
1549 requestedDimensionNames.update(self.dimensions.extract(dimensions).names)
1550 # Construct the summary structure needed to construct a QueryBuilder.
1551 summary = queries.QuerySummary(
1552 requested=DimensionGraph(self.dimensions, names=requestedDimensionNames),
1553 dataId=standardizedDataId,
1554 expression=where,
1555 bind=bind,
1556 defaults=self.defaults.dataId,
1557 check=check,
1558 )
1559 builder = self.makeQueryBuilder(summary)
1560 # Add the dataset subquery to the query, telling the QueryBuilder to
1561 # include the rank of the selected collection in the results only if we
1562 # need to findFirst. Note that if any of the collections are
1563 # actually wildcard expressions, and we've asked for deduplication,
1564 # this will raise TypeError for us.
1565 if not builder.joinDataset(datasetType, collections, isResult=True, findFirst=findFirst):
1566 return queries.ChainedDatasetQueryResults(())
1567 query = builder.finish()
1568 return queries.ParentDatasetQueryResults(self._db, query, components=[None])
1570 def queryDataIds(self, dimensions: Union[Iterable[Union[Dimension, str]], Dimension, str], *,
1571 dataId: Optional[DataId] = None,
1572 datasets: Any = None,
1573 collections: Any = None,
1574 where: Optional[str] = None,
1575 components: Optional[bool] = None,
1576 bind: Optional[Mapping[str, Any]] = None,
1577 check: bool = True,
1578 **kwargs: Any) -> queries.DataCoordinateQueryResults:
1579 """Query for data IDs matching user-provided criteria.
1581 Parameters
1582 ----------
1583 dimensions : `Dimension` or `str`, or iterable thereof
1584 The dimensions of the data IDs to yield, as either `Dimension`
1585 instances or `str`. Will be automatically expanded to a complete
1586 `DimensionGraph`.
1587 dataId : `dict` or `DataCoordinate`, optional
1588 A data ID whose key-value pairs are used as equality constraints
1589 in the query.
1590 datasets : `Any`, optional
1591 An expression that fully or partially identifies dataset types
1592 that should constrain the yielded data IDs. For example, including
1593 "raw" here would constrain the yielded ``instrument``,
1594 ``exposure``, ``detector``, and ``physical_filter`` values to only
1595 those for which at least one "raw" dataset exists in
1596 ``collections``. Allowed types include `DatasetType`, `str`,
1597 `re.Pattern`, and iterables thereof. Unlike other dataset type
1598 expressions, ``...`` is not permitted - it doesn't make sense to
1599 constrain data IDs on the existence of *all* datasets.
1600 See :ref:`daf_butler_dataset_type_expressions` for more
1601 information.
1602 collections: `Any`, optional
1603 An expression that fully or partially identifies the collections
1604 to search for datasets, such as a `str`, `re.Pattern`, or iterable
1605 thereof. `...` can be used to return all collections. Must be
1606 provided if ``datasets`` is, and is ignored if it is not. See
1607 :ref:`daf_butler_collection_expressions` for more information.
1608 If not provided, ``self.default.collections`` is used.
1609 where : `str`, optional
1610 A string expression similar to a SQL WHERE clause. May involve
1611 any column of a dimension table or (as a shortcut for the primary
1612 key column of a dimension table) dimension name. See
1613 :ref:`daf_butler_dimension_expressions` for more information.
1614 components : `bool`, optional
1615 If `True`, apply all dataset expression patterns to component
1616 dataset type names as well. If `False`, never apply patterns to
1617 components. If `None` (default), apply patterns to components only
1618 if their parent datasets were not matched by the expression.
1619 Fully-specified component datasets (`str` or `DatasetType`
1620 instances) are always included.
1621 bind : `Mapping`, optional
1622 Mapping containing literal values that should be injected into the
1623 ``where`` expression, keyed by the identifiers they replace.
1624 check : `bool`, optional
1625 If `True` (default) check the query for consistency before
1626 executing it. This may reject some valid queries that resemble
1627 common mistakes (e.g. queries for visits without specifying an
1628 instrument).
1629 **kwargs
1630 Additional keyword arguments are forwarded to
1631 `DataCoordinate.standardize` when processing the ``dataId``
1632 argument (and may be used to provide a constraining data ID even
1633 when the ``dataId`` argument is `None`).
1635 Returns
1636 -------
1637 dataIds : `DataCoordinateQueryResults`
1638 Data IDs matching the given query parameters. These are guaranteed
1639 to identify all dimensions (`DataCoordinate.hasFull` returns
1640 `True`), but will not contain `DimensionRecord` objects
1641 (`DataCoordinate.hasRecords` returns `False`). Call
1642 `DataCoordinateQueryResults.expanded` on the returned object to
1643 fetch those (and consider using
1644 `DataCoordinateQueryResults.materialize` on the returned object
1645 first if the expected number of rows is very large). See
1646 documentation for those methods for additional information.
1648 Raises
1649 ------
1650 TypeError
1651 Raised if ``collections`` is `None`, ``self.defaults.collections``
1652 is `None`, and ``datasets`` is not `None`.
1653 """
1654 dimensions = iterable(dimensions)
1655 standardizedDataId = self.expandDataId(dataId, **kwargs)
1656 standardizedDatasetTypes = set()
1657 requestedDimensions = self.dimensions.extract(dimensions)
1658 queryDimensionNames = set(requestedDimensions.names)
1659 if datasets is not None:
1660 if collections is None:
1661 if not self.defaults.collections:
1662 raise TypeError("Cannot pass 'datasets' without 'collections'.")
1663 collections = self.defaults.collections
1664 else:
1665 # Preprocess collections expression in case the original
1666 # included single-pass iterators (we'll want to use it multiple
1667 # times below).
1668 collections = CollectionQuery.fromExpression(collections)
1669 for datasetType in self.queryDatasetTypes(datasets, components=components):
1670 queryDimensionNames.update(datasetType.dimensions.names)
1671 # If any matched dataset type is a component, just operate on
1672 # its parent instead, because Registry doesn't know anything
1673 # about what components exist, and here (unlike queryDatasets)
1674 # we don't care about returning them.
1675 parentDatasetTypeName, componentName = datasetType.nameAndComponent()
1676 if componentName is not None:
1677 datasetType = self.getDatasetType(parentDatasetTypeName)
1678 standardizedDatasetTypes.add(datasetType)
1680 summary = queries.QuerySummary(
1681 requested=DimensionGraph(self.dimensions, names=queryDimensionNames),
1682 dataId=standardizedDataId,
1683 expression=where,
1684 bind=bind,
1685 defaults=self.defaults.dataId,
1686 check=check,
1687 )
1688 builder = self.makeQueryBuilder(summary)
1689 for datasetType in standardizedDatasetTypes:
1690 builder.joinDataset(datasetType, collections, isResult=False)
1691 query = builder.finish()
1692 return queries.DataCoordinateQueryResults(self._db, query)
1694 def queryDimensionRecords(self, element: Union[DimensionElement, str], *,
1695 dataId: Optional[DataId] = None,
1696 datasets: Any = None,
1697 collections: Any = None,
1698 where: Optional[str] = None,
1699 components: Optional[bool] = None,
1700 bind: Optional[Mapping[str, Any]] = None,
1701 check: bool = True,
1702 **kwargs: Any) -> Iterator[DimensionRecord]:
1703 """Query for dimension information matching user-provided criteria.
1705 Parameters
1706 ----------
1707 element : `DimensionElement` or `str`
1708 The dimension element to obtain records for.
1709 dataId : `dict` or `DataCoordinate`, optional
1710 A data ID whose key-value pairs are used as equality constraints
1711 in the query.
1712 datasets : `Any`, optional
1713 An expression that fully or partially identifies dataset types
1714 that should constrain the yielded records. See `queryDataIds` and
1715 :ref:`daf_butler_dataset_type_expressions` for more information.
1716 collections: `Any`, optional
1717 An expression that fully or partially identifies the collections
1718 to search for datasets. See `queryDataIds` and
1719 :ref:`daf_butler_collection_expressions` for more information.
1720 where : `str`, optional
1721 A string expression similar to a SQL WHERE clause. See
1722 `queryDataIds` and :ref:`daf_butler_dimension_expressions` for more
1723 information.
1724 components : `bool`, optional
1725 Whether to apply dataset expressions to components as well.
1726 See `queryDataIds` for more information.
1727 bind : `Mapping`, optional
1728 Mapping containing literal values that should be injected into the
1729 ``where`` expression, keyed by the identifiers they replace.
1730 check : `bool`, optional
1731 If `True` (default) check the query for consistency before
1732 executing it. This may reject some valid queries that resemble
1733 common mistakes (e.g. queries for visits without specifying an
1734 instrument).
1735 **kwargs
1736 Additional keyword arguments are forwarded to
1737 `DataCoordinate.standardize` when processing the ``dataId``
1738 argument (and may be used to provide a constraining data ID even
1739 when the ``dataId`` argument is `None`).
1741 Returns
1742 -------
1743 dataIds : `DataCoordinateQueryResults`
1744 Data IDs matching the given query parameters.
1745 """
1746 if not isinstance(element, DimensionElement):
1747 try:
1748 element = self.dimensions[element]
1749 except KeyError as e:
1750 raise KeyError(f"No such dimension '{element}', available dimensions: "
1751 + str(self.dimensions.getStaticElements())) from e
1752 dataIds = self.queryDataIds(element.graph, dataId=dataId, datasets=datasets, collections=collections,
1753 where=where, components=components, bind=bind, check=check, **kwargs)
1754 return iter(self._managers.dimensions[element].fetch(dataIds))
1756 def queryDatasetAssociations(
1757 self,
1758 datasetType: Union[str, DatasetType],
1759 collections: Any = ...,
1760 *,
1761 collectionTypes: Iterable[CollectionType] = CollectionType.all(),
1762 flattenChains: bool = False,
1763 ) -> Iterator[DatasetAssociation]:
1764 """Iterate over dataset-collection combinations where the dataset is in
1765 the collection.
1767 This method is a temporary placeholder for better support for
1768 assocation results in `queryDatasets`. It will probably be
1769 removed in the future, and should be avoided in production code
1770 whenever possible.
1772 Parameters
1773 ----------
1774 datasetType : `DatasetType` or `str`
1775 A dataset type object or the name of one.
1776 collections: `Any`, optional
1777 An expression that fully or partially identifies the collections
1778 to search for datasets. See `queryCollections` and
1779 :ref:`daf_butler_collection_expressions` for more information.
1780 If not provided, ``self.default.collections`` is used.
1781 collectionTypes : `AbstractSet` [ `CollectionType` ], optional
1782 If provided, only yield associations from collections of these
1783 types.
1784 flattenChains : `bool`, optional
1785 If `True` (default) search in the children of
1786 `~CollectionType.CHAINED` collections. If `False`, ``CHAINED``
1787 collections are ignored.
1789 Yields
1790 ------
1791 association : `DatasetAssociation`
1792 Object representing the relationship beween a single dataset and
1793 a single collection.
1795 Raises
1796 ------
1797 TypeError
1798 Raised if ``collections`` is `None` and
1799 ``self.defaults.collections`` is `None`.
1800 """
1801 if collections is None:
1802 if not self.defaults.collections:
1803 raise TypeError("No collections provided to findDataset, "
1804 "and no defaults from registry construction.")
1805 collections = self.defaults.collections
1806 else:
1807 collections = CollectionQuery.fromExpression(collections)
1808 TimespanReprClass = self._db.getTimespanRepresentation()
1809 if isinstance(datasetType, str):
1810 storage = self._managers.datasets[datasetType]
1811 else:
1812 storage = self._managers.datasets[datasetType.name]
1813 for collectionRecord in collections.iter(self._managers.collections,
1814 collectionTypes=frozenset(collectionTypes),
1815 flattenChains=flattenChains):
1816 query = storage.select(collectionRecord)
1817 if query is None:
1818 continue
1819 for row in self._db.query(query.combine()):
1820 dataId = DataCoordinate.fromRequiredValues(
1821 storage.datasetType.dimensions,
1822 tuple(row[name] for name in storage.datasetType.dimensions.required.names)
1823 )
1824 runRecord = self._managers.collections[row[self._managers.collections.getRunForeignKeyName()]]
1825 ref = DatasetRef(storage.datasetType, dataId, id=row["id"], run=runRecord.name,
1826 conform=False)
1827 if collectionRecord.type is CollectionType.CALIBRATION:
1828 timespan = TimespanReprClass.extract(row)
1829 else:
1830 timespan = None
1831 yield DatasetAssociation(ref=ref, collection=collectionRecord.name, timespan=timespan)
1833 storageClasses: StorageClassFactory
1834 """All storage classes known to the registry (`StorageClassFactory`).
1835 """