Coverage for python/lsst/daf/butler/registry/sql_registry.py: 18%
575 statements
« prev ^ index » next coverage.py v7.3.2, created at 2023-12-05 11:07 +0000
« prev ^ index » next coverage.py v7.3.2, created at 2023-12-05 11:07 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This software is dual licensed under the GNU General Public License and also
10# under a 3-clause BSD license. Recipients may choose which of these licenses
11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12# respectively. If you choose the GPL option then the following text applies
13# (but note that there is still no warranty even if you opt for BSD instead):
14#
15# This program is free software: you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation, either version 3 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program. If not, see <http://www.gnu.org/licenses/>.
28from __future__ import annotations
30from .. import ddl
32__all__ = ("SqlRegistry",)
34import contextlib
35import logging
36import warnings
37from collections.abc import Iterable, Iterator, Mapping, Sequence
38from typing import TYPE_CHECKING, Any, Literal, cast
40import sqlalchemy
41from lsst.daf.relation import LeafRelation, Relation
42from lsst.resources import ResourcePathExpression
43from lsst.utils.introspection import find_outside_stacklevel
44from lsst.utils.iteration import ensure_iterable
46from .._column_tags import DatasetColumnTag
47from .._config import Config
48from .._dataset_association import DatasetAssociation
49from .._dataset_ref import DatasetId, DatasetIdGenEnum, DatasetRef
50from .._dataset_type import DatasetType
51from .._named import NamedKeyMapping, NameLookupMapping
52from .._storage_class import StorageClassFactory
53from .._timespan import Timespan
54from ..dimensions import (
55 DataCoordinate,
56 DataId,
57 Dimension,
58 DimensionConfig,
59 DimensionElement,
60 DimensionGraph,
61 DimensionGroup,
62 DimensionRecord,
63 DimensionUniverse,
64)
65from ..progress import Progress
66from ..registry import (
67 ArgumentError,
68 CollectionExpressionError,
69 CollectionSummary,
70 CollectionType,
71 CollectionTypeError,
72 ConflictingDefinitionError,
73 DataIdValueError,
74 DatasetTypeError,
75 DimensionNameError,
76 InconsistentDataIdError,
77 NoDefaultCollectionError,
78 OrphanedRecordError,
79 RegistryConfig,
80 RegistryConsistencyError,
81 RegistryDefaults,
82 queries,
83)
84from ..registry.interfaces import ChainedCollectionRecord, ReadOnlyDatabaseError, RunRecord
85from ..registry.managers import RegistryManagerInstances, RegistryManagerTypes
86from ..registry.wildcards import CollectionWildcard, DatasetTypeWildcard
87from ..utils import transactional
89if TYPE_CHECKING:
90 from .._butler_config import ButlerConfig
91 from ..datastore._datastore import DatastoreOpaqueTable
92 from ..datastore.stored_file_info import StoredDatastoreItemInfo
93 from ..registry._registry import CollectionArgType
94 from ..registry.interfaces import (
95 CollectionRecord,
96 Database,
97 DatastoreRegistryBridgeManager,
98 ObsCoreTableManager,
99 )
102_LOG = logging.getLogger(__name__)
105class SqlRegistry:
106 """Butler Registry implementation that uses SQL database as backend.
108 Parameters
109 ----------
110 database : `Database`
111 Database instance to store Registry.
112 defaults : `RegistryDefaults`
113 Default collection search path and/or output `~CollectionType.RUN`
114 collection.
115 managers : `RegistryManagerInstances`
116 All the managers required for this registry.
117 """
119 defaultConfigFile: str | None = None
120 """Path to configuration defaults. Accessed within the ``configs`` resource
121 or relative to a search path. Can be None if no defaults specified.
122 """
124 @classmethod
125 def forceRegistryConfig(
126 cls, config: ButlerConfig | RegistryConfig | Config | str | None
127 ) -> RegistryConfig:
128 """Force the supplied config to a `RegistryConfig`.
130 Parameters
131 ----------
132 config : `RegistryConfig`, `Config` or `str` or `None`
133 Registry configuration, if missing then default configuration will
134 be loaded from registry.yaml.
136 Returns
137 -------
138 registry_config : `RegistryConfig`
139 A registry config.
140 """
141 if not isinstance(config, RegistryConfig):
142 if isinstance(config, str | Config) or config is None:
143 config = RegistryConfig(config)
144 else:
145 raise ValueError(f"Incompatible Registry configuration: {config}")
146 return config
148 @classmethod
149 def createFromConfig(
150 cls,
151 config: RegistryConfig | str | None = None,
152 dimensionConfig: DimensionConfig | str | None = None,
153 butlerRoot: ResourcePathExpression | None = None,
154 ) -> SqlRegistry:
155 """Create registry database and return `SqlRegistry` instance.
157 This method initializes database contents, database must be empty
158 prior to calling this method.
160 Parameters
161 ----------
162 config : `RegistryConfig` or `str`, optional
163 Registry configuration, if missing then default configuration will
164 be loaded from registry.yaml.
165 dimensionConfig : `DimensionConfig` or `str`, optional
166 Dimensions configuration, if missing then default configuration
167 will be loaded from dimensions.yaml.
168 butlerRoot : convertible to `lsst.resources.ResourcePath`, optional
169 Path to the repository root this `SqlRegistry` will manage.
171 Returns
172 -------
173 registry : `SqlRegistry`
174 A new `SqlRegistry` instance.
175 """
176 config = cls.forceRegistryConfig(config)
177 config.replaceRoot(butlerRoot)
179 if isinstance(dimensionConfig, str):
180 dimensionConfig = DimensionConfig(dimensionConfig)
181 elif dimensionConfig is None:
182 dimensionConfig = DimensionConfig()
183 elif not isinstance(dimensionConfig, DimensionConfig):
184 raise TypeError(f"Incompatible Dimension configuration type: {type(dimensionConfig)}")
186 DatabaseClass = config.getDatabaseClass()
187 database = DatabaseClass.fromUri(
188 config.connectionString, origin=config.get("origin", 0), namespace=config.get("namespace")
189 )
190 managerTypes = RegistryManagerTypes.fromConfig(config)
191 managers = managerTypes.makeRepo(database, dimensionConfig)
192 return cls(database, RegistryDefaults(), managers)
194 @classmethod
195 def fromConfig(
196 cls,
197 config: ButlerConfig | RegistryConfig | Config | str,
198 butlerRoot: ResourcePathExpression | None = None,
199 writeable: bool = True,
200 defaults: RegistryDefaults | None = None,
201 ) -> SqlRegistry:
202 """Create `Registry` subclass instance from `config`.
204 Registry database must be initialized prior to calling this method.
206 Parameters
207 ----------
208 config : `ButlerConfig`, `RegistryConfig`, `Config` or `str`
209 Registry configuration
210 butlerRoot : `lsst.resources.ResourcePathExpression`, optional
211 Path to the repository root this `Registry` will manage.
212 writeable : `bool`, optional
213 If `True` (default) create a read-write connection to the database.
214 defaults : `RegistryDefaults`, optional
215 Default collection search path and/or output `~CollectionType.RUN`
216 collection.
218 Returns
219 -------
220 registry : `SqlRegistry`
221 A new `SqlRegistry` subclass instance.
222 """
223 config = cls.forceRegistryConfig(config)
224 config.replaceRoot(butlerRoot)
225 DatabaseClass = config.getDatabaseClass()
226 database = DatabaseClass.fromUri(
227 config.connectionString,
228 origin=config.get("origin", 0),
229 namespace=config.get("namespace"),
230 writeable=writeable,
231 )
232 managerTypes = RegistryManagerTypes.fromConfig(config)
233 with database.session():
234 managers = managerTypes.loadRepo(database)
235 if defaults is None:
236 defaults = RegistryDefaults()
237 return cls(database, defaults, managers)
239 def __init__(self, database: Database, defaults: RegistryDefaults, managers: RegistryManagerInstances):
240 self._db = database
241 self._managers = managers
242 self.storageClasses = StorageClassFactory()
243 # Intentionally invoke property setter to initialize defaults. This
244 # can only be done after most of the rest of Registry has already been
245 # initialized, and must be done before the property getter is used.
246 self.defaults = defaults
248 # TODO: This is currently initialized by `make_datastore_tables`,
249 # eventually we'll need to do it during construction.
250 # The mapping is indexed by the opaque table name.
251 self._datastore_record_classes: Mapping[str, type[StoredDatastoreItemInfo]] = {}
253 def __str__(self) -> str:
254 return str(self._db)
256 def __repr__(self) -> str:
257 return f"SqlRegistry({self._db!r}, {self.dimensions!r})"
259 def isWriteable(self) -> bool:
260 """Return `True` if this registry allows write operations, and `False`
261 otherwise.
262 """
263 return self._db.isWriteable()
265 def copy(self, defaults: RegistryDefaults | None = None) -> SqlRegistry:
266 """Create a new `SqlRegistry` backed by the same data repository
267 and connection as this one, but independent defaults.
269 Parameters
270 ----------
271 defaults : `~lsst.daf.butler.registry.RegistryDefaults`, optional
272 Default collections and data ID values for the new registry. If
273 not provided, ``self.defaults`` will be used (but future changes
274 to either registry's defaults will not affect the other).
276 Returns
277 -------
278 copy : `SqlRegistry`
279 A new `SqlRegistry` instance with its own defaults.
281 Notes
282 -----
283 Because the new registry shares a connection with the original, they
284 also share transaction state (despite the fact that their `transaction`
285 context manager methods do not reflect this), and must be used with
286 care.
287 """
288 if defaults is None:
289 # No need to copy, because `RegistryDefaults` is immutable; we
290 # effectively copy on write.
291 defaults = self.defaults
292 return type(self)(self._db, defaults, self._managers)
294 @property
295 def dimensions(self) -> DimensionUniverse:
296 """Definitions of all dimensions recognized by this `Registry`
297 (`DimensionUniverse`).
298 """
299 return self._managers.dimensions.universe
301 @property
302 def defaults(self) -> RegistryDefaults:
303 """Default collection search path and/or output `~CollectionType.RUN`
304 collection (`~lsst.daf.butler.registry.RegistryDefaults`).
306 This is an immutable struct whose components may not be set
307 individually, but the entire struct can be set by assigning to this
308 property.
309 """
310 return self._defaults
312 @defaults.setter
313 def defaults(self, value: RegistryDefaults) -> None:
314 if value.run is not None:
315 self.registerRun(value.run)
316 value.finish(self)
317 self._defaults = value
319 def refresh(self) -> None:
320 """Refresh all in-memory state by querying the database.
322 This may be necessary to enable querying for entities added by other
323 registry instances after this one was constructed.
324 """
325 with self._db.transaction():
326 self._managers.refresh()
328 @contextlib.contextmanager
329 def caching_context(self) -> Iterator[None]:
330 """Context manager that enables caching."""
331 self._managers.caching_context.enable()
332 yield
333 self._managers.caching_context.disable()
335 @contextlib.contextmanager
336 def transaction(self, *, savepoint: bool = False) -> Iterator[None]:
337 """Return a context manager that represents a transaction."""
338 try:
339 with self._db.transaction(savepoint=savepoint):
340 yield
341 except BaseException:
342 # TODO: this clears the caches sometimes when we wouldn't actually
343 # need to. Can we avoid that?
344 self._managers.dimensions.clearCaches()
345 raise
347 def resetConnectionPool(self) -> None:
348 """Reset SQLAlchemy connection pool for `SqlRegistry` database.
350 This operation is useful when using registry with fork-based
351 multiprocessing. To use registry across fork boundary one has to make
352 sure that there are no currently active connections (no session or
353 transaction is in progress) and connection pool is reset using this
354 method. This method should be called by the child process immediately
355 after the fork.
356 """
357 self._db._engine.dispose()
359 def registerOpaqueTable(self, tableName: str, spec: ddl.TableSpec) -> None:
360 """Add an opaque (to the `Registry`) table for use by a `Datastore` or
361 other data repository client.
363 Opaque table records can be added via `insertOpaqueData`, retrieved via
364 `fetchOpaqueData`, and removed via `deleteOpaqueData`.
366 Parameters
367 ----------
368 tableName : `str`
369 Logical name of the opaque table. This may differ from the
370 actual name used in the database by a prefix and/or suffix.
371 spec : `ddl.TableSpec`
372 Specification for the table to be added.
373 """
374 self._managers.opaque.register(tableName, spec)
376 @transactional
377 def insertOpaqueData(self, tableName: str, *data: dict) -> None:
378 """Insert records into an opaque table.
380 Parameters
381 ----------
382 tableName : `str`
383 Logical name of the opaque table. Must match the name used in a
384 previous call to `registerOpaqueTable`.
385 data
386 Each additional positional argument is a dictionary that represents
387 a single row to be added.
388 """
389 self._managers.opaque[tableName].insert(*data)
391 def fetchOpaqueData(self, tableName: str, **where: Any) -> Iterator[Mapping[str, Any]]:
392 """Retrieve records from an opaque table.
394 Parameters
395 ----------
396 tableName : `str`
397 Logical name of the opaque table. Must match the name used in a
398 previous call to `registerOpaqueTable`.
399 where
400 Additional keyword arguments are interpreted as equality
401 constraints that restrict the returned rows (combined with AND);
402 keyword arguments are column names and values are the values they
403 must have.
405 Yields
406 ------
407 row : `dict`
408 A dictionary representing a single result row.
409 """
410 yield from self._managers.opaque[tableName].fetch(**where)
412 @transactional
413 def deleteOpaqueData(self, tableName: str, **where: Any) -> None:
414 """Remove records from an opaque table.
416 Parameters
417 ----------
418 tableName : `str`
419 Logical name of the opaque table. Must match the name used in a
420 previous call to `registerOpaqueTable`.
421 where
422 Additional keyword arguments are interpreted as equality
423 constraints that restrict the deleted rows (combined with AND);
424 keyword arguments are column names and values are the values they
425 must have.
426 """
427 self._managers.opaque[tableName].delete(where.keys(), where)
429 def registerCollection(
430 self, name: str, type: CollectionType = CollectionType.TAGGED, doc: str | None = None
431 ) -> bool:
432 """Add a new collection if one with the given name does not exist.
434 Parameters
435 ----------
436 name : `str`
437 The name of the collection to create.
438 type : `CollectionType`
439 Enum value indicating the type of collection to create.
440 doc : `str`, optional
441 Documentation string for the collection.
443 Returns
444 -------
445 registered : `bool`
446 Boolean indicating whether the collection was already registered
447 or was created by this call.
449 Notes
450 -----
451 This method cannot be called within transactions, as it needs to be
452 able to perform its own transaction to be concurrent.
453 """
454 _, registered = self._managers.collections.register(name, type, doc=doc)
455 return registered
457 def getCollectionType(self, name: str) -> CollectionType:
458 """Return an enumeration value indicating the type of the given
459 collection.
461 Parameters
462 ----------
463 name : `str`
464 The name of the collection.
466 Returns
467 -------
468 type : `CollectionType`
469 Enum value indicating the type of this collection.
471 Raises
472 ------
473 lsst.daf.butler.registry.MissingCollectionError
474 Raised if no collection with the given name exists.
475 """
476 return self._managers.collections.find(name).type
478 def _get_collection_record(self, name: str) -> CollectionRecord:
479 """Return the record for this collection.
481 Parameters
482 ----------
483 name : `str`
484 Name of the collection for which the record is to be retrieved.
486 Returns
487 -------
488 record : `CollectionRecord`
489 The record for this collection.
490 """
491 return self._managers.collections.find(name)
493 def registerRun(self, name: str, doc: str | None = None) -> bool:
494 """Add a new run if one with the given name does not exist.
496 Parameters
497 ----------
498 name : `str`
499 The name of the run to create.
500 doc : `str`, optional
501 Documentation string for the collection.
503 Returns
504 -------
505 registered : `bool`
506 Boolean indicating whether a new run was registered. `False`
507 if it already existed.
509 Notes
510 -----
511 This method cannot be called within transactions, as it needs to be
512 able to perform its own transaction to be concurrent.
513 """
514 _, registered = self._managers.collections.register(name, CollectionType.RUN, doc=doc)
515 return registered
517 @transactional
518 def removeCollection(self, name: str) -> None:
519 """Remove the given collection from the registry.
521 Parameters
522 ----------
523 name : `str`
524 The name of the collection to remove.
526 Raises
527 ------
528 lsst.daf.butler.registry.MissingCollectionError
529 Raised if no collection with the given name exists.
530 sqlalchemy.exc.IntegrityError
531 Raised if the database rows associated with the collection are
532 still referenced by some other table, such as a dataset in a
533 datastore (for `~CollectionType.RUN` collections only) or a
534 `~CollectionType.CHAINED` collection of which this collection is
535 a child.
537 Notes
538 -----
539 If this is a `~CollectionType.RUN` collection, all datasets and quanta
540 in it will removed from the `Registry` database. This requires that
541 those datasets be removed (or at least trashed) from any datastores
542 that hold them first.
544 A collection may not be deleted as long as it is referenced by a
545 `~CollectionType.CHAINED` collection; the ``CHAINED`` collection must
546 be deleted or redefined first.
547 """
548 self._managers.collections.remove(name)
550 def getCollectionChain(self, parent: str) -> tuple[str, ...]:
551 """Return the child collections in a `~CollectionType.CHAINED`
552 collection.
554 Parameters
555 ----------
556 parent : `str`
557 Name of the chained collection. Must have already been added via
558 a call to `Registry.registerCollection`.
560 Returns
561 -------
562 children : `~collections.abc.Sequence` [ `str` ]
563 An ordered sequence of collection names that are searched when the
564 given chained collection is searched.
566 Raises
567 ------
568 lsst.daf.butler.registry.MissingCollectionError
569 Raised if ``parent`` does not exist in the `Registry`.
570 lsst.daf.butler.registry.CollectionTypeError
571 Raised if ``parent`` does not correspond to a
572 `~CollectionType.CHAINED` collection.
573 """
574 record = self._managers.collections.find(parent)
575 if record.type is not CollectionType.CHAINED:
576 raise CollectionTypeError(f"Collection '{parent}' has type {record.type.name}, not CHAINED.")
577 assert isinstance(record, ChainedCollectionRecord)
578 return record.children
580 @transactional
581 def setCollectionChain(self, parent: str, children: Any, *, flatten: bool = False) -> None:
582 """Define or redefine a `~CollectionType.CHAINED` collection.
584 Parameters
585 ----------
586 parent : `str`
587 Name of the chained collection. Must have already been added via
588 a call to `Registry.registerCollection`.
589 children : collection expression
590 An expression defining an ordered search of child collections,
591 generally an iterable of `str`; see
592 :ref:`daf_butler_collection_expressions` for more information.
593 flatten : `bool`, optional
594 If `True` (`False` is default), recursively flatten out any nested
595 `~CollectionType.CHAINED` collections in ``children`` first.
597 Raises
598 ------
599 lsst.daf.butler.registry.MissingCollectionError
600 Raised when any of the given collections do not exist in the
601 `Registry`.
602 lsst.daf.butler.registry.CollectionTypeError
603 Raised if ``parent`` does not correspond to a
604 `~CollectionType.CHAINED` collection.
605 ValueError
606 Raised if the given collections contains a cycle.
607 """
608 record = self._managers.collections.find(parent)
609 if record.type is not CollectionType.CHAINED:
610 raise CollectionTypeError(f"Collection '{parent}' has type {record.type.name}, not CHAINED.")
611 assert isinstance(record, ChainedCollectionRecord)
612 children = CollectionWildcard.from_expression(children).require_ordered()
613 if children != record.children or flatten:
614 self._managers.collections.update_chain(record, children, flatten=flatten)
616 def getCollectionParentChains(self, collection: str) -> set[str]:
617 """Return the CHAINED collections that directly contain the given one.
619 Parameters
620 ----------
621 name : `str`
622 Name of the collection.
624 Returns
625 -------
626 chains : `set` of `str`
627 Set of `~CollectionType.CHAINED` collection names.
628 """
629 return self._managers.collections.getParentChains(self._managers.collections.find(collection).key)
631 def getCollectionDocumentation(self, collection: str) -> str | None:
632 """Retrieve the documentation string for a collection.
634 Parameters
635 ----------
636 name : `str`
637 Name of the collection.
639 Returns
640 -------
641 docs : `str` or `None`
642 Docstring for the collection with the given name.
643 """
644 return self._managers.collections.getDocumentation(self._managers.collections.find(collection).key)
646 def setCollectionDocumentation(self, collection: str, doc: str | None) -> None:
647 """Set the documentation string for a collection.
649 Parameters
650 ----------
651 name : `str`
652 Name of the collection.
653 docs : `str` or `None`
654 Docstring for the collection with the given name; will replace any
655 existing docstring. Passing `None` will remove any existing
656 docstring.
657 """
658 self._managers.collections.setDocumentation(self._managers.collections.find(collection).key, doc)
660 def getCollectionSummary(self, collection: str) -> CollectionSummary:
661 """Return a summary for the given collection.
663 Parameters
664 ----------
665 collection : `str`
666 Name of the collection for which a summary is to be retrieved.
668 Returns
669 -------
670 summary : `~lsst.daf.butler.registry.CollectionSummary`
671 Summary of the dataset types and governor dimension values in
672 this collection.
673 """
674 record = self._managers.collections.find(collection)
675 return self._managers.datasets.getCollectionSummary(record)
677 def registerDatasetType(self, datasetType: DatasetType) -> bool:
678 """Add a new `DatasetType` to the Registry.
680 It is not an error to register the same `DatasetType` twice.
682 Parameters
683 ----------
684 datasetType : `DatasetType`
685 The `DatasetType` to be added.
687 Returns
688 -------
689 inserted : `bool`
690 `True` if ``datasetType`` was inserted, `False` if an identical
691 existing `DatasetType` was found. Note that in either case the
692 DatasetType is guaranteed to be defined in the Registry
693 consistently with the given definition.
695 Raises
696 ------
697 ValueError
698 Raised if the dimensions or storage class are invalid.
699 lsst.daf.butler.registry.ConflictingDefinitionError
700 Raised if this `DatasetType` is already registered with a different
701 definition.
703 Notes
704 -----
705 This method cannot be called within transactions, as it needs to be
706 able to perform its own transaction to be concurrent.
707 """
708 return self._managers.datasets.register(datasetType)
710 def removeDatasetType(self, name: str | tuple[str, ...]) -> None:
711 """Remove the named `DatasetType` from the registry.
713 .. warning::
715 Registry implementations can cache the dataset type definitions.
716 This means that deleting the dataset type definition may result in
717 unexpected behavior from other butler processes that are active
718 that have not seen the deletion.
720 Parameters
721 ----------
722 name : `str` or `tuple` [`str`]
723 Name of the type to be removed or tuple containing a list of type
724 names to be removed. Wildcards are allowed.
726 Raises
727 ------
728 lsst.daf.butler.registry.OrphanedRecordError
729 Raised if an attempt is made to remove the dataset type definition
730 when there are already datasets associated with it.
732 Notes
733 -----
734 If the dataset type is not registered the method will return without
735 action.
736 """
737 for datasetTypeExpression in ensure_iterable(name):
738 # Catch any warnings from the caller specifying a component
739 # dataset type. This will result in an error later but the
740 # warning could be confusing when the caller is not querying
741 # anything.
742 with warnings.catch_warnings():
743 warnings.simplefilter("ignore", category=FutureWarning)
744 datasetTypes = list(self.queryDatasetTypes(datasetTypeExpression))
745 if not datasetTypes:
746 _LOG.info("Dataset type %r not defined", datasetTypeExpression)
747 else:
748 for datasetType in datasetTypes:
749 self._managers.datasets.remove(datasetType.name)
750 _LOG.info("Removed dataset type %r", datasetType.name)
752 def getDatasetType(self, name: str) -> DatasetType:
753 """Get the `DatasetType`.
755 Parameters
756 ----------
757 name : `str`
758 Name of the type.
760 Returns
761 -------
762 type : `DatasetType`
763 The `DatasetType` associated with the given name.
765 Raises
766 ------
767 lsst.daf.butler.registry.MissingDatasetTypeError
768 Raised if the requested dataset type has not been registered.
770 Notes
771 -----
772 This method handles component dataset types automatically, though most
773 other registry operations do not.
774 """
775 parent_name, component = DatasetType.splitDatasetTypeName(name)
776 storage = self._managers.datasets[parent_name]
777 if component is None:
778 return storage.datasetType
779 else:
780 return storage.datasetType.makeComponentDatasetType(component)
782 def supportsIdGenerationMode(self, mode: DatasetIdGenEnum) -> bool:
783 """Test whether the given dataset ID generation mode is supported by
784 `insertDatasets`.
786 Parameters
787 ----------
788 mode : `DatasetIdGenEnum`
789 Enum value for the mode to test.
791 Returns
792 -------
793 supported : `bool`
794 Whether the given mode is supported.
795 """
796 return self._managers.datasets.supportsIdGenerationMode(mode)
798 def findDataset(
799 self,
800 datasetType: DatasetType | str,
801 dataId: DataId | None = None,
802 *,
803 collections: CollectionArgType | None = None,
804 timespan: Timespan | None = None,
805 datastore_records: bool = False,
806 **kwargs: Any,
807 ) -> DatasetRef | None:
808 """Find a dataset given its `DatasetType` and data ID.
810 This can be used to obtain a `DatasetRef` that permits the dataset to
811 be read from a `Datastore`. If the dataset is a component and can not
812 be found using the provided dataset type, a dataset ref for the parent
813 will be returned instead but with the correct dataset type.
815 Parameters
816 ----------
817 datasetType : `DatasetType` or `str`
818 A `DatasetType` or the name of one. If this is a `DatasetType`
819 instance, its storage class will be respected and propagated to
820 the output, even if it differs from the dataset type definition
821 in the registry, as long as the storage classes are convertible.
822 dataId : `dict` or `DataCoordinate`, optional
823 A `dict`-like object containing the `Dimension` links that identify
824 the dataset within a collection.
825 collections : collection expression, optional
826 An expression that fully or partially identifies the collections to
827 search for the dataset; see
828 :ref:`daf_butler_collection_expressions` for more information.
829 Defaults to ``self.defaults.collections``.
830 timespan : `Timespan`, optional
831 A timespan that the validity range of the dataset must overlap.
832 If not provided, any `~CollectionType.CALIBRATION` collections
833 matched by the ``collections`` argument will not be searched.
834 **kwargs
835 Additional keyword arguments passed to
836 `DataCoordinate.standardize` to convert ``dataId`` to a true
837 `DataCoordinate` or augment an existing one.
839 Returns
840 -------
841 ref : `DatasetRef`
842 A reference to the dataset, or `None` if no matching Dataset
843 was found.
845 Raises
846 ------
847 lsst.daf.butler.registry.NoDefaultCollectionError
848 Raised if ``collections`` is `None` and
849 ``self.defaults.collections`` is `None`.
850 LookupError
851 Raised if one or more data ID keys are missing.
852 lsst.daf.butler.registry.MissingDatasetTypeError
853 Raised if the dataset type does not exist.
854 lsst.daf.butler.registry.MissingCollectionError
855 Raised if any of ``collections`` does not exist in the registry.
857 Notes
858 -----
859 This method simply returns `None` and does not raise an exception even
860 when the set of collections searched is intrinsically incompatible with
861 the dataset type, e.g. if ``datasetType.isCalibration() is False``, but
862 only `~CollectionType.CALIBRATION` collections are being searched.
863 This may make it harder to debug some lookup failures, but the behavior
864 is intentional; we consider it more important that failed searches are
865 reported consistently, regardless of the reason, and that adding
866 additional collections that do not contain a match to the search path
867 never changes the behavior.
869 This method handles component dataset types automatically, though most
870 other registry operations do not.
871 """
872 if collections is None:
873 if not self.defaults.collections:
874 raise NoDefaultCollectionError(
875 "No collections provided to findDataset, and no defaults from registry construction."
876 )
877 collections = self.defaults.collections
878 backend = queries.SqlQueryBackend(self._db, self._managers)
879 collection_wildcard = CollectionWildcard.from_expression(collections, require_ordered=True)
880 if collection_wildcard.empty():
881 return None
882 matched_collections = backend.resolve_collection_wildcard(collection_wildcard)
883 parent_dataset_type, components = backend.resolve_single_dataset_type_wildcard(
884 datasetType, components_deprecated=False
885 )
886 if len(components) > 1:
887 raise DatasetTypeError(
888 f"findDataset requires exactly one dataset type; got multiple components {components} "
889 f"for parent dataset type {parent_dataset_type.name}."
890 )
891 component = components[0]
892 dataId = DataCoordinate.standardize(
893 dataId,
894 dimensions=parent_dataset_type.dimensions,
895 universe=self.dimensions,
896 defaults=self.defaults.dataId,
897 **kwargs,
898 )
899 governor_constraints = {name: {cast(str, dataId[name])} for name in dataId.dimensions.governors}
900 (filtered_collections,) = backend.filter_dataset_collections(
901 [parent_dataset_type],
902 matched_collections,
903 governor_constraints=governor_constraints,
904 ).values()
905 if not filtered_collections:
906 return None
907 if timespan is None:
908 filtered_collections = [
909 collection_record
910 for collection_record in filtered_collections
911 if collection_record.type is not CollectionType.CALIBRATION
912 ]
913 if filtered_collections:
914 requested_columns = {"dataset_id", "run", "collection"}
915 with backend.context() as context:
916 predicate = context.make_data_coordinate_predicate(
917 dataId.subset(parent_dataset_type.dimensions), full=False
918 )
919 if timespan is not None:
920 requested_columns.add("timespan")
921 predicate = predicate.logical_and(
922 context.make_timespan_overlap_predicate(
923 DatasetColumnTag(parent_dataset_type.name, "timespan"), timespan
924 )
925 )
926 relation = backend.make_dataset_query_relation(
927 parent_dataset_type, filtered_collections, requested_columns, context
928 ).with_rows_satisfying(predicate)
929 rows = list(context.fetch_iterable(relation))
930 else:
931 rows = []
932 if not rows:
933 return None
934 elif len(rows) == 1:
935 best_row = rows[0]
936 else:
937 rank_by_collection_key = {record.key: n for n, record in enumerate(filtered_collections)}
938 collection_tag = DatasetColumnTag(parent_dataset_type.name, "collection")
939 row_iter = iter(rows)
940 best_row = next(row_iter)
941 best_rank = rank_by_collection_key[best_row[collection_tag]]
942 have_tie = False
943 for row in row_iter:
944 if (rank := rank_by_collection_key[row[collection_tag]]) < best_rank:
945 best_row = row
946 best_rank = rank
947 have_tie = False
948 elif rank == best_rank:
949 have_tie = True
950 assert timespan is not None, "Rank ties should be impossible given DB constraints."
951 if have_tie:
952 raise LookupError(
953 f"Ambiguous calibration lookup for {parent_dataset_type.name} in collections "
954 f"{collection_wildcard.strings} with timespan {timespan}."
955 )
956 reader = queries.DatasetRefReader(
957 parent_dataset_type,
958 translate_collection=lambda k: self._managers.collections[k].name,
959 )
960 ref = reader.read(best_row, data_id=dataId)
961 if component is not None:
962 ref = ref.makeComponentRef(component)
963 if datastore_records:
964 ref = self.get_datastore_records(ref)
966 return ref
968 @transactional
969 def insertDatasets(
970 self,
971 datasetType: DatasetType | str,
972 dataIds: Iterable[DataId],
973 run: str | None = None,
974 expand: bool = True,
975 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE,
976 ) -> list[DatasetRef]:
977 """Insert one or more datasets into the `Registry`.
979 This always adds new datasets; to associate existing datasets with
980 a new collection, use ``associate``.
982 Parameters
983 ----------
984 datasetType : `DatasetType` or `str`
985 A `DatasetType` or the name of one.
986 dataIds : `~collections.abc.Iterable` of `dict` or `DataCoordinate`
987 Dimension-based identifiers for the new datasets.
988 run : `str`, optional
989 The name of the run that produced the datasets. Defaults to
990 ``self.defaults.run``.
991 expand : `bool`, optional
992 If `True` (default), expand data IDs as they are inserted. This is
993 necessary in general to allow datastore to generate file templates,
994 but it may be disabled if the caller can guarantee this is
995 unnecessary.
996 idGenerationMode : `DatasetIdGenEnum`, optional
997 Specifies option for generating dataset IDs. By default unique IDs
998 are generated for each inserted dataset.
1000 Returns
1001 -------
1002 refs : `list` of `DatasetRef`
1003 Resolved `DatasetRef` instances for all given data IDs (in the same
1004 order).
1006 Raises
1007 ------
1008 lsst.daf.butler.registry.DatasetTypeError
1009 Raised if ``datasetType`` is not known to registry.
1010 lsst.daf.butler.registry.CollectionTypeError
1011 Raised if ``run`` collection type is not `~CollectionType.RUN`.
1012 lsst.daf.butler.registry.NoDefaultCollectionError
1013 Raised if ``run`` is `None` and ``self.defaults.run`` is `None`.
1014 lsst.daf.butler.registry.ConflictingDefinitionError
1015 If a dataset with the same dataset type and data ID as one of those
1016 given already exists in ``run``.
1017 lsst.daf.butler.registry.MissingCollectionError
1018 Raised if ``run`` does not exist in the registry.
1019 """
1020 if isinstance(datasetType, DatasetType):
1021 storage = self._managers.datasets.find(datasetType.name)
1022 if storage is None:
1023 raise DatasetTypeError(f"DatasetType '{datasetType}' has not been registered.")
1024 else:
1025 storage = self._managers.datasets.find(datasetType)
1026 if storage is None:
1027 raise DatasetTypeError(f"DatasetType with name '{datasetType}' has not been registered.")
1028 if run is None:
1029 if self.defaults.run is None:
1030 raise NoDefaultCollectionError(
1031 "No run provided to insertDatasets, and no default from registry construction."
1032 )
1033 run = self.defaults.run
1034 runRecord = self._managers.collections.find(run)
1035 if runRecord.type is not CollectionType.RUN:
1036 raise CollectionTypeError(
1037 f"Given collection is of type {runRecord.type.name}; RUN collection required."
1038 )
1039 assert isinstance(runRecord, RunRecord)
1040 progress = Progress("daf.butler.Registry.insertDatasets", level=logging.DEBUG)
1041 if expand:
1042 expandedDataIds = [
1043 self.expandDataId(dataId, dimensions=storage.datasetType.dimensions)
1044 for dataId in progress.wrap(dataIds, f"Expanding {storage.datasetType.name} data IDs")
1045 ]
1046 else:
1047 expandedDataIds = [
1048 DataCoordinate.standardize(dataId, graph=storage.datasetType.dimensions) for dataId in dataIds
1049 ]
1050 try:
1051 refs = list(storage.insert(runRecord, expandedDataIds, idGenerationMode))
1052 if self._managers.obscore:
1053 context = queries.SqlQueryContext(self._db, self._managers.column_types)
1054 self._managers.obscore.add_datasets(refs, context)
1055 except sqlalchemy.exc.IntegrityError as err:
1056 raise ConflictingDefinitionError(
1057 "A database constraint failure was triggered by inserting "
1058 f"one or more datasets of type {storage.datasetType} into "
1059 f"collection '{run}'. "
1060 "This probably means a dataset with the same data ID "
1061 "and dataset type already exists, but it may also mean a "
1062 "dimension row is missing."
1063 ) from err
1064 return refs
1066 @transactional
1067 def _importDatasets(
1068 self,
1069 datasets: Iterable[DatasetRef],
1070 expand: bool = True,
1071 ) -> list[DatasetRef]:
1072 """Import one or more datasets into the `Registry`.
1074 Difference from `insertDatasets` method is that this method accepts
1075 `DatasetRef` instances which should already be resolved and have a
1076 dataset ID. If registry supports globally-unique dataset IDs (e.g.
1077 `uuid.UUID`) then datasets which already exist in the registry will be
1078 ignored if imported again.
1080 Parameters
1081 ----------
1082 datasets : `~collections.abc.Iterable` of `DatasetRef`
1083 Datasets to be inserted. All `DatasetRef` instances must have
1084 identical ``datasetType`` and ``run`` attributes. ``run``
1085 attribute can be `None` and defaults to ``self.defaults.run``.
1086 Datasets can specify ``id`` attribute which will be used for
1087 inserted datasets. All dataset IDs must have the same type
1088 (`int` or `uuid.UUID`), if type of dataset IDs does not match
1089 configured backend then IDs will be ignored and new IDs will be
1090 generated by backend.
1091 expand : `bool`, optional
1092 If `True` (default), expand data IDs as they are inserted. This is
1093 necessary in general, but it may be disabled if the caller can
1094 guarantee this is unnecessary.
1096 Returns
1097 -------
1098 refs : `list` of `DatasetRef`
1099 Resolved `DatasetRef` instances for all given data IDs (in the same
1100 order). If any of ``datasets`` has an ID which already exists in
1101 the database then it will not be inserted or updated, but a
1102 resolved `DatasetRef` will be returned for it in any case.
1104 Raises
1105 ------
1106 lsst.daf.butler.registry.NoDefaultCollectionError
1107 Raised if ``run`` is `None` and ``self.defaults.run`` is `None`.
1108 lsst.daf.butler.registry.DatasetTypeError
1109 Raised if datasets correspond to more than one dataset type or
1110 dataset type is not known to registry.
1111 lsst.daf.butler.registry.ConflictingDefinitionError
1112 If a dataset with the same dataset type and data ID as one of those
1113 given already exists in ``run``.
1114 lsst.daf.butler.registry.MissingCollectionError
1115 Raised if ``run`` does not exist in the registry.
1117 Notes
1118 -----
1119 This method is considered package-private and internal to Butler
1120 implementation. Clients outside daf_butler package should not use this
1121 method.
1122 """
1123 datasets = list(datasets)
1124 if not datasets:
1125 # nothing to do
1126 return []
1128 # find dataset type
1129 datasetTypes = {dataset.datasetType for dataset in datasets}
1130 if len(datasetTypes) != 1:
1131 raise DatasetTypeError(f"Multiple dataset types in input datasets: {datasetTypes}")
1132 datasetType = datasetTypes.pop()
1134 # get storage handler for this dataset type
1135 storage = self._managers.datasets.find(datasetType.name)
1136 if storage is None:
1137 raise DatasetTypeError(f"DatasetType '{datasetType}' has not been registered.")
1139 # find run name
1140 runs = {dataset.run for dataset in datasets}
1141 if len(runs) != 1:
1142 raise ValueError(f"Multiple run names in input datasets: {runs}")
1143 run = runs.pop()
1145 runRecord = self._managers.collections.find(run)
1146 if runRecord.type is not CollectionType.RUN:
1147 raise CollectionTypeError(
1148 f"Given collection '{runRecord.name}' is of type {runRecord.type.name};"
1149 " RUN collection required."
1150 )
1151 assert isinstance(runRecord, RunRecord)
1153 progress = Progress("daf.butler.Registry.insertDatasets", level=logging.DEBUG)
1154 if expand:
1155 expandedDatasets = [
1156 dataset.expanded(self.expandDataId(dataset.dataId, dimensions=storage.datasetType.dimensions))
1157 for dataset in progress.wrap(datasets, f"Expanding {storage.datasetType.name} data IDs")
1158 ]
1159 else:
1160 expandedDatasets = [
1161 DatasetRef(datasetType, dataset.dataId, id=dataset.id, run=dataset.run, conform=True)
1162 for dataset in datasets
1163 ]
1165 try:
1166 refs = list(storage.import_(runRecord, expandedDatasets))
1167 if self._managers.obscore:
1168 context = queries.SqlQueryContext(self._db, self._managers.column_types)
1169 self._managers.obscore.add_datasets(refs, context)
1170 except sqlalchemy.exc.IntegrityError as err:
1171 raise ConflictingDefinitionError(
1172 "A database constraint failure was triggered by inserting "
1173 f"one or more datasets of type {storage.datasetType} into "
1174 f"collection '{run}'. "
1175 "This probably means a dataset with the same data ID "
1176 "and dataset type already exists, but it may also mean a "
1177 "dimension row is missing."
1178 ) from err
1179 # Check that imported dataset IDs match the input
1180 for imported_ref, input_ref in zip(refs, datasets, strict=True):
1181 if imported_ref.id != input_ref.id:
1182 raise RegistryConsistencyError(
1183 "Imported dataset ID differs from input dataset ID, "
1184 f"input ref: {input_ref}, imported ref: {imported_ref}"
1185 )
1186 return refs
1188 def getDataset(self, id: DatasetId) -> DatasetRef | None:
1189 """Retrieve a Dataset entry.
1191 Parameters
1192 ----------
1193 id : `DatasetId`
1194 The unique identifier for the dataset.
1196 Returns
1197 -------
1198 ref : `DatasetRef` or `None`
1199 A ref to the Dataset, or `None` if no matching Dataset
1200 was found.
1201 """
1202 return self._managers.datasets.getDatasetRef(id)
1204 @transactional
1205 def removeDatasets(self, refs: Iterable[DatasetRef]) -> None:
1206 """Remove datasets from the Registry.
1208 The datasets will be removed unconditionally from all collections, and
1209 any `Quantum` that consumed this dataset will instead be marked with
1210 having a NULL input. `Datastore` records will *not* be deleted; the
1211 caller is responsible for ensuring that the dataset has already been
1212 removed from all Datastores.
1214 Parameters
1215 ----------
1216 refs : `~collections.abc.Iterable` [`DatasetRef`]
1217 References to the datasets to be removed. Must include a valid
1218 ``id`` attribute, and should be considered invalidated upon return.
1220 Raises
1221 ------
1222 lsst.daf.butler.AmbiguousDatasetError
1223 Raised if any ``ref.id`` is `None`.
1224 lsst.daf.butler.registry.OrphanedRecordError
1225 Raised if any dataset is still present in any `Datastore`.
1226 """
1227 progress = Progress("lsst.daf.butler.Registry.removeDatasets", level=logging.DEBUG)
1228 for datasetType, refsForType in progress.iter_item_chunks(
1229 DatasetRef.iter_by_type(refs), desc="Removing datasets by type"
1230 ):
1231 storage = self._managers.datasets[datasetType.name]
1232 try:
1233 storage.delete(refsForType)
1234 except sqlalchemy.exc.IntegrityError as err:
1235 raise OrphanedRecordError(
1236 "One or more datasets is still present in one or more Datastores."
1237 ) from err
1239 @transactional
1240 def associate(self, collection: str, refs: Iterable[DatasetRef]) -> None:
1241 """Add existing datasets to a `~CollectionType.TAGGED` collection.
1243 If a DatasetRef with the same exact ID is already in a collection
1244 nothing is changed. If a `DatasetRef` with the same `DatasetType` and
1245 data ID but with different ID exists in the collection,
1246 `~lsst.daf.butler.registry.ConflictingDefinitionError` is raised.
1248 Parameters
1249 ----------
1250 collection : `str`
1251 Indicates the collection the datasets should be associated with.
1252 refs : `~collections.abc.Iterable` [ `DatasetRef` ]
1253 An iterable of resolved `DatasetRef` instances that already exist
1254 in this `Registry`.
1256 Raises
1257 ------
1258 lsst.daf.butler.registry.ConflictingDefinitionError
1259 If a Dataset with the given `DatasetRef` already exists in the
1260 given collection.
1261 lsst.daf.butler.registry.MissingCollectionError
1262 Raised if ``collection`` does not exist in the registry.
1263 lsst.daf.butler.registry.CollectionTypeError
1264 Raise adding new datasets to the given ``collection`` is not
1265 allowed.
1266 """
1267 progress = Progress("lsst.daf.butler.Registry.associate", level=logging.DEBUG)
1268 collectionRecord = self._managers.collections.find(collection)
1269 if collectionRecord.type is not CollectionType.TAGGED:
1270 raise CollectionTypeError(
1271 f"Collection '{collection}' has type {collectionRecord.type.name}, not TAGGED."
1272 )
1273 for datasetType, refsForType in progress.iter_item_chunks(
1274 DatasetRef.iter_by_type(refs), desc="Associating datasets by type"
1275 ):
1276 storage = self._managers.datasets[datasetType.name]
1277 try:
1278 storage.associate(collectionRecord, refsForType)
1279 if self._managers.obscore:
1280 # If a TAGGED collection is being monitored by ObsCore
1281 # manager then we may need to save the dataset.
1282 context = queries.SqlQueryContext(self._db, self._managers.column_types)
1283 self._managers.obscore.associate(refsForType, collectionRecord, context)
1284 except sqlalchemy.exc.IntegrityError as err:
1285 raise ConflictingDefinitionError(
1286 f"Constraint violation while associating dataset of type {datasetType.name} with "
1287 f"collection {collection}. This probably means that one or more datasets with the same "
1288 "dataset type and data ID already exist in the collection, but it may also indicate "
1289 "that the datasets do not exist."
1290 ) from err
1292 @transactional
1293 def disassociate(self, collection: str, refs: Iterable[DatasetRef]) -> None:
1294 """Remove existing datasets from a `~CollectionType.TAGGED` collection.
1296 ``collection`` and ``ref`` combinations that are not currently
1297 associated are silently ignored.
1299 Parameters
1300 ----------
1301 collection : `str`
1302 The collection the datasets should no longer be associated with.
1303 refs : `~collections.abc.Iterable` [ `DatasetRef` ]
1304 An iterable of resolved `DatasetRef` instances that already exist
1305 in this `Registry`.
1307 Raises
1308 ------
1309 lsst.daf.butler.AmbiguousDatasetError
1310 Raised if any of the given dataset references is unresolved.
1311 lsst.daf.butler.registry.MissingCollectionError
1312 Raised if ``collection`` does not exist in the registry.
1313 lsst.daf.butler.registry.CollectionTypeError
1314 Raise adding new datasets to the given ``collection`` is not
1315 allowed.
1316 """
1317 progress = Progress("lsst.daf.butler.Registry.disassociate", level=logging.DEBUG)
1318 collectionRecord = self._managers.collections.find(collection)
1319 if collectionRecord.type is not CollectionType.TAGGED:
1320 raise CollectionTypeError(
1321 f"Collection '{collection}' has type {collectionRecord.type.name}; expected TAGGED."
1322 )
1323 for datasetType, refsForType in progress.iter_item_chunks(
1324 DatasetRef.iter_by_type(refs), desc="Disassociating datasets by type"
1325 ):
1326 storage = self._managers.datasets[datasetType.name]
1327 storage.disassociate(collectionRecord, refsForType)
1328 if self._managers.obscore:
1329 self._managers.obscore.disassociate(refsForType, collectionRecord)
1331 @transactional
1332 def certify(self, collection: str, refs: Iterable[DatasetRef], timespan: Timespan) -> None:
1333 """Associate one or more datasets with a calibration collection and a
1334 validity range within it.
1336 Parameters
1337 ----------
1338 collection : `str`
1339 The name of an already-registered `~CollectionType.CALIBRATION`
1340 collection.
1341 refs : `~collections.abc.Iterable` [ `DatasetRef` ]
1342 Datasets to be associated.
1343 timespan : `Timespan`
1344 The validity range for these datasets within the collection.
1346 Raises
1347 ------
1348 lsst.daf.butler.AmbiguousDatasetError
1349 Raised if any of the given `DatasetRef` instances is unresolved.
1350 lsst.daf.butler.registry.ConflictingDefinitionError
1351 Raised if the collection already contains a different dataset with
1352 the same `DatasetType` and data ID and an overlapping validity
1353 range.
1354 lsst.daf.butler.registry.CollectionTypeError
1355 Raised if ``collection`` is not a `~CollectionType.CALIBRATION`
1356 collection or if one or more datasets are of a dataset type for
1357 which `DatasetType.isCalibration` returns `False`.
1358 """
1359 progress = Progress("lsst.daf.butler.Registry.certify", level=logging.DEBUG)
1360 collectionRecord = self._managers.collections.find(collection)
1361 for datasetType, refsForType in progress.iter_item_chunks(
1362 DatasetRef.iter_by_type(refs), desc="Certifying datasets by type"
1363 ):
1364 storage = self._managers.datasets[datasetType.name]
1365 storage.certify(
1366 collectionRecord,
1367 refsForType,
1368 timespan,
1369 context=queries.SqlQueryContext(self._db, self._managers.column_types),
1370 )
1372 @transactional
1373 def decertify(
1374 self,
1375 collection: str,
1376 datasetType: str | DatasetType,
1377 timespan: Timespan,
1378 *,
1379 dataIds: Iterable[DataId] | None = None,
1380 ) -> None:
1381 """Remove or adjust datasets to clear a validity range within a
1382 calibration collection.
1384 Parameters
1385 ----------
1386 collection : `str`
1387 The name of an already-registered `~CollectionType.CALIBRATION`
1388 collection.
1389 datasetType : `str` or `DatasetType`
1390 Name or `DatasetType` instance for the datasets to be decertified.
1391 timespan : `Timespan`, optional
1392 The validity range to remove datasets from within the collection.
1393 Datasets that overlap this range but are not contained by it will
1394 have their validity ranges adjusted to not overlap it, which may
1395 split a single dataset validity range into two.
1396 dataIds : iterable [`dict` or `DataCoordinate`], optional
1397 Data IDs that should be decertified within the given validity range
1398 If `None`, all data IDs for ``self.datasetType`` will be
1399 decertified.
1401 Raises
1402 ------
1403 lsst.daf.butler.registry.CollectionTypeError
1404 Raised if ``collection`` is not a `~CollectionType.CALIBRATION`
1405 collection or if ``datasetType.isCalibration() is False``.
1406 """
1407 collectionRecord = self._managers.collections.find(collection)
1408 if isinstance(datasetType, str):
1409 storage = self._managers.datasets[datasetType]
1410 else:
1411 storage = self._managers.datasets[datasetType.name]
1412 standardizedDataIds = None
1413 if dataIds is not None:
1414 standardizedDataIds = [
1415 DataCoordinate.standardize(d, dimensions=storage.datasetType.dimensions) for d in dataIds
1416 ]
1417 storage.decertify(
1418 collectionRecord,
1419 timespan,
1420 dataIds=standardizedDataIds,
1421 context=queries.SqlQueryContext(self._db, self._managers.column_types),
1422 )
1424 def getDatastoreBridgeManager(self) -> DatastoreRegistryBridgeManager:
1425 """Return an object that allows a new `Datastore` instance to
1426 communicate with this `Registry`.
1428 Returns
1429 -------
1430 manager : `~.interfaces.DatastoreRegistryBridgeManager`
1431 Object that mediates communication between this `Registry` and its
1432 associated datastores.
1433 """
1434 return self._managers.datastores
1436 def getDatasetLocations(self, ref: DatasetRef) -> Iterable[str]:
1437 """Retrieve datastore locations for a given dataset.
1439 Parameters
1440 ----------
1441 ref : `DatasetRef`
1442 A reference to the dataset for which to retrieve storage
1443 information.
1445 Returns
1446 -------
1447 datastores : `~collections.abc.Iterable` [ `str` ]
1448 All the matching datastores holding this dataset.
1450 Raises
1451 ------
1452 lsst.daf.butler.AmbiguousDatasetError
1453 Raised if ``ref.id`` is `None`.
1454 """
1455 return self._managers.datastores.findDatastores(ref)
1457 def expandDataId(
1458 self,
1459 dataId: DataId | None = None,
1460 *,
1461 dimensions: Iterable[str] | DimensionGroup | DimensionGraph | None = None,
1462 graph: DimensionGraph | None = None,
1463 records: NameLookupMapping[DimensionElement, DimensionRecord | None] | None = None,
1464 withDefaults: bool = True,
1465 **kwargs: Any,
1466 ) -> DataCoordinate:
1467 """Expand a dimension-based data ID to include additional information.
1469 Parameters
1470 ----------
1471 dataId : `DataCoordinate` or `dict`, optional
1472 Data ID to be expanded; augmented and overridden by ``kwargs``.
1473 dimensions : `~collections.abc.Iterable` [ `str` ], \
1474 `DimensionGroup`, or `DimensionGraph`, optional
1475 The dimensions to be identified by the new `DataCoordinate`.
1476 If not provided, will be inferred from the keys of ``mapping`` and
1477 ``**kwargs``, and ``universe`` must be provided unless ``mapping``
1478 is already a `DataCoordinate`.
1479 graph : `DimensionGraph`, optional
1480 Like ``dimensions``, but as a ``DimensionGraph`` instance. Ignored
1481 if ``dimensions`` is provided. Deprecated and will be removed
1482 after v27.
1483 records : `~collections.abc.Mapping` [`str`, `DimensionRecord`], \
1484 optional
1485 Dimension record data to use before querying the database for that
1486 data, keyed by element name.
1487 withDefaults : `bool`, optional
1488 Utilize ``self.defaults.dataId`` to fill in missing governor
1489 dimension key-value pairs. Defaults to `True` (i.e. defaults are
1490 used).
1491 **kwargs
1492 Additional keywords are treated like additional key-value pairs for
1493 ``dataId``, extending and overriding
1495 Returns
1496 -------
1497 expanded : `DataCoordinate`
1498 A data ID that includes full metadata for all of the dimensions it
1499 identifies, i.e. guarantees that ``expanded.hasRecords()`` and
1500 ``expanded.hasFull()`` both return `True`.
1502 Raises
1503 ------
1504 lsst.daf.butler.registry.DataIdError
1505 Raised when ``dataId`` or keyword arguments specify unknown
1506 dimensions or values, or when a resulting data ID contains
1507 contradictory key-value pairs, according to dimension
1508 relationships.
1510 Notes
1511 -----
1512 This method cannot be relied upon to reject invalid data ID values
1513 for dimensions that do actually not have any record columns. For
1514 efficiency reasons the records for these dimensions (which have only
1515 dimension key values that are given by the caller) may be constructed
1516 directly rather than obtained from the registry database.
1517 """
1518 if not withDefaults:
1519 defaults = None
1520 else:
1521 defaults = self.defaults.dataId
1522 try:
1523 standardized = DataCoordinate.standardize(
1524 dataId,
1525 graph=graph,
1526 dimensions=dimensions,
1527 universe=self.dimensions,
1528 defaults=defaults,
1529 **kwargs,
1530 )
1531 except KeyError as exc:
1532 # This means either kwargs have some odd name or required
1533 # dimension is missing.
1534 raise DimensionNameError(str(exc)) from exc
1535 if standardized.hasRecords():
1536 return standardized
1537 if records is None:
1538 records = {}
1539 elif isinstance(records, NamedKeyMapping):
1540 records = records.byName()
1541 else:
1542 records = dict(records)
1543 if isinstance(dataId, DataCoordinate) and dataId.hasRecords():
1544 for element_name in dataId.dimensions.elements:
1545 records[element_name] = dataId.records[element_name]
1546 keys = dict(standardized.mapping)
1547 context = queries.SqlQueryContext(self._db, self._managers.column_types)
1548 for element_name in standardized.dimensions.lookup_order:
1549 element = self.dimensions[element_name]
1550 record = records.get(element_name, ...) # Use ... to mean not found; None might mean NULL
1551 if record is ...:
1552 if element_name in self.dimensions.dimensions.names and keys.get(element_name) is None:
1553 if element_name in standardized.dimensions.required:
1554 raise DimensionNameError(
1555 f"No value or null value for required dimension {element_name}."
1556 )
1557 keys[element_name] = None
1558 record = None
1559 else:
1560 storage = self._managers.dimensions[element_name]
1561 record = storage.fetch_one(
1562 DataCoordinate.standardize(keys, dimensions=element.minimal_group), context
1563 )
1564 records[element_name] = record
1565 if record is not None:
1566 for d in element.implied:
1567 value = getattr(record, d.name)
1568 if keys.setdefault(d.name, value) != value:
1569 raise InconsistentDataIdError(
1570 f"Data ID {standardized} has {d.name}={keys[d.name]!r}, "
1571 f"but {element_name} implies {d.name}={value!r}."
1572 )
1573 else:
1574 if element_name in standardized.dimensions.required:
1575 raise DataIdValueError(
1576 f"Could not fetch record for required dimension {element.name} via keys {keys}."
1577 )
1578 if element.alwaysJoin:
1579 raise InconsistentDataIdError(
1580 f"Could not fetch record for element {element_name} via keys {keys}, ",
1581 "but it is marked alwaysJoin=True; this means one or more dimensions are not "
1582 "related.",
1583 )
1584 for d in element.implied:
1585 keys.setdefault(d.name, None)
1586 records.setdefault(d.name, None)
1587 return DataCoordinate.standardize(keys, dimensions=standardized.dimensions).expanded(records=records)
1589 def insertDimensionData(
1590 self,
1591 element: DimensionElement | str,
1592 *data: Mapping[str, Any] | DimensionRecord,
1593 conform: bool = True,
1594 replace: bool = False,
1595 skip_existing: bool = False,
1596 ) -> None:
1597 """Insert one or more dimension records into the database.
1599 Parameters
1600 ----------
1601 element : `DimensionElement` or `str`
1602 The `DimensionElement` or name thereof that identifies the table
1603 records will be inserted into.
1604 *data : `dict` or `DimensionRecord`
1605 One or more records to insert.
1606 conform : `bool`, optional
1607 If `False` (`True` is default) perform no checking or conversions,
1608 and assume that ``element`` is a `DimensionElement` instance and
1609 ``data`` is a one or more `DimensionRecord` instances of the
1610 appropriate subclass.
1611 replace : `bool`, optional
1612 If `True` (`False` is default), replace existing records in the
1613 database if there is a conflict.
1614 skip_existing : `bool`, optional
1615 If `True` (`False` is default), skip insertion if a record with
1616 the same primary key values already exists. Unlike
1617 `syncDimensionData`, this will not detect when the given record
1618 differs from what is in the database, and should not be used when
1619 this is a concern.
1620 """
1621 if conform:
1622 if isinstance(element, str):
1623 element = self.dimensions[element]
1624 records = [
1625 row if isinstance(row, DimensionRecord) else element.RecordClass(**row) for row in data
1626 ]
1627 else:
1628 # Ignore typing since caller said to trust them with conform=False.
1629 records = data # type: ignore
1630 storage = self._managers.dimensions[element]
1631 storage.insert(*records, replace=replace, skip_existing=skip_existing)
1633 def syncDimensionData(
1634 self,
1635 element: DimensionElement | str,
1636 row: Mapping[str, Any] | DimensionRecord,
1637 conform: bool = True,
1638 update: bool = False,
1639 ) -> bool | dict[str, Any]:
1640 """Synchronize the given dimension record with the database, inserting
1641 if it does not already exist and comparing values if it does.
1643 Parameters
1644 ----------
1645 element : `DimensionElement` or `str`
1646 The `DimensionElement` or name thereof that identifies the table
1647 records will be inserted into.
1648 row : `dict` or `DimensionRecord`
1649 The record to insert.
1650 conform : `bool`, optional
1651 If `False` (`True` is default) perform no checking or conversions,
1652 and assume that ``element`` is a `DimensionElement` instance and
1653 ``data`` is a one or more `DimensionRecord` instances of the
1654 appropriate subclass.
1655 update : `bool`, optional
1656 If `True` (`False` is default), update the existing record in the
1657 database if there is a conflict.
1659 Returns
1660 -------
1661 inserted_or_updated : `bool` or `dict`
1662 `True` if a new row was inserted, `False` if no changes were
1663 needed, or a `dict` mapping updated column names to their old
1664 values if an update was performed (only possible if
1665 ``update=True``).
1667 Raises
1668 ------
1669 lsst.daf.butler.registry.ConflictingDefinitionError
1670 Raised if the record exists in the database (according to primary
1671 key lookup) but is inconsistent with the given one.
1672 """
1673 if conform:
1674 if isinstance(element, str):
1675 element = self.dimensions[element]
1676 record = row if isinstance(row, DimensionRecord) else element.RecordClass(**row)
1677 else:
1678 # Ignore typing since caller said to trust them with conform=False.
1679 record = row # type: ignore
1680 storage = self._managers.dimensions[element]
1681 return storage.sync(record, update=update)
1683 def queryDatasetTypes(
1684 self,
1685 expression: Any = ...,
1686 *,
1687 components: bool | None = False,
1688 missing: list[str] | None = None,
1689 ) -> Iterable[DatasetType]:
1690 """Iterate over the dataset types whose names match an expression.
1692 Parameters
1693 ----------
1694 expression : dataset type expression, optional
1695 An expression that fully or partially identifies the dataset types
1696 to return, such as a `str`, `re.Pattern`, or iterable thereof.
1697 ``...`` can be used to return all dataset types, and is the
1698 default. See :ref:`daf_butler_dataset_type_expressions` for more
1699 information.
1700 components : `bool`, optional
1701 If `True`, apply all expression patterns to component dataset type
1702 names as well. If `False`, never apply patterns to components.
1703 If `None`, apply patterns to components only if their
1704 parent datasets were not matched by the expression.
1705 Fully-specified component datasets (`str` or `DatasetType`
1706 instances) are always included.
1708 Values other than `False` are deprecated, and only `False` will be
1709 supported after v26. After v27 this argument will be removed
1710 entirely.
1711 missing : `list` of `str`, optional
1712 String dataset type names that were explicitly given (i.e. not
1713 regular expression patterns) but not found will be appended to this
1714 list, if it is provided.
1716 Returns
1717 -------
1718 dataset_types : `~collections.abc.Iterable` [ `DatasetType`]
1719 An `~collections.abc.Iterable` of `DatasetType` instances whose
1720 names match ``expression``.
1722 Raises
1723 ------
1724 lsst.daf.butler.registry.DatasetTypeExpressionError
1725 Raised when ``expression`` is invalid.
1726 """
1727 wildcard = DatasetTypeWildcard.from_expression(expression)
1728 composition_dict = self._managers.datasets.resolve_wildcard(
1729 wildcard,
1730 components=components,
1731 missing=missing,
1732 )
1733 result: list[DatasetType] = []
1734 for parent_dataset_type, components_for_parent in composition_dict.items():
1735 result.extend(
1736 parent_dataset_type.makeComponentDatasetType(c) if c is not None else parent_dataset_type
1737 for c in components_for_parent
1738 )
1739 return result
1741 def queryCollections(
1742 self,
1743 expression: Any = ...,
1744 datasetType: DatasetType | None = None,
1745 collectionTypes: Iterable[CollectionType] | CollectionType = CollectionType.all(),
1746 flattenChains: bool = False,
1747 includeChains: bool | None = None,
1748 ) -> Sequence[str]:
1749 """Iterate over the collections whose names match an expression.
1751 Parameters
1752 ----------
1753 expression : collection expression, optional
1754 An expression that identifies the collections to return, such as
1755 a `str` (for full matches or partial matches via globs),
1756 `re.Pattern` (for partial matches), or iterable thereof. ``...``
1757 can be used to return all collections, and is the default.
1758 See :ref:`daf_butler_collection_expressions` for more information.
1759 datasetType : `DatasetType`, optional
1760 If provided, only yield collections that may contain datasets of
1761 this type. This is a conservative approximation in general; it may
1762 yield collections that do not have any such datasets.
1763 collectionTypes : `~collections.abc.Set` [`CollectionType`] or \
1764 `CollectionType`, optional
1765 If provided, only yield collections of these types.
1766 flattenChains : `bool`, optional
1767 If `True` (`False` is default), recursively yield the child
1768 collections of matching `~CollectionType.CHAINED` collections.
1769 includeChains : `bool`, optional
1770 If `True`, yield records for matching `~CollectionType.CHAINED`
1771 collections. Default is the opposite of ``flattenChains``: include
1772 either CHAINED collections or their children, but not both.
1774 Returns
1775 -------
1776 collections : `~collections.abc.Sequence` [ `str` ]
1777 The names of collections that match ``expression``.
1779 Raises
1780 ------
1781 lsst.daf.butler.registry.CollectionExpressionError
1782 Raised when ``expression`` is invalid.
1784 Notes
1785 -----
1786 The order in which collections are returned is unspecified, except that
1787 the children of a `~CollectionType.CHAINED` collection are guaranteed
1788 to be in the order in which they are searched. When multiple parent
1789 `~CollectionType.CHAINED` collections match the same criteria, the
1790 order in which the two lists appear is unspecified, and the lists of
1791 children may be incomplete if a child has multiple parents.
1792 """
1793 # Right now the datasetTypes argument is completely ignored, but that
1794 # is consistent with its [lack of] guarantees. DM-24939 or a follow-up
1795 # ticket will take care of that.
1796 try:
1797 wildcard = CollectionWildcard.from_expression(expression)
1798 except TypeError as exc:
1799 raise CollectionExpressionError(f"Invalid collection expression '{expression}'") from exc
1800 collectionTypes = ensure_iterable(collectionTypes)
1801 return [
1802 record.name
1803 for record in self._managers.collections.resolve_wildcard(
1804 wildcard,
1805 collection_types=frozenset(collectionTypes),
1806 flatten_chains=flattenChains,
1807 include_chains=includeChains,
1808 )
1809 ]
1811 def _makeQueryBuilder(
1812 self,
1813 summary: queries.QuerySummary,
1814 doomed_by: Iterable[str] = (),
1815 ) -> queries.QueryBuilder:
1816 """Return a `QueryBuilder` instance capable of constructing and
1817 managing more complex queries than those obtainable via `Registry`
1818 interfaces.
1820 This is an advanced interface; downstream code should prefer
1821 `Registry.queryDataIds` and `Registry.queryDatasets` whenever those
1822 are sufficient.
1824 Parameters
1825 ----------
1826 summary : `queries.QuerySummary`
1827 Object describing and categorizing the full set of dimensions that
1828 will be included in the query.
1829 doomed_by : `~collections.abc.Iterable` of `str`, optional
1830 A list of diagnostic messages that indicate why the query is going
1831 to yield no results and should not even be executed. If an empty
1832 container (default) the query will be executed unless other code
1833 determines that it is doomed.
1835 Returns
1836 -------
1837 builder : `queries.QueryBuilder`
1838 Object that can be used to construct and perform advanced queries.
1839 """
1840 doomed_by = list(doomed_by)
1841 backend = queries.SqlQueryBackend(self._db, self._managers)
1842 context = backend.context()
1843 relation: Relation | None = None
1844 if doomed_by:
1845 relation = LeafRelation.make_doomed(context.sql_engine, set(), doomed_by)
1846 return queries.QueryBuilder(
1847 summary,
1848 backend=backend,
1849 context=context,
1850 relation=relation,
1851 )
1853 def _standardize_query_data_id_args(
1854 self, data_id: DataId | None, *, doomed_by: list[str], **kwargs: Any
1855 ) -> DataCoordinate:
1856 """Preprocess the data ID arguments passed to query* methods.
1858 Parameters
1859 ----------
1860 data_id : `DataId` or `None`
1861 Data ID that constrains the query results.
1862 doomed_by : `list` [ `str` ]
1863 List to append messages indicating why the query is doomed to
1864 yield no results.
1865 **kwargs
1866 Additional data ID key-value pairs, extending and overriding
1867 ``data_id``.
1869 Returns
1870 -------
1871 data_id : `DataCoordinate`
1872 Standardized data ID. Will be fully expanded unless expansion
1873 fails, in which case a message will be appended to ``doomed_by``
1874 on return.
1875 """
1876 try:
1877 return self.expandDataId(data_id, **kwargs)
1878 except DataIdValueError as err:
1879 doomed_by.append(str(err))
1880 return DataCoordinate.standardize(
1881 data_id, **kwargs, universe=self.dimensions, defaults=self.defaults.dataId
1882 )
1884 def _standardize_query_dataset_args(
1885 self,
1886 datasets: Any,
1887 collections: CollectionArgType | None,
1888 components: bool | None,
1889 mode: Literal["find_first"] | Literal["find_all"] | Literal["constrain"] = "constrain",
1890 *,
1891 doomed_by: list[str],
1892 ) -> tuple[dict[DatasetType, list[str | None]], CollectionWildcard | None]:
1893 """Preprocess dataset arguments passed to query* methods.
1895 Parameters
1896 ----------
1897 datasets : `DatasetType`, `str`, `re.Pattern`, or iterable of these
1898 Expression identifying dataset types. See `queryDatasetTypes` for
1899 details.
1900 collections : `str`, `re.Pattern`, or iterable of these
1901 Expression identifying collections to be searched. See
1902 `queryCollections` for details.
1903 components : `bool`, optional
1904 If `True`, apply all expression patterns to component dataset type
1905 names as well. If `False`, never apply patterns to components.
1906 If `None` (default), apply patterns to components only if their
1907 parent datasets were not matched by the expression.
1908 Fully-specified component datasets (`str` or `DatasetType`
1909 instances) are always included.
1911 Values other than `False` are deprecated, and only `False` will be
1912 supported after v26. After v27 this argument will be removed
1913 entirely.
1914 mode : `str`, optional
1915 The way in which datasets are being used in this query; one of:
1917 - "find_first": this is a query for the first dataset in an
1918 ordered list of collections. Prohibits collection wildcards,
1919 but permits dataset type wildcards.
1921 - "find_all": this is a query for all datasets in all matched
1922 collections. Permits collection and dataset type wildcards.
1924 - "constrain": this is a query for something other than datasets,
1925 with results constrained by dataset existence. Permits
1926 collection wildcards and prohibits ``...`` as a dataset type
1927 wildcard.
1928 doomed_by : `list` [ `str` ]
1929 List to append messages indicating why the query is doomed to
1930 yield no results.
1932 Returns
1933 -------
1934 composition : `defaultdict` [ `DatasetType`, `list` [ `str` ] ]
1935 Dictionary mapping parent dataset type to `list` of components
1936 matched for that dataset type (or `None` for the parent itself).
1937 collections : `CollectionWildcard`
1938 Processed collection expression.
1939 """
1940 composition: dict[DatasetType, list[str | None]] = {}
1941 collection_wildcard: CollectionWildcard | None = None
1942 if datasets is not None:
1943 if collections is None:
1944 if not self.defaults.collections:
1945 raise NoDefaultCollectionError("No collections, and no registry default collections.")
1946 collection_wildcard = CollectionWildcard.from_expression(self.defaults.collections)
1947 else:
1948 collection_wildcard = CollectionWildcard.from_expression(collections)
1949 if mode == "find_first" and collection_wildcard.patterns:
1950 raise TypeError(
1951 f"Collection pattern(s) {collection_wildcard.patterns} not allowed in this context."
1952 )
1953 missing: list[str] = []
1954 composition = self._managers.datasets.resolve_wildcard(
1955 datasets, components=components, missing=missing, explicit_only=(mode == "constrain")
1956 )
1957 if missing and mode == "constrain":
1958 # After v26 this should raise MissingDatasetTypeError, to be
1959 # implemented on DM-36303.
1960 warnings.warn(
1961 f"Dataset type(s) {missing} are not registered; this will be an error after v26.",
1962 FutureWarning,
1963 stacklevel=find_outside_stacklevel("lsst.daf.butler"),
1964 )
1965 doomed_by.extend(f"Dataset type {name} is not registered." for name in missing)
1966 elif collections:
1967 # I think this check should actually be `collections is not None`,
1968 # but it looks like some CLI scripts use empty tuple as default.
1969 raise ArgumentError(f"Cannot pass 'collections' (='{collections}') without 'datasets'.")
1970 return composition, collection_wildcard
1972 def queryDatasets(
1973 self,
1974 datasetType: Any,
1975 *,
1976 collections: CollectionArgType | None = None,
1977 dimensions: Iterable[Dimension | str] | None = None,
1978 dataId: DataId | None = None,
1979 where: str = "",
1980 findFirst: bool = False,
1981 components: bool | None = False,
1982 bind: Mapping[str, Any] | None = None,
1983 check: bool = True,
1984 **kwargs: Any,
1985 ) -> queries.DatasetQueryResults:
1986 """Query for and iterate over dataset references matching user-provided
1987 criteria.
1989 Parameters
1990 ----------
1991 datasetType : dataset type expression
1992 An expression that fully or partially identifies the dataset types
1993 to be queried. Allowed types include `DatasetType`, `str`,
1994 `re.Pattern`, and iterables thereof. The special value ``...`` can
1995 be used to query all dataset types. See
1996 :ref:`daf_butler_dataset_type_expressions` for more information.
1997 collections : collection expression, optional
1998 An expression that identifies the collections to search, such as a
1999 `str` (for full matches or partial matches via globs), `re.Pattern`
2000 (for partial matches), or iterable thereof. ``...`` can be used to
2001 search all collections (actually just all `~CollectionType.RUN`
2002 collections, because this will still find all datasets).
2003 If not provided, ``self.default.collections`` is used. See
2004 :ref:`daf_butler_collection_expressions` for more information.
2005 dimensions : `~collections.abc.Iterable` of `Dimension` or `str`
2006 Dimensions to include in the query (in addition to those used
2007 to identify the queried dataset type(s)), either to constrain
2008 the resulting datasets to those for which a matching dimension
2009 exists, or to relate the dataset type's dimensions to dimensions
2010 referenced by the ``dataId`` or ``where`` arguments.
2011 dataId : `dict` or `DataCoordinate`, optional
2012 A data ID whose key-value pairs are used as equality constraints
2013 in the query.
2014 where : `str`, optional
2015 A string expression similar to a SQL WHERE clause. May involve
2016 any column of a dimension table or (as a shortcut for the primary
2017 key column of a dimension table) dimension name. See
2018 :ref:`daf_butler_dimension_expressions` for more information.
2019 findFirst : `bool`, optional
2020 If `True` (`False` is default), for each result data ID, only
2021 yield one `DatasetRef` of each `DatasetType`, from the first
2022 collection in which a dataset of that dataset type appears
2023 (according to the order of ``collections`` passed in). If `True`,
2024 ``collections`` must not contain regular expressions and may not
2025 be ``...``.
2026 components : `bool`, optional
2027 If `True`, apply all dataset expression patterns to component
2028 dataset type names as well. If `False`, never apply patterns to
2029 components. If `None`, apply patterns to components only
2030 if their parent datasets were not matched by the expression.
2031 Fully-specified component datasets (`str` or `DatasetType`
2032 instances) are always included.
2034 Values other than `False` are deprecated, and only `False` will be
2035 supported after v26. After v27 this argument will be removed
2036 entirely.
2037 bind : `~collections.abc.Mapping`, optional
2038 Mapping containing literal values that should be injected into the
2039 ``where`` expression, keyed by the identifiers they replace.
2040 Values of collection type can be expanded in some cases; see
2041 :ref:`daf_butler_dimension_expressions_identifiers` for more
2042 information.
2043 check : `bool`, optional
2044 If `True` (default) check the query for consistency before
2045 executing it. This may reject some valid queries that resemble
2046 common mistakes (e.g. queries for visits without specifying an
2047 instrument).
2048 **kwargs
2049 Additional keyword arguments are forwarded to
2050 `DataCoordinate.standardize` when processing the ``dataId``
2051 argument (and may be used to provide a constraining data ID even
2052 when the ``dataId`` argument is `None`).
2054 Returns
2055 -------
2056 refs : `.queries.DatasetQueryResults`
2057 Dataset references matching the given query criteria. Nested data
2058 IDs are guaranteed to include values for all implied dimensions
2059 (i.e. `DataCoordinate.hasFull` will return `True`), but will not
2060 include dimension records (`DataCoordinate.hasRecords` will be
2061 `False`) unless `~.queries.DatasetQueryResults.expanded` is
2062 called on the result object (which returns a new one).
2064 Raises
2065 ------
2066 lsst.daf.butler.registry.DatasetTypeExpressionError
2067 Raised when ``datasetType`` expression is invalid.
2068 TypeError
2069 Raised when the arguments are incompatible, such as when a
2070 collection wildcard is passed when ``findFirst`` is `True`, or
2071 when ``collections`` is `None` and ``self.defaults.collections`` is
2072 also `None`.
2073 lsst.daf.butler.registry.DataIdError
2074 Raised when ``dataId`` or keyword arguments specify unknown
2075 dimensions or values, or when they contain inconsistent values.
2076 lsst.daf.butler.registry.UserExpressionError
2077 Raised when ``where`` expression is invalid.
2079 Notes
2080 -----
2081 When multiple dataset types are queried in a single call, the
2082 results of this operation are equivalent to querying for each dataset
2083 type separately in turn, and no information about the relationships
2084 between datasets of different types is included. In contexts where
2085 that kind of information is important, the recommended pattern is to
2086 use `queryDataIds` to first obtain data IDs (possibly with the
2087 desired dataset types and collections passed as constraints to the
2088 query), and then use multiple (generally much simpler) calls to
2089 `queryDatasets` with the returned data IDs passed as constraints.
2090 """
2091 doomed_by: list[str] = []
2092 data_id = self._standardize_query_data_id_args(dataId, doomed_by=doomed_by, **kwargs)
2093 dataset_composition, collection_wildcard = self._standardize_query_dataset_args(
2094 datasetType,
2095 collections,
2096 components,
2097 mode="find_first" if findFirst else "find_all",
2098 doomed_by=doomed_by,
2099 )
2100 if collection_wildcard is not None and collection_wildcard.empty():
2101 doomed_by.append("No datasets can be found because collection list is empty.")
2102 return queries.ChainedDatasetQueryResults([], doomed_by=doomed_by)
2103 parent_results: list[queries.ParentDatasetQueryResults] = []
2104 for parent_dataset_type, components_for_parent in dataset_composition.items():
2105 # The full set of dimensions in the query is the combination of
2106 # those needed for the DatasetType and those explicitly requested,
2107 # if any.
2108 dimension_names = set(parent_dataset_type.dimensions.names)
2109 if dimensions is not None:
2110 dimension_names.update(self.dimensions.conform(dimensions).names)
2111 # Construct the summary structure needed to construct a
2112 # QueryBuilder.
2113 summary = queries.QuerySummary(
2114 requested=self.dimensions.conform(dimension_names),
2115 column_types=self._managers.column_types,
2116 data_id=data_id,
2117 expression=where,
2118 bind=bind,
2119 defaults=self.defaults.dataId,
2120 check=check,
2121 datasets=[parent_dataset_type],
2122 )
2123 builder = self._makeQueryBuilder(summary)
2124 # Add the dataset subquery to the query, telling the QueryBuilder
2125 # to include the rank of the selected collection in the results
2126 # only if we need to findFirst. Note that if any of the
2127 # collections are actually wildcard expressions, and
2128 # findFirst=True, this will raise TypeError for us.
2129 builder.joinDataset(parent_dataset_type, collection_wildcard, isResult=True, findFirst=findFirst)
2130 query = builder.finish()
2131 parent_results.append(
2132 queries.ParentDatasetQueryResults(
2133 query, parent_dataset_type, components=components_for_parent
2134 )
2135 )
2136 if not parent_results:
2137 doomed_by.extend(
2138 f"No registered dataset type matching {t!r} found, so no matching datasets can "
2139 "exist in any collection."
2140 for t in ensure_iterable(datasetType)
2141 )
2142 return queries.ChainedDatasetQueryResults([], doomed_by=doomed_by)
2143 elif len(parent_results) == 1:
2144 return parent_results[0]
2145 else:
2146 return queries.ChainedDatasetQueryResults(parent_results)
2148 def queryDataIds(
2149 self,
2150 # TODO: Drop Dimension support on DM-41326.
2151 dimensions: DimensionGroup | Iterable[Dimension | str] | Dimension | str,
2152 *,
2153 dataId: DataId | None = None,
2154 datasets: Any = None,
2155 collections: CollectionArgType | None = None,
2156 where: str = "",
2157 components: bool | None = None,
2158 bind: Mapping[str, Any] | None = None,
2159 check: bool = True,
2160 **kwargs: Any,
2161 ) -> queries.DataCoordinateQueryResults:
2162 """Query for data IDs matching user-provided criteria.
2164 Parameters
2165 ----------
2166 dimensions : `DimensionGroup`, `Dimension`, or `str`, or \
2167 `~collections.abc.Iterable` [ `Dimension` or `str` ]
2168 The dimensions of the data IDs to yield, as either `Dimension`
2169 instances or `str`. Will be automatically expanded to a complete
2170 `DimensionGroup`. Support for `Dimension` instances is deprecated
2171 and will not be supported after v27.
2172 dataId : `dict` or `DataCoordinate`, optional
2173 A data ID whose key-value pairs are used as equality constraints
2174 in the query.
2175 datasets : dataset type expression, optional
2176 An expression that fully or partially identifies dataset types
2177 that should constrain the yielded data IDs. For example, including
2178 "raw" here would constrain the yielded ``instrument``,
2179 ``exposure``, ``detector``, and ``physical_filter`` values to only
2180 those for which at least one "raw" dataset exists in
2181 ``collections``. Allowed types include `DatasetType`, `str`,
2182 and iterables thereof. Regular expression objects (i.e.
2183 `re.Pattern`) are deprecated and will be removed after the v26
2184 release. See :ref:`daf_butler_dataset_type_expressions` for more
2185 information.
2186 collections : collection expression, optional
2187 An expression that identifies the collections to search for
2188 datasets, such as a `str` (for full matches or partial matches
2189 via globs), `re.Pattern` (for partial matches), or iterable
2190 thereof. ``...`` can be used to search all collections (actually
2191 just all `~CollectionType.RUN` collections, because this will
2192 still find all datasets). If not provided,
2193 ``self.default.collections`` is used. Ignored unless ``datasets``
2194 is also passed. See :ref:`daf_butler_collection_expressions` for
2195 more information.
2196 where : `str`, optional
2197 A string expression similar to a SQL WHERE clause. May involve
2198 any column of a dimension table or (as a shortcut for the primary
2199 key column of a dimension table) dimension name. See
2200 :ref:`daf_butler_dimension_expressions` for more information.
2201 components : `bool`, optional
2202 If `True`, apply all dataset expression patterns to component
2203 dataset type names as well. If `False`, never apply patterns to
2204 components. If `None`, apply patterns to components only
2205 if their parent datasets were not matched by the expression.
2206 Fully-specified component datasets (`str` or `DatasetType`
2207 instances) are always included.
2209 Values other than `False` are deprecated, and only `False` will be
2210 supported after v26. After v27 this argument will be removed
2211 entirely.
2212 bind : `~collections.abc.Mapping`, optional
2213 Mapping containing literal values that should be injected into the
2214 ``where`` expression, keyed by the identifiers they replace.
2215 Values of collection type can be expanded in some cases; see
2216 :ref:`daf_butler_dimension_expressions_identifiers` for more
2217 information.
2218 check : `bool`, optional
2219 If `True` (default) check the query for consistency before
2220 executing it. This may reject some valid queries that resemble
2221 common mistakes (e.g. queries for visits without specifying an
2222 instrument).
2223 **kwargs
2224 Additional keyword arguments are forwarded to
2225 `DataCoordinate.standardize` when processing the ``dataId``
2226 argument (and may be used to provide a constraining data ID even
2227 when the ``dataId`` argument is `None`).
2229 Returns
2230 -------
2231 dataIds : `.queries.DataCoordinateQueryResults`
2232 Data IDs matching the given query parameters. These are guaranteed
2233 to identify all dimensions (`DataCoordinate.hasFull` returns
2234 `True`), but will not contain `DimensionRecord` objects
2235 (`DataCoordinate.hasRecords` returns `False`). Call
2236 `~.queries.DataCoordinateQueryResults.expanded` on the
2237 returned object to fetch those (and consider using
2238 `~.queries.DataCoordinateQueryResults.materialize` on the
2239 returned object first if the expected number of rows is very
2240 large). See documentation for those methods for additional
2241 information.
2243 Raises
2244 ------
2245 lsst.daf.butler.registry.NoDefaultCollectionError
2246 Raised if ``collections`` is `None` and
2247 ``self.defaults.collections`` is `None`.
2248 lsst.daf.butler.registry.CollectionExpressionError
2249 Raised when ``collections`` expression is invalid.
2250 lsst.daf.butler.registry.DataIdError
2251 Raised when ``dataId`` or keyword arguments specify unknown
2252 dimensions or values, or when they contain inconsistent values.
2253 lsst.daf.butler.registry.DatasetTypeExpressionError
2254 Raised when ``datasetType`` expression is invalid.
2255 lsst.daf.butler.registry.UserExpressionError
2256 Raised when ``where`` expression is invalid.
2257 """
2258 requested_dimensions = self.dimensions.conform(dimensions)
2259 doomed_by: list[str] = []
2260 data_id = self._standardize_query_data_id_args(dataId, doomed_by=doomed_by, **kwargs)
2261 dataset_composition, collection_wildcard = self._standardize_query_dataset_args(
2262 datasets, collections, components, doomed_by=doomed_by
2263 )
2264 if collection_wildcard is not None and collection_wildcard.empty():
2265 doomed_by.append("No data coordinates can be found because collection list is empty.")
2266 summary = queries.QuerySummary(
2267 requested=requested_dimensions,
2268 column_types=self._managers.column_types,
2269 data_id=data_id,
2270 expression=where,
2271 bind=bind,
2272 defaults=self.defaults.dataId,
2273 check=check,
2274 datasets=dataset_composition.keys(),
2275 )
2276 builder = self._makeQueryBuilder(summary, doomed_by=doomed_by)
2277 for datasetType in dataset_composition:
2278 builder.joinDataset(datasetType, collection_wildcard, isResult=False)
2279 query = builder.finish()
2281 return queries.DataCoordinateQueryResults(query)
2283 def queryDimensionRecords(
2284 self,
2285 element: DimensionElement | str,
2286 *,
2287 dataId: DataId | None = None,
2288 datasets: Any = None,
2289 collections: CollectionArgType | None = None,
2290 where: str = "",
2291 components: bool | None = None,
2292 bind: Mapping[str, Any] | None = None,
2293 check: bool = True,
2294 **kwargs: Any,
2295 ) -> queries.DimensionRecordQueryResults:
2296 """Query for dimension information matching user-provided criteria.
2298 Parameters
2299 ----------
2300 element : `DimensionElement` or `str`
2301 The dimension element to obtain records for.
2302 dataId : `dict` or `DataCoordinate`, optional
2303 A data ID whose key-value pairs are used as equality constraints
2304 in the query.
2305 datasets : dataset type expression, optional
2306 An expression that fully or partially identifies dataset types
2307 that should constrain the yielded records. See `queryDataIds` and
2308 :ref:`daf_butler_dataset_type_expressions` for more information.
2309 collections : collection expression, optional
2310 An expression that identifies the collections to search for
2311 datasets, such as a `str` (for full matches or partial matches
2312 via globs), `re.Pattern` (for partial matches), or iterable
2313 thereof. ``...`` can be used to search all collections (actually
2314 just all `~CollectionType.RUN` collections, because this will
2315 still find all datasets). If not provided,
2316 ``self.default.collections`` is used. Ignored unless ``datasets``
2317 is also passed. See :ref:`daf_butler_collection_expressions` for
2318 more information.
2319 where : `str`, optional
2320 A string expression similar to a SQL WHERE clause. See
2321 `queryDataIds` and :ref:`daf_butler_dimension_expressions` for more
2322 information.
2323 components : `bool`, optional
2324 Whether to apply dataset expressions to components as well.
2325 See `queryDataIds` for more information.
2327 Values other than `False` are deprecated, and only `False` will be
2328 supported after v26. After v27 this argument will be removed
2329 entirely.
2330 bind : `~collections.abc.Mapping`, optional
2331 Mapping containing literal values that should be injected into the
2332 ``where`` expression, keyed by the identifiers they replace.
2333 Values of collection type can be expanded in some cases; see
2334 :ref:`daf_butler_dimension_expressions_identifiers` for more
2335 information.
2336 check : `bool`, optional
2337 If `True` (default) check the query for consistency before
2338 executing it. This may reject some valid queries that resemble
2339 common mistakes (e.g. queries for visits without specifying an
2340 instrument).
2341 **kwargs
2342 Additional keyword arguments are forwarded to
2343 `DataCoordinate.standardize` when processing the ``dataId``
2344 argument (and may be used to provide a constraining data ID even
2345 when the ``dataId`` argument is `None`).
2347 Returns
2348 -------
2349 dataIds : `.queries.DimensionRecordQueryResults`
2350 Data IDs matching the given query parameters.
2352 Raises
2353 ------
2354 lsst.daf.butler.registry.NoDefaultCollectionError
2355 Raised if ``collections`` is `None` and
2356 ``self.defaults.collections`` is `None`.
2357 lsst.daf.butler.registry.CollectionExpressionError
2358 Raised when ``collections`` expression is invalid.
2359 lsst.daf.butler.registry.DataIdError
2360 Raised when ``dataId`` or keyword arguments specify unknown
2361 dimensions or values, or when they contain inconsistent values.
2362 lsst.daf.butler.registry.DatasetTypeExpressionError
2363 Raised when ``datasetType`` expression is invalid.
2364 lsst.daf.butler.registry.UserExpressionError
2365 Raised when ``where`` expression is invalid.
2366 """
2367 if not isinstance(element, DimensionElement):
2368 try:
2369 element = self.dimensions[element]
2370 except KeyError as e:
2371 raise DimensionNameError(
2372 f"No such dimension '{element}', available dimensions: " + str(self.dimensions.elements)
2373 ) from e
2374 doomed_by: list[str] = []
2375 data_id = self._standardize_query_data_id_args(dataId, doomed_by=doomed_by, **kwargs)
2376 dataset_composition, collection_wildcard = self._standardize_query_dataset_args(
2377 datasets, collections, components, doomed_by=doomed_by
2378 )
2379 if collection_wildcard is not None and collection_wildcard.empty():
2380 doomed_by.append("No dimension records can be found because collection list is empty.")
2381 summary = queries.QuerySummary(
2382 requested=element.minimal_group,
2383 column_types=self._managers.column_types,
2384 data_id=data_id,
2385 expression=where,
2386 bind=bind,
2387 defaults=self.defaults.dataId,
2388 check=check,
2389 datasets=dataset_composition.keys(),
2390 )
2391 builder = self._makeQueryBuilder(summary, doomed_by=doomed_by)
2392 for datasetType in dataset_composition:
2393 builder.joinDataset(datasetType, collection_wildcard, isResult=False)
2394 query = builder.finish().with_record_columns(element.name)
2395 return queries.DatabaseDimensionRecordQueryResults(query, element)
2397 def queryDatasetAssociations(
2398 self,
2399 datasetType: str | DatasetType,
2400 collections: CollectionArgType | None = ...,
2401 *,
2402 collectionTypes: Iterable[CollectionType] = CollectionType.all(),
2403 flattenChains: bool = False,
2404 ) -> Iterator[DatasetAssociation]:
2405 """Iterate over dataset-collection combinations where the dataset is in
2406 the collection.
2408 This method is a temporary placeholder for better support for
2409 association results in `queryDatasets`. It will probably be
2410 removed in the future, and should be avoided in production code
2411 whenever possible.
2413 Parameters
2414 ----------
2415 datasetType : `DatasetType` or `str`
2416 A dataset type object or the name of one.
2417 collections : collection expression, optional
2418 An expression that identifies the collections to search for
2419 datasets, such as a `str` (for full matches or partial matches
2420 via globs), `re.Pattern` (for partial matches), or iterable
2421 thereof. ``...`` can be used to search all collections (actually
2422 just all `~CollectionType.RUN` collections, because this will still
2423 find all datasets). If not provided, ``self.default.collections``
2424 is used. See :ref:`daf_butler_collection_expressions` for more
2425 information.
2426 collectionTypes : `~collections.abc.Set` [ `CollectionType` ], optional
2427 If provided, only yield associations from collections of these
2428 types.
2429 flattenChains : `bool`, optional
2430 If `True`, search in the children of `~CollectionType.CHAINED`
2431 collections. If `False`, ``CHAINED`` collections are ignored.
2433 Yields
2434 ------
2435 association : `.DatasetAssociation`
2436 Object representing the relationship between a single dataset and
2437 a single collection.
2439 Raises
2440 ------
2441 lsst.daf.butler.registry.NoDefaultCollectionError
2442 Raised if ``collections`` is `None` and
2443 ``self.defaults.collections`` is `None`.
2444 lsst.daf.butler.registry.CollectionExpressionError
2445 Raised when ``collections`` expression is invalid.
2446 """
2447 if collections is None:
2448 if not self.defaults.collections:
2449 raise NoDefaultCollectionError(
2450 "No collections provided to queryDatasetAssociations, "
2451 "and no defaults from registry construction."
2452 )
2453 collections = self.defaults.collections
2454 collection_wildcard = CollectionWildcard.from_expression(collections)
2455 backend = queries.SqlQueryBackend(self._db, self._managers)
2456 parent_dataset_type, _ = backend.resolve_single_dataset_type_wildcard(datasetType, components=False)
2457 timespan_tag = DatasetColumnTag(parent_dataset_type.name, "timespan")
2458 collection_tag = DatasetColumnTag(parent_dataset_type.name, "collection")
2459 for parent_collection_record in backend.resolve_collection_wildcard(
2460 collection_wildcard,
2461 collection_types=frozenset(collectionTypes),
2462 flatten_chains=flattenChains,
2463 ):
2464 # Resolve this possibly-chained collection into a list of
2465 # non-CHAINED collections that actually hold datasets of this
2466 # type.
2467 candidate_collection_records = backend.resolve_dataset_collections(
2468 parent_dataset_type,
2469 CollectionWildcard.from_names([parent_collection_record.name]),
2470 allow_calibration_collections=True,
2471 governor_constraints={},
2472 )
2473 if not candidate_collection_records:
2474 continue
2475 with backend.context() as context:
2476 relation = backend.make_dataset_query_relation(
2477 parent_dataset_type,
2478 candidate_collection_records,
2479 columns={"dataset_id", "run", "timespan", "collection"},
2480 context=context,
2481 )
2482 reader = queries.DatasetRefReader(
2483 parent_dataset_type,
2484 translate_collection=lambda k: self._managers.collections[k].name,
2485 full=False,
2486 )
2487 for row in context.fetch_iterable(relation):
2488 ref = reader.read(row)
2489 collection_record = self._managers.collections[row[collection_tag]]
2490 if collection_record.type is CollectionType.CALIBRATION:
2491 timespan = row[timespan_tag]
2492 else:
2493 # For backwards compatibility and (possibly?) user
2494 # convenience we continue to define the timespan of a
2495 # DatasetAssociation row for a non-CALIBRATION
2496 # collection to be None rather than a fully unbounded
2497 # timespan.
2498 timespan = None
2499 yield DatasetAssociation(ref=ref, collection=collection_record.name, timespan=timespan)
2501 def get_datastore_records(self, ref: DatasetRef) -> DatasetRef:
2502 """Retrieve datastore records for given ref.
2504 Parameters
2505 ----------
2506 ref : `DatasetRef`
2507 Dataset reference for which to retrieve its corresponding datastore
2508 records.
2510 Returns
2511 -------
2512 updated_ref : `DatasetRef`
2513 Dataset reference with filled datastore records.
2515 Notes
2516 -----
2517 If this method is called with the dataset ref that is not known to the
2518 registry then the reference with an empty set of records is returned.
2519 """
2520 datastore_records: dict[str, list[StoredDatastoreItemInfo]] = {}
2521 for opaque, record_class in self._datastore_record_classes.items():
2522 records = self.fetchOpaqueData(opaque, dataset_id=ref.id)
2523 datastore_records[opaque] = [record_class.from_record(record) for record in records]
2524 return ref.replace(datastore_records=datastore_records)
2526 def store_datastore_records(self, refs: Mapping[str, DatasetRef]) -> None:
2527 """Store datastore records for given refs.
2529 Parameters
2530 ----------
2531 refs : `~collections.abc.Mapping` [`str`, `DatasetRef`]
2532 Mapping of a datastore name to dataset reference stored in that
2533 datastore, reference must include datastore records.
2534 """
2535 for datastore_name, ref in refs.items():
2536 # Store ref IDs in the bridge table.
2537 bridge = self._managers.datastores.register(datastore_name)
2538 bridge.insert([ref])
2540 # store records in opaque tables
2541 assert ref._datastore_records is not None, "Dataset ref must have datastore records"
2542 for table_name, records in ref._datastore_records.items():
2543 opaque_table = self._managers.opaque.get(table_name)
2544 assert opaque_table is not None, f"Unexpected opaque table name {table_name}"
2545 opaque_table.insert(*(record.to_record(dataset_id=ref.id) for record in records))
2547 def make_datastore_tables(self, tables: Mapping[str, DatastoreOpaqueTable]) -> None:
2548 """Create opaque tables used by datastores.
2550 Parameters
2551 ----------
2552 tables : `~collections.abc.Mapping`
2553 Maps opaque table name to its definition.
2555 Notes
2556 -----
2557 This method should disappear in the future when opaque table
2558 definitions will be provided during `Registry` construction.
2559 """
2560 datastore_record_classes = {}
2561 for table_name, table_def in tables.items():
2562 datastore_record_classes[table_name] = table_def.record_class
2563 try:
2564 self._managers.opaque.register(table_name, table_def.table_spec)
2565 except ReadOnlyDatabaseError:
2566 # If the database is read only and we just tried and failed to
2567 # create a table, it means someone is trying to create a
2568 # read-only butler client for an empty repo. That should be
2569 # okay, as long as they then try to get any datasets before
2570 # some other client creates the table. Chances are they're
2571 # just validating configuration.
2572 pass
2573 self._datastore_record_classes = datastore_record_classes
2575 @property
2576 def obsCoreTableManager(self) -> ObsCoreTableManager | None:
2577 """The ObsCore manager instance for this registry
2578 (`~.interfaces.ObsCoreTableManager`
2579 or `None`).
2581 ObsCore manager may not be implemented for all registry backend, or
2582 may not be enabled for many repositories.
2583 """
2584 return self._managers.obscore
2586 storageClasses: StorageClassFactory
2587 """All storage classes known to the registry (`StorageClassFactory`).
2588 """
2590 _defaults: RegistryDefaults
2591 """Default collections used for registry queries (`RegistryDefaults`)."""