Coverage for python/lsst/daf/butler/registry/sql_registry.py: 18%
570 statements
« prev ^ index » next coverage.py v7.3.2, created at 2023-10-27 09:44 +0000
« prev ^ index » next coverage.py v7.3.2, created at 2023-10-27 09:44 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This software is dual licensed under the GNU General Public License and also
10# under a 3-clause BSD license. Recipients may choose which of these licenses
11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12# respectively. If you choose the GPL option then the following text applies
13# (but note that there is still no warranty even if you opt for BSD instead):
14#
15# This program is free software: you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation, either version 3 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program. If not, see <http://www.gnu.org/licenses/>.
28from __future__ import annotations
30from .. import ddl
32__all__ = ("SqlRegistry",)
34import contextlib
35import logging
36import warnings
37from collections.abc import Iterable, Iterator, Mapping, Sequence
38from typing import TYPE_CHECKING, Any, Literal, cast
40import sqlalchemy
41from lsst.daf.relation import LeafRelation, Relation
42from lsst.resources import ResourcePathExpression
43from lsst.utils.introspection import find_outside_stacklevel
44from lsst.utils.iteration import ensure_iterable
46from .._column_tags import DatasetColumnTag
47from .._config import Config
48from .._dataset_association import DatasetAssociation
49from .._dataset_ref import DatasetId, DatasetIdGenEnum, DatasetRef
50from .._dataset_type import DatasetType
51from .._named import NamedKeyMapping, NameLookupMapping
52from .._storage_class import StorageClassFactory
53from .._timespan import Timespan
54from ..dimensions import (
55 DataCoordinate,
56 DataId,
57 Dimension,
58 DimensionConfig,
59 DimensionElement,
60 DimensionGraph,
61 DimensionRecord,
62 DimensionUniverse,
63)
64from ..progress import Progress
65from ..registry import (
66 ArgumentError,
67 CollectionExpressionError,
68 CollectionSummary,
69 CollectionType,
70 CollectionTypeError,
71 ConflictingDefinitionError,
72 DataIdValueError,
73 DatasetTypeError,
74 DimensionNameError,
75 InconsistentDataIdError,
76 NoDefaultCollectionError,
77 OrphanedRecordError,
78 RegistryConfig,
79 RegistryConsistencyError,
80 RegistryDefaults,
81 queries,
82)
83from ..registry.interfaces import ChainedCollectionRecord, ReadOnlyDatabaseError, RunRecord
84from ..registry.managers import RegistryManagerInstances, RegistryManagerTypes
85from ..registry.wildcards import CollectionWildcard, DatasetTypeWildcard
86from ..utils import transactional
88if TYPE_CHECKING:
89 from .._butler_config import ButlerConfig
90 from ..datastore._datastore import DatastoreOpaqueTable
91 from ..datastore.stored_file_info import StoredDatastoreItemInfo
92 from ..registry._registry import CollectionArgType
93 from ..registry.interfaces import (
94 CollectionRecord,
95 Database,
96 DatastoreRegistryBridgeManager,
97 ObsCoreTableManager,
98 )
101_LOG = logging.getLogger(__name__)
104class SqlRegistry:
105 """Butler Registry implementation that uses SQL database as backend.
107 Parameters
108 ----------
109 database : `Database`
110 Database instance to store Registry.
111 defaults : `RegistryDefaults`
112 Default collection search path and/or output `~CollectionType.RUN`
113 collection.
114 managers : `RegistryManagerInstances`
115 All the managers required for this registry.
116 """
118 defaultConfigFile: str | None = None
119 """Path to configuration defaults. Accessed within the ``configs`` resource
120 or relative to a search path. Can be None if no defaults specified.
121 """
123 @classmethod
124 def forceRegistryConfig(
125 cls, config: ButlerConfig | RegistryConfig | Config | str | None
126 ) -> RegistryConfig:
127 """Force the supplied config to a `RegistryConfig`.
129 Parameters
130 ----------
131 config : `RegistryConfig`, `Config` or `str` or `None`
132 Registry configuration, if missing then default configuration will
133 be loaded from registry.yaml.
135 Returns
136 -------
137 registry_config : `RegistryConfig`
138 A registry config.
139 """
140 if not isinstance(config, RegistryConfig):
141 if isinstance(config, str | Config) or config is None:
142 config = RegistryConfig(config)
143 else:
144 raise ValueError(f"Incompatible Registry configuration: {config}")
145 return config
147 @classmethod
148 def createFromConfig(
149 cls,
150 config: RegistryConfig | str | None = None,
151 dimensionConfig: DimensionConfig | str | None = None,
152 butlerRoot: ResourcePathExpression | None = None,
153 ) -> SqlRegistry:
154 """Create registry database and return `SqlRegistry` instance.
156 This method initializes database contents, database must be empty
157 prior to calling this method.
159 Parameters
160 ----------
161 config : `RegistryConfig` or `str`, optional
162 Registry configuration, if missing then default configuration will
163 be loaded from registry.yaml.
164 dimensionConfig : `DimensionConfig` or `str`, optional
165 Dimensions configuration, if missing then default configuration
166 will be loaded from dimensions.yaml.
167 butlerRoot : convertible to `lsst.resources.ResourcePath`, optional
168 Path to the repository root this `SqlRegistry` will manage.
170 Returns
171 -------
172 registry : `SqlRegistry`
173 A new `SqlRegistry` instance.
174 """
175 config = cls.forceRegistryConfig(config)
176 config.replaceRoot(butlerRoot)
178 if isinstance(dimensionConfig, str):
179 dimensionConfig = DimensionConfig(dimensionConfig)
180 elif dimensionConfig is None:
181 dimensionConfig = DimensionConfig()
182 elif not isinstance(dimensionConfig, DimensionConfig):
183 raise TypeError(f"Incompatible Dimension configuration type: {type(dimensionConfig)}")
185 DatabaseClass = config.getDatabaseClass()
186 database = DatabaseClass.fromUri(
187 config.connectionString, origin=config.get("origin", 0), namespace=config.get("namespace")
188 )
189 managerTypes = RegistryManagerTypes.fromConfig(config)
190 managers = managerTypes.makeRepo(database, dimensionConfig)
191 return cls(database, RegistryDefaults(), managers)
193 @classmethod
194 def fromConfig(
195 cls,
196 config: ButlerConfig | RegistryConfig | Config | str,
197 butlerRoot: ResourcePathExpression | None = None,
198 writeable: bool = True,
199 defaults: RegistryDefaults | None = None,
200 ) -> SqlRegistry:
201 """Create `Registry` subclass instance from `config`.
203 Registry database must be initialized prior to calling this method.
205 Parameters
206 ----------
207 config : `ButlerConfig`, `RegistryConfig`, `Config` or `str`
208 Registry configuration
209 butlerRoot : `lsst.resources.ResourcePathExpression`, optional
210 Path to the repository root this `Registry` will manage.
211 writeable : `bool`, optional
212 If `True` (default) create a read-write connection to the database.
213 defaults : `RegistryDefaults`, optional
214 Default collection search path and/or output `~CollectionType.RUN`
215 collection.
217 Returns
218 -------
219 registry : `SqlRegistry`
220 A new `SqlRegistry` subclass instance.
221 """
222 config = cls.forceRegistryConfig(config)
223 config.replaceRoot(butlerRoot)
224 DatabaseClass = config.getDatabaseClass()
225 database = DatabaseClass.fromUri(
226 config.connectionString,
227 origin=config.get("origin", 0),
228 namespace=config.get("namespace"),
229 writeable=writeable,
230 )
231 managerTypes = RegistryManagerTypes.fromConfig(config)
232 with database.session():
233 managers = managerTypes.loadRepo(database)
234 if defaults is None:
235 defaults = RegistryDefaults()
236 return cls(database, defaults, managers)
238 def __init__(self, database: Database, defaults: RegistryDefaults, managers: RegistryManagerInstances):
239 self._db = database
240 self._managers = managers
241 self.storageClasses = StorageClassFactory()
242 # Intentionally invoke property setter to initialize defaults. This
243 # can only be done after most of the rest of Registry has already been
244 # initialized, and must be done before the property getter is used.
245 self.defaults = defaults
247 # TODO: This is currently initialized by `make_datastore_tables`,
248 # eventually we'll need to do it during construction.
249 # The mapping is indexed by the opaque table name.
250 self._datastore_record_classes: Mapping[str, type[StoredDatastoreItemInfo]] = {}
252 def __str__(self) -> str:
253 return str(self._db)
255 def __repr__(self) -> str:
256 return f"SqlRegistry({self._db!r}, {self.dimensions!r})"
258 def isWriteable(self) -> bool:
259 """Return `True` if this registry allows write operations, and `False`
260 otherwise.
261 """
262 return self._db.isWriteable()
264 def copy(self, defaults: RegistryDefaults | None = None) -> SqlRegistry:
265 """Create a new `SqlRegistry` backed by the same data repository
266 and connection as this one, but independent defaults.
268 Parameters
269 ----------
270 defaults : `~lsst.daf.butler.registry.RegistryDefaults`, optional
271 Default collections and data ID values for the new registry. If
272 not provided, ``self.defaults`` will be used (but future changes
273 to either registry's defaults will not affect the other).
275 Returns
276 -------
277 copy : `SqlRegistry`
278 A new `SqlRegistry` instance with its own defaults.
280 Notes
281 -----
282 Because the new registry shares a connection with the original, they
283 also share transaction state (despite the fact that their `transaction`
284 context manager methods do not reflect this), and must be used with
285 care.
286 """
287 if defaults is None:
288 # No need to copy, because `RegistryDefaults` is immutable; we
289 # effectively copy on write.
290 defaults = self.defaults
291 return type(self)(self._db, defaults, self._managers)
293 @property
294 def dimensions(self) -> DimensionUniverse:
295 """Definitions of all dimensions recognized by this `Registry`
296 (`DimensionUniverse`).
297 """
298 return self._managers.dimensions.universe
300 @property
301 def defaults(self) -> RegistryDefaults:
302 """Default collection search path and/or output `~CollectionType.RUN`
303 collection (`~lsst.daf.butler.registry.RegistryDefaults`).
305 This is an immutable struct whose components may not be set
306 individually, but the entire struct can be set by assigning to this
307 property.
308 """
309 return self._defaults
311 @defaults.setter
312 def defaults(self, value: RegistryDefaults) -> None:
313 if value.run is not None:
314 self.registerRun(value.run)
315 value.finish(self)
316 self._defaults = value
318 def refresh(self) -> None:
319 """Refresh all in-memory state by querying the database.
321 This may be necessary to enable querying for entities added by other
322 registry instances after this one was constructed.
323 """
324 with self._db.transaction():
325 self._managers.refresh()
327 @contextlib.contextmanager
328 def transaction(self, *, savepoint: bool = False) -> Iterator[None]:
329 """Return a context manager that represents a transaction."""
330 try:
331 with self._db.transaction(savepoint=savepoint):
332 yield
333 except BaseException:
334 # TODO: this clears the caches sometimes when we wouldn't actually
335 # need to. Can we avoid that?
336 self._managers.dimensions.clearCaches()
337 raise
339 def resetConnectionPool(self) -> None:
340 """Reset SQLAlchemy connection pool for `SqlRegistry` database.
342 This operation is useful when using registry with fork-based
343 multiprocessing. To use registry across fork boundary one has to make
344 sure that there are no currently active connections (no session or
345 transaction is in progress) and connection pool is reset using this
346 method. This method should be called by the child process immediately
347 after the fork.
348 """
349 self._db._engine.dispose()
351 def registerOpaqueTable(self, tableName: str, spec: ddl.TableSpec) -> None:
352 """Add an opaque (to the `Registry`) table for use by a `Datastore` or
353 other data repository client.
355 Opaque table records can be added via `insertOpaqueData`, retrieved via
356 `fetchOpaqueData`, and removed via `deleteOpaqueData`.
358 Parameters
359 ----------
360 tableName : `str`
361 Logical name of the opaque table. This may differ from the
362 actual name used in the database by a prefix and/or suffix.
363 spec : `ddl.TableSpec`
364 Specification for the table to be added.
365 """
366 self._managers.opaque.register(tableName, spec)
368 @transactional
369 def insertOpaqueData(self, tableName: str, *data: dict) -> None:
370 """Insert records into an opaque table.
372 Parameters
373 ----------
374 tableName : `str`
375 Logical name of the opaque table. Must match the name used in a
376 previous call to `registerOpaqueTable`.
377 data
378 Each additional positional argument is a dictionary that represents
379 a single row to be added.
380 """
381 self._managers.opaque[tableName].insert(*data)
383 def fetchOpaqueData(self, tableName: str, **where: Any) -> Iterator[Mapping[str, Any]]:
384 """Retrieve records from an opaque table.
386 Parameters
387 ----------
388 tableName : `str`
389 Logical name of the opaque table. Must match the name used in a
390 previous call to `registerOpaqueTable`.
391 where
392 Additional keyword arguments are interpreted as equality
393 constraints that restrict the returned rows (combined with AND);
394 keyword arguments are column names and values are the values they
395 must have.
397 Yields
398 ------
399 row : `dict`
400 A dictionary representing a single result row.
401 """
402 yield from self._managers.opaque[tableName].fetch(**where)
404 @transactional
405 def deleteOpaqueData(self, tableName: str, **where: Any) -> None:
406 """Remove records from an opaque table.
408 Parameters
409 ----------
410 tableName : `str`
411 Logical name of the opaque table. Must match the name used in a
412 previous call to `registerOpaqueTable`.
413 where
414 Additional keyword arguments are interpreted as equality
415 constraints that restrict the deleted rows (combined with AND);
416 keyword arguments are column names and values are the values they
417 must have.
418 """
419 self._managers.opaque[tableName].delete(where.keys(), where)
421 def registerCollection(
422 self, name: str, type: CollectionType = CollectionType.TAGGED, doc: str | None = None
423 ) -> bool:
424 """Add a new collection if one with the given name does not exist.
426 Parameters
427 ----------
428 name : `str`
429 The name of the collection to create.
430 type : `CollectionType`
431 Enum value indicating the type of collection to create.
432 doc : `str`, optional
433 Documentation string for the collection.
435 Returns
436 -------
437 registered : `bool`
438 Boolean indicating whether the collection was already registered
439 or was created by this call.
441 Notes
442 -----
443 This method cannot be called within transactions, as it needs to be
444 able to perform its own transaction to be concurrent.
445 """
446 _, registered = self._managers.collections.register(name, type, doc=doc)
447 return registered
449 def getCollectionType(self, name: str) -> CollectionType:
450 """Return an enumeration value indicating the type of the given
451 collection.
453 Parameters
454 ----------
455 name : `str`
456 The name of the collection.
458 Returns
459 -------
460 type : `CollectionType`
461 Enum value indicating the type of this collection.
463 Raises
464 ------
465 lsst.daf.butler.registry.MissingCollectionError
466 Raised if no collection with the given name exists.
467 """
468 return self._managers.collections.find(name).type
470 def _get_collection_record(self, name: str) -> CollectionRecord:
471 """Return the record for this collection.
473 Parameters
474 ----------
475 name : `str`
476 Name of the collection for which the record is to be retrieved.
478 Returns
479 -------
480 record : `CollectionRecord`
481 The record for this collection.
482 """
483 return self._managers.collections.find(name)
485 def registerRun(self, name: str, doc: str | None = None) -> bool:
486 """Add a new run if one with the given name does not exist.
488 Parameters
489 ----------
490 name : `str`
491 The name of the run to create.
492 doc : `str`, optional
493 Documentation string for the collection.
495 Returns
496 -------
497 registered : `bool`
498 Boolean indicating whether a new run was registered. `False`
499 if it already existed.
501 Notes
502 -----
503 This method cannot be called within transactions, as it needs to be
504 able to perform its own transaction to be concurrent.
505 """
506 _, registered = self._managers.collections.register(name, CollectionType.RUN, doc=doc)
507 return registered
509 @transactional
510 def removeCollection(self, name: str) -> None:
511 """Remove the given collection from the registry.
513 Parameters
514 ----------
515 name : `str`
516 The name of the collection to remove.
518 Raises
519 ------
520 lsst.daf.butler.registry.MissingCollectionError
521 Raised if no collection with the given name exists.
522 sqlalchemy.exc.IntegrityError
523 Raised if the database rows associated with the collection are
524 still referenced by some other table, such as a dataset in a
525 datastore (for `~CollectionType.RUN` collections only) or a
526 `~CollectionType.CHAINED` collection of which this collection is
527 a child.
529 Notes
530 -----
531 If this is a `~CollectionType.RUN` collection, all datasets and quanta
532 in it will removed from the `Registry` database. This requires that
533 those datasets be removed (or at least trashed) from any datastores
534 that hold them first.
536 A collection may not be deleted as long as it is referenced by a
537 `~CollectionType.CHAINED` collection; the ``CHAINED`` collection must
538 be deleted or redefined first.
539 """
540 self._managers.collections.remove(name)
542 def getCollectionChain(self, parent: str) -> tuple[str, ...]:
543 """Return the child collections in a `~CollectionType.CHAINED`
544 collection.
546 Parameters
547 ----------
548 parent : `str`
549 Name of the chained collection. Must have already been added via
550 a call to `Registry.registerCollection`.
552 Returns
553 -------
554 children : `~collections.abc.Sequence` [ `str` ]
555 An ordered sequence of collection names that are searched when the
556 given chained collection is searched.
558 Raises
559 ------
560 lsst.daf.butler.registry.MissingCollectionError
561 Raised if ``parent`` does not exist in the `Registry`.
562 lsst.daf.butler.registry.CollectionTypeError
563 Raised if ``parent`` does not correspond to a
564 `~CollectionType.CHAINED` collection.
565 """
566 record = self._managers.collections.find(parent)
567 if record.type is not CollectionType.CHAINED:
568 raise CollectionTypeError(f"Collection '{parent}' has type {record.type.name}, not CHAINED.")
569 assert isinstance(record, ChainedCollectionRecord)
570 return record.children
572 @transactional
573 def setCollectionChain(self, parent: str, children: Any, *, flatten: bool = False) -> None:
574 """Define or redefine a `~CollectionType.CHAINED` collection.
576 Parameters
577 ----------
578 parent : `str`
579 Name of the chained collection. Must have already been added via
580 a call to `Registry.registerCollection`.
581 children : collection expression
582 An expression defining an ordered search of child collections,
583 generally an iterable of `str`; see
584 :ref:`daf_butler_collection_expressions` for more information.
585 flatten : `bool`, optional
586 If `True` (`False` is default), recursively flatten out any nested
587 `~CollectionType.CHAINED` collections in ``children`` first.
589 Raises
590 ------
591 lsst.daf.butler.registry.MissingCollectionError
592 Raised when any of the given collections do not exist in the
593 `Registry`.
594 lsst.daf.butler.registry.CollectionTypeError
595 Raised if ``parent`` does not correspond to a
596 `~CollectionType.CHAINED` collection.
597 ValueError
598 Raised if the given collections contains a cycle.
599 """
600 record = self._managers.collections.find(parent)
601 if record.type is not CollectionType.CHAINED:
602 raise CollectionTypeError(f"Collection '{parent}' has type {record.type.name}, not CHAINED.")
603 assert isinstance(record, ChainedCollectionRecord)
604 children = CollectionWildcard.from_expression(children).require_ordered()
605 if children != record.children or flatten:
606 record.update(self._managers.collections, children, flatten=flatten)
608 def getCollectionParentChains(self, collection: str) -> set[str]:
609 """Return the CHAINED collections that directly contain the given one.
611 Parameters
612 ----------
613 name : `str`
614 Name of the collection.
616 Returns
617 -------
618 chains : `set` of `str`
619 Set of `~CollectionType.CHAINED` collection names.
620 """
621 return {
622 record.name
623 for record in self._managers.collections.getParentChains(
624 self._managers.collections.find(collection).key
625 )
626 }
628 def getCollectionDocumentation(self, collection: str) -> str | None:
629 """Retrieve the documentation string for a collection.
631 Parameters
632 ----------
633 name : `str`
634 Name of the collection.
636 Returns
637 -------
638 docs : `str` or `None`
639 Docstring for the collection with the given name.
640 """
641 return self._managers.collections.getDocumentation(self._managers.collections.find(collection).key)
643 def setCollectionDocumentation(self, collection: str, doc: str | None) -> None:
644 """Set the documentation string for a collection.
646 Parameters
647 ----------
648 name : `str`
649 Name of the collection.
650 docs : `str` or `None`
651 Docstring for the collection with the given name; will replace any
652 existing docstring. Passing `None` will remove any existing
653 docstring.
654 """
655 self._managers.collections.setDocumentation(self._managers.collections.find(collection).key, doc)
657 def getCollectionSummary(self, collection: str) -> CollectionSummary:
658 """Return a summary for the given collection.
660 Parameters
661 ----------
662 collection : `str`
663 Name of the collection for which a summary is to be retrieved.
665 Returns
666 -------
667 summary : `~lsst.daf.butler.registry.CollectionSummary`
668 Summary of the dataset types and governor dimension values in
669 this collection.
670 """
671 record = self._managers.collections.find(collection)
672 return self._managers.datasets.getCollectionSummary(record)
674 def registerDatasetType(self, datasetType: DatasetType) -> bool:
675 """Add a new `DatasetType` to the Registry.
677 It is not an error to register the same `DatasetType` twice.
679 Parameters
680 ----------
681 datasetType : `DatasetType`
682 The `DatasetType` to be added.
684 Returns
685 -------
686 inserted : `bool`
687 `True` if ``datasetType`` was inserted, `False` if an identical
688 existing `DatasetType` was found. Note that in either case the
689 DatasetType is guaranteed to be defined in the Registry
690 consistently with the given definition.
692 Raises
693 ------
694 ValueError
695 Raised if the dimensions or storage class are invalid.
696 lsst.daf.butler.registry.ConflictingDefinitionError
697 Raised if this `DatasetType` is already registered with a different
698 definition.
700 Notes
701 -----
702 This method cannot be called within transactions, as it needs to be
703 able to perform its own transaction to be concurrent.
704 """
705 _, inserted = self._managers.datasets.register(datasetType)
706 return inserted
708 def removeDatasetType(self, name: str | tuple[str, ...]) -> None:
709 """Remove the named `DatasetType` from the registry.
711 .. warning::
713 Registry implementations can cache the dataset type definitions.
714 This means that deleting the dataset type definition may result in
715 unexpected behavior from other butler processes that are active
716 that have not seen the deletion.
718 Parameters
719 ----------
720 name : `str` or `tuple` [`str`]
721 Name of the type to be removed or tuple containing a list of type
722 names to be removed. Wildcards are allowed.
724 Raises
725 ------
726 lsst.daf.butler.registry.OrphanedRecordError
727 Raised if an attempt is made to remove the dataset type definition
728 when there are already datasets associated with it.
730 Notes
731 -----
732 If the dataset type is not registered the method will return without
733 action.
734 """
735 for datasetTypeExpression in ensure_iterable(name):
736 # Catch any warnings from the caller specifying a component
737 # dataset type. This will result in an error later but the
738 # warning could be confusing when the caller is not querying
739 # anything.
740 with warnings.catch_warnings():
741 warnings.simplefilter("ignore", category=FutureWarning)
742 datasetTypes = list(self.queryDatasetTypes(datasetTypeExpression))
743 if not datasetTypes:
744 _LOG.info("Dataset type %r not defined", datasetTypeExpression)
745 else:
746 for datasetType in datasetTypes:
747 self._managers.datasets.remove(datasetType.name)
748 _LOG.info("Removed dataset type %r", datasetType.name)
750 def getDatasetType(self, name: str) -> DatasetType:
751 """Get the `DatasetType`.
753 Parameters
754 ----------
755 name : `str`
756 Name of the type.
758 Returns
759 -------
760 type : `DatasetType`
761 The `DatasetType` associated with the given name.
763 Raises
764 ------
765 lsst.daf.butler.registry.MissingDatasetTypeError
766 Raised if the requested dataset type has not been registered.
768 Notes
769 -----
770 This method handles component dataset types automatically, though most
771 other registry operations do not.
772 """
773 parent_name, component = DatasetType.splitDatasetTypeName(name)
774 storage = self._managers.datasets[parent_name]
775 if component is None:
776 return storage.datasetType
777 else:
778 return storage.datasetType.makeComponentDatasetType(component)
780 def supportsIdGenerationMode(self, mode: DatasetIdGenEnum) -> bool:
781 """Test whether the given dataset ID generation mode is supported by
782 `insertDatasets`.
784 Parameters
785 ----------
786 mode : `DatasetIdGenEnum`
787 Enum value for the mode to test.
789 Returns
790 -------
791 supported : `bool`
792 Whether the given mode is supported.
793 """
794 return self._managers.datasets.supportsIdGenerationMode(mode)
796 def findDataset(
797 self,
798 datasetType: DatasetType | str,
799 dataId: DataId | None = None,
800 *,
801 collections: CollectionArgType | None = None,
802 timespan: Timespan | None = None,
803 datastore_records: bool = False,
804 **kwargs: Any,
805 ) -> DatasetRef | None:
806 """Find a dataset given its `DatasetType` and data ID.
808 This can be used to obtain a `DatasetRef` that permits the dataset to
809 be read from a `Datastore`. If the dataset is a component and can not
810 be found using the provided dataset type, a dataset ref for the parent
811 will be returned instead but with the correct dataset type.
813 Parameters
814 ----------
815 datasetType : `DatasetType` or `str`
816 A `DatasetType` or the name of one. If this is a `DatasetType`
817 instance, its storage class will be respected and propagated to
818 the output, even if it differs from the dataset type definition
819 in the registry, as long as the storage classes are convertible.
820 dataId : `dict` or `DataCoordinate`, optional
821 A `dict`-like object containing the `Dimension` links that identify
822 the dataset within a collection.
823 collections : collection expression, optional
824 An expression that fully or partially identifies the collections to
825 search for the dataset; see
826 :ref:`daf_butler_collection_expressions` for more information.
827 Defaults to ``self.defaults.collections``.
828 timespan : `Timespan`, optional
829 A timespan that the validity range of the dataset must overlap.
830 If not provided, any `~CollectionType.CALIBRATION` collections
831 matched by the ``collections`` argument will not be searched.
832 **kwargs
833 Additional keyword arguments passed to
834 `DataCoordinate.standardize` to convert ``dataId`` to a true
835 `DataCoordinate` or augment an existing one.
837 Returns
838 -------
839 ref : `DatasetRef`
840 A reference to the dataset, or `None` if no matching Dataset
841 was found.
843 Raises
844 ------
845 lsst.daf.butler.registry.NoDefaultCollectionError
846 Raised if ``collections`` is `None` and
847 ``self.defaults.collections`` is `None`.
848 LookupError
849 Raised if one or more data ID keys are missing.
850 lsst.daf.butler.registry.MissingDatasetTypeError
851 Raised if the dataset type does not exist.
852 lsst.daf.butler.registry.MissingCollectionError
853 Raised if any of ``collections`` does not exist in the registry.
855 Notes
856 -----
857 This method simply returns `None` and does not raise an exception even
858 when the set of collections searched is intrinsically incompatible with
859 the dataset type, e.g. if ``datasetType.isCalibration() is False``, but
860 only `~CollectionType.CALIBRATION` collections are being searched.
861 This may make it harder to debug some lookup failures, but the behavior
862 is intentional; we consider it more important that failed searches are
863 reported consistently, regardless of the reason, and that adding
864 additional collections that do not contain a match to the search path
865 never changes the behavior.
867 This method handles component dataset types automatically, though most
868 other registry operations do not.
869 """
870 if collections is None:
871 if not self.defaults.collections:
872 raise NoDefaultCollectionError(
873 "No collections provided to findDataset, and no defaults from registry construction."
874 )
875 collections = self.defaults.collections
876 backend = queries.SqlQueryBackend(self._db, self._managers)
877 collection_wildcard = CollectionWildcard.from_expression(collections, require_ordered=True)
878 if collection_wildcard.empty():
879 return None
880 matched_collections = backend.resolve_collection_wildcard(collection_wildcard)
881 parent_dataset_type, components = backend.resolve_single_dataset_type_wildcard(
882 datasetType, components_deprecated=False
883 )
884 if len(components) > 1:
885 raise DatasetTypeError(
886 f"findDataset requires exactly one dataset type; got multiple components {components} "
887 f"for parent dataset type {parent_dataset_type.name}."
888 )
889 component = components[0]
890 dataId = DataCoordinate.standardize(
891 dataId,
892 graph=parent_dataset_type.dimensions,
893 universe=self.dimensions,
894 defaults=self.defaults.dataId,
895 **kwargs,
896 )
897 governor_constraints = {name: {cast(str, dataId[name])} for name in dataId.graph.governors.names}
898 (filtered_collections,) = backend.filter_dataset_collections(
899 [parent_dataset_type],
900 matched_collections,
901 governor_constraints=governor_constraints,
902 ).values()
903 if not filtered_collections:
904 return None
905 if timespan is None:
906 filtered_collections = [
907 collection_record
908 for collection_record in filtered_collections
909 if collection_record.type is not CollectionType.CALIBRATION
910 ]
911 if filtered_collections:
912 requested_columns = {"dataset_id", "run", "collection"}
913 with backend.context() as context:
914 predicate = context.make_data_coordinate_predicate(
915 dataId.subset(parent_dataset_type.dimensions), full=False
916 )
917 if timespan is not None:
918 requested_columns.add("timespan")
919 predicate = predicate.logical_and(
920 context.make_timespan_overlap_predicate(
921 DatasetColumnTag(parent_dataset_type.name, "timespan"), timespan
922 )
923 )
924 relation = backend.make_dataset_query_relation(
925 parent_dataset_type, filtered_collections, requested_columns, context
926 ).with_rows_satisfying(predicate)
927 rows = list(context.fetch_iterable(relation))
928 else:
929 rows = []
930 if not rows:
931 return None
932 elif len(rows) == 1:
933 best_row = rows[0]
934 else:
935 rank_by_collection_key = {record.key: n for n, record in enumerate(filtered_collections)}
936 collection_tag = DatasetColumnTag(parent_dataset_type.name, "collection")
937 row_iter = iter(rows)
938 best_row = next(row_iter)
939 best_rank = rank_by_collection_key[best_row[collection_tag]]
940 have_tie = False
941 for row in row_iter:
942 if (rank := rank_by_collection_key[row[collection_tag]]) < best_rank:
943 best_row = row
944 best_rank = rank
945 have_tie = False
946 elif rank == best_rank:
947 have_tie = True
948 assert timespan is not None, "Rank ties should be impossible given DB constraints."
949 if have_tie:
950 raise LookupError(
951 f"Ambiguous calibration lookup for {parent_dataset_type.name} in collections "
952 f"{collection_wildcard.strings} with timespan {timespan}."
953 )
954 reader = queries.DatasetRefReader(
955 parent_dataset_type,
956 translate_collection=lambda k: self._managers.collections[k].name,
957 )
958 ref = reader.read(best_row, data_id=dataId)
959 if component is not None:
960 ref = ref.makeComponentRef(component)
961 if datastore_records:
962 ref = self.get_datastore_records(ref)
964 return ref
966 @transactional
967 def insertDatasets(
968 self,
969 datasetType: DatasetType | str,
970 dataIds: Iterable[DataId],
971 run: str | None = None,
972 expand: bool = True,
973 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE,
974 ) -> list[DatasetRef]:
975 """Insert one or more datasets into the `Registry`.
977 This always adds new datasets; to associate existing datasets with
978 a new collection, use ``associate``.
980 Parameters
981 ----------
982 datasetType : `DatasetType` or `str`
983 A `DatasetType` or the name of one.
984 dataIds : `~collections.abc.Iterable` of `dict` or `DataCoordinate`
985 Dimension-based identifiers for the new datasets.
986 run : `str`, optional
987 The name of the run that produced the datasets. Defaults to
988 ``self.defaults.run``.
989 expand : `bool`, optional
990 If `True` (default), expand data IDs as they are inserted. This is
991 necessary in general to allow datastore to generate file templates,
992 but it may be disabled if the caller can guarantee this is
993 unnecessary.
994 idGenerationMode : `DatasetIdGenEnum`, optional
995 Specifies option for generating dataset IDs. By default unique IDs
996 are generated for each inserted dataset.
998 Returns
999 -------
1000 refs : `list` of `DatasetRef`
1001 Resolved `DatasetRef` instances for all given data IDs (in the same
1002 order).
1004 Raises
1005 ------
1006 lsst.daf.butler.registry.DatasetTypeError
1007 Raised if ``datasetType`` is not known to registry.
1008 lsst.daf.butler.registry.CollectionTypeError
1009 Raised if ``run`` collection type is not `~CollectionType.RUN`.
1010 lsst.daf.butler.registry.NoDefaultCollectionError
1011 Raised if ``run`` is `None` and ``self.defaults.run`` is `None`.
1012 lsst.daf.butler.registry.ConflictingDefinitionError
1013 If a dataset with the same dataset type and data ID as one of those
1014 given already exists in ``run``.
1015 lsst.daf.butler.registry.MissingCollectionError
1016 Raised if ``run`` does not exist in the registry.
1017 """
1018 if isinstance(datasetType, DatasetType):
1019 storage = self._managers.datasets.find(datasetType.name)
1020 if storage is None:
1021 raise DatasetTypeError(f"DatasetType '{datasetType}' has not been registered.")
1022 else:
1023 storage = self._managers.datasets.find(datasetType)
1024 if storage is None:
1025 raise DatasetTypeError(f"DatasetType with name '{datasetType}' has not been registered.")
1026 if run is None:
1027 if self.defaults.run is None:
1028 raise NoDefaultCollectionError(
1029 "No run provided to insertDatasets, and no default from registry construction."
1030 )
1031 run = self.defaults.run
1032 runRecord = self._managers.collections.find(run)
1033 if runRecord.type is not CollectionType.RUN:
1034 raise CollectionTypeError(
1035 f"Given collection is of type {runRecord.type.name}; RUN collection required."
1036 )
1037 assert isinstance(runRecord, RunRecord)
1038 progress = Progress("daf.butler.Registry.insertDatasets", level=logging.DEBUG)
1039 if expand:
1040 expandedDataIds = [
1041 self.expandDataId(dataId, graph=storage.datasetType.dimensions)
1042 for dataId in progress.wrap(dataIds, f"Expanding {storage.datasetType.name} data IDs")
1043 ]
1044 else:
1045 expandedDataIds = [
1046 DataCoordinate.standardize(dataId, graph=storage.datasetType.dimensions) for dataId in dataIds
1047 ]
1048 try:
1049 refs = list(storage.insert(runRecord, expandedDataIds, idGenerationMode))
1050 if self._managers.obscore:
1051 context = queries.SqlQueryContext(self._db, self._managers.column_types)
1052 self._managers.obscore.add_datasets(refs, context)
1053 except sqlalchemy.exc.IntegrityError as err:
1054 raise ConflictingDefinitionError(
1055 "A database constraint failure was triggered by inserting "
1056 f"one or more datasets of type {storage.datasetType} into "
1057 f"collection '{run}'. "
1058 "This probably means a dataset with the same data ID "
1059 "and dataset type already exists, but it may also mean a "
1060 "dimension row is missing."
1061 ) from err
1062 return refs
1064 @transactional
1065 def _importDatasets(
1066 self,
1067 datasets: Iterable[DatasetRef],
1068 expand: bool = True,
1069 ) -> list[DatasetRef]:
1070 """Import one or more datasets into the `Registry`.
1072 Difference from `insertDatasets` method is that this method accepts
1073 `DatasetRef` instances which should already be resolved and have a
1074 dataset ID. If registry supports globally-unique dataset IDs (e.g.
1075 `uuid.UUID`) then datasets which already exist in the registry will be
1076 ignored if imported again.
1078 Parameters
1079 ----------
1080 datasets : `~collections.abc.Iterable` of `DatasetRef`
1081 Datasets to be inserted. All `DatasetRef` instances must have
1082 identical ``datasetType`` and ``run`` attributes. ``run``
1083 attribute can be `None` and defaults to ``self.defaults.run``.
1084 Datasets can specify ``id`` attribute which will be used for
1085 inserted datasets. All dataset IDs must have the same type
1086 (`int` or `uuid.UUID`), if type of dataset IDs does not match
1087 configured backend then IDs will be ignored and new IDs will be
1088 generated by backend.
1089 expand : `bool`, optional
1090 If `True` (default), expand data IDs as they are inserted. This is
1091 necessary in general, but it may be disabled if the caller can
1092 guarantee this is unnecessary.
1094 Returns
1095 -------
1096 refs : `list` of `DatasetRef`
1097 Resolved `DatasetRef` instances for all given data IDs (in the same
1098 order). If any of ``datasets`` has an ID which already exists in
1099 the database then it will not be inserted or updated, but a
1100 resolved `DatasetRef` will be returned for it in any case.
1102 Raises
1103 ------
1104 lsst.daf.butler.registry.NoDefaultCollectionError
1105 Raised if ``run`` is `None` and ``self.defaults.run`` is `None`.
1106 lsst.daf.butler.registry.DatasetTypeError
1107 Raised if datasets correspond to more than one dataset type or
1108 dataset type is not known to registry.
1109 lsst.daf.butler.registry.ConflictingDefinitionError
1110 If a dataset with the same dataset type and data ID as one of those
1111 given already exists in ``run``.
1112 lsst.daf.butler.registry.MissingCollectionError
1113 Raised if ``run`` does not exist in the registry.
1115 Notes
1116 -----
1117 This method is considered package-private and internal to Butler
1118 implementation. Clients outside daf_butler package should not use this
1119 method.
1120 """
1121 datasets = list(datasets)
1122 if not datasets:
1123 # nothing to do
1124 return []
1126 # find dataset type
1127 datasetTypes = {dataset.datasetType for dataset in datasets}
1128 if len(datasetTypes) != 1:
1129 raise DatasetTypeError(f"Multiple dataset types in input datasets: {datasetTypes}")
1130 datasetType = datasetTypes.pop()
1132 # get storage handler for this dataset type
1133 storage = self._managers.datasets.find(datasetType.name)
1134 if storage is None:
1135 raise DatasetTypeError(f"DatasetType '{datasetType}' has not been registered.")
1137 # find run name
1138 runs = {dataset.run for dataset in datasets}
1139 if len(runs) != 1:
1140 raise ValueError(f"Multiple run names in input datasets: {runs}")
1141 run = runs.pop()
1143 runRecord = self._managers.collections.find(run)
1144 if runRecord.type is not CollectionType.RUN:
1145 raise CollectionTypeError(
1146 f"Given collection '{runRecord.name}' is of type {runRecord.type.name};"
1147 " RUN collection required."
1148 )
1149 assert isinstance(runRecord, RunRecord)
1151 progress = Progress("daf.butler.Registry.insertDatasets", level=logging.DEBUG)
1152 if expand:
1153 expandedDatasets = [
1154 dataset.expanded(self.expandDataId(dataset.dataId, graph=storage.datasetType.dimensions))
1155 for dataset in progress.wrap(datasets, f"Expanding {storage.datasetType.name} data IDs")
1156 ]
1157 else:
1158 expandedDatasets = [
1159 DatasetRef(datasetType, dataset.dataId, id=dataset.id, run=dataset.run, conform=True)
1160 for dataset in datasets
1161 ]
1163 try:
1164 refs = list(storage.import_(runRecord, expandedDatasets))
1165 if self._managers.obscore:
1166 context = queries.SqlQueryContext(self._db, self._managers.column_types)
1167 self._managers.obscore.add_datasets(refs, context)
1168 except sqlalchemy.exc.IntegrityError as err:
1169 raise ConflictingDefinitionError(
1170 "A database constraint failure was triggered by inserting "
1171 f"one or more datasets of type {storage.datasetType} into "
1172 f"collection '{run}'. "
1173 "This probably means a dataset with the same data ID "
1174 "and dataset type already exists, but it may also mean a "
1175 "dimension row is missing."
1176 ) from err
1177 # Check that imported dataset IDs match the input
1178 for imported_ref, input_ref in zip(refs, datasets, strict=True):
1179 if imported_ref.id != input_ref.id:
1180 raise RegistryConsistencyError(
1181 "Imported dataset ID differs from input dataset ID, "
1182 f"input ref: {input_ref}, imported ref: {imported_ref}"
1183 )
1184 return refs
1186 def getDataset(self, id: DatasetId) -> DatasetRef | None:
1187 """Retrieve a Dataset entry.
1189 Parameters
1190 ----------
1191 id : `DatasetId`
1192 The unique identifier for the dataset.
1194 Returns
1195 -------
1196 ref : `DatasetRef` or `None`
1197 A ref to the Dataset, or `None` if no matching Dataset
1198 was found.
1199 """
1200 return self._managers.datasets.getDatasetRef(id)
1202 @transactional
1203 def removeDatasets(self, refs: Iterable[DatasetRef]) -> None:
1204 """Remove datasets from the Registry.
1206 The datasets will be removed unconditionally from all collections, and
1207 any `Quantum` that consumed this dataset will instead be marked with
1208 having a NULL input. `Datastore` records will *not* be deleted; the
1209 caller is responsible for ensuring that the dataset has already been
1210 removed from all Datastores.
1212 Parameters
1213 ----------
1214 refs : `~collections.abc.Iterable` [`DatasetRef`]
1215 References to the datasets to be removed. Must include a valid
1216 ``id`` attribute, and should be considered invalidated upon return.
1218 Raises
1219 ------
1220 lsst.daf.butler.AmbiguousDatasetError
1221 Raised if any ``ref.id`` is `None`.
1222 lsst.daf.butler.registry.OrphanedRecordError
1223 Raised if any dataset is still present in any `Datastore`.
1224 """
1225 progress = Progress("lsst.daf.butler.Registry.removeDatasets", level=logging.DEBUG)
1226 for datasetType, refsForType in progress.iter_item_chunks(
1227 DatasetRef.iter_by_type(refs), desc="Removing datasets by type"
1228 ):
1229 storage = self._managers.datasets[datasetType.name]
1230 try:
1231 storage.delete(refsForType)
1232 except sqlalchemy.exc.IntegrityError as err:
1233 raise OrphanedRecordError(
1234 "One or more datasets is still present in one or more Datastores."
1235 ) from err
1237 @transactional
1238 def associate(self, collection: str, refs: Iterable[DatasetRef]) -> None:
1239 """Add existing datasets to a `~CollectionType.TAGGED` collection.
1241 If a DatasetRef with the same exact ID is already in a collection
1242 nothing is changed. If a `DatasetRef` with the same `DatasetType` and
1243 data ID but with different ID exists in the collection,
1244 `~lsst.daf.butler.registry.ConflictingDefinitionError` is raised.
1246 Parameters
1247 ----------
1248 collection : `str`
1249 Indicates the collection the datasets should be associated with.
1250 refs : `~collections.abc.Iterable` [ `DatasetRef` ]
1251 An iterable of resolved `DatasetRef` instances that already exist
1252 in this `Registry`.
1254 Raises
1255 ------
1256 lsst.daf.butler.registry.ConflictingDefinitionError
1257 If a Dataset with the given `DatasetRef` already exists in the
1258 given collection.
1259 lsst.daf.butler.registry.MissingCollectionError
1260 Raised if ``collection`` does not exist in the registry.
1261 lsst.daf.butler.registry.CollectionTypeError
1262 Raise adding new datasets to the given ``collection`` is not
1263 allowed.
1264 """
1265 progress = Progress("lsst.daf.butler.Registry.associate", level=logging.DEBUG)
1266 collectionRecord = self._managers.collections.find(collection)
1267 if collectionRecord.type is not CollectionType.TAGGED:
1268 raise CollectionTypeError(
1269 f"Collection '{collection}' has type {collectionRecord.type.name}, not TAGGED."
1270 )
1271 for datasetType, refsForType in progress.iter_item_chunks(
1272 DatasetRef.iter_by_type(refs), desc="Associating datasets by type"
1273 ):
1274 storage = self._managers.datasets[datasetType.name]
1275 try:
1276 storage.associate(collectionRecord, refsForType)
1277 if self._managers.obscore:
1278 # If a TAGGED collection is being monitored by ObsCore
1279 # manager then we may need to save the dataset.
1280 context = queries.SqlQueryContext(self._db, self._managers.column_types)
1281 self._managers.obscore.associate(refsForType, collectionRecord, context)
1282 except sqlalchemy.exc.IntegrityError as err:
1283 raise ConflictingDefinitionError(
1284 f"Constraint violation while associating dataset of type {datasetType.name} with "
1285 f"collection {collection}. This probably means that one or more datasets with the same "
1286 "dataset type and data ID already exist in the collection, but it may also indicate "
1287 "that the datasets do not exist."
1288 ) from err
1290 @transactional
1291 def disassociate(self, collection: str, refs: Iterable[DatasetRef]) -> None:
1292 """Remove existing datasets from a `~CollectionType.TAGGED` collection.
1294 ``collection`` and ``ref`` combinations that are not currently
1295 associated are silently ignored.
1297 Parameters
1298 ----------
1299 collection : `str`
1300 The collection the datasets should no longer be associated with.
1301 refs : `~collections.abc.Iterable` [ `DatasetRef` ]
1302 An iterable of resolved `DatasetRef` instances that already exist
1303 in this `Registry`.
1305 Raises
1306 ------
1307 lsst.daf.butler.AmbiguousDatasetError
1308 Raised if any of the given dataset references is unresolved.
1309 lsst.daf.butler.registry.MissingCollectionError
1310 Raised if ``collection`` does not exist in the registry.
1311 lsst.daf.butler.registry.CollectionTypeError
1312 Raise adding new datasets to the given ``collection`` is not
1313 allowed.
1314 """
1315 progress = Progress("lsst.daf.butler.Registry.disassociate", level=logging.DEBUG)
1316 collectionRecord = self._managers.collections.find(collection)
1317 if collectionRecord.type is not CollectionType.TAGGED:
1318 raise CollectionTypeError(
1319 f"Collection '{collection}' has type {collectionRecord.type.name}; expected TAGGED."
1320 )
1321 for datasetType, refsForType in progress.iter_item_chunks(
1322 DatasetRef.iter_by_type(refs), desc="Disassociating datasets by type"
1323 ):
1324 storage = self._managers.datasets[datasetType.name]
1325 storage.disassociate(collectionRecord, refsForType)
1326 if self._managers.obscore:
1327 self._managers.obscore.disassociate(refsForType, collectionRecord)
1329 @transactional
1330 def certify(self, collection: str, refs: Iterable[DatasetRef], timespan: Timespan) -> None:
1331 """Associate one or more datasets with a calibration collection and a
1332 validity range within it.
1334 Parameters
1335 ----------
1336 collection : `str`
1337 The name of an already-registered `~CollectionType.CALIBRATION`
1338 collection.
1339 refs : `~collections.abc.Iterable` [ `DatasetRef` ]
1340 Datasets to be associated.
1341 timespan : `Timespan`
1342 The validity range for these datasets within the collection.
1344 Raises
1345 ------
1346 lsst.daf.butler.AmbiguousDatasetError
1347 Raised if any of the given `DatasetRef` instances is unresolved.
1348 lsst.daf.butler.registry.ConflictingDefinitionError
1349 Raised if the collection already contains a different dataset with
1350 the same `DatasetType` and data ID and an overlapping validity
1351 range.
1352 lsst.daf.butler.registry.CollectionTypeError
1353 Raised if ``collection`` is not a `~CollectionType.CALIBRATION`
1354 collection or if one or more datasets are of a dataset type for
1355 which `DatasetType.isCalibration` returns `False`.
1356 """
1357 progress = Progress("lsst.daf.butler.Registry.certify", level=logging.DEBUG)
1358 collectionRecord = self._managers.collections.find(collection)
1359 for datasetType, refsForType in progress.iter_item_chunks(
1360 DatasetRef.iter_by_type(refs), desc="Certifying datasets by type"
1361 ):
1362 storage = self._managers.datasets[datasetType.name]
1363 storage.certify(
1364 collectionRecord,
1365 refsForType,
1366 timespan,
1367 context=queries.SqlQueryContext(self._db, self._managers.column_types),
1368 )
1370 @transactional
1371 def decertify(
1372 self,
1373 collection: str,
1374 datasetType: str | DatasetType,
1375 timespan: Timespan,
1376 *,
1377 dataIds: Iterable[DataId] | None = None,
1378 ) -> None:
1379 """Remove or adjust datasets to clear a validity range within a
1380 calibration collection.
1382 Parameters
1383 ----------
1384 collection : `str`
1385 The name of an already-registered `~CollectionType.CALIBRATION`
1386 collection.
1387 datasetType : `str` or `DatasetType`
1388 Name or `DatasetType` instance for the datasets to be decertified.
1389 timespan : `Timespan`, optional
1390 The validity range to remove datasets from within the collection.
1391 Datasets that overlap this range but are not contained by it will
1392 have their validity ranges adjusted to not overlap it, which may
1393 split a single dataset validity range into two.
1394 dataIds : iterable [`dict` or `DataCoordinate`], optional
1395 Data IDs that should be decertified within the given validity range
1396 If `None`, all data IDs for ``self.datasetType`` will be
1397 decertified.
1399 Raises
1400 ------
1401 lsst.daf.butler.registry.CollectionTypeError
1402 Raised if ``collection`` is not a `~CollectionType.CALIBRATION`
1403 collection or if ``datasetType.isCalibration() is False``.
1404 """
1405 collectionRecord = self._managers.collections.find(collection)
1406 if isinstance(datasetType, str):
1407 storage = self._managers.datasets[datasetType]
1408 else:
1409 storage = self._managers.datasets[datasetType.name]
1410 standardizedDataIds = None
1411 if dataIds is not None:
1412 standardizedDataIds = [
1413 DataCoordinate.standardize(d, graph=storage.datasetType.dimensions) for d in dataIds
1414 ]
1415 storage.decertify(
1416 collectionRecord,
1417 timespan,
1418 dataIds=standardizedDataIds,
1419 context=queries.SqlQueryContext(self._db, self._managers.column_types),
1420 )
1422 def getDatastoreBridgeManager(self) -> DatastoreRegistryBridgeManager:
1423 """Return an object that allows a new `Datastore` instance to
1424 communicate with this `Registry`.
1426 Returns
1427 -------
1428 manager : `~.interfaces.DatastoreRegistryBridgeManager`
1429 Object that mediates communication between this `Registry` and its
1430 associated datastores.
1431 """
1432 return self._managers.datastores
1434 def getDatasetLocations(self, ref: DatasetRef) -> Iterable[str]:
1435 """Retrieve datastore locations for a given dataset.
1437 Parameters
1438 ----------
1439 ref : `DatasetRef`
1440 A reference to the dataset for which to retrieve storage
1441 information.
1443 Returns
1444 -------
1445 datastores : `~collections.abc.Iterable` [ `str` ]
1446 All the matching datastores holding this dataset.
1448 Raises
1449 ------
1450 lsst.daf.butler.AmbiguousDatasetError
1451 Raised if ``ref.id`` is `None`.
1452 """
1453 return self._managers.datastores.findDatastores(ref)
1455 def expandDataId(
1456 self,
1457 dataId: DataId | None = None,
1458 *,
1459 graph: DimensionGraph | None = None,
1460 records: NameLookupMapping[DimensionElement, DimensionRecord | None] | None = None,
1461 withDefaults: bool = True,
1462 **kwargs: Any,
1463 ) -> DataCoordinate:
1464 """Expand a dimension-based data ID to include additional information.
1466 Parameters
1467 ----------
1468 dataId : `DataCoordinate` or `dict`, optional
1469 Data ID to be expanded; augmented and overridden by ``kwargs``.
1470 graph : `DimensionGraph`, optional
1471 Set of dimensions for the expanded ID. If `None`, the dimensions
1472 will be inferred from the keys of ``dataId`` and ``kwargs``.
1473 Dimensions that are in ``dataId`` or ``kwargs`` but not in
1474 ``graph`` are silently ignored, providing a way to extract and
1475 ``graph`` expand a subset of a data ID.
1476 records : `~collections.abc.Mapping` [`str`, `DimensionRecord`], \
1477 optional
1478 Dimension record data to use before querying the database for that
1479 data, keyed by element name.
1480 withDefaults : `bool`, optional
1481 Utilize ``self.defaults.dataId`` to fill in missing governor
1482 dimension key-value pairs. Defaults to `True` (i.e. defaults are
1483 used).
1484 **kwargs
1485 Additional keywords are treated like additional key-value pairs for
1486 ``dataId``, extending and overriding
1488 Returns
1489 -------
1490 expanded : `DataCoordinate`
1491 A data ID that includes full metadata for all of the dimensions it
1492 identifies, i.e. guarantees that ``expanded.hasRecords()`` and
1493 ``expanded.hasFull()`` both return `True`.
1495 Raises
1496 ------
1497 lsst.daf.butler.registry.DataIdError
1498 Raised when ``dataId`` or keyword arguments specify unknown
1499 dimensions or values, or when a resulting data ID contains
1500 contradictory key-value pairs, according to dimension
1501 relationships.
1503 Notes
1504 -----
1505 This method cannot be relied upon to reject invalid data ID values
1506 for dimensions that do actually not have any record columns. For
1507 efficiency reasons the records for these dimensions (which have only
1508 dimension key values that are given by the caller) may be constructed
1509 directly rather than obtained from the registry database.
1510 """
1511 if not withDefaults:
1512 defaults = None
1513 else:
1514 defaults = self.defaults.dataId
1515 try:
1516 standardized = DataCoordinate.standardize(
1517 dataId, graph=graph, universe=self.dimensions, defaults=defaults, **kwargs
1518 )
1519 except KeyError as exc:
1520 # This means either kwargs have some odd name or required
1521 # dimension is missing.
1522 raise DimensionNameError(str(exc)) from exc
1523 if standardized.hasRecords():
1524 return standardized
1525 if records is None:
1526 records = {}
1527 elif isinstance(records, NamedKeyMapping):
1528 records = records.byName()
1529 else:
1530 records = dict(records)
1531 if isinstance(dataId, DataCoordinate) and dataId.hasRecords():
1532 records.update(dataId.records.byName())
1533 keys = standardized.byName()
1534 context = queries.SqlQueryContext(self._db, self._managers.column_types)
1535 for element in standardized.graph.primaryKeyTraversalOrder:
1536 record = records.get(element.name, ...) # Use ... to mean not found; None might mean NULL
1537 if record is ...:
1538 if isinstance(element, Dimension) and keys.get(element.name) is None:
1539 if element in standardized.graph.required:
1540 raise DimensionNameError(
1541 f"No value or null value for required dimension {element.name}."
1542 )
1543 keys[element.name] = None
1544 record = None
1545 else:
1546 storage = self._managers.dimensions[element]
1547 record = storage.fetch_one(DataCoordinate.standardize(keys, graph=element.graph), context)
1548 records[element.name] = record
1549 if record is not None:
1550 for d in element.implied:
1551 value = getattr(record, d.name)
1552 if keys.setdefault(d.name, value) != value:
1553 raise InconsistentDataIdError(
1554 f"Data ID {standardized} has {d.name}={keys[d.name]!r}, "
1555 f"but {element.name} implies {d.name}={value!r}."
1556 )
1557 else:
1558 if element in standardized.graph.required:
1559 raise DataIdValueError(
1560 f"Could not fetch record for required dimension {element.name} via keys {keys}."
1561 )
1562 if element.alwaysJoin:
1563 raise InconsistentDataIdError(
1564 f"Could not fetch record for element {element.name} via keys {keys}, ",
1565 "but it is marked alwaysJoin=True; this means one or more dimensions are not "
1566 "related.",
1567 )
1568 for d in element.implied:
1569 keys.setdefault(d.name, None)
1570 records.setdefault(d.name, None)
1571 return DataCoordinate.standardize(keys, graph=standardized.graph).expanded(records=records)
1573 def insertDimensionData(
1574 self,
1575 element: DimensionElement | str,
1576 *data: Mapping[str, Any] | DimensionRecord,
1577 conform: bool = True,
1578 replace: bool = False,
1579 skip_existing: bool = False,
1580 ) -> None:
1581 """Insert one or more dimension records into the database.
1583 Parameters
1584 ----------
1585 element : `DimensionElement` or `str`
1586 The `DimensionElement` or name thereof that identifies the table
1587 records will be inserted into.
1588 *data : `dict` or `DimensionRecord`
1589 One or more records to insert.
1590 conform : `bool`, optional
1591 If `False` (`True` is default) perform no checking or conversions,
1592 and assume that ``element`` is a `DimensionElement` instance and
1593 ``data`` is a one or more `DimensionRecord` instances of the
1594 appropriate subclass.
1595 replace : `bool`, optional
1596 If `True` (`False` is default), replace existing records in the
1597 database if there is a conflict.
1598 skip_existing : `bool`, optional
1599 If `True` (`False` is default), skip insertion if a record with
1600 the same primary key values already exists. Unlike
1601 `syncDimensionData`, this will not detect when the given record
1602 differs from what is in the database, and should not be used when
1603 this is a concern.
1604 """
1605 if conform:
1606 if isinstance(element, str):
1607 element = self.dimensions[element]
1608 records = [
1609 row if isinstance(row, DimensionRecord) else element.RecordClass(**row) for row in data
1610 ]
1611 else:
1612 # Ignore typing since caller said to trust them with conform=False.
1613 records = data # type: ignore
1614 storage = self._managers.dimensions[element]
1615 storage.insert(*records, replace=replace, skip_existing=skip_existing)
1617 def syncDimensionData(
1618 self,
1619 element: DimensionElement | str,
1620 row: Mapping[str, Any] | DimensionRecord,
1621 conform: bool = True,
1622 update: bool = False,
1623 ) -> bool | dict[str, Any]:
1624 """Synchronize the given dimension record with the database, inserting
1625 if it does not already exist and comparing values if it does.
1627 Parameters
1628 ----------
1629 element : `DimensionElement` or `str`
1630 The `DimensionElement` or name thereof that identifies the table
1631 records will be inserted into.
1632 row : `dict` or `DimensionRecord`
1633 The record to insert.
1634 conform : `bool`, optional
1635 If `False` (`True` is default) perform no checking or conversions,
1636 and assume that ``element`` is a `DimensionElement` instance and
1637 ``data`` is a one or more `DimensionRecord` instances of the
1638 appropriate subclass.
1639 update : `bool`, optional
1640 If `True` (`False` is default), update the existing record in the
1641 database if there is a conflict.
1643 Returns
1644 -------
1645 inserted_or_updated : `bool` or `dict`
1646 `True` if a new row was inserted, `False` if no changes were
1647 needed, or a `dict` mapping updated column names to their old
1648 values if an update was performed (only possible if
1649 ``update=True``).
1651 Raises
1652 ------
1653 lsst.daf.butler.registry.ConflictingDefinitionError
1654 Raised if the record exists in the database (according to primary
1655 key lookup) but is inconsistent with the given one.
1656 """
1657 if conform:
1658 if isinstance(element, str):
1659 element = self.dimensions[element]
1660 record = row if isinstance(row, DimensionRecord) else element.RecordClass(**row)
1661 else:
1662 # Ignore typing since caller said to trust them with conform=False.
1663 record = row # type: ignore
1664 storage = self._managers.dimensions[element]
1665 return storage.sync(record, update=update)
1667 def queryDatasetTypes(
1668 self,
1669 expression: Any = ...,
1670 *,
1671 components: bool | None = False,
1672 missing: list[str] | None = None,
1673 ) -> Iterable[DatasetType]:
1674 """Iterate over the dataset types whose names match an expression.
1676 Parameters
1677 ----------
1678 expression : dataset type expression, optional
1679 An expression that fully or partially identifies the dataset types
1680 to return, such as a `str`, `re.Pattern`, or iterable thereof.
1681 ``...`` can be used to return all dataset types, and is the
1682 default. See :ref:`daf_butler_dataset_type_expressions` for more
1683 information.
1684 components : `bool`, optional
1685 If `True`, apply all expression patterns to component dataset type
1686 names as well. If `False`, never apply patterns to components.
1687 If `None`, apply patterns to components only if their
1688 parent datasets were not matched by the expression.
1689 Fully-specified component datasets (`str` or `DatasetType`
1690 instances) are always included.
1692 Values other than `False` are deprecated, and only `False` will be
1693 supported after v26. After v27 this argument will be removed
1694 entirely.
1695 missing : `list` of `str`, optional
1696 String dataset type names that were explicitly given (i.e. not
1697 regular expression patterns) but not found will be appended to this
1698 list, if it is provided.
1700 Returns
1701 -------
1702 dataset_types : `~collections.abc.Iterable` [ `DatasetType`]
1703 An `~collections.abc.Iterable` of `DatasetType` instances whose
1704 names match ``expression``.
1706 Raises
1707 ------
1708 lsst.daf.butler.registry.DatasetTypeExpressionError
1709 Raised when ``expression`` is invalid.
1710 """
1711 wildcard = DatasetTypeWildcard.from_expression(expression)
1712 composition_dict = self._managers.datasets.resolve_wildcard(
1713 wildcard,
1714 components=components,
1715 missing=missing,
1716 )
1717 result: list[DatasetType] = []
1718 for parent_dataset_type, components_for_parent in composition_dict.items():
1719 result.extend(
1720 parent_dataset_type.makeComponentDatasetType(c) if c is not None else parent_dataset_type
1721 for c in components_for_parent
1722 )
1723 return result
1725 def queryCollections(
1726 self,
1727 expression: Any = ...,
1728 datasetType: DatasetType | None = None,
1729 collectionTypes: Iterable[CollectionType] | CollectionType = CollectionType.all(),
1730 flattenChains: bool = False,
1731 includeChains: bool | None = None,
1732 ) -> Sequence[str]:
1733 """Iterate over the collections whose names match an expression.
1735 Parameters
1736 ----------
1737 expression : collection expression, optional
1738 An expression that identifies the collections to return, such as
1739 a `str` (for full matches or partial matches via globs),
1740 `re.Pattern` (for partial matches), or iterable thereof. ``...``
1741 can be used to return all collections, and is the default.
1742 See :ref:`daf_butler_collection_expressions` for more information.
1743 datasetType : `DatasetType`, optional
1744 If provided, only yield collections that may contain datasets of
1745 this type. This is a conservative approximation in general; it may
1746 yield collections that do not have any such datasets.
1747 collectionTypes : `~collections.abc.Set` [`CollectionType`] or \
1748 `CollectionType`, optional
1749 If provided, only yield collections of these types.
1750 flattenChains : `bool`, optional
1751 If `True` (`False` is default), recursively yield the child
1752 collections of matching `~CollectionType.CHAINED` collections.
1753 includeChains : `bool`, optional
1754 If `True`, yield records for matching `~CollectionType.CHAINED`
1755 collections. Default is the opposite of ``flattenChains``: include
1756 either CHAINED collections or their children, but not both.
1758 Returns
1759 -------
1760 collections : `~collections.abc.Sequence` [ `str` ]
1761 The names of collections that match ``expression``.
1763 Raises
1764 ------
1765 lsst.daf.butler.registry.CollectionExpressionError
1766 Raised when ``expression`` is invalid.
1768 Notes
1769 -----
1770 The order in which collections are returned is unspecified, except that
1771 the children of a `~CollectionType.CHAINED` collection are guaranteed
1772 to be in the order in which they are searched. When multiple parent
1773 `~CollectionType.CHAINED` collections match the same criteria, the
1774 order in which the two lists appear is unspecified, and the lists of
1775 children may be incomplete if a child has multiple parents.
1776 """
1777 # Right now the datasetTypes argument is completely ignored, but that
1778 # is consistent with its [lack of] guarantees. DM-24939 or a follow-up
1779 # ticket will take care of that.
1780 try:
1781 wildcard = CollectionWildcard.from_expression(expression)
1782 except TypeError as exc:
1783 raise CollectionExpressionError(f"Invalid collection expression '{expression}'") from exc
1784 collectionTypes = ensure_iterable(collectionTypes)
1785 return [
1786 record.name
1787 for record in self._managers.collections.resolve_wildcard(
1788 wildcard,
1789 collection_types=frozenset(collectionTypes),
1790 flatten_chains=flattenChains,
1791 include_chains=includeChains,
1792 )
1793 ]
1795 def _makeQueryBuilder(
1796 self,
1797 summary: queries.QuerySummary,
1798 doomed_by: Iterable[str] = (),
1799 ) -> queries.QueryBuilder:
1800 """Return a `QueryBuilder` instance capable of constructing and
1801 managing more complex queries than those obtainable via `Registry`
1802 interfaces.
1804 This is an advanced interface; downstream code should prefer
1805 `Registry.queryDataIds` and `Registry.queryDatasets` whenever those
1806 are sufficient.
1808 Parameters
1809 ----------
1810 summary : `queries.QuerySummary`
1811 Object describing and categorizing the full set of dimensions that
1812 will be included in the query.
1813 doomed_by : `~collections.abc.Iterable` of `str`, optional
1814 A list of diagnostic messages that indicate why the query is going
1815 to yield no results and should not even be executed. If an empty
1816 container (default) the query will be executed unless other code
1817 determines that it is doomed.
1819 Returns
1820 -------
1821 builder : `queries.QueryBuilder`
1822 Object that can be used to construct and perform advanced queries.
1823 """
1824 doomed_by = list(doomed_by)
1825 backend = queries.SqlQueryBackend(self._db, self._managers)
1826 context = backend.context()
1827 relation: Relation | None = None
1828 if doomed_by:
1829 relation = LeafRelation.make_doomed(context.sql_engine, set(), doomed_by)
1830 return queries.QueryBuilder(
1831 summary,
1832 backend=backend,
1833 context=context,
1834 relation=relation,
1835 )
1837 def _standardize_query_data_id_args(
1838 self, data_id: DataId | None, *, doomed_by: list[str], **kwargs: Any
1839 ) -> DataCoordinate:
1840 """Preprocess the data ID arguments passed to query* methods.
1842 Parameters
1843 ----------
1844 data_id : `DataId` or `None`
1845 Data ID that constrains the query results.
1846 doomed_by : `list` [ `str` ]
1847 List to append messages indicating why the query is doomed to
1848 yield no results.
1849 **kwargs
1850 Additional data ID key-value pairs, extending and overriding
1851 ``data_id``.
1853 Returns
1854 -------
1855 data_id : `DataCoordinate`
1856 Standardized data ID. Will be fully expanded unless expansion
1857 fails, in which case a message will be appended to ``doomed_by``
1858 on return.
1859 """
1860 try:
1861 return self.expandDataId(data_id, **kwargs)
1862 except DataIdValueError as err:
1863 doomed_by.append(str(err))
1864 return DataCoordinate.standardize(
1865 data_id, **kwargs, universe=self.dimensions, defaults=self.defaults.dataId
1866 )
1868 def _standardize_query_dataset_args(
1869 self,
1870 datasets: Any,
1871 collections: CollectionArgType | None,
1872 components: bool | None,
1873 mode: Literal["find_first"] | Literal["find_all"] | Literal["constrain"] = "constrain",
1874 *,
1875 doomed_by: list[str],
1876 ) -> tuple[dict[DatasetType, list[str | None]], CollectionWildcard | None]:
1877 """Preprocess dataset arguments passed to query* methods.
1879 Parameters
1880 ----------
1881 datasets : `DatasetType`, `str`, `re.Pattern`, or iterable of these
1882 Expression identifying dataset types. See `queryDatasetTypes` for
1883 details.
1884 collections : `str`, `re.Pattern`, or iterable of these
1885 Expression identifying collections to be searched. See
1886 `queryCollections` for details.
1887 components : `bool`, optional
1888 If `True`, apply all expression patterns to component dataset type
1889 names as well. If `False`, never apply patterns to components.
1890 If `None` (default), apply patterns to components only if their
1891 parent datasets were not matched by the expression.
1892 Fully-specified component datasets (`str` or `DatasetType`
1893 instances) are always included.
1895 Values other than `False` are deprecated, and only `False` will be
1896 supported after v26. After v27 this argument will be removed
1897 entirely.
1898 mode : `str`, optional
1899 The way in which datasets are being used in this query; one of:
1901 - "find_first": this is a query for the first dataset in an
1902 ordered list of collections. Prohibits collection wildcards,
1903 but permits dataset type wildcards.
1905 - "find_all": this is a query for all datasets in all matched
1906 collections. Permits collection and dataset type wildcards.
1908 - "constrain": this is a query for something other than datasets,
1909 with results constrained by dataset existence. Permits
1910 collection wildcards and prohibits ``...`` as a dataset type
1911 wildcard.
1912 doomed_by : `list` [ `str` ]
1913 List to append messages indicating why the query is doomed to
1914 yield no results.
1916 Returns
1917 -------
1918 composition : `defaultdict` [ `DatasetType`, `list` [ `str` ] ]
1919 Dictionary mapping parent dataset type to `list` of components
1920 matched for that dataset type (or `None` for the parent itself).
1921 collections : `CollectionWildcard`
1922 Processed collection expression.
1923 """
1924 composition: dict[DatasetType, list[str | None]] = {}
1925 collection_wildcard: CollectionWildcard | None = None
1926 if datasets is not None:
1927 if collections is None:
1928 if not self.defaults.collections:
1929 raise NoDefaultCollectionError("No collections, and no registry default collections.")
1930 collection_wildcard = CollectionWildcard.from_expression(self.defaults.collections)
1931 else:
1932 collection_wildcard = CollectionWildcard.from_expression(collections)
1933 if mode == "find_first" and collection_wildcard.patterns:
1934 raise TypeError(
1935 f"Collection pattern(s) {collection_wildcard.patterns} not allowed in this context."
1936 )
1937 missing: list[str] = []
1938 composition = self._managers.datasets.resolve_wildcard(
1939 datasets, components=components, missing=missing, explicit_only=(mode == "constrain")
1940 )
1941 if missing and mode == "constrain":
1942 # After v26 this should raise MissingDatasetTypeError, to be
1943 # implemented on DM-36303.
1944 warnings.warn(
1945 f"Dataset type(s) {missing} are not registered; this will be an error after v26.",
1946 FutureWarning,
1947 stacklevel=find_outside_stacklevel("lsst.daf.butler"),
1948 )
1949 doomed_by.extend(f"Dataset type {name} is not registered." for name in missing)
1950 elif collections:
1951 # I think this check should actually be `collections is not None`,
1952 # but it looks like some CLI scripts use empty tuple as default.
1953 raise ArgumentError(f"Cannot pass 'collections' (='{collections}') without 'datasets'.")
1954 return composition, collection_wildcard
1956 def queryDatasets(
1957 self,
1958 datasetType: Any,
1959 *,
1960 collections: CollectionArgType | None = None,
1961 dimensions: Iterable[Dimension | str] | None = None,
1962 dataId: DataId | None = None,
1963 where: str = "",
1964 findFirst: bool = False,
1965 components: bool | None = False,
1966 bind: Mapping[str, Any] | None = None,
1967 check: bool = True,
1968 **kwargs: Any,
1969 ) -> queries.DatasetQueryResults:
1970 """Query for and iterate over dataset references matching user-provided
1971 criteria.
1973 Parameters
1974 ----------
1975 datasetType : dataset type expression
1976 An expression that fully or partially identifies the dataset types
1977 to be queried. Allowed types include `DatasetType`, `str`,
1978 `re.Pattern`, and iterables thereof. The special value ``...`` can
1979 be used to query all dataset types. See
1980 :ref:`daf_butler_dataset_type_expressions` for more information.
1981 collections : collection expression, optional
1982 An expression that identifies the collections to search, such as a
1983 `str` (for full matches or partial matches via globs), `re.Pattern`
1984 (for partial matches), or iterable thereof. ``...`` can be used to
1985 search all collections (actually just all `~CollectionType.RUN`
1986 collections, because this will still find all datasets).
1987 If not provided, ``self.default.collections`` is used. See
1988 :ref:`daf_butler_collection_expressions` for more information.
1989 dimensions : `~collections.abc.Iterable` of `Dimension` or `str`
1990 Dimensions to include in the query (in addition to those used
1991 to identify the queried dataset type(s)), either to constrain
1992 the resulting datasets to those for which a matching dimension
1993 exists, or to relate the dataset type's dimensions to dimensions
1994 referenced by the ``dataId`` or ``where`` arguments.
1995 dataId : `dict` or `DataCoordinate`, optional
1996 A data ID whose key-value pairs are used as equality constraints
1997 in the query.
1998 where : `str`, optional
1999 A string expression similar to a SQL WHERE clause. May involve
2000 any column of a dimension table or (as a shortcut for the primary
2001 key column of a dimension table) dimension name. See
2002 :ref:`daf_butler_dimension_expressions` for more information.
2003 findFirst : `bool`, optional
2004 If `True` (`False` is default), for each result data ID, only
2005 yield one `DatasetRef` of each `DatasetType`, from the first
2006 collection in which a dataset of that dataset type appears
2007 (according to the order of ``collections`` passed in). If `True`,
2008 ``collections`` must not contain regular expressions and may not
2009 be ``...``.
2010 components : `bool`, optional
2011 If `True`, apply all dataset expression patterns to component
2012 dataset type names as well. If `False`, never apply patterns to
2013 components. If `None`, apply patterns to components only
2014 if their parent datasets were not matched by the expression.
2015 Fully-specified component datasets (`str` or `DatasetType`
2016 instances) are always included.
2018 Values other than `False` are deprecated, and only `False` will be
2019 supported after v26. After v27 this argument will be removed
2020 entirely.
2021 bind : `~collections.abc.Mapping`, optional
2022 Mapping containing literal values that should be injected into the
2023 ``where`` expression, keyed by the identifiers they replace.
2024 Values of collection type can be expanded in some cases; see
2025 :ref:`daf_butler_dimension_expressions_identifiers` for more
2026 information.
2027 check : `bool`, optional
2028 If `True` (default) check the query for consistency before
2029 executing it. This may reject some valid queries that resemble
2030 common mistakes (e.g. queries for visits without specifying an
2031 instrument).
2032 **kwargs
2033 Additional keyword arguments are forwarded to
2034 `DataCoordinate.standardize` when processing the ``dataId``
2035 argument (and may be used to provide a constraining data ID even
2036 when the ``dataId`` argument is `None`).
2038 Returns
2039 -------
2040 refs : `.queries.DatasetQueryResults`
2041 Dataset references matching the given query criteria. Nested data
2042 IDs are guaranteed to include values for all implied dimensions
2043 (i.e. `DataCoordinate.hasFull` will return `True`), but will not
2044 include dimension records (`DataCoordinate.hasRecords` will be
2045 `False`) unless `~.queries.DatasetQueryResults.expanded` is
2046 called on the result object (which returns a new one).
2048 Raises
2049 ------
2050 lsst.daf.butler.registry.DatasetTypeExpressionError
2051 Raised when ``datasetType`` expression is invalid.
2052 TypeError
2053 Raised when the arguments are incompatible, such as when a
2054 collection wildcard is passed when ``findFirst`` is `True`, or
2055 when ``collections`` is `None` and ``self.defaults.collections`` is
2056 also `None`.
2057 lsst.daf.butler.registry.DataIdError
2058 Raised when ``dataId`` or keyword arguments specify unknown
2059 dimensions or values, or when they contain inconsistent values.
2060 lsst.daf.butler.registry.UserExpressionError
2061 Raised when ``where`` expression is invalid.
2063 Notes
2064 -----
2065 When multiple dataset types are queried in a single call, the
2066 results of this operation are equivalent to querying for each dataset
2067 type separately in turn, and no information about the relationships
2068 between datasets of different types is included. In contexts where
2069 that kind of information is important, the recommended pattern is to
2070 use `queryDataIds` to first obtain data IDs (possibly with the
2071 desired dataset types and collections passed as constraints to the
2072 query), and then use multiple (generally much simpler) calls to
2073 `queryDatasets` with the returned data IDs passed as constraints.
2074 """
2075 doomed_by: list[str] = []
2076 data_id = self._standardize_query_data_id_args(dataId, doomed_by=doomed_by, **kwargs)
2077 dataset_composition, collection_wildcard = self._standardize_query_dataset_args(
2078 datasetType,
2079 collections,
2080 components,
2081 mode="find_first" if findFirst else "find_all",
2082 doomed_by=doomed_by,
2083 )
2084 if collection_wildcard is not None and collection_wildcard.empty():
2085 doomed_by.append("No datasets can be found because collection list is empty.")
2086 return queries.ChainedDatasetQueryResults([], doomed_by=doomed_by)
2087 parent_results: list[queries.ParentDatasetQueryResults] = []
2088 for parent_dataset_type, components_for_parent in dataset_composition.items():
2089 # The full set of dimensions in the query is the combination of
2090 # those needed for the DatasetType and those explicitly requested,
2091 # if any.
2092 dimension_names = set(parent_dataset_type.dimensions.names)
2093 if dimensions is not None:
2094 dimension_names.update(self.dimensions.extract(dimensions).names)
2095 # Construct the summary structure needed to construct a
2096 # QueryBuilder.
2097 summary = queries.QuerySummary(
2098 requested=DimensionGraph(self.dimensions, names=dimension_names),
2099 column_types=self._managers.column_types,
2100 data_id=data_id,
2101 expression=where,
2102 bind=bind,
2103 defaults=self.defaults.dataId,
2104 check=check,
2105 datasets=[parent_dataset_type],
2106 )
2107 builder = self._makeQueryBuilder(summary)
2108 # Add the dataset subquery to the query, telling the QueryBuilder
2109 # to include the rank of the selected collection in the results
2110 # only if we need to findFirst. Note that if any of the
2111 # collections are actually wildcard expressions, and
2112 # findFirst=True, this will raise TypeError for us.
2113 builder.joinDataset(parent_dataset_type, collection_wildcard, isResult=True, findFirst=findFirst)
2114 query = builder.finish()
2115 parent_results.append(
2116 queries.ParentDatasetQueryResults(
2117 query, parent_dataset_type, components=components_for_parent
2118 )
2119 )
2120 if not parent_results:
2121 doomed_by.extend(
2122 f"No registered dataset type matching {t!r} found, so no matching datasets can "
2123 "exist in any collection."
2124 for t in ensure_iterable(datasetType)
2125 )
2126 return queries.ChainedDatasetQueryResults([], doomed_by=doomed_by)
2127 elif len(parent_results) == 1:
2128 return parent_results[0]
2129 else:
2130 return queries.ChainedDatasetQueryResults(parent_results)
2132 def queryDataIds(
2133 self,
2134 dimensions: Iterable[Dimension | str] | Dimension | str,
2135 *,
2136 dataId: DataId | None = None,
2137 datasets: Any = None,
2138 collections: CollectionArgType | None = None,
2139 where: str = "",
2140 components: bool | None = None,
2141 bind: Mapping[str, Any] | None = None,
2142 check: bool = True,
2143 **kwargs: Any,
2144 ) -> queries.DataCoordinateQueryResults:
2145 """Query for data IDs matching user-provided criteria.
2147 Parameters
2148 ----------
2149 dimensions : `Dimension` or `str`, or iterable thereof
2150 The dimensions of the data IDs to yield, as either `Dimension`
2151 instances or `str`. Will be automatically expanded to a complete
2152 `DimensionGraph`.
2153 dataId : `dict` or `DataCoordinate`, optional
2154 A data ID whose key-value pairs are used as equality constraints
2155 in the query.
2156 datasets : dataset type expression, optional
2157 An expression that fully or partially identifies dataset types
2158 that should constrain the yielded data IDs. For example, including
2159 "raw" here would constrain the yielded ``instrument``,
2160 ``exposure``, ``detector``, and ``physical_filter`` values to only
2161 those for which at least one "raw" dataset exists in
2162 ``collections``. Allowed types include `DatasetType`, `str`,
2163 and iterables thereof. Regular expression objects (i.e.
2164 `re.Pattern`) are deprecated and will be removed after the v26
2165 release. See :ref:`daf_butler_dataset_type_expressions` for more
2166 information.
2167 collections : collection expression, optional
2168 An expression that identifies the collections to search for
2169 datasets, such as a `str` (for full matches or partial matches
2170 via globs), `re.Pattern` (for partial matches), or iterable
2171 thereof. ``...`` can be used to search all collections (actually
2172 just all `~CollectionType.RUN` collections, because this will
2173 still find all datasets). If not provided,
2174 ``self.default.collections`` is used. Ignored unless ``datasets``
2175 is also passed. See :ref:`daf_butler_collection_expressions` for
2176 more information.
2177 where : `str`, optional
2178 A string expression similar to a SQL WHERE clause. May involve
2179 any column of a dimension table or (as a shortcut for the primary
2180 key column of a dimension table) dimension name. See
2181 :ref:`daf_butler_dimension_expressions` for more information.
2182 components : `bool`, optional
2183 If `True`, apply all dataset expression patterns to component
2184 dataset type names as well. If `False`, never apply patterns to
2185 components. If `None`, apply patterns to components only
2186 if their parent datasets were not matched by the expression.
2187 Fully-specified component datasets (`str` or `DatasetType`
2188 instances) are always included.
2190 Values other than `False` are deprecated, and only `False` will be
2191 supported after v26. After v27 this argument will be removed
2192 entirely.
2193 bind : `~collections.abc.Mapping`, optional
2194 Mapping containing literal values that should be injected into the
2195 ``where`` expression, keyed by the identifiers they replace.
2196 Values of collection type can be expanded in some cases; see
2197 :ref:`daf_butler_dimension_expressions_identifiers` for more
2198 information.
2199 check : `bool`, optional
2200 If `True` (default) check the query for consistency before
2201 executing it. This may reject some valid queries that resemble
2202 common mistakes (e.g. queries for visits without specifying an
2203 instrument).
2204 **kwargs
2205 Additional keyword arguments are forwarded to
2206 `DataCoordinate.standardize` when processing the ``dataId``
2207 argument (and may be used to provide a constraining data ID even
2208 when the ``dataId`` argument is `None`).
2210 Returns
2211 -------
2212 dataIds : `.queries.DataCoordinateQueryResults`
2213 Data IDs matching the given query parameters. These are guaranteed
2214 to identify all dimensions (`DataCoordinate.hasFull` returns
2215 `True`), but will not contain `DimensionRecord` objects
2216 (`DataCoordinate.hasRecords` returns `False`). Call
2217 `~.queries.DataCoordinateQueryResults.expanded` on the
2218 returned object to fetch those (and consider using
2219 `~.queries.DataCoordinateQueryResults.materialize` on the
2220 returned object first if the expected number of rows is very
2221 large). See documentation for those methods for additional
2222 information.
2224 Raises
2225 ------
2226 lsst.daf.butler.registry.NoDefaultCollectionError
2227 Raised if ``collections`` is `None` and
2228 ``self.defaults.collections`` is `None`.
2229 lsst.daf.butler.registry.CollectionExpressionError
2230 Raised when ``collections`` expression is invalid.
2231 lsst.daf.butler.registry.DataIdError
2232 Raised when ``dataId`` or keyword arguments specify unknown
2233 dimensions or values, or when they contain inconsistent values.
2234 lsst.daf.butler.registry.DatasetTypeExpressionError
2235 Raised when ``datasetType`` expression is invalid.
2236 lsst.daf.butler.registry.UserExpressionError
2237 Raised when ``where`` expression is invalid.
2238 """
2239 dimensions = ensure_iterable(dimensions)
2240 requestedDimensions = self.dimensions.extract(dimensions)
2241 doomed_by: list[str] = []
2242 data_id = self._standardize_query_data_id_args(dataId, doomed_by=doomed_by, **kwargs)
2243 dataset_composition, collection_wildcard = self._standardize_query_dataset_args(
2244 datasets, collections, components, doomed_by=doomed_by
2245 )
2246 if collection_wildcard is not None and collection_wildcard.empty():
2247 doomed_by.append("No data coordinates can be found because collection list is empty.")
2248 summary = queries.QuerySummary(
2249 requested=requestedDimensions,
2250 column_types=self._managers.column_types,
2251 data_id=data_id,
2252 expression=where,
2253 bind=bind,
2254 defaults=self.defaults.dataId,
2255 check=check,
2256 datasets=dataset_composition.keys(),
2257 )
2258 builder = self._makeQueryBuilder(summary, doomed_by=doomed_by)
2259 for datasetType in dataset_composition:
2260 builder.joinDataset(datasetType, collection_wildcard, isResult=False)
2261 query = builder.finish()
2263 return queries.DataCoordinateQueryResults(query)
2265 def queryDimensionRecords(
2266 self,
2267 element: DimensionElement | str,
2268 *,
2269 dataId: DataId | None = None,
2270 datasets: Any = None,
2271 collections: CollectionArgType | None = None,
2272 where: str = "",
2273 components: bool | None = None,
2274 bind: Mapping[str, Any] | None = None,
2275 check: bool = True,
2276 **kwargs: Any,
2277 ) -> queries.DimensionRecordQueryResults:
2278 """Query for dimension information matching user-provided criteria.
2280 Parameters
2281 ----------
2282 element : `DimensionElement` or `str`
2283 The dimension element to obtain records for.
2284 dataId : `dict` or `DataCoordinate`, optional
2285 A data ID whose key-value pairs are used as equality constraints
2286 in the query.
2287 datasets : dataset type expression, optional
2288 An expression that fully or partially identifies dataset types
2289 that should constrain the yielded records. See `queryDataIds` and
2290 :ref:`daf_butler_dataset_type_expressions` for more information.
2291 collections : collection expression, optional
2292 An expression that identifies the collections to search for
2293 datasets, such as a `str` (for full matches or partial matches
2294 via globs), `re.Pattern` (for partial matches), or iterable
2295 thereof. ``...`` can be used to search all collections (actually
2296 just all `~CollectionType.RUN` collections, because this will
2297 still find all datasets). If not provided,
2298 ``self.default.collections`` is used. Ignored unless ``datasets``
2299 is also passed. See :ref:`daf_butler_collection_expressions` for
2300 more information.
2301 where : `str`, optional
2302 A string expression similar to a SQL WHERE clause. See
2303 `queryDataIds` and :ref:`daf_butler_dimension_expressions` for more
2304 information.
2305 components : `bool`, optional
2306 Whether to apply dataset expressions to components as well.
2307 See `queryDataIds` for more information.
2309 Values other than `False` are deprecated, and only `False` will be
2310 supported after v26. After v27 this argument will be removed
2311 entirely.
2312 bind : `~collections.abc.Mapping`, optional
2313 Mapping containing literal values that should be injected into the
2314 ``where`` expression, keyed by the identifiers they replace.
2315 Values of collection type can be expanded in some cases; see
2316 :ref:`daf_butler_dimension_expressions_identifiers` for more
2317 information.
2318 check : `bool`, optional
2319 If `True` (default) check the query for consistency before
2320 executing it. This may reject some valid queries that resemble
2321 common mistakes (e.g. queries for visits without specifying an
2322 instrument).
2323 **kwargs
2324 Additional keyword arguments are forwarded to
2325 `DataCoordinate.standardize` when processing the ``dataId``
2326 argument (and may be used to provide a constraining data ID even
2327 when the ``dataId`` argument is `None`).
2329 Returns
2330 -------
2331 dataIds : `.queries.DimensionRecordQueryResults`
2332 Data IDs matching the given query parameters.
2334 Raises
2335 ------
2336 lsst.daf.butler.registry.NoDefaultCollectionError
2337 Raised if ``collections`` is `None` and
2338 ``self.defaults.collections`` is `None`.
2339 lsst.daf.butler.registry.CollectionExpressionError
2340 Raised when ``collections`` expression is invalid.
2341 lsst.daf.butler.registry.DataIdError
2342 Raised when ``dataId`` or keyword arguments specify unknown
2343 dimensions or values, or when they contain inconsistent values.
2344 lsst.daf.butler.registry.DatasetTypeExpressionError
2345 Raised when ``datasetType`` expression is invalid.
2346 lsst.daf.butler.registry.UserExpressionError
2347 Raised when ``where`` expression is invalid.
2348 """
2349 if not isinstance(element, DimensionElement):
2350 try:
2351 element = self.dimensions[element]
2352 except KeyError as e:
2353 raise DimensionNameError(
2354 f"No such dimension '{element}', available dimensions: "
2355 + str(self.dimensions.getStaticElements())
2356 ) from e
2357 doomed_by: list[str] = []
2358 data_id = self._standardize_query_data_id_args(dataId, doomed_by=doomed_by, **kwargs)
2359 dataset_composition, collection_wildcard = self._standardize_query_dataset_args(
2360 datasets, collections, components, doomed_by=doomed_by
2361 )
2362 if collection_wildcard is not None and collection_wildcard.empty():
2363 doomed_by.append("No dimension records can be found because collection list is empty.")
2364 summary = queries.QuerySummary(
2365 requested=element.graph,
2366 column_types=self._managers.column_types,
2367 data_id=data_id,
2368 expression=where,
2369 bind=bind,
2370 defaults=self.defaults.dataId,
2371 check=check,
2372 datasets=dataset_composition.keys(),
2373 )
2374 builder = self._makeQueryBuilder(summary, doomed_by=doomed_by)
2375 for datasetType in dataset_composition:
2376 builder.joinDataset(datasetType, collection_wildcard, isResult=False)
2377 query = builder.finish().with_record_columns(element)
2378 return queries.DatabaseDimensionRecordQueryResults(query, element)
2380 def queryDatasetAssociations(
2381 self,
2382 datasetType: str | DatasetType,
2383 collections: CollectionArgType | None = ...,
2384 *,
2385 collectionTypes: Iterable[CollectionType] = CollectionType.all(),
2386 flattenChains: bool = False,
2387 ) -> Iterator[DatasetAssociation]:
2388 """Iterate over dataset-collection combinations where the dataset is in
2389 the collection.
2391 This method is a temporary placeholder for better support for
2392 association results in `queryDatasets`. It will probably be
2393 removed in the future, and should be avoided in production code
2394 whenever possible.
2396 Parameters
2397 ----------
2398 datasetType : `DatasetType` or `str`
2399 A dataset type object or the name of one.
2400 collections : collection expression, optional
2401 An expression that identifies the collections to search for
2402 datasets, such as a `str` (for full matches or partial matches
2403 via globs), `re.Pattern` (for partial matches), or iterable
2404 thereof. ``...`` can be used to search all collections (actually
2405 just all `~CollectionType.RUN` collections, because this will still
2406 find all datasets). If not provided, ``self.default.collections``
2407 is used. See :ref:`daf_butler_collection_expressions` for more
2408 information.
2409 collectionTypes : `~collections.abc.Set` [ `CollectionType` ], optional
2410 If provided, only yield associations from collections of these
2411 types.
2412 flattenChains : `bool`, optional
2413 If `True`, search in the children of `~CollectionType.CHAINED`
2414 collections. If `False`, ``CHAINED`` collections are ignored.
2416 Yields
2417 ------
2418 association : `.DatasetAssociation`
2419 Object representing the relationship between a single dataset and
2420 a single collection.
2422 Raises
2423 ------
2424 lsst.daf.butler.registry.NoDefaultCollectionError
2425 Raised if ``collections`` is `None` and
2426 ``self.defaults.collections`` is `None`.
2427 lsst.daf.butler.registry.CollectionExpressionError
2428 Raised when ``collections`` expression is invalid.
2429 """
2430 if collections is None:
2431 if not self.defaults.collections:
2432 raise NoDefaultCollectionError(
2433 "No collections provided to queryDatasetAssociations, "
2434 "and no defaults from registry construction."
2435 )
2436 collections = self.defaults.collections
2437 collection_wildcard = CollectionWildcard.from_expression(collections)
2438 backend = queries.SqlQueryBackend(self._db, self._managers)
2439 parent_dataset_type, _ = backend.resolve_single_dataset_type_wildcard(datasetType, components=False)
2440 timespan_tag = DatasetColumnTag(parent_dataset_type.name, "timespan")
2441 collection_tag = DatasetColumnTag(parent_dataset_type.name, "collection")
2442 for parent_collection_record in backend.resolve_collection_wildcard(
2443 collection_wildcard,
2444 collection_types=frozenset(collectionTypes),
2445 flatten_chains=flattenChains,
2446 ):
2447 # Resolve this possibly-chained collection into a list of
2448 # non-CHAINED collections that actually hold datasets of this
2449 # type.
2450 candidate_collection_records = backend.resolve_dataset_collections(
2451 parent_dataset_type,
2452 CollectionWildcard.from_names([parent_collection_record.name]),
2453 allow_calibration_collections=True,
2454 governor_constraints={},
2455 )
2456 if not candidate_collection_records:
2457 continue
2458 with backend.context() as context:
2459 relation = backend.make_dataset_query_relation(
2460 parent_dataset_type,
2461 candidate_collection_records,
2462 columns={"dataset_id", "run", "timespan", "collection"},
2463 context=context,
2464 )
2465 reader = queries.DatasetRefReader(
2466 parent_dataset_type,
2467 translate_collection=lambda k: self._managers.collections[k].name,
2468 full=False,
2469 )
2470 for row in context.fetch_iterable(relation):
2471 ref = reader.read(row)
2472 collection_record = self._managers.collections[row[collection_tag]]
2473 if collection_record.type is CollectionType.CALIBRATION:
2474 timespan = row[timespan_tag]
2475 else:
2476 # For backwards compatibility and (possibly?) user
2477 # convenience we continue to define the timespan of a
2478 # DatasetAssociation row for a non-CALIBRATION
2479 # collection to be None rather than a fully unbounded
2480 # timespan.
2481 timespan = None
2482 yield DatasetAssociation(ref=ref, collection=collection_record.name, timespan=timespan)
2484 def get_datastore_records(self, ref: DatasetRef) -> DatasetRef:
2485 """Retrieve datastore records for given ref.
2487 Parameters
2488 ----------
2489 ref : `DatasetRef`
2490 Dataset reference for which to retrieve its corresponding datastore
2491 records.
2493 Returns
2494 -------
2495 updated_ref : `DatasetRef`
2496 Dataset reference with filled datastore records.
2498 Notes
2499 -----
2500 If this method is called with the dataset ref that is not known to the
2501 registry then the reference with an empty set of records is returned.
2502 """
2503 datastore_records: dict[str, list[StoredDatastoreItemInfo]] = {}
2504 for opaque, record_class in self._datastore_record_classes.items():
2505 records = self.fetchOpaqueData(opaque, dataset_id=ref.id)
2506 datastore_records[opaque] = [record_class.from_record(record) for record in records]
2507 return ref.replace(datastore_records=datastore_records)
2509 def store_datastore_records(self, refs: Mapping[str, DatasetRef]) -> None:
2510 """Store datastore records for given refs.
2512 Parameters
2513 ----------
2514 refs : `~collections.abc.Mapping` [`str`, `DatasetRef`]
2515 Mapping of a datastore name to dataset reference stored in that
2516 datastore, reference must include datastore records.
2517 """
2518 for datastore_name, ref in refs.items():
2519 # Store ref IDs in the bridge table.
2520 bridge = self._managers.datastores.register(datastore_name)
2521 bridge.insert([ref])
2523 # store records in opaque tables
2524 assert ref._datastore_records is not None, "Dataset ref must have datastore records"
2525 for table_name, records in ref._datastore_records.items():
2526 opaque_table = self._managers.opaque.get(table_name)
2527 assert opaque_table is not None, f"Unexpected opaque table name {table_name}"
2528 opaque_table.insert(*(record.to_record(dataset_id=ref.id) for record in records))
2530 def make_datastore_tables(self, tables: Mapping[str, DatastoreOpaqueTable]) -> None:
2531 """Create opaque tables used by datastores.
2533 Parameters
2534 ----------
2535 tables : `~collections.abc.Mapping`
2536 Maps opaque table name to its definition.
2538 Notes
2539 -----
2540 This method should disappear in the future when opaque table
2541 definitions will be provided during `Registry` construction.
2542 """
2543 datastore_record_classes = {}
2544 for table_name, table_def in tables.items():
2545 datastore_record_classes[table_name] = table_def.record_class
2546 try:
2547 self._managers.opaque.register(table_name, table_def.table_spec)
2548 except ReadOnlyDatabaseError:
2549 # If the database is read only and we just tried and failed to
2550 # create a table, it means someone is trying to create a
2551 # read-only butler client for an empty repo. That should be
2552 # okay, as long as they then try to get any datasets before
2553 # some other client creates the table. Chances are they're
2554 # just validating configuration.
2555 pass
2556 self._datastore_record_classes = datastore_record_classes
2558 @property
2559 def obsCoreTableManager(self) -> ObsCoreTableManager | None:
2560 """The ObsCore manager instance for this registry
2561 (`~.interfaces.ObsCoreTableManager`
2562 or `None`).
2564 ObsCore manager may not be implemented for all registry backend, or
2565 may not be enabled for many repositories.
2566 """
2567 return self._managers.obscore
2569 storageClasses: StorageClassFactory
2570 """All storage classes known to the registry (`StorageClassFactory`).
2571 """
2573 _defaults: RegistryDefaults
2574 """Default collections used for registry queries (`RegistryDefaults`)."""