Coverage for python/lsst/daf/butler/registry/sql_registry.py: 17%
577 statements
« prev ^ index » next coverage.py v7.4.0, created at 2024-01-16 10:44 +0000
« prev ^ index » next coverage.py v7.4.0, created at 2024-01-16 10:44 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This software is dual licensed under the GNU General Public License and also
10# under a 3-clause BSD license. Recipients may choose which of these licenses
11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12# respectively. If you choose the GPL option then the following text applies
13# (but note that there is still no warranty even if you opt for BSD instead):
14#
15# This program is free software: you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation, either version 3 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program. If not, see <http://www.gnu.org/licenses/>.
28from __future__ import annotations
30from .. import ddl
32__all__ = ("SqlRegistry",)
34import contextlib
35import logging
36import warnings
37from collections.abc import Iterable, Iterator, Mapping, Sequence
38from typing import TYPE_CHECKING, Any, Literal, cast
40import sqlalchemy
41from lsst.daf.relation import LeafRelation, Relation
42from lsst.resources import ResourcePathExpression
43from lsst.utils.introspection import find_outside_stacklevel
44from lsst.utils.iteration import ensure_iterable
46from .._column_tags import DatasetColumnTag
47from .._config import Config
48from .._dataset_association import DatasetAssociation
49from .._dataset_ref import DatasetId, DatasetIdGenEnum, DatasetRef
50from .._dataset_type import DatasetType
51from .._named import NamedKeyMapping, NameLookupMapping
52from .._storage_class import StorageClassFactory
53from .._timespan import Timespan
54from ..dimensions import (
55 DataCoordinate,
56 DataId,
57 Dimension,
58 DimensionConfig,
59 DimensionElement,
60 DimensionGraph,
61 DimensionGroup,
62 DimensionRecord,
63 DimensionUniverse,
64)
65from ..dimensions.record_cache import DimensionRecordCache
66from ..progress import Progress
67from ..registry import (
68 ArgumentError,
69 CollectionExpressionError,
70 CollectionSummary,
71 CollectionType,
72 CollectionTypeError,
73 ConflictingDefinitionError,
74 DataIdValueError,
75 DatasetTypeError,
76 DimensionNameError,
77 InconsistentDataIdError,
78 MissingDatasetTypeError,
79 NoDefaultCollectionError,
80 OrphanedRecordError,
81 RegistryConfig,
82 RegistryConsistencyError,
83 RegistryDefaults,
84 queries,
85)
86from ..registry.interfaces import ChainedCollectionRecord, ReadOnlyDatabaseError, RunRecord
87from ..registry.managers import RegistryManagerInstances, RegistryManagerTypes
88from ..registry.wildcards import CollectionWildcard, DatasetTypeWildcard
89from ..utils import _DefaultMarker, _Marker, transactional
91if TYPE_CHECKING:
92 from .._butler_config import ButlerConfig
93 from ..datastore._datastore import DatastoreOpaqueTable
94 from ..datastore.stored_file_info import StoredDatastoreItemInfo
95 from ..registry._registry import CollectionArgType
96 from ..registry.interfaces import (
97 CollectionRecord,
98 Database,
99 DatastoreRegistryBridgeManager,
100 ObsCoreTableManager,
101 )
104_LOG = logging.getLogger(__name__)
107class SqlRegistry:
108 """Butler Registry implementation that uses SQL database as backend.
110 Parameters
111 ----------
112 database : `Database`
113 Database instance to store Registry.
114 defaults : `RegistryDefaults`
115 Default collection search path and/or output `~CollectionType.RUN`
116 collection.
117 managers : `RegistryManagerInstances`
118 All the managers required for this registry.
119 """
121 defaultConfigFile: str | None = None
122 """Path to configuration defaults. Accessed within the ``configs`` resource
123 or relative to a search path. Can be None if no defaults specified.
124 """
126 @classmethod
127 def forceRegistryConfig(
128 cls, config: ButlerConfig | RegistryConfig | Config | str | None
129 ) -> RegistryConfig:
130 """Force the supplied config to a `RegistryConfig`.
132 Parameters
133 ----------
134 config : `RegistryConfig`, `Config` or `str` or `None`
135 Registry configuration, if missing then default configuration will
136 be loaded from registry.yaml.
138 Returns
139 -------
140 registry_config : `RegistryConfig`
141 A registry config.
142 """
143 if not isinstance(config, RegistryConfig):
144 if isinstance(config, str | Config) or config is None:
145 config = RegistryConfig(config)
146 else:
147 raise ValueError(f"Incompatible Registry configuration: {config}")
148 return config
150 @classmethod
151 def createFromConfig(
152 cls,
153 config: RegistryConfig | str | None = None,
154 dimensionConfig: DimensionConfig | str | None = None,
155 butlerRoot: ResourcePathExpression | None = None,
156 ) -> SqlRegistry:
157 """Create registry database and return `SqlRegistry` instance.
159 This method initializes database contents, database must be empty
160 prior to calling this method.
162 Parameters
163 ----------
164 config : `RegistryConfig` or `str`, optional
165 Registry configuration, if missing then default configuration will
166 be loaded from registry.yaml.
167 dimensionConfig : `DimensionConfig` or `str`, optional
168 Dimensions configuration, if missing then default configuration
169 will be loaded from dimensions.yaml.
170 butlerRoot : convertible to `lsst.resources.ResourcePath`, optional
171 Path to the repository root this `SqlRegistry` will manage.
173 Returns
174 -------
175 registry : `SqlRegistry`
176 A new `SqlRegistry` instance.
177 """
178 config = cls.forceRegistryConfig(config)
179 config.replaceRoot(butlerRoot)
181 if isinstance(dimensionConfig, str):
182 dimensionConfig = DimensionConfig(dimensionConfig)
183 elif dimensionConfig is None:
184 dimensionConfig = DimensionConfig()
185 elif not isinstance(dimensionConfig, DimensionConfig):
186 raise TypeError(f"Incompatible Dimension configuration type: {type(dimensionConfig)}")
188 DatabaseClass = config.getDatabaseClass()
189 database = DatabaseClass.fromUri(
190 config.connectionString, origin=config.get("origin", 0), namespace=config.get("namespace")
191 )
192 managerTypes = RegistryManagerTypes.fromConfig(config)
193 managers = managerTypes.makeRepo(database, dimensionConfig)
194 return cls(database, RegistryDefaults(), managers)
196 @classmethod
197 def fromConfig(
198 cls,
199 config: ButlerConfig | RegistryConfig | Config | str,
200 butlerRoot: ResourcePathExpression | None = None,
201 writeable: bool = True,
202 defaults: RegistryDefaults | None = None,
203 ) -> SqlRegistry:
204 """Create `Registry` subclass instance from `config`.
206 Registry database must be initialized prior to calling this method.
208 Parameters
209 ----------
210 config : `ButlerConfig`, `RegistryConfig`, `Config` or `str`
211 Registry configuration.
212 butlerRoot : `lsst.resources.ResourcePathExpression`, optional
213 Path to the repository root this `Registry` will manage.
214 writeable : `bool`, optional
215 If `True` (default) create a read-write connection to the database.
216 defaults : `RegistryDefaults`, optional
217 Default collection search path and/or output `~CollectionType.RUN`
218 collection.
220 Returns
221 -------
222 registry : `SqlRegistry`
223 A new `SqlRegistry` subclass instance.
224 """
225 config = cls.forceRegistryConfig(config)
226 config.replaceRoot(butlerRoot)
227 DatabaseClass = config.getDatabaseClass()
228 database = DatabaseClass.fromUri(
229 config.connectionString,
230 origin=config.get("origin", 0),
231 namespace=config.get("namespace"),
232 writeable=writeable,
233 )
234 managerTypes = RegistryManagerTypes.fromConfig(config)
235 with database.session():
236 managers = managerTypes.loadRepo(database)
237 if defaults is None:
238 defaults = RegistryDefaults()
239 return cls(database, defaults, managers)
241 def __init__(
242 self,
243 database: Database,
244 defaults: RegistryDefaults,
245 managers: RegistryManagerInstances,
246 ):
247 self._db = database
248 self._managers = managers
249 self.storageClasses = StorageClassFactory()
250 # This is public to SqlRegistry's internal-to-daf_butler callers, but
251 # it is intentionally not part of RegistryShim.
252 self.dimension_record_cache = DimensionRecordCache(
253 self._managers.dimensions.universe,
254 fetch=self._managers.dimensions.fetch_cache_dict,
255 )
256 # Intentionally invoke property setter to initialize defaults. This
257 # can only be done after most of the rest of Registry has already been
258 # initialized, and must be done before the property getter is used.
259 self.defaults = defaults
260 # TODO: This is currently initialized by `make_datastore_tables`,
261 # eventually we'll need to do it during construction.
262 # The mapping is indexed by the opaque table name.
263 self._datastore_record_classes: Mapping[str, type[StoredDatastoreItemInfo]] = {}
265 def __str__(self) -> str:
266 return str(self._db)
268 def __repr__(self) -> str:
269 return f"SqlRegistry({self._db!r}, {self.dimensions!r})"
271 def isWriteable(self) -> bool:
272 """Return `True` if this registry allows write operations, and `False`
273 otherwise.
274 """
275 return self._db.isWriteable()
277 def copy(self, defaults: RegistryDefaults | None = None) -> SqlRegistry:
278 """Create a new `SqlRegistry` backed by the same data repository
279 and connection as this one, but independent defaults.
281 Parameters
282 ----------
283 defaults : `~lsst.daf.butler.registry.RegistryDefaults`, optional
284 Default collections and data ID values for the new registry. If
285 not provided, ``self.defaults`` will be used (but future changes
286 to either registry's defaults will not affect the other).
288 Returns
289 -------
290 copy : `SqlRegistry`
291 A new `SqlRegistry` instance with its own defaults.
293 Notes
294 -----
295 Because the new registry shares a connection with the original, they
296 also share transaction state (despite the fact that their `transaction`
297 context manager methods do not reflect this), and must be used with
298 care.
299 """
300 if defaults is None:
301 # No need to copy, because `RegistryDefaults` is immutable; we
302 # effectively copy on write.
303 defaults = self.defaults
304 result = SqlRegistry(self._db, defaults, self._managers)
305 result.dimension_record_cache.load_from(self.dimension_record_cache)
306 return result
308 @property
309 def dimensions(self) -> DimensionUniverse:
310 """Definitions of all dimensions recognized by this `Registry`
311 (`DimensionUniverse`).
312 """
313 return self._managers.dimensions.universe
315 @property
316 def defaults(self) -> RegistryDefaults:
317 """Default collection search path and/or output `~CollectionType.RUN`
318 collection (`~lsst.daf.butler.registry.RegistryDefaults`).
320 This is an immutable struct whose components may not be set
321 individually, but the entire struct can be set by assigning to this
322 property.
323 """
324 return self._defaults
326 @defaults.setter
327 def defaults(self, value: RegistryDefaults) -> None:
328 if value.run is not None:
329 self.registerRun(value.run)
330 value.finish(self)
331 self._defaults = value
333 def refresh(self) -> None:
334 """Refresh all in-memory state by querying the database.
336 This may be necessary to enable querying for entities added by other
337 registry instances after this one was constructed.
338 """
339 self.dimension_record_cache.reset()
340 with self._db.transaction():
341 self._managers.refresh()
343 def caching_context(self) -> contextlib.AbstractContextManager[None]:
344 """Return context manager that enables caching.
346 Returns
347 -------
348 manager
349 A context manager that enables client-side caching. Entering
350 the context returns `None`.
351 """
352 return self._managers.caching_context_manager()
354 @contextlib.contextmanager
355 def transaction(self, *, savepoint: bool = False) -> Iterator[None]:
356 """Return a context manager that represents a transaction.
358 Parameters
359 ----------
360 savepoint : `bool`
361 Whether to issue a SAVEPOINT in the database.
363 Yields
364 ------
365 `None`
366 """
367 with self._db.transaction(savepoint=savepoint):
368 yield
370 def resetConnectionPool(self) -> None:
371 """Reset SQLAlchemy connection pool for `SqlRegistry` database.
373 This operation is useful when using registry with fork-based
374 multiprocessing. To use registry across fork boundary one has to make
375 sure that there are no currently active connections (no session or
376 transaction is in progress) and connection pool is reset using this
377 method. This method should be called by the child process immediately
378 after the fork.
379 """
380 self._db._engine.dispose()
382 def registerOpaqueTable(self, tableName: str, spec: ddl.TableSpec) -> None:
383 """Add an opaque (to the `Registry`) table for use by a `Datastore` or
384 other data repository client.
386 Opaque table records can be added via `insertOpaqueData`, retrieved via
387 `fetchOpaqueData`, and removed via `deleteOpaqueData`.
389 Parameters
390 ----------
391 tableName : `str`
392 Logical name of the opaque table. This may differ from the
393 actual name used in the database by a prefix and/or suffix.
394 spec : `ddl.TableSpec`
395 Specification for the table to be added.
396 """
397 self._managers.opaque.register(tableName, spec)
399 @transactional
400 def insertOpaqueData(self, tableName: str, *data: dict) -> None:
401 """Insert records into an opaque table.
403 Parameters
404 ----------
405 tableName : `str`
406 Logical name of the opaque table. Must match the name used in a
407 previous call to `registerOpaqueTable`.
408 *data
409 Each additional positional argument is a dictionary that represents
410 a single row to be added.
411 """
412 self._managers.opaque[tableName].insert(*data)
414 def fetchOpaqueData(self, tableName: str, **where: Any) -> Iterator[Mapping[str, Any]]:
415 """Retrieve records from an opaque table.
417 Parameters
418 ----------
419 tableName : `str`
420 Logical name of the opaque table. Must match the name used in a
421 previous call to `registerOpaqueTable`.
422 **where
423 Additional keyword arguments are interpreted as equality
424 constraints that restrict the returned rows (combined with AND);
425 keyword arguments are column names and values are the values they
426 must have.
428 Yields
429 ------
430 row : `dict`
431 A dictionary representing a single result row.
432 """
433 yield from self._managers.opaque[tableName].fetch(**where)
435 @transactional
436 def deleteOpaqueData(self, tableName: str, **where: Any) -> None:
437 """Remove records from an opaque table.
439 Parameters
440 ----------
441 tableName : `str`
442 Logical name of the opaque table. Must match the name used in a
443 previous call to `registerOpaqueTable`.
444 **where
445 Additional keyword arguments are interpreted as equality
446 constraints that restrict the deleted rows (combined with AND);
447 keyword arguments are column names and values are the values they
448 must have.
449 """
450 self._managers.opaque[tableName].delete(where.keys(), where)
452 def registerCollection(
453 self, name: str, type: CollectionType = CollectionType.TAGGED, doc: str | None = None
454 ) -> bool:
455 """Add a new collection if one with the given name does not exist.
457 Parameters
458 ----------
459 name : `str`
460 The name of the collection to create.
461 type : `CollectionType`
462 Enum value indicating the type of collection to create.
463 doc : `str`, optional
464 Documentation string for the collection.
466 Returns
467 -------
468 registered : `bool`
469 Boolean indicating whether the collection was already registered
470 or was created by this call.
472 Notes
473 -----
474 This method cannot be called within transactions, as it needs to be
475 able to perform its own transaction to be concurrent.
476 """
477 _, registered = self._managers.collections.register(name, type, doc=doc)
478 return registered
480 def getCollectionType(self, name: str) -> CollectionType:
481 """Return an enumeration value indicating the type of the given
482 collection.
484 Parameters
485 ----------
486 name : `str`
487 The name of the collection.
489 Returns
490 -------
491 type : `CollectionType`
492 Enum value indicating the type of this collection.
494 Raises
495 ------
496 lsst.daf.butler.registry.MissingCollectionError
497 Raised if no collection with the given name exists.
498 """
499 return self._managers.collections.find(name).type
501 def _get_collection_record(self, name: str) -> CollectionRecord:
502 """Return the record for this collection.
504 Parameters
505 ----------
506 name : `str`
507 Name of the collection for which the record is to be retrieved.
509 Returns
510 -------
511 record : `CollectionRecord`
512 The record for this collection.
513 """
514 return self._managers.collections.find(name)
516 def registerRun(self, name: str, doc: str | None = None) -> bool:
517 """Add a new run if one with the given name does not exist.
519 Parameters
520 ----------
521 name : `str`
522 The name of the run to create.
523 doc : `str`, optional
524 Documentation string for the collection.
526 Returns
527 -------
528 registered : `bool`
529 Boolean indicating whether a new run was registered. `False`
530 if it already existed.
532 Notes
533 -----
534 This method cannot be called within transactions, as it needs to be
535 able to perform its own transaction to be concurrent.
536 """
537 _, registered = self._managers.collections.register(name, CollectionType.RUN, doc=doc)
538 return registered
540 @transactional
541 def removeCollection(self, name: str) -> None:
542 """Remove the given collection from the registry.
544 Parameters
545 ----------
546 name : `str`
547 The name of the collection to remove.
549 Raises
550 ------
551 lsst.daf.butler.registry.MissingCollectionError
552 Raised if no collection with the given name exists.
553 sqlalchemy.exc.IntegrityError
554 Raised if the database rows associated with the collection are
555 still referenced by some other table, such as a dataset in a
556 datastore (for `~CollectionType.RUN` collections only) or a
557 `~CollectionType.CHAINED` collection of which this collection is
558 a child.
560 Notes
561 -----
562 If this is a `~CollectionType.RUN` collection, all datasets and quanta
563 in it will removed from the `Registry` database. This requires that
564 those datasets be removed (or at least trashed) from any datastores
565 that hold them first.
567 A collection may not be deleted as long as it is referenced by a
568 `~CollectionType.CHAINED` collection; the ``CHAINED`` collection must
569 be deleted or redefined first.
570 """
571 self._managers.collections.remove(name)
573 def getCollectionChain(self, parent: str) -> tuple[str, ...]:
574 """Return the child collections in a `~CollectionType.CHAINED`
575 collection.
577 Parameters
578 ----------
579 parent : `str`
580 Name of the chained collection. Must have already been added via
581 a call to `Registry.registerCollection`.
583 Returns
584 -------
585 children : `~collections.abc.Sequence` [ `str` ]
586 An ordered sequence of collection names that are searched when the
587 given chained collection is searched.
589 Raises
590 ------
591 lsst.daf.butler.registry.MissingCollectionError
592 Raised if ``parent`` does not exist in the `Registry`.
593 lsst.daf.butler.registry.CollectionTypeError
594 Raised if ``parent`` does not correspond to a
595 `~CollectionType.CHAINED` collection.
596 """
597 record = self._managers.collections.find(parent)
598 if record.type is not CollectionType.CHAINED:
599 raise CollectionTypeError(f"Collection '{parent}' has type {record.type.name}, not CHAINED.")
600 assert isinstance(record, ChainedCollectionRecord)
601 return record.children
603 @transactional
604 def setCollectionChain(self, parent: str, children: Any, *, flatten: bool = False) -> None:
605 """Define or redefine a `~CollectionType.CHAINED` collection.
607 Parameters
608 ----------
609 parent : `str`
610 Name of the chained collection. Must have already been added via
611 a call to `Registry.registerCollection`.
612 children : collection expression
613 An expression defining an ordered search of child collections,
614 generally an iterable of `str`; see
615 :ref:`daf_butler_collection_expressions` for more information.
616 flatten : `bool`, optional
617 If `True` (`False` is default), recursively flatten out any nested
618 `~CollectionType.CHAINED` collections in ``children`` first.
620 Raises
621 ------
622 lsst.daf.butler.registry.MissingCollectionError
623 Raised when any of the given collections do not exist in the
624 `Registry`.
625 lsst.daf.butler.registry.CollectionTypeError
626 Raised if ``parent`` does not correspond to a
627 `~CollectionType.CHAINED` collection.
628 ValueError
629 Raised if the given collections contains a cycle.
630 """
631 record = self._managers.collections.find(parent)
632 if record.type is not CollectionType.CHAINED:
633 raise CollectionTypeError(f"Collection '{parent}' has type {record.type.name}, not CHAINED.")
634 assert isinstance(record, ChainedCollectionRecord)
635 children = CollectionWildcard.from_expression(children).require_ordered()
636 if children != record.children or flatten:
637 self._managers.collections.update_chain(record, children, flatten=flatten)
639 def getCollectionParentChains(self, collection: str) -> set[str]:
640 """Return the CHAINED collections that directly contain the given one.
642 Parameters
643 ----------
644 collection : `str`
645 Name of the collection.
647 Returns
648 -------
649 chains : `set` of `str`
650 Set of `~CollectionType.CHAINED` collection names.
651 """
652 return self._managers.collections.getParentChains(self._managers.collections.find(collection).key)
654 def getCollectionDocumentation(self, collection: str) -> str | None:
655 """Retrieve the documentation string for a collection.
657 Parameters
658 ----------
659 collection : `str`
660 Name of the collection.
662 Returns
663 -------
664 docs : `str` or `None`
665 Docstring for the collection with the given name.
666 """
667 return self._managers.collections.getDocumentation(self._managers.collections.find(collection).key)
669 def setCollectionDocumentation(self, collection: str, doc: str | None) -> None:
670 """Set the documentation string for a collection.
672 Parameters
673 ----------
674 collection : `str`
675 Name of the collection.
676 doc : `str` or `None`
677 Docstring for the collection with the given name; will replace any
678 existing docstring. Passing `None` will remove any existing
679 docstring.
680 """
681 self._managers.collections.setDocumentation(self._managers.collections.find(collection).key, doc)
683 def getCollectionSummary(self, collection: str) -> CollectionSummary:
684 """Return a summary for the given collection.
686 Parameters
687 ----------
688 collection : `str`
689 Name of the collection for which a summary is to be retrieved.
691 Returns
692 -------
693 summary : `~lsst.daf.butler.registry.CollectionSummary`
694 Summary of the dataset types and governor dimension values in
695 this collection.
696 """
697 record = self._managers.collections.find(collection)
698 return self._managers.datasets.getCollectionSummary(record)
700 def registerDatasetType(self, datasetType: DatasetType) -> bool:
701 """Add a new `DatasetType` to the Registry.
703 It is not an error to register the same `DatasetType` twice.
705 Parameters
706 ----------
707 datasetType : `DatasetType`
708 The `DatasetType` to be added.
710 Returns
711 -------
712 inserted : `bool`
713 `True` if ``datasetType`` was inserted, `False` if an identical
714 existing `DatasetType` was found. Note that in either case the
715 DatasetType is guaranteed to be defined in the Registry
716 consistently with the given definition.
718 Raises
719 ------
720 ValueError
721 Raised if the dimensions or storage class are invalid.
722 lsst.daf.butler.registry.ConflictingDefinitionError
723 Raised if this `DatasetType` is already registered with a different
724 definition.
726 Notes
727 -----
728 This method cannot be called within transactions, as it needs to be
729 able to perform its own transaction to be concurrent.
730 """
731 return self._managers.datasets.register(datasetType)
733 def removeDatasetType(self, name: str | tuple[str, ...]) -> None:
734 """Remove the named `DatasetType` from the registry.
736 .. warning::
738 Registry implementations can cache the dataset type definitions.
739 This means that deleting the dataset type definition may result in
740 unexpected behavior from other butler processes that are active
741 that have not seen the deletion.
743 Parameters
744 ----------
745 name : `str` or `tuple` [`str`]
746 Name of the type to be removed or tuple containing a list of type
747 names to be removed. Wildcards are allowed.
749 Raises
750 ------
751 lsst.daf.butler.registry.OrphanedRecordError
752 Raised if an attempt is made to remove the dataset type definition
753 when there are already datasets associated with it.
755 Notes
756 -----
757 If the dataset type is not registered the method will return without
758 action.
759 """
760 for datasetTypeExpression in ensure_iterable(name):
761 # Catch any warnings from the caller specifying a component
762 # dataset type. This will result in an error later but the
763 # warning could be confusing when the caller is not querying
764 # anything.
765 with warnings.catch_warnings():
766 warnings.simplefilter("ignore", category=FutureWarning)
767 datasetTypes = list(self.queryDatasetTypes(datasetTypeExpression))
768 if not datasetTypes:
769 _LOG.info("Dataset type %r not defined", datasetTypeExpression)
770 else:
771 for datasetType in datasetTypes:
772 self._managers.datasets.remove(datasetType.name)
773 _LOG.info("Removed dataset type %r", datasetType.name)
775 def getDatasetType(self, name: str) -> DatasetType:
776 """Get the `DatasetType`.
778 Parameters
779 ----------
780 name : `str`
781 Name of the type.
783 Returns
784 -------
785 type : `DatasetType`
786 The `DatasetType` associated with the given name.
788 Raises
789 ------
790 lsst.daf.butler.registry.MissingDatasetTypeError
791 Raised if the requested dataset type has not been registered.
793 Notes
794 -----
795 This method handles component dataset types automatically, though most
796 other registry operations do not.
797 """
798 parent_name, component = DatasetType.splitDatasetTypeName(name)
799 storage = self._managers.datasets[parent_name]
800 if component is None:
801 return storage.datasetType
802 else:
803 return storage.datasetType.makeComponentDatasetType(component)
805 def supportsIdGenerationMode(self, mode: DatasetIdGenEnum) -> bool:
806 """Test whether the given dataset ID generation mode is supported by
807 `insertDatasets`.
809 Parameters
810 ----------
811 mode : `DatasetIdGenEnum`
812 Enum value for the mode to test.
814 Returns
815 -------
816 supported : `bool`
817 Whether the given mode is supported.
818 """
819 return self._managers.datasets.supportsIdGenerationMode(mode)
821 def findDataset(
822 self,
823 datasetType: DatasetType | str,
824 dataId: DataId | None = None,
825 *,
826 collections: CollectionArgType | None = None,
827 timespan: Timespan | None = None,
828 datastore_records: bool = False,
829 **kwargs: Any,
830 ) -> DatasetRef | None:
831 """Find a dataset given its `DatasetType` and data ID.
833 This can be used to obtain a `DatasetRef` that permits the dataset to
834 be read from a `Datastore`. If the dataset is a component and can not
835 be found using the provided dataset type, a dataset ref for the parent
836 will be returned instead but with the correct dataset type.
838 Parameters
839 ----------
840 datasetType : `DatasetType` or `str`
841 A `DatasetType` or the name of one. If this is a `DatasetType`
842 instance, its storage class will be respected and propagated to
843 the output, even if it differs from the dataset type definition
844 in the registry, as long as the storage classes are convertible.
845 dataId : `dict` or `DataCoordinate`, optional
846 A `dict`-like object containing the `Dimension` links that identify
847 the dataset within a collection.
848 collections : collection expression, optional
849 An expression that fully or partially identifies the collections to
850 search for the dataset; see
851 :ref:`daf_butler_collection_expressions` for more information.
852 Defaults to ``self.defaults.collections``.
853 timespan : `Timespan`, optional
854 A timespan that the validity range of the dataset must overlap.
855 If not provided, any `~CollectionType.CALIBRATION` collections
856 matched by the ``collections`` argument will not be searched.
857 datastore_records : `bool`, optional
858 Whether to attach datastore records to the `DatasetRef`.
859 **kwargs
860 Additional keyword arguments passed to
861 `DataCoordinate.standardize` to convert ``dataId`` to a true
862 `DataCoordinate` or augment an existing one.
864 Returns
865 -------
866 ref : `DatasetRef`
867 A reference to the dataset, or `None` if no matching Dataset
868 was found.
870 Raises
871 ------
872 lsst.daf.butler.registry.NoDefaultCollectionError
873 Raised if ``collections`` is `None` and
874 ``self.defaults.collections`` is `None`.
875 LookupError
876 Raised if one or more data ID keys are missing.
877 lsst.daf.butler.registry.MissingDatasetTypeError
878 Raised if the dataset type does not exist.
879 lsst.daf.butler.registry.MissingCollectionError
880 Raised if any of ``collections`` does not exist in the registry.
882 Notes
883 -----
884 This method simply returns `None` and does not raise an exception even
885 when the set of collections searched is intrinsically incompatible with
886 the dataset type, e.g. if ``datasetType.isCalibration() is False``, but
887 only `~CollectionType.CALIBRATION` collections are being searched.
888 This may make it harder to debug some lookup failures, but the behavior
889 is intentional; we consider it more important that failed searches are
890 reported consistently, regardless of the reason, and that adding
891 additional collections that do not contain a match to the search path
892 never changes the behavior.
894 This method handles component dataset types automatically, though most
895 other registry operations do not.
896 """
897 if collections is None:
898 if not self.defaults.collections:
899 raise NoDefaultCollectionError(
900 "No collections provided to findDataset, and no defaults from registry construction."
901 )
902 collections = self.defaults.collections
903 backend = queries.SqlQueryBackend(self._db, self._managers, self.dimension_record_cache)
904 with backend.caching_context():
905 collection_wildcard = CollectionWildcard.from_expression(collections, require_ordered=True)
906 if collection_wildcard.empty():
907 return None
908 matched_collections = backend.resolve_collection_wildcard(collection_wildcard)
909 resolved_dataset_type = backend.resolve_single_dataset_type_wildcard(datasetType)
910 dataId = DataCoordinate.standardize(
911 dataId,
912 dimensions=resolved_dataset_type.dimensions,
913 universe=self.dimensions,
914 defaults=self.defaults.dataId,
915 **kwargs,
916 )
917 governor_constraints = {name: {cast(str, dataId[name])} for name in dataId.dimensions.governors}
918 (filtered_collections,) = backend.filter_dataset_collections(
919 [resolved_dataset_type],
920 matched_collections,
921 governor_constraints=governor_constraints,
922 ).values()
923 if not filtered_collections:
924 return None
925 if timespan is None:
926 filtered_collections = [
927 collection_record
928 for collection_record in filtered_collections
929 if collection_record.type is not CollectionType.CALIBRATION
930 ]
931 if filtered_collections:
932 requested_columns = {"dataset_id", "run", "collection"}
933 with backend.context() as context:
934 predicate = context.make_data_coordinate_predicate(
935 dataId.subset(resolved_dataset_type.dimensions), full=False
936 )
937 if timespan is not None:
938 requested_columns.add("timespan")
939 predicate = predicate.logical_and(
940 context.make_timespan_overlap_predicate(
941 DatasetColumnTag(resolved_dataset_type.name, "timespan"), timespan
942 )
943 )
944 relation = backend.make_dataset_query_relation(
945 resolved_dataset_type, filtered_collections, requested_columns, context
946 ).with_rows_satisfying(predicate)
947 rows = list(context.fetch_iterable(relation))
948 else:
949 rows = []
950 if not rows:
951 return None
952 elif len(rows) == 1:
953 best_row = rows[0]
954 else:
955 rank_by_collection_key = {record.key: n for n, record in enumerate(filtered_collections)}
956 collection_tag = DatasetColumnTag(resolved_dataset_type.name, "collection")
957 row_iter = iter(rows)
958 best_row = next(row_iter)
959 best_rank = rank_by_collection_key[best_row[collection_tag]]
960 have_tie = False
961 for row in row_iter:
962 if (rank := rank_by_collection_key[row[collection_tag]]) < best_rank:
963 best_row = row
964 best_rank = rank
965 have_tie = False
966 elif rank == best_rank:
967 have_tie = True
968 assert timespan is not None, "Rank ties should be impossible given DB constraints."
969 if have_tie:
970 raise LookupError(
971 f"Ambiguous calibration lookup for {resolved_dataset_type.name} in collections "
972 f"{collection_wildcard.strings} with timespan {timespan}."
973 )
974 reader = queries.DatasetRefReader(
975 resolved_dataset_type,
976 translate_collection=lambda k: self._managers.collections[k].name,
977 )
978 ref = reader.read(best_row, data_id=dataId)
979 if datastore_records:
980 ref = self.get_datastore_records(ref)
982 return ref
984 @transactional
985 def insertDatasets(
986 self,
987 datasetType: DatasetType | str,
988 dataIds: Iterable[DataId],
989 run: str | None = None,
990 expand: bool = True,
991 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE,
992 ) -> list[DatasetRef]:
993 """Insert one or more datasets into the `Registry`.
995 This always adds new datasets; to associate existing datasets with
996 a new collection, use ``associate``.
998 Parameters
999 ----------
1000 datasetType : `DatasetType` or `str`
1001 A `DatasetType` or the name of one.
1002 dataIds : `~collections.abc.Iterable` of `dict` or `DataCoordinate`
1003 Dimension-based identifiers for the new datasets.
1004 run : `str`, optional
1005 The name of the run that produced the datasets. Defaults to
1006 ``self.defaults.run``.
1007 expand : `bool`, optional
1008 If `True` (default), expand data IDs as they are inserted. This is
1009 necessary in general to allow datastore to generate file templates,
1010 but it may be disabled if the caller can guarantee this is
1011 unnecessary.
1012 idGenerationMode : `DatasetIdGenEnum`, optional
1013 Specifies option for generating dataset IDs. By default unique IDs
1014 are generated for each inserted dataset.
1016 Returns
1017 -------
1018 refs : `list` of `DatasetRef`
1019 Resolved `DatasetRef` instances for all given data IDs (in the same
1020 order).
1022 Raises
1023 ------
1024 lsst.daf.butler.registry.DatasetTypeError
1025 Raised if ``datasetType`` is not known to registry.
1026 lsst.daf.butler.registry.CollectionTypeError
1027 Raised if ``run`` collection type is not `~CollectionType.RUN`.
1028 lsst.daf.butler.registry.NoDefaultCollectionError
1029 Raised if ``run`` is `None` and ``self.defaults.run`` is `None`.
1030 lsst.daf.butler.registry.ConflictingDefinitionError
1031 If a dataset with the same dataset type and data ID as one of those
1032 given already exists in ``run``.
1033 lsst.daf.butler.registry.MissingCollectionError
1034 Raised if ``run`` does not exist in the registry.
1035 """
1036 if isinstance(datasetType, DatasetType):
1037 storage = self._managers.datasets.find(datasetType.name)
1038 if storage is None:
1039 raise DatasetTypeError(f"DatasetType '{datasetType}' has not been registered.")
1040 else:
1041 storage = self._managers.datasets.find(datasetType)
1042 if storage is None:
1043 raise DatasetTypeError(f"DatasetType with name '{datasetType}' has not been registered.")
1044 if run is None:
1045 if self.defaults.run is None:
1046 raise NoDefaultCollectionError(
1047 "No run provided to insertDatasets, and no default from registry construction."
1048 )
1049 run = self.defaults.run
1050 runRecord = self._managers.collections.find(run)
1051 if runRecord.type is not CollectionType.RUN:
1052 raise CollectionTypeError(
1053 f"Given collection is of type {runRecord.type.name}; RUN collection required."
1054 )
1055 assert isinstance(runRecord, RunRecord)
1056 progress = Progress("daf.butler.Registry.insertDatasets", level=logging.DEBUG)
1057 if expand:
1058 expandedDataIds = [
1059 self.expandDataId(dataId, dimensions=storage.datasetType.dimensions)
1060 for dataId in progress.wrap(dataIds, f"Expanding {storage.datasetType.name} data IDs")
1061 ]
1062 else:
1063 expandedDataIds = [
1064 DataCoordinate.standardize(dataId, graph=storage.datasetType.dimensions) for dataId in dataIds
1065 ]
1066 try:
1067 refs = list(storage.insert(runRecord, expandedDataIds, idGenerationMode))
1068 if self._managers.obscore:
1069 context = queries.SqlQueryContext(self._db, self._managers.column_types)
1070 self._managers.obscore.add_datasets(refs, context)
1071 except sqlalchemy.exc.IntegrityError as err:
1072 raise ConflictingDefinitionError(
1073 "A database constraint failure was triggered by inserting "
1074 f"one or more datasets of type {storage.datasetType} into "
1075 f"collection '{run}'. "
1076 "This probably means a dataset with the same data ID "
1077 "and dataset type already exists, but it may also mean a "
1078 "dimension row is missing."
1079 ) from err
1080 return refs
1082 @transactional
1083 def _importDatasets(
1084 self,
1085 datasets: Iterable[DatasetRef],
1086 expand: bool = True,
1087 ) -> list[DatasetRef]:
1088 """Import one or more datasets into the `Registry`.
1090 Difference from `insertDatasets` method is that this method accepts
1091 `DatasetRef` instances which should already be resolved and have a
1092 dataset ID. If registry supports globally-unique dataset IDs (e.g.
1093 `uuid.UUID`) then datasets which already exist in the registry will be
1094 ignored if imported again.
1096 Parameters
1097 ----------
1098 datasets : `~collections.abc.Iterable` of `DatasetRef`
1099 Datasets to be inserted. All `DatasetRef` instances must have
1100 identical ``datasetType`` and ``run`` attributes. ``run``
1101 attribute can be `None` and defaults to ``self.defaults.run``.
1102 Datasets can specify ``id`` attribute which will be used for
1103 inserted datasets. All dataset IDs must have the same type
1104 (`int` or `uuid.UUID`), if type of dataset IDs does not match
1105 configured backend then IDs will be ignored and new IDs will be
1106 generated by backend.
1107 expand : `bool`, optional
1108 If `True` (default), expand data IDs as they are inserted. This is
1109 necessary in general, but it may be disabled if the caller can
1110 guarantee this is unnecessary.
1112 Returns
1113 -------
1114 refs : `list` of `DatasetRef`
1115 Resolved `DatasetRef` instances for all given data IDs (in the same
1116 order). If any of ``datasets`` has an ID which already exists in
1117 the database then it will not be inserted or updated, but a
1118 resolved `DatasetRef` will be returned for it in any case.
1120 Raises
1121 ------
1122 lsst.daf.butler.registry.NoDefaultCollectionError
1123 Raised if ``run`` is `None` and ``self.defaults.run`` is `None`.
1124 lsst.daf.butler.registry.DatasetTypeError
1125 Raised if datasets correspond to more than one dataset type or
1126 dataset type is not known to registry.
1127 lsst.daf.butler.registry.ConflictingDefinitionError
1128 If a dataset with the same dataset type and data ID as one of those
1129 given already exists in ``run``.
1130 lsst.daf.butler.registry.MissingCollectionError
1131 Raised if ``run`` does not exist in the registry.
1133 Notes
1134 -----
1135 This method is considered package-private and internal to Butler
1136 implementation. Clients outside daf_butler package should not use this
1137 method.
1138 """
1139 datasets = list(datasets)
1140 if not datasets:
1141 # nothing to do
1142 return []
1144 # find dataset type
1145 datasetTypes = {dataset.datasetType for dataset in datasets}
1146 if len(datasetTypes) != 1:
1147 raise DatasetTypeError(f"Multiple dataset types in input datasets: {datasetTypes}")
1148 datasetType = datasetTypes.pop()
1150 # get storage handler for this dataset type
1151 storage = self._managers.datasets.find(datasetType.name)
1152 if storage is None:
1153 raise DatasetTypeError(f"DatasetType '{datasetType}' has not been registered.")
1155 # find run name
1156 runs = {dataset.run for dataset in datasets}
1157 if len(runs) != 1:
1158 raise ValueError(f"Multiple run names in input datasets: {runs}")
1159 run = runs.pop()
1161 runRecord = self._managers.collections.find(run)
1162 if runRecord.type is not CollectionType.RUN:
1163 raise CollectionTypeError(
1164 f"Given collection '{runRecord.name}' is of type {runRecord.type.name};"
1165 " RUN collection required."
1166 )
1167 assert isinstance(runRecord, RunRecord)
1169 progress = Progress("daf.butler.Registry.insertDatasets", level=logging.DEBUG)
1170 if expand:
1171 expandedDatasets = [
1172 dataset.expanded(self.expandDataId(dataset.dataId, dimensions=storage.datasetType.dimensions))
1173 for dataset in progress.wrap(datasets, f"Expanding {storage.datasetType.name} data IDs")
1174 ]
1175 else:
1176 expandedDatasets = [
1177 DatasetRef(datasetType, dataset.dataId, id=dataset.id, run=dataset.run, conform=True)
1178 for dataset in datasets
1179 ]
1181 try:
1182 refs = list(storage.import_(runRecord, expandedDatasets))
1183 if self._managers.obscore:
1184 context = queries.SqlQueryContext(self._db, self._managers.column_types)
1185 self._managers.obscore.add_datasets(refs, context)
1186 except sqlalchemy.exc.IntegrityError as err:
1187 raise ConflictingDefinitionError(
1188 "A database constraint failure was triggered by inserting "
1189 f"one or more datasets of type {storage.datasetType} into "
1190 f"collection '{run}'. "
1191 "This probably means a dataset with the same data ID "
1192 "and dataset type already exists, but it may also mean a "
1193 "dimension row is missing."
1194 ) from err
1195 # Check that imported dataset IDs match the input
1196 for imported_ref, input_ref in zip(refs, datasets, strict=True):
1197 if imported_ref.id != input_ref.id:
1198 raise RegistryConsistencyError(
1199 "Imported dataset ID differs from input dataset ID, "
1200 f"input ref: {input_ref}, imported ref: {imported_ref}"
1201 )
1202 return refs
1204 def getDataset(self, id: DatasetId) -> DatasetRef | None:
1205 """Retrieve a Dataset entry.
1207 Parameters
1208 ----------
1209 id : `DatasetId`
1210 The unique identifier for the dataset.
1212 Returns
1213 -------
1214 ref : `DatasetRef` or `None`
1215 A ref to the Dataset, or `None` if no matching Dataset
1216 was found.
1217 """
1218 return self._managers.datasets.getDatasetRef(id)
1220 @transactional
1221 def removeDatasets(self, refs: Iterable[DatasetRef]) -> None:
1222 """Remove datasets from the Registry.
1224 The datasets will be removed unconditionally from all collections, and
1225 any `Quantum` that consumed this dataset will instead be marked with
1226 having a NULL input. `Datastore` records will *not* be deleted; the
1227 caller is responsible for ensuring that the dataset has already been
1228 removed from all Datastores.
1230 Parameters
1231 ----------
1232 refs : `~collections.abc.Iterable` [`DatasetRef`]
1233 References to the datasets to be removed. Must include a valid
1234 ``id`` attribute, and should be considered invalidated upon return.
1236 Raises
1237 ------
1238 lsst.daf.butler.AmbiguousDatasetError
1239 Raised if any ``ref.id`` is `None`.
1240 lsst.daf.butler.registry.OrphanedRecordError
1241 Raised if any dataset is still present in any `Datastore`.
1242 """
1243 progress = Progress("lsst.daf.butler.Registry.removeDatasets", level=logging.DEBUG)
1244 for datasetType, refsForType in progress.iter_item_chunks(
1245 DatasetRef.iter_by_type(refs), desc="Removing datasets by type"
1246 ):
1247 storage = self._managers.datasets[datasetType.name]
1248 try:
1249 storage.delete(refsForType)
1250 except sqlalchemy.exc.IntegrityError as err:
1251 raise OrphanedRecordError(
1252 "One or more datasets is still present in one or more Datastores."
1253 ) from err
1255 @transactional
1256 def associate(self, collection: str, refs: Iterable[DatasetRef]) -> None:
1257 """Add existing datasets to a `~CollectionType.TAGGED` collection.
1259 If a DatasetRef with the same exact ID is already in a collection
1260 nothing is changed. If a `DatasetRef` with the same `DatasetType` and
1261 data ID but with different ID exists in the collection,
1262 `~lsst.daf.butler.registry.ConflictingDefinitionError` is raised.
1264 Parameters
1265 ----------
1266 collection : `str`
1267 Indicates the collection the datasets should be associated with.
1268 refs : `~collections.abc.Iterable` [ `DatasetRef` ]
1269 An iterable of resolved `DatasetRef` instances that already exist
1270 in this `Registry`.
1272 Raises
1273 ------
1274 lsst.daf.butler.registry.ConflictingDefinitionError
1275 If a Dataset with the given `DatasetRef` already exists in the
1276 given collection.
1277 lsst.daf.butler.registry.MissingCollectionError
1278 Raised if ``collection`` does not exist in the registry.
1279 lsst.daf.butler.registry.CollectionTypeError
1280 Raise adding new datasets to the given ``collection`` is not
1281 allowed.
1282 """
1283 progress = Progress("lsst.daf.butler.Registry.associate", level=logging.DEBUG)
1284 collectionRecord = self._managers.collections.find(collection)
1285 if collectionRecord.type is not CollectionType.TAGGED:
1286 raise CollectionTypeError(
1287 f"Collection '{collection}' has type {collectionRecord.type.name}, not TAGGED."
1288 )
1289 for datasetType, refsForType in progress.iter_item_chunks(
1290 DatasetRef.iter_by_type(refs), desc="Associating datasets by type"
1291 ):
1292 storage = self._managers.datasets[datasetType.name]
1293 try:
1294 storage.associate(collectionRecord, refsForType)
1295 if self._managers.obscore:
1296 # If a TAGGED collection is being monitored by ObsCore
1297 # manager then we may need to save the dataset.
1298 context = queries.SqlQueryContext(self._db, self._managers.column_types)
1299 self._managers.obscore.associate(refsForType, collectionRecord, context)
1300 except sqlalchemy.exc.IntegrityError as err:
1301 raise ConflictingDefinitionError(
1302 f"Constraint violation while associating dataset of type {datasetType.name} with "
1303 f"collection {collection}. This probably means that one or more datasets with the same "
1304 "dataset type and data ID already exist in the collection, but it may also indicate "
1305 "that the datasets do not exist."
1306 ) from err
1308 @transactional
1309 def disassociate(self, collection: str, refs: Iterable[DatasetRef]) -> None:
1310 """Remove existing datasets from a `~CollectionType.TAGGED` collection.
1312 ``collection`` and ``ref`` combinations that are not currently
1313 associated are silently ignored.
1315 Parameters
1316 ----------
1317 collection : `str`
1318 The collection the datasets should no longer be associated with.
1319 refs : `~collections.abc.Iterable` [ `DatasetRef` ]
1320 An iterable of resolved `DatasetRef` instances that already exist
1321 in this `Registry`.
1323 Raises
1324 ------
1325 lsst.daf.butler.AmbiguousDatasetError
1326 Raised if any of the given dataset references is unresolved.
1327 lsst.daf.butler.registry.MissingCollectionError
1328 Raised if ``collection`` does not exist in the registry.
1329 lsst.daf.butler.registry.CollectionTypeError
1330 Raise adding new datasets to the given ``collection`` is not
1331 allowed.
1332 """
1333 progress = Progress("lsst.daf.butler.Registry.disassociate", level=logging.DEBUG)
1334 collectionRecord = self._managers.collections.find(collection)
1335 if collectionRecord.type is not CollectionType.TAGGED:
1336 raise CollectionTypeError(
1337 f"Collection '{collection}' has type {collectionRecord.type.name}; expected TAGGED."
1338 )
1339 for datasetType, refsForType in progress.iter_item_chunks(
1340 DatasetRef.iter_by_type(refs), desc="Disassociating datasets by type"
1341 ):
1342 storage = self._managers.datasets[datasetType.name]
1343 storage.disassociate(collectionRecord, refsForType)
1344 if self._managers.obscore:
1345 self._managers.obscore.disassociate(refsForType, collectionRecord)
1347 @transactional
1348 def certify(self, collection: str, refs: Iterable[DatasetRef], timespan: Timespan) -> None:
1349 """Associate one or more datasets with a calibration collection and a
1350 validity range within it.
1352 Parameters
1353 ----------
1354 collection : `str`
1355 The name of an already-registered `~CollectionType.CALIBRATION`
1356 collection.
1357 refs : `~collections.abc.Iterable` [ `DatasetRef` ]
1358 Datasets to be associated.
1359 timespan : `Timespan`
1360 The validity range for these datasets within the collection.
1362 Raises
1363 ------
1364 lsst.daf.butler.AmbiguousDatasetError
1365 Raised if any of the given `DatasetRef` instances is unresolved.
1366 lsst.daf.butler.registry.ConflictingDefinitionError
1367 Raised if the collection already contains a different dataset with
1368 the same `DatasetType` and data ID and an overlapping validity
1369 range.
1370 lsst.daf.butler.registry.CollectionTypeError
1371 Raised if ``collection`` is not a `~CollectionType.CALIBRATION`
1372 collection or if one or more datasets are of a dataset type for
1373 which `DatasetType.isCalibration` returns `False`.
1374 """
1375 progress = Progress("lsst.daf.butler.Registry.certify", level=logging.DEBUG)
1376 collectionRecord = self._managers.collections.find(collection)
1377 for datasetType, refsForType in progress.iter_item_chunks(
1378 DatasetRef.iter_by_type(refs), desc="Certifying datasets by type"
1379 ):
1380 storage = self._managers.datasets[datasetType.name]
1381 storage.certify(
1382 collectionRecord,
1383 refsForType,
1384 timespan,
1385 context=queries.SqlQueryContext(self._db, self._managers.column_types),
1386 )
1388 @transactional
1389 def decertify(
1390 self,
1391 collection: str,
1392 datasetType: str | DatasetType,
1393 timespan: Timespan,
1394 *,
1395 dataIds: Iterable[DataId] | None = None,
1396 ) -> None:
1397 """Remove or adjust datasets to clear a validity range within a
1398 calibration collection.
1400 Parameters
1401 ----------
1402 collection : `str`
1403 The name of an already-registered `~CollectionType.CALIBRATION`
1404 collection.
1405 datasetType : `str` or `DatasetType`
1406 Name or `DatasetType` instance for the datasets to be decertified.
1407 timespan : `Timespan`, optional
1408 The validity range to remove datasets from within the collection.
1409 Datasets that overlap this range but are not contained by it will
1410 have their validity ranges adjusted to not overlap it, which may
1411 split a single dataset validity range into two.
1412 dataIds : iterable [`dict` or `DataCoordinate`], optional
1413 Data IDs that should be decertified within the given validity range
1414 If `None`, all data IDs for ``self.datasetType`` will be
1415 decertified.
1417 Raises
1418 ------
1419 lsst.daf.butler.registry.CollectionTypeError
1420 Raised if ``collection`` is not a `~CollectionType.CALIBRATION`
1421 collection or if ``datasetType.isCalibration() is False``.
1422 """
1423 collectionRecord = self._managers.collections.find(collection)
1424 if isinstance(datasetType, str):
1425 storage = self._managers.datasets[datasetType]
1426 else:
1427 storage = self._managers.datasets[datasetType.name]
1428 standardizedDataIds = None
1429 if dataIds is not None:
1430 standardizedDataIds = [
1431 DataCoordinate.standardize(d, dimensions=storage.datasetType.dimensions) for d in dataIds
1432 ]
1433 storage.decertify(
1434 collectionRecord,
1435 timespan,
1436 dataIds=standardizedDataIds,
1437 context=queries.SqlQueryContext(self._db, self._managers.column_types),
1438 )
1440 def getDatastoreBridgeManager(self) -> DatastoreRegistryBridgeManager:
1441 """Return an object that allows a new `Datastore` instance to
1442 communicate with this `Registry`.
1444 Returns
1445 -------
1446 manager : `~.interfaces.DatastoreRegistryBridgeManager`
1447 Object that mediates communication between this `Registry` and its
1448 associated datastores.
1449 """
1450 return self._managers.datastores
1452 def getDatasetLocations(self, ref: DatasetRef) -> Iterable[str]:
1453 """Retrieve datastore locations for a given dataset.
1455 Parameters
1456 ----------
1457 ref : `DatasetRef`
1458 A reference to the dataset for which to retrieve storage
1459 information.
1461 Returns
1462 -------
1463 datastores : `~collections.abc.Iterable` [ `str` ]
1464 All the matching datastores holding this dataset.
1466 Raises
1467 ------
1468 lsst.daf.butler.AmbiguousDatasetError
1469 Raised if ``ref.id`` is `None`.
1470 """
1471 return self._managers.datastores.findDatastores(ref)
1473 def expandDataId(
1474 self,
1475 dataId: DataId | None = None,
1476 *,
1477 dimensions: Iterable[str] | DimensionGroup | DimensionGraph | None = None,
1478 graph: DimensionGraph | None = None,
1479 records: NameLookupMapping[DimensionElement, DimensionRecord | None] | None = None,
1480 withDefaults: bool = True,
1481 **kwargs: Any,
1482 ) -> DataCoordinate:
1483 """Expand a dimension-based data ID to include additional information.
1485 Parameters
1486 ----------
1487 dataId : `DataCoordinate` or `dict`, optional
1488 Data ID to be expanded; augmented and overridden by ``kwargs``.
1489 dimensions : `~collections.abc.Iterable` [ `str` ], \
1490 `DimensionGroup`, or `DimensionGraph`, optional
1491 The dimensions to be identified by the new `DataCoordinate`.
1492 If not provided, will be inferred from the keys of ``mapping`` and
1493 ``**kwargs``, and ``universe`` must be provided unless ``mapping``
1494 is already a `DataCoordinate`.
1495 graph : `DimensionGraph`, optional
1496 Like ``dimensions``, but as a ``DimensionGraph`` instance. Ignored
1497 if ``dimensions`` is provided. Deprecated and will be removed
1498 after v27.
1499 records : `~collections.abc.Mapping` [`str`, `DimensionRecord`], \
1500 optional
1501 Dimension record data to use before querying the database for that
1502 data, keyed by element name.
1503 withDefaults : `bool`, optional
1504 Utilize ``self.defaults.dataId`` to fill in missing governor
1505 dimension key-value pairs. Defaults to `True` (i.e. defaults are
1506 used).
1507 **kwargs
1508 Additional keywords are treated like additional key-value pairs for
1509 ``dataId``, extending and overriding.
1511 Returns
1512 -------
1513 expanded : `DataCoordinate`
1514 A data ID that includes full metadata for all of the dimensions it
1515 identifies, i.e. guarantees that ``expanded.hasRecords()`` and
1516 ``expanded.hasFull()`` both return `True`.
1518 Raises
1519 ------
1520 lsst.daf.butler.registry.DataIdError
1521 Raised when ``dataId`` or keyword arguments specify unknown
1522 dimensions or values, or when a resulting data ID contains
1523 contradictory key-value pairs, according to dimension
1524 relationships.
1526 Notes
1527 -----
1528 This method cannot be relied upon to reject invalid data ID values
1529 for dimensions that do actually not have any record columns. For
1530 efficiency reasons the records for these dimensions (which have only
1531 dimension key values that are given by the caller) may be constructed
1532 directly rather than obtained from the registry database.
1533 """
1534 if not withDefaults:
1535 defaults = None
1536 else:
1537 defaults = self.defaults.dataId
1538 try:
1539 standardized = DataCoordinate.standardize(
1540 dataId,
1541 graph=graph,
1542 dimensions=dimensions,
1543 universe=self.dimensions,
1544 defaults=defaults,
1545 **kwargs,
1546 )
1547 except KeyError as exc:
1548 # This means either kwargs have some odd name or required
1549 # dimension is missing.
1550 raise DimensionNameError(str(exc)) from exc
1551 if standardized.hasRecords():
1552 return standardized
1553 if records is None:
1554 records = {}
1555 elif isinstance(records, NamedKeyMapping):
1556 records = records.byName()
1557 else:
1558 records = dict(records)
1559 if isinstance(dataId, DataCoordinate) and dataId.hasRecords():
1560 for element_name in dataId.dimensions.elements:
1561 records[element_name] = dataId.records[element_name]
1562 keys = dict(standardized.mapping)
1563 for element_name in standardized.dimensions.lookup_order:
1564 element = self.dimensions[element_name]
1565 record = records.get(element_name, ...) # Use ... to mean not found; None might mean NULL
1566 if record is ...:
1567 if element_name in self.dimensions.dimensions.names and keys.get(element_name) is None:
1568 if element_name in standardized.dimensions.required:
1569 raise DimensionNameError(
1570 f"No value or null value for required dimension {element_name}."
1571 )
1572 keys[element_name] = None
1573 record = None
1574 else:
1575 record = self._managers.dimensions.fetch_one(
1576 element_name,
1577 DataCoordinate.standardize(keys, dimensions=element.minimal_group),
1578 self.dimension_record_cache,
1579 )
1580 records[element_name] = record
1581 if record is not None:
1582 for d in element.implied:
1583 value = getattr(record, d.name)
1584 if keys.setdefault(d.name, value) != value:
1585 raise InconsistentDataIdError(
1586 f"Data ID {standardized} has {d.name}={keys[d.name]!r}, "
1587 f"but {element_name} implies {d.name}={value!r}."
1588 )
1589 else:
1590 if element_name in standardized.dimensions.required:
1591 raise DataIdValueError(
1592 f"Could not fetch record for required dimension {element.name} via keys {keys}."
1593 )
1594 if element.defines_relationships:
1595 raise InconsistentDataIdError(
1596 f"Could not fetch record for element {element_name} via keys {keys}, ",
1597 "but it is marked as defining relationships; this means one or more dimensions are "
1598 "have inconsistent values.",
1599 )
1600 for d in element.implied:
1601 keys.setdefault(d.name, None)
1602 records.setdefault(d.name, None)
1603 return DataCoordinate.standardize(keys, dimensions=standardized.dimensions).expanded(records=records)
1605 def insertDimensionData(
1606 self,
1607 element: DimensionElement | str,
1608 *data: Mapping[str, Any] | DimensionRecord,
1609 conform: bool = True,
1610 replace: bool = False,
1611 skip_existing: bool = False,
1612 ) -> None:
1613 """Insert one or more dimension records into the database.
1615 Parameters
1616 ----------
1617 element : `DimensionElement` or `str`
1618 The `DimensionElement` or name thereof that identifies the table
1619 records will be inserted into.
1620 *data : `dict` or `DimensionRecord`
1621 One or more records to insert.
1622 conform : `bool`, optional
1623 If `False` (`True` is default) perform no checking or conversions,
1624 and assume that ``element`` is a `DimensionElement` instance and
1625 ``data`` is a one or more `DimensionRecord` instances of the
1626 appropriate subclass.
1627 replace : `bool`, optional
1628 If `True` (`False` is default), replace existing records in the
1629 database if there is a conflict.
1630 skip_existing : `bool`, optional
1631 If `True` (`False` is default), skip insertion if a record with
1632 the same primary key values already exists. Unlike
1633 `syncDimensionData`, this will not detect when the given record
1634 differs from what is in the database, and should not be used when
1635 this is a concern.
1636 """
1637 if isinstance(element, str):
1638 element = self.dimensions[element]
1639 if conform:
1640 records = [
1641 row if isinstance(row, DimensionRecord) else element.RecordClass(**row) for row in data
1642 ]
1643 else:
1644 # Ignore typing since caller said to trust them with conform=False.
1645 records = data # type: ignore
1646 self._managers.dimensions.insert(
1647 element,
1648 *records,
1649 cache=self.dimension_record_cache,
1650 replace=replace,
1651 skip_existing=skip_existing,
1652 )
1654 def syncDimensionData(
1655 self,
1656 element: DimensionElement | str,
1657 row: Mapping[str, Any] | DimensionRecord,
1658 conform: bool = True,
1659 update: bool = False,
1660 ) -> bool | dict[str, Any]:
1661 """Synchronize the given dimension record with the database, inserting
1662 if it does not already exist and comparing values if it does.
1664 Parameters
1665 ----------
1666 element : `DimensionElement` or `str`
1667 The `DimensionElement` or name thereof that identifies the table
1668 records will be inserted into.
1669 row : `dict` or `DimensionRecord`
1670 The record to insert.
1671 conform : `bool`, optional
1672 If `False` (`True` is default) perform no checking or conversions,
1673 and assume that ``element`` is a `DimensionElement` instance and
1674 ``data`` is a one or more `DimensionRecord` instances of the
1675 appropriate subclass.
1676 update : `bool`, optional
1677 If `True` (`False` is default), update the existing record in the
1678 database if there is a conflict.
1680 Returns
1681 -------
1682 inserted_or_updated : `bool` or `dict`
1683 `True` if a new row was inserted, `False` if no changes were
1684 needed, or a `dict` mapping updated column names to their old
1685 values if an update was performed (only possible if
1686 ``update=True``).
1688 Raises
1689 ------
1690 lsst.daf.butler.registry.ConflictingDefinitionError
1691 Raised if the record exists in the database (according to primary
1692 key lookup) but is inconsistent with the given one.
1693 """
1694 if conform:
1695 if isinstance(element, str):
1696 element = self.dimensions[element]
1697 record = row if isinstance(row, DimensionRecord) else element.RecordClass(**row)
1698 else:
1699 # Ignore typing since caller said to trust them with conform=False.
1700 record = row # type: ignore
1701 return self._managers.dimensions.sync(record, self.dimension_record_cache, update=update)
1703 def queryDatasetTypes(
1704 self,
1705 expression: Any = ...,
1706 *,
1707 components: bool | _Marker = _DefaultMarker,
1708 missing: list[str] | None = None,
1709 ) -> Iterable[DatasetType]:
1710 """Iterate over the dataset types whose names match an expression.
1712 Parameters
1713 ----------
1714 expression : dataset type expression, optional
1715 An expression that fully or partially identifies the dataset types
1716 to return, such as a `str`, `re.Pattern`, or iterable thereof.
1717 ``...`` can be used to return all dataset types, and is the
1718 default. See :ref:`daf_butler_dataset_type_expressions` for more
1719 information.
1720 components : `bool`, optional
1721 Must be `False`. Provided only for backwards compatibility. After
1722 v27 this argument will be removed entirely.
1723 missing : `list` of `str`, optional
1724 String dataset type names that were explicitly given (i.e. not
1725 regular expression patterns) but not found will be appended to this
1726 list, if it is provided.
1728 Returns
1729 -------
1730 dataset_types : `~collections.abc.Iterable` [ `DatasetType`]
1731 An `~collections.abc.Iterable` of `DatasetType` instances whose
1732 names match ``expression``.
1734 Raises
1735 ------
1736 lsst.daf.butler.registry.DatasetTypeExpressionError
1737 Raised when ``expression`` is invalid.
1738 """
1739 if components is not _DefaultMarker:
1740 if components is not False:
1741 raise DatasetTypeError(
1742 "Dataset component queries are no longer supported by Registry. Use "
1743 "DatasetType methods to obtain components from parent dataset types instead."
1744 )
1745 else:
1746 warnings.warn(
1747 "The components parameter is ignored. It will be removed after v27.",
1748 category=FutureWarning,
1749 stacklevel=find_outside_stacklevel("lsst.daf.butler"),
1750 )
1751 wildcard = DatasetTypeWildcard.from_expression(expression)
1752 return self._managers.datasets.resolve_wildcard(wildcard, missing=missing)
1754 def queryCollections(
1755 self,
1756 expression: Any = ...,
1757 datasetType: DatasetType | None = None,
1758 collectionTypes: Iterable[CollectionType] | CollectionType = CollectionType.all(),
1759 flattenChains: bool = False,
1760 includeChains: bool | None = None,
1761 ) -> Sequence[str]:
1762 """Iterate over the collections whose names match an expression.
1764 Parameters
1765 ----------
1766 expression : collection expression, optional
1767 An expression that identifies the collections to return, such as
1768 a `str` (for full matches or partial matches via globs),
1769 `re.Pattern` (for partial matches), or iterable thereof. ``...``
1770 can be used to return all collections, and is the default.
1771 See :ref:`daf_butler_collection_expressions` for more information.
1772 datasetType : `DatasetType`, optional
1773 If provided, only yield collections that may contain datasets of
1774 this type. This is a conservative approximation in general; it may
1775 yield collections that do not have any such datasets.
1776 collectionTypes : `~collections.abc.Set` [`CollectionType`] or \
1777 `CollectionType`, optional
1778 If provided, only yield collections of these types.
1779 flattenChains : `bool`, optional
1780 If `True` (`False` is default), recursively yield the child
1781 collections of matching `~CollectionType.CHAINED` collections.
1782 includeChains : `bool`, optional
1783 If `True`, yield records for matching `~CollectionType.CHAINED`
1784 collections. Default is the opposite of ``flattenChains``: include
1785 either CHAINED collections or their children, but not both.
1787 Returns
1788 -------
1789 collections : `~collections.abc.Sequence` [ `str` ]
1790 The names of collections that match ``expression``.
1792 Raises
1793 ------
1794 lsst.daf.butler.registry.CollectionExpressionError
1795 Raised when ``expression`` is invalid.
1797 Notes
1798 -----
1799 The order in which collections are returned is unspecified, except that
1800 the children of a `~CollectionType.CHAINED` collection are guaranteed
1801 to be in the order in which they are searched. When multiple parent
1802 `~CollectionType.CHAINED` collections match the same criteria, the
1803 order in which the two lists appear is unspecified, and the lists of
1804 children may be incomplete if a child has multiple parents.
1805 """
1806 # Right now the datasetTypes argument is completely ignored, but that
1807 # is consistent with its [lack of] guarantees. DM-24939 or a follow-up
1808 # ticket will take care of that.
1809 try:
1810 wildcard = CollectionWildcard.from_expression(expression)
1811 except TypeError as exc:
1812 raise CollectionExpressionError(f"Invalid collection expression '{expression}'") from exc
1813 collectionTypes = ensure_iterable(collectionTypes)
1814 return [
1815 record.name
1816 for record in self._managers.collections.resolve_wildcard(
1817 wildcard,
1818 collection_types=frozenset(collectionTypes),
1819 flatten_chains=flattenChains,
1820 include_chains=includeChains,
1821 )
1822 ]
1824 def _makeQueryBuilder(
1825 self,
1826 summary: queries.QuerySummary,
1827 doomed_by: Iterable[str] = (),
1828 ) -> queries.QueryBuilder:
1829 """Return a `QueryBuilder` instance capable of constructing and
1830 managing more complex queries than those obtainable via `Registry`
1831 interfaces.
1833 This is an advanced interface; downstream code should prefer
1834 `Registry.queryDataIds` and `Registry.queryDatasets` whenever those
1835 are sufficient.
1837 Parameters
1838 ----------
1839 summary : `queries.QuerySummary`
1840 Object describing and categorizing the full set of dimensions that
1841 will be included in the query.
1842 doomed_by : `~collections.abc.Iterable` of `str`, optional
1843 A list of diagnostic messages that indicate why the query is going
1844 to yield no results and should not even be executed. If an empty
1845 container (default) the query will be executed unless other code
1846 determines that it is doomed.
1848 Returns
1849 -------
1850 builder : `queries.QueryBuilder`
1851 Object that can be used to construct and perform advanced queries.
1852 """
1853 doomed_by = list(doomed_by)
1854 backend = queries.SqlQueryBackend(self._db, self._managers, self.dimension_record_cache)
1855 context = backend.context()
1856 relation: Relation | None = None
1857 if doomed_by:
1858 relation = LeafRelation.make_doomed(context.sql_engine, set(), doomed_by)
1859 return queries.QueryBuilder(
1860 summary,
1861 backend=backend,
1862 context=context,
1863 relation=relation,
1864 )
1866 def _standardize_query_data_id_args(
1867 self, data_id: DataId | None, *, doomed_by: list[str], **kwargs: Any
1868 ) -> DataCoordinate:
1869 """Preprocess the data ID arguments passed to query* methods.
1871 Parameters
1872 ----------
1873 data_id : `DataId` or `None`
1874 Data ID that constrains the query results.
1875 doomed_by : `list` [ `str` ]
1876 List to append messages indicating why the query is doomed to
1877 yield no results.
1878 **kwargs
1879 Additional data ID key-value pairs, extending and overriding
1880 ``data_id``.
1882 Returns
1883 -------
1884 data_id : `DataCoordinate`
1885 Standardized data ID. Will be fully expanded unless expansion
1886 fails, in which case a message will be appended to ``doomed_by``
1887 on return.
1888 """
1889 try:
1890 return self.expandDataId(data_id, **kwargs)
1891 except DataIdValueError as err:
1892 doomed_by.append(str(err))
1893 return DataCoordinate.standardize(
1894 data_id, **kwargs, universe=self.dimensions, defaults=self.defaults.dataId
1895 )
1897 def _standardize_query_dataset_args(
1898 self,
1899 datasets: Any,
1900 collections: CollectionArgType | None,
1901 mode: Literal["find_first"] | Literal["find_all"] | Literal["constrain"] = "constrain",
1902 *,
1903 doomed_by: list[str],
1904 ) -> tuple[list[DatasetType], CollectionWildcard | None]:
1905 """Preprocess dataset arguments passed to query* methods.
1907 Parameters
1908 ----------
1909 datasets : `DatasetType`, `str`, `re.Pattern`, or iterable of these
1910 Expression identifying dataset types. See `queryDatasetTypes` for
1911 details.
1912 collections : `str`, `re.Pattern`, or iterable of these
1913 Expression identifying collections to be searched. See
1914 `queryCollections` for details.
1915 mode : `str`, optional
1916 The way in which datasets are being used in this query; one of:
1918 - "find_first": this is a query for the first dataset in an
1919 ordered list of collections. Prohibits collection wildcards,
1920 but permits dataset type wildcards.
1922 - "find_all": this is a query for all datasets in all matched
1923 collections. Permits collection and dataset type wildcards.
1925 - "constrain": this is a query for something other than datasets,
1926 with results constrained by dataset existence. Permits
1927 collection wildcards and prohibits ``...`` as a dataset type
1928 wildcard.
1929 doomed_by : `list` [ `str` ]
1930 List to append messages indicating why the query is doomed to
1931 yield no results.
1933 Returns
1934 -------
1935 dataset_types : `list` [ `DatasetType` ]
1936 List of matched dataset types.
1937 collections : `CollectionWildcard`
1938 Processed collection expression.
1939 """
1940 dataset_types: list[DatasetType] = []
1941 collection_wildcard: CollectionWildcard | None = None
1942 if datasets is not None:
1943 if collections is None:
1944 if not self.defaults.collections:
1945 raise NoDefaultCollectionError("No collections, and no registry default collections.")
1946 collection_wildcard = CollectionWildcard.from_expression(self.defaults.collections)
1947 else:
1948 collection_wildcard = CollectionWildcard.from_expression(collections)
1949 if mode == "find_first" and collection_wildcard.patterns:
1950 raise TypeError(
1951 f"Collection pattern(s) {collection_wildcard.patterns} not allowed in this context."
1952 )
1953 missing: list[str] = []
1954 dataset_types = self._managers.datasets.resolve_wildcard(
1955 datasets, missing=missing, explicit_only=(mode == "constrain")
1956 )
1957 if missing and mode == "constrain":
1958 raise MissingDatasetTypeError(
1959 f"Dataset type(s) {missing} are not registered.",
1960 )
1961 doomed_by.extend(f"Dataset type {name} is not registered." for name in missing)
1962 elif collections:
1963 # I think this check should actually be `collections is not None`,
1964 # but it looks like some CLI scripts use empty tuple as default.
1965 raise ArgumentError(f"Cannot pass 'collections' (='{collections}') without 'datasets'.")
1966 return dataset_types, collection_wildcard
1968 def queryDatasets(
1969 self,
1970 datasetType: Any,
1971 *,
1972 collections: CollectionArgType | None = None,
1973 dimensions: Iterable[Dimension | str] | None = None,
1974 dataId: DataId | None = None,
1975 where: str = "",
1976 findFirst: bool = False,
1977 components: bool | _Marker = _DefaultMarker,
1978 bind: Mapping[str, Any] | None = None,
1979 check: bool = True,
1980 **kwargs: Any,
1981 ) -> queries.DatasetQueryResults:
1982 """Query for and iterate over dataset references matching user-provided
1983 criteria.
1985 Parameters
1986 ----------
1987 datasetType : dataset type expression
1988 An expression that fully or partially identifies the dataset types
1989 to be queried. Allowed types include `DatasetType`, `str`,
1990 `re.Pattern`, and iterables thereof. The special value ``...`` can
1991 be used to query all dataset types. See
1992 :ref:`daf_butler_dataset_type_expressions` for more information.
1993 collections : collection expression, optional
1994 An expression that identifies the collections to search, such as a
1995 `str` (for full matches or partial matches via globs), `re.Pattern`
1996 (for partial matches), or iterable thereof. ``...`` can be used to
1997 search all collections (actually just all `~CollectionType.RUN`
1998 collections, because this will still find all datasets).
1999 If not provided, ``self.default.collections`` is used. See
2000 :ref:`daf_butler_collection_expressions` for more information.
2001 dimensions : `~collections.abc.Iterable` of `Dimension` or `str`
2002 Dimensions to include in the query (in addition to those used
2003 to identify the queried dataset type(s)), either to constrain
2004 the resulting datasets to those for which a matching dimension
2005 exists, or to relate the dataset type's dimensions to dimensions
2006 referenced by the ``dataId`` or ``where`` arguments.
2007 dataId : `dict` or `DataCoordinate`, optional
2008 A data ID whose key-value pairs are used as equality constraints
2009 in the query.
2010 where : `str`, optional
2011 A string expression similar to a SQL WHERE clause. May involve
2012 any column of a dimension table or (as a shortcut for the primary
2013 key column of a dimension table) dimension name. See
2014 :ref:`daf_butler_dimension_expressions` for more information.
2015 findFirst : `bool`, optional
2016 If `True` (`False` is default), for each result data ID, only
2017 yield one `DatasetRef` of each `DatasetType`, from the first
2018 collection in which a dataset of that dataset type appears
2019 (according to the order of ``collections`` passed in). If `True`,
2020 ``collections`` must not contain regular expressions and may not
2021 be ``...``.
2022 components : `bool`, optional
2023 Must be `False`. Provided only for backwards compatibility. After
2024 v27 this argument will be removed entirely.
2025 bind : `~collections.abc.Mapping`, optional
2026 Mapping containing literal values that should be injected into the
2027 ``where`` expression, keyed by the identifiers they replace.
2028 Values of collection type can be expanded in some cases; see
2029 :ref:`daf_butler_dimension_expressions_identifiers` for more
2030 information.
2031 check : `bool`, optional
2032 If `True` (default) check the query for consistency before
2033 executing it. This may reject some valid queries that resemble
2034 common mistakes (e.g. queries for visits without specifying an
2035 instrument).
2036 **kwargs
2037 Additional keyword arguments are forwarded to
2038 `DataCoordinate.standardize` when processing the ``dataId``
2039 argument (and may be used to provide a constraining data ID even
2040 when the ``dataId`` argument is `None`).
2042 Returns
2043 -------
2044 refs : `.queries.DatasetQueryResults`
2045 Dataset references matching the given query criteria. Nested data
2046 IDs are guaranteed to include values for all implied dimensions
2047 (i.e. `DataCoordinate.hasFull` will return `True`), but will not
2048 include dimension records (`DataCoordinate.hasRecords` will be
2049 `False`) unless `~.queries.DatasetQueryResults.expanded` is
2050 called on the result object (which returns a new one).
2052 Raises
2053 ------
2054 lsst.daf.butler.registry.DatasetTypeExpressionError
2055 Raised when ``datasetType`` expression is invalid.
2056 TypeError
2057 Raised when the arguments are incompatible, such as when a
2058 collection wildcard is passed when ``findFirst`` is `True`, or
2059 when ``collections`` is `None` and ``self.defaults.collections`` is
2060 also `None`.
2061 lsst.daf.butler.registry.DataIdError
2062 Raised when ``dataId`` or keyword arguments specify unknown
2063 dimensions or values, or when they contain inconsistent values.
2064 lsst.daf.butler.registry.UserExpressionError
2065 Raised when ``where`` expression is invalid.
2067 Notes
2068 -----
2069 When multiple dataset types are queried in a single call, the
2070 results of this operation are equivalent to querying for each dataset
2071 type separately in turn, and no information about the relationships
2072 between datasets of different types is included. In contexts where
2073 that kind of information is important, the recommended pattern is to
2074 use `queryDataIds` to first obtain data IDs (possibly with the
2075 desired dataset types and collections passed as constraints to the
2076 query), and then use multiple (generally much simpler) calls to
2077 `queryDatasets` with the returned data IDs passed as constraints.
2078 """
2079 if components is not _DefaultMarker:
2080 if components is not False:
2081 raise DatasetTypeError(
2082 "Dataset component queries are no longer supported by Registry. Use "
2083 "DatasetType methods to obtain components from parent dataset types instead."
2084 )
2085 else:
2086 warnings.warn(
2087 "The components parameter is ignored. It will be removed after v27.",
2088 category=FutureWarning,
2089 stacklevel=find_outside_stacklevel("lsst.daf.butler"),
2090 )
2091 doomed_by: list[str] = []
2092 data_id = self._standardize_query_data_id_args(dataId, doomed_by=doomed_by, **kwargs)
2093 resolved_dataset_types, collection_wildcard = self._standardize_query_dataset_args(
2094 datasetType,
2095 collections,
2096 mode="find_first" if findFirst else "find_all",
2097 doomed_by=doomed_by,
2098 )
2099 if collection_wildcard is not None and collection_wildcard.empty():
2100 doomed_by.append("No datasets can be found because collection list is empty.")
2101 return queries.ChainedDatasetQueryResults([], doomed_by=doomed_by)
2102 parent_results: list[queries.ParentDatasetQueryResults] = []
2103 for resolved_dataset_type in resolved_dataset_types:
2104 # The full set of dimensions in the query is the combination of
2105 # those needed for the DatasetType and those explicitly requested,
2106 # if any.
2107 dimension_names = set(resolved_dataset_type.dimensions.names)
2108 if dimensions is not None:
2109 dimension_names.update(self.dimensions.conform(dimensions).names)
2110 # Construct the summary structure needed to construct a
2111 # QueryBuilder.
2112 summary = queries.QuerySummary(
2113 requested=self.dimensions.conform(dimension_names),
2114 column_types=self._managers.column_types,
2115 data_id=data_id,
2116 expression=where,
2117 bind=bind,
2118 defaults=self.defaults.dataId,
2119 check=check,
2120 datasets=[resolved_dataset_type],
2121 )
2122 builder = self._makeQueryBuilder(summary)
2123 # Add the dataset subquery to the query, telling the QueryBuilder
2124 # to include the rank of the selected collection in the results
2125 # only if we need to findFirst. Note that if any of the
2126 # collections are actually wildcard expressions, and
2127 # findFirst=True, this will raise TypeError for us.
2128 builder.joinDataset(
2129 resolved_dataset_type, collection_wildcard, isResult=True, findFirst=findFirst
2130 )
2131 query = builder.finish()
2132 parent_results.append(
2133 queries.ParentDatasetQueryResults(query, resolved_dataset_type, components=[None])
2134 )
2135 if not parent_results:
2136 doomed_by.extend(
2137 f"No registered dataset type matching {t!r} found, so no matching datasets can "
2138 "exist in any collection."
2139 for t in ensure_iterable(datasetType)
2140 )
2141 return queries.ChainedDatasetQueryResults([], doomed_by=doomed_by)
2142 elif len(parent_results) == 1:
2143 return parent_results[0]
2144 else:
2145 return queries.ChainedDatasetQueryResults(parent_results)
2147 def queryDataIds(
2148 self,
2149 # TODO: Drop Dimension support on DM-41326.
2150 dimensions: DimensionGroup | Iterable[Dimension | str] | Dimension | str,
2151 *,
2152 dataId: DataId | None = None,
2153 datasets: Any = None,
2154 collections: CollectionArgType | None = None,
2155 where: str = "",
2156 components: bool | _Marker = _DefaultMarker,
2157 bind: Mapping[str, Any] | None = None,
2158 check: bool = True,
2159 **kwargs: Any,
2160 ) -> queries.DataCoordinateQueryResults:
2161 """Query for data IDs matching user-provided criteria.
2163 Parameters
2164 ----------
2165 dimensions : `DimensionGroup`, `Dimension`, or `str`, or \
2166 `~collections.abc.Iterable` [ `Dimension` or `str` ]
2167 The dimensions of the data IDs to yield, as either `Dimension`
2168 instances or `str`. Will be automatically expanded to a complete
2169 `DimensionGroup`. Support for `Dimension` instances is deprecated
2170 and will not be supported after v27.
2171 dataId : `dict` or `DataCoordinate`, optional
2172 A data ID whose key-value pairs are used as equality constraints
2173 in the query.
2174 datasets : dataset type expression, optional
2175 An expression that fully or partially identifies dataset types
2176 that should constrain the yielded data IDs. For example, including
2177 "raw" here would constrain the yielded ``instrument``,
2178 ``exposure``, ``detector``, and ``physical_filter`` values to only
2179 those for which at least one "raw" dataset exists in
2180 ``collections``. Allowed types include `DatasetType`, `str`,
2181 and iterables thereof. Regular expression objects (i.e.
2182 `re.Pattern`) are deprecated and will be removed after the v26
2183 release. See :ref:`daf_butler_dataset_type_expressions` for more
2184 information.
2185 collections : collection expression, optional
2186 An expression that identifies the collections to search for
2187 datasets, such as a `str` (for full matches or partial matches
2188 via globs), `re.Pattern` (for partial matches), or iterable
2189 thereof. ``...`` can be used to search all collections (actually
2190 just all `~CollectionType.RUN` collections, because this will
2191 still find all datasets). If not provided,
2192 ``self.default.collections`` is used. Ignored unless ``datasets``
2193 is also passed. See :ref:`daf_butler_collection_expressions` for
2194 more information.
2195 where : `str`, optional
2196 A string expression similar to a SQL WHERE clause. May involve
2197 any column of a dimension table or (as a shortcut for the primary
2198 key column of a dimension table) dimension name. See
2199 :ref:`daf_butler_dimension_expressions` for more information.
2200 components : `bool`, optional
2201 Must be `False`. Provided only for backwards compatibility. After
2202 v27 this argument will be removed entirely.
2203 bind : `~collections.abc.Mapping`, optional
2204 Mapping containing literal values that should be injected into the
2205 ``where`` expression, keyed by the identifiers they replace.
2206 Values of collection type can be expanded in some cases; see
2207 :ref:`daf_butler_dimension_expressions_identifiers` for more
2208 information.
2209 check : `bool`, optional
2210 If `True` (default) check the query for consistency before
2211 executing it. This may reject some valid queries that resemble
2212 common mistakes (e.g. queries for visits without specifying an
2213 instrument).
2214 **kwargs
2215 Additional keyword arguments are forwarded to
2216 `DataCoordinate.standardize` when processing the ``dataId``
2217 argument (and may be used to provide a constraining data ID even
2218 when the ``dataId`` argument is `None`).
2220 Returns
2221 -------
2222 dataIds : `.queries.DataCoordinateQueryResults`
2223 Data IDs matching the given query parameters. These are guaranteed
2224 to identify all dimensions (`DataCoordinate.hasFull` returns
2225 `True`), but will not contain `DimensionRecord` objects
2226 (`DataCoordinate.hasRecords` returns `False`). Call
2227 `~.queries.DataCoordinateQueryResults.expanded` on the
2228 returned object to fetch those (and consider using
2229 `~.queries.DataCoordinateQueryResults.materialize` on the
2230 returned object first if the expected number of rows is very
2231 large). See documentation for those methods for additional
2232 information.
2234 Raises
2235 ------
2236 lsst.daf.butler.registry.NoDefaultCollectionError
2237 Raised if ``collections`` is `None` and
2238 ``self.defaults.collections`` is `None`.
2239 lsst.daf.butler.registry.CollectionExpressionError
2240 Raised when ``collections`` expression is invalid.
2241 lsst.daf.butler.registry.DataIdError
2242 Raised when ``dataId`` or keyword arguments specify unknown
2243 dimensions or values, or when they contain inconsistent values.
2244 lsst.daf.butler.registry.DatasetTypeExpressionError
2245 Raised when ``datasetType`` expression is invalid.
2246 lsst.daf.butler.registry.UserExpressionError
2247 Raised when ``where`` expression is invalid.
2248 """
2249 if components is not _DefaultMarker:
2250 if components is not False:
2251 raise DatasetTypeError(
2252 "Dataset component queries are no longer supported by Registry. Use "
2253 "DatasetType methods to obtain components from parent dataset types instead."
2254 )
2255 else:
2256 warnings.warn(
2257 "The components parameter is ignored. It will be removed after v27.",
2258 category=FutureWarning,
2259 stacklevel=find_outside_stacklevel("lsst.daf.butler"),
2260 )
2261 requested_dimensions = self.dimensions.conform(dimensions)
2262 doomed_by: list[str] = []
2263 data_id = self._standardize_query_data_id_args(dataId, doomed_by=doomed_by, **kwargs)
2264 resolved_dataset_types, collection_wildcard = self._standardize_query_dataset_args(
2265 datasets, collections, doomed_by=doomed_by
2266 )
2267 if collection_wildcard is not None and collection_wildcard.empty():
2268 doomed_by.append("No data coordinates can be found because collection list is empty.")
2269 summary = queries.QuerySummary(
2270 requested=requested_dimensions,
2271 column_types=self._managers.column_types,
2272 data_id=data_id,
2273 expression=where,
2274 bind=bind,
2275 defaults=self.defaults.dataId,
2276 check=check,
2277 datasets=resolved_dataset_types,
2278 )
2279 builder = self._makeQueryBuilder(summary, doomed_by=doomed_by)
2280 for datasetType in resolved_dataset_types:
2281 builder.joinDataset(datasetType, collection_wildcard, isResult=False)
2282 query = builder.finish()
2284 return queries.DataCoordinateQueryResults(query)
2286 def queryDimensionRecords(
2287 self,
2288 element: DimensionElement | str,
2289 *,
2290 dataId: DataId | None = None,
2291 datasets: Any = None,
2292 collections: CollectionArgType | None = None,
2293 where: str = "",
2294 components: bool | _Marker = _DefaultMarker,
2295 bind: Mapping[str, Any] | None = None,
2296 check: bool = True,
2297 **kwargs: Any,
2298 ) -> queries.DimensionRecordQueryResults:
2299 """Query for dimension information matching user-provided criteria.
2301 Parameters
2302 ----------
2303 element : `DimensionElement` or `str`
2304 The dimension element to obtain records for.
2305 dataId : `dict` or `DataCoordinate`, optional
2306 A data ID whose key-value pairs are used as equality constraints
2307 in the query.
2308 datasets : dataset type expression, optional
2309 An expression that fully or partially identifies dataset types
2310 that should constrain the yielded records. See `queryDataIds` and
2311 :ref:`daf_butler_dataset_type_expressions` for more information.
2312 collections : collection expression, optional
2313 An expression that identifies the collections to search for
2314 datasets, such as a `str` (for full matches or partial matches
2315 via globs), `re.Pattern` (for partial matches), or iterable
2316 thereof. ``...`` can be used to search all collections (actually
2317 just all `~CollectionType.RUN` collections, because this will
2318 still find all datasets). If not provided,
2319 ``self.default.collections`` is used. Ignored unless ``datasets``
2320 is also passed. See :ref:`daf_butler_collection_expressions` for
2321 more information.
2322 where : `str`, optional
2323 A string expression similar to a SQL WHERE clause. See
2324 `queryDataIds` and :ref:`daf_butler_dimension_expressions` for more
2325 information.
2326 components : `bool`, optional
2327 Whether to apply dataset expressions to components as well.
2328 See `queryDataIds` for more information.
2330 Must be `False`. Provided only for backwards compatibility. After
2331 v27 this argument will be removed entirely.
2332 bind : `~collections.abc.Mapping`, optional
2333 Mapping containing literal values that should be injected into the
2334 ``where`` expression, keyed by the identifiers they replace.
2335 Values of collection type can be expanded in some cases; see
2336 :ref:`daf_butler_dimension_expressions_identifiers` for more
2337 information.
2338 check : `bool`, optional
2339 If `True` (default) check the query for consistency before
2340 executing it. This may reject some valid queries that resemble
2341 common mistakes (e.g. queries for visits without specifying an
2342 instrument).
2343 **kwargs
2344 Additional keyword arguments are forwarded to
2345 `DataCoordinate.standardize` when processing the ``dataId``
2346 argument (and may be used to provide a constraining data ID even
2347 when the ``dataId`` argument is `None`).
2349 Returns
2350 -------
2351 dataIds : `.queries.DimensionRecordQueryResults`
2352 Data IDs matching the given query parameters.
2354 Raises
2355 ------
2356 lsst.daf.butler.registry.NoDefaultCollectionError
2357 Raised if ``collections`` is `None` and
2358 ``self.defaults.collections`` is `None`.
2359 lsst.daf.butler.registry.CollectionExpressionError
2360 Raised when ``collections`` expression is invalid.
2361 lsst.daf.butler.registry.DataIdError
2362 Raised when ``dataId`` or keyword arguments specify unknown
2363 dimensions or values, or when they contain inconsistent values.
2364 lsst.daf.butler.registry.DatasetTypeExpressionError
2365 Raised when ``datasetType`` expression is invalid.
2366 lsst.daf.butler.registry.UserExpressionError
2367 Raised when ``where`` expression is invalid.
2368 """
2369 if components is not _DefaultMarker:
2370 if components is not False:
2371 raise DatasetTypeError(
2372 "Dataset component queries are no longer supported by Registry. Use "
2373 "DatasetType methods to obtain components from parent dataset types instead."
2374 )
2375 else:
2376 warnings.warn(
2377 "The components parameter is ignored. It will be removed after v27.",
2378 category=FutureWarning,
2379 stacklevel=find_outside_stacklevel("lsst.daf.butler"),
2380 )
2381 if not isinstance(element, DimensionElement):
2382 try:
2383 element = self.dimensions[element]
2384 except KeyError as e:
2385 raise DimensionNameError(
2386 f"No such dimension '{element}', available dimensions: " + str(self.dimensions.elements)
2387 ) from e
2388 doomed_by: list[str] = []
2389 data_id = self._standardize_query_data_id_args(dataId, doomed_by=doomed_by, **kwargs)
2390 resolved_dataset_types, collection_wildcard = self._standardize_query_dataset_args(
2391 datasets, collections, doomed_by=doomed_by
2392 )
2393 if collection_wildcard is not None and collection_wildcard.empty():
2394 doomed_by.append("No dimension records can be found because collection list is empty.")
2395 summary = queries.QuerySummary(
2396 requested=element.minimal_group,
2397 column_types=self._managers.column_types,
2398 data_id=data_id,
2399 expression=where,
2400 bind=bind,
2401 defaults=self.defaults.dataId,
2402 check=check,
2403 datasets=resolved_dataset_types,
2404 )
2405 builder = self._makeQueryBuilder(summary, doomed_by=doomed_by)
2406 for datasetType in resolved_dataset_types:
2407 builder.joinDataset(datasetType, collection_wildcard, isResult=False)
2408 query = builder.finish().with_record_columns(element.name)
2409 return queries.DatabaseDimensionRecordQueryResults(query, element)
2411 def queryDatasetAssociations(
2412 self,
2413 datasetType: str | DatasetType,
2414 collections: CollectionArgType | None = ...,
2415 *,
2416 collectionTypes: Iterable[CollectionType] = CollectionType.all(),
2417 flattenChains: bool = False,
2418 ) -> Iterator[DatasetAssociation]:
2419 """Iterate over dataset-collection combinations where the dataset is in
2420 the collection.
2422 This method is a temporary placeholder for better support for
2423 association results in `queryDatasets`. It will probably be
2424 removed in the future, and should be avoided in production code
2425 whenever possible.
2427 Parameters
2428 ----------
2429 datasetType : `DatasetType` or `str`
2430 A dataset type object or the name of one.
2431 collections : collection expression, optional
2432 An expression that identifies the collections to search for
2433 datasets, such as a `str` (for full matches or partial matches
2434 via globs), `re.Pattern` (for partial matches), or iterable
2435 thereof. ``...`` can be used to search all collections (actually
2436 just all `~CollectionType.RUN` collections, because this will still
2437 find all datasets). If not provided, ``self.default.collections``
2438 is used. See :ref:`daf_butler_collection_expressions` for more
2439 information.
2440 collectionTypes : `~collections.abc.Set` [ `CollectionType` ], optional
2441 If provided, only yield associations from collections of these
2442 types.
2443 flattenChains : `bool`, optional
2444 If `True`, search in the children of `~CollectionType.CHAINED`
2445 collections. If `False`, ``CHAINED`` collections are ignored.
2447 Yields
2448 ------
2449 association : `.DatasetAssociation`
2450 Object representing the relationship between a single dataset and
2451 a single collection.
2453 Raises
2454 ------
2455 lsst.daf.butler.registry.NoDefaultCollectionError
2456 Raised if ``collections`` is `None` and
2457 ``self.defaults.collections`` is `None`.
2458 lsst.daf.butler.registry.CollectionExpressionError
2459 Raised when ``collections`` expression is invalid.
2460 """
2461 if collections is None:
2462 if not self.defaults.collections:
2463 raise NoDefaultCollectionError(
2464 "No collections provided to queryDatasetAssociations, "
2465 "and no defaults from registry construction."
2466 )
2467 collections = self.defaults.collections
2468 collection_wildcard = CollectionWildcard.from_expression(collections)
2469 backend = queries.SqlQueryBackend(self._db, self._managers, self.dimension_record_cache)
2470 parent_dataset_type = backend.resolve_single_dataset_type_wildcard(datasetType)
2471 timespan_tag = DatasetColumnTag(parent_dataset_type.name, "timespan")
2472 collection_tag = DatasetColumnTag(parent_dataset_type.name, "collection")
2473 for parent_collection_record in backend.resolve_collection_wildcard(
2474 collection_wildcard,
2475 collection_types=frozenset(collectionTypes),
2476 flatten_chains=flattenChains,
2477 ):
2478 # Resolve this possibly-chained collection into a list of
2479 # non-CHAINED collections that actually hold datasets of this
2480 # type.
2481 candidate_collection_records = backend.resolve_dataset_collections(
2482 parent_dataset_type,
2483 CollectionWildcard.from_names([parent_collection_record.name]),
2484 allow_calibration_collections=True,
2485 governor_constraints={},
2486 )
2487 if not candidate_collection_records:
2488 continue
2489 with backend.context() as context:
2490 relation = backend.make_dataset_query_relation(
2491 parent_dataset_type,
2492 candidate_collection_records,
2493 columns={"dataset_id", "run", "timespan", "collection"},
2494 context=context,
2495 )
2496 reader = queries.DatasetRefReader(
2497 parent_dataset_type,
2498 translate_collection=lambda k: self._managers.collections[k].name,
2499 full=False,
2500 )
2501 for row in context.fetch_iterable(relation):
2502 ref = reader.read(row)
2503 collection_record = self._managers.collections[row[collection_tag]]
2504 if collection_record.type is CollectionType.CALIBRATION:
2505 timespan = row[timespan_tag]
2506 else:
2507 # For backwards compatibility and (possibly?) user
2508 # convenience we continue to define the timespan of a
2509 # DatasetAssociation row for a non-CALIBRATION
2510 # collection to be None rather than a fully unbounded
2511 # timespan.
2512 timespan = None
2513 yield DatasetAssociation(ref=ref, collection=collection_record.name, timespan=timespan)
2515 def get_datastore_records(self, ref: DatasetRef) -> DatasetRef:
2516 """Retrieve datastore records for given ref.
2518 Parameters
2519 ----------
2520 ref : `DatasetRef`
2521 Dataset reference for which to retrieve its corresponding datastore
2522 records.
2524 Returns
2525 -------
2526 updated_ref : `DatasetRef`
2527 Dataset reference with filled datastore records.
2529 Notes
2530 -----
2531 If this method is called with the dataset ref that is not known to the
2532 registry then the reference with an empty set of records is returned.
2533 """
2534 datastore_records: dict[str, list[StoredDatastoreItemInfo]] = {}
2535 for opaque, record_class in self._datastore_record_classes.items():
2536 records = self.fetchOpaqueData(opaque, dataset_id=ref.id)
2537 datastore_records[opaque] = [record_class.from_record(record) for record in records]
2538 return ref.replace(datastore_records=datastore_records)
2540 def store_datastore_records(self, refs: Mapping[str, DatasetRef]) -> None:
2541 """Store datastore records for given refs.
2543 Parameters
2544 ----------
2545 refs : `~collections.abc.Mapping` [`str`, `DatasetRef`]
2546 Mapping of a datastore name to dataset reference stored in that
2547 datastore, reference must include datastore records.
2548 """
2549 for datastore_name, ref in refs.items():
2550 # Store ref IDs in the bridge table.
2551 bridge = self._managers.datastores.register(datastore_name)
2552 bridge.insert([ref])
2554 # store records in opaque tables
2555 assert ref._datastore_records is not None, "Dataset ref must have datastore records"
2556 for table_name, records in ref._datastore_records.items():
2557 opaque_table = self._managers.opaque.get(table_name)
2558 assert opaque_table is not None, f"Unexpected opaque table name {table_name}"
2559 opaque_table.insert(*(record.to_record(dataset_id=ref.id) for record in records))
2561 def make_datastore_tables(self, tables: Mapping[str, DatastoreOpaqueTable]) -> None:
2562 """Create opaque tables used by datastores.
2564 Parameters
2565 ----------
2566 tables : `~collections.abc.Mapping`
2567 Maps opaque table name to its definition.
2569 Notes
2570 -----
2571 This method should disappear in the future when opaque table
2572 definitions will be provided during `Registry` construction.
2573 """
2574 datastore_record_classes = {}
2575 for table_name, table_def in tables.items():
2576 datastore_record_classes[table_name] = table_def.record_class
2577 try:
2578 self._managers.opaque.register(table_name, table_def.table_spec)
2579 except ReadOnlyDatabaseError:
2580 # If the database is read only and we just tried and failed to
2581 # create a table, it means someone is trying to create a
2582 # read-only butler client for an empty repo. That should be
2583 # okay, as long as they then try to get any datasets before
2584 # some other client creates the table. Chances are they're
2585 # just validating configuration.
2586 pass
2587 self._datastore_record_classes = datastore_record_classes
2589 @property
2590 def obsCoreTableManager(self) -> ObsCoreTableManager | None:
2591 """The ObsCore manager instance for this registry
2592 (`~.interfaces.ObsCoreTableManager`
2593 or `None`).
2595 ObsCore manager may not be implemented for all registry backend, or
2596 may not be enabled for many repositories.
2597 """
2598 return self._managers.obscore
2600 storageClasses: StorageClassFactory
2601 """All storage classes known to the registry (`StorageClassFactory`).
2602 """
2604 _defaults: RegistryDefaults
2605 """Default collections used for registry queries (`RegistryDefaults`)."""