Coverage for python/lsst/daf/butler/registries/sql.py: 14%
447 statements
« prev ^ index » next coverage.py v6.5.0, created at 2023-01-06 01:42 -0800
« prev ^ index » next coverage.py v6.5.0, created at 2023-01-06 01:42 -0800
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22from __future__ import annotations
24__all__ = ("SqlRegistry",)
26import contextlib
27import logging
28import warnings
29from typing import (
30 TYPE_CHECKING,
31 Any,
32 Dict,
33 Iterable,
34 Iterator,
35 List,
36 Literal,
37 Mapping,
38 Optional,
39 Sequence,
40 Set,
41 Tuple,
42 Union,
43)
45import sqlalchemy
46from lsst.resources import ResourcePathExpression
47from lsst.utils.iteration import ensure_iterable
49from ..core import (
50 Config,
51 DataCoordinate,
52 DataCoordinateIterable,
53 DataId,
54 DatasetAssociation,
55 DatasetId,
56 DatasetRef,
57 DatasetType,
58 Dimension,
59 DimensionConfig,
60 DimensionElement,
61 DimensionGraph,
62 DimensionRecord,
63 DimensionUniverse,
64 NamedKeyMapping,
65 NameLookupMapping,
66 Progress,
67 StorageClassFactory,
68 Timespan,
69 ddl,
70)
71from ..core.utils import transactional
72from ..registry import (
73 ArgumentError,
74 CollectionExpressionError,
75 CollectionSummary,
76 CollectionType,
77 CollectionTypeError,
78 ConflictingDefinitionError,
79 DataIdValueError,
80 DatasetTypeError,
81 DimensionNameError,
82 InconsistentDataIdError,
83 NoDefaultCollectionError,
84 OrphanedRecordError,
85 Registry,
86 RegistryConfig,
87 RegistryDefaults,
88 queries,
89)
90from ..registry.interfaces import ChainedCollectionRecord, DatasetIdFactory, DatasetIdGenEnum, RunRecord
91from ..registry.managers import RegistryManagerInstances, RegistryManagerTypes
92from ..registry.wildcards import CollectionWildcard, DatasetTypeWildcard
94if TYPE_CHECKING: 94 ↛ 95line 94 didn't jump to line 95, because the condition on line 94 was never true
95 from .._butlerConfig import ButlerConfig
96 from ..registry.interfaces import CollectionRecord, Database, DatastoreRegistryBridgeManager
99_LOG = logging.getLogger(__name__)
102class SqlRegistry(Registry):
103 """Registry implementation based on SQLAlchemy.
105 Parameters
106 ----------
107 database : `Database`
108 Database instance to store Registry.
109 defaults : `RegistryDefaults`
110 Default collection search path and/or output `~CollectionType.RUN`
111 collection.
112 managers : `RegistryManagerInstances`
113 All the managers required for this registry.
114 """
116 defaultConfigFile: Optional[str] = None
117 """Path to configuration defaults. Accessed within the ``configs`` resource
118 or relative to a search path. Can be None if no defaults specified.
119 """
121 @classmethod
122 def createFromConfig(
123 cls,
124 config: Optional[Union[RegistryConfig, str]] = None,
125 dimensionConfig: Optional[Union[DimensionConfig, str]] = None,
126 butlerRoot: Optional[ResourcePathExpression] = None,
127 ) -> Registry:
128 """Create registry database and return `SqlRegistry` instance.
130 This method initializes database contents, database must be empty
131 prior to calling this method.
133 Parameters
134 ----------
135 config : `RegistryConfig` or `str`, optional
136 Registry configuration, if missing then default configuration will
137 be loaded from registry.yaml.
138 dimensionConfig : `DimensionConfig` or `str`, optional
139 Dimensions configuration, if missing then default configuration
140 will be loaded from dimensions.yaml.
141 butlerRoot : convertible to `lsst.resources.ResourcePath`, optional
142 Path to the repository root this `SqlRegistry` will manage.
144 Returns
145 -------
146 registry : `SqlRegistry`
147 A new `SqlRegistry` instance.
148 """
149 config = cls.forceRegistryConfig(config)
150 config.replaceRoot(butlerRoot)
152 if isinstance(dimensionConfig, str):
153 dimensionConfig = DimensionConfig(dimensionConfig)
154 elif dimensionConfig is None:
155 dimensionConfig = DimensionConfig()
156 elif not isinstance(dimensionConfig, DimensionConfig):
157 raise TypeError(f"Incompatible Dimension configuration type: {type(dimensionConfig)}")
159 DatabaseClass = config.getDatabaseClass()
160 database = DatabaseClass.fromUri(
161 str(config.connectionString), origin=config.get("origin", 0), namespace=config.get("namespace")
162 )
163 managerTypes = RegistryManagerTypes.fromConfig(config)
164 managers = managerTypes.makeRepo(database, dimensionConfig)
165 return cls(database, RegistryDefaults(), managers)
167 @classmethod
168 def fromConfig(
169 cls,
170 config: Union[ButlerConfig, RegistryConfig, Config, str],
171 butlerRoot: Optional[ResourcePathExpression] = None,
172 writeable: bool = True,
173 defaults: Optional[RegistryDefaults] = None,
174 ) -> Registry:
175 """Create `Registry` subclass instance from `config`.
177 Registry database must be initialized prior to calling this method.
179 Parameters
180 ----------
181 config : `ButlerConfig`, `RegistryConfig`, `Config` or `str`
182 Registry configuration
183 butlerRoot : `lsst.resources.ResourcePathExpression`, optional
184 Path to the repository root this `Registry` will manage.
185 writeable : `bool`, optional
186 If `True` (default) create a read-write connection to the database.
187 defaults : `RegistryDefaults`, optional
188 Default collection search path and/or output `~CollectionType.RUN`
189 collection.
191 Returns
192 -------
193 registry : `SqlRegistry` (subclass)
194 A new `SqlRegistry` subclass instance.
195 """
196 config = cls.forceRegistryConfig(config)
197 config.replaceRoot(butlerRoot)
198 DatabaseClass = config.getDatabaseClass()
199 database = DatabaseClass.fromUri(
200 str(config.connectionString),
201 origin=config.get("origin", 0),
202 namespace=config.get("namespace"),
203 writeable=writeable,
204 )
205 managerTypes = RegistryManagerTypes.fromConfig(config)
206 with database.session():
207 managers = managerTypes.loadRepo(database)
208 if defaults is None:
209 defaults = RegistryDefaults()
210 return cls(database, defaults, managers)
212 def __init__(self, database: Database, defaults: RegistryDefaults, managers: RegistryManagerInstances):
213 self._db = database
214 self._managers = managers
215 self.storageClasses = StorageClassFactory()
216 # Intentionally invoke property setter to initialize defaults. This
217 # can only be done after most of the rest of Registry has already been
218 # initialized, and must be done before the property getter is used.
219 self.defaults = defaults
220 # In the future DatasetIdFactory may become configurable and this
221 # instance will need to be shared with datasets manager.
222 self.datasetIdFactory = DatasetIdFactory()
224 def __str__(self) -> str:
225 return str(self._db)
227 def __repr__(self) -> str:
228 return f"SqlRegistry({self._db!r}, {self.dimensions!r})"
230 def isWriteable(self) -> bool:
231 # Docstring inherited from lsst.daf.butler.registry.Registry
232 return self._db.isWriteable()
234 def copy(self, defaults: Optional[RegistryDefaults] = None) -> Registry:
235 # Docstring inherited from lsst.daf.butler.registry.Registry
236 if defaults is None:
237 # No need to copy, because `RegistryDefaults` is immutable; we
238 # effectively copy on write.
239 defaults = self.defaults
240 return type(self)(self._db, defaults, self._managers)
242 @property
243 def dimensions(self) -> DimensionUniverse:
244 # Docstring inherited from lsst.daf.butler.registry.Registry
245 return self._managers.dimensions.universe
247 def refresh(self) -> None:
248 # Docstring inherited from lsst.daf.butler.registry.Registry
249 with self._db.transaction():
250 self._managers.refresh()
252 @contextlib.contextmanager
253 def transaction(self, *, savepoint: bool = False) -> Iterator[None]:
254 # Docstring inherited from lsst.daf.butler.registry.Registry
255 try:
256 with self._db.transaction(savepoint=savepoint):
257 yield
258 except BaseException:
259 # TODO: this clears the caches sometimes when we wouldn't actually
260 # need to. Can we avoid that?
261 self._managers.dimensions.clearCaches()
262 raise
264 def resetConnectionPool(self) -> None:
265 """Reset SQLAlchemy connection pool for `SqlRegistry` database.
267 This operation is useful when using registry with fork-based
268 multiprocessing. To use registry across fork boundary one has to make
269 sure that there are no currently active connections (no session or
270 transaction is in progress) and connection pool is reset using this
271 method. This method should be called by the child process immediately
272 after the fork.
273 """
274 self._db._engine.dispose()
276 def registerOpaqueTable(self, tableName: str, spec: ddl.TableSpec) -> None:
277 """Add an opaque (to the `Registry`) table for use by a `Datastore` or
278 other data repository client.
280 Opaque table records can be added via `insertOpaqueData`, retrieved via
281 `fetchOpaqueData`, and removed via `deleteOpaqueData`.
283 Parameters
284 ----------
285 tableName : `str`
286 Logical name of the opaque table. This may differ from the
287 actual name used in the database by a prefix and/or suffix.
288 spec : `ddl.TableSpec`
289 Specification for the table to be added.
290 """
291 self._managers.opaque.register(tableName, spec)
293 @transactional
294 def insertOpaqueData(self, tableName: str, *data: dict) -> None:
295 """Insert records into an opaque table.
297 Parameters
298 ----------
299 tableName : `str`
300 Logical name of the opaque table. Must match the name used in a
301 previous call to `registerOpaqueTable`.
302 data
303 Each additional positional argument is a dictionary that represents
304 a single row to be added.
305 """
306 self._managers.opaque[tableName].insert(*data)
308 def fetchOpaqueData(self, tableName: str, **where: Any) -> Iterator[dict]:
309 """Retrieve records from an opaque table.
311 Parameters
312 ----------
313 tableName : `str`
314 Logical name of the opaque table. Must match the name used in a
315 previous call to `registerOpaqueTable`.
316 where
317 Additional keyword arguments are interpreted as equality
318 constraints that restrict the returned rows (combined with AND);
319 keyword arguments are column names and values are the values they
320 must have.
322 Yields
323 ------
324 row : `dict`
325 A dictionary representing a single result row.
326 """
327 yield from self._managers.opaque[tableName].fetch(**where)
329 @transactional
330 def deleteOpaqueData(self, tableName: str, **where: Any) -> None:
331 """Remove records from an opaque table.
333 Parameters
334 ----------
335 tableName : `str`
336 Logical name of the opaque table. Must match the name used in a
337 previous call to `registerOpaqueTable`.
338 where
339 Additional keyword arguments are interpreted as equality
340 constraints that restrict the deleted rows (combined with AND);
341 keyword arguments are column names and values are the values they
342 must have.
343 """
344 self._managers.opaque[tableName].delete(where.keys(), where)
346 def registerCollection(
347 self, name: str, type: CollectionType = CollectionType.TAGGED, doc: Optional[str] = None
348 ) -> bool:
349 # Docstring inherited from lsst.daf.butler.registry.Registry
350 _, registered = self._managers.collections.register(name, type, doc=doc)
351 return registered
353 def getCollectionType(self, name: str) -> CollectionType:
354 # Docstring inherited from lsst.daf.butler.registry.Registry
355 return self._managers.collections.find(name).type
357 def _get_collection_record(self, name: str) -> CollectionRecord:
358 # Docstring inherited from lsst.daf.butler.registry.Registry
359 return self._managers.collections.find(name)
361 def registerRun(self, name: str, doc: Optional[str] = None) -> bool:
362 # Docstring inherited from lsst.daf.butler.registry.Registry
363 _, registered = self._managers.collections.register(name, CollectionType.RUN, doc=doc)
364 return registered
366 @transactional
367 def removeCollection(self, name: str) -> None:
368 # Docstring inherited from lsst.daf.butler.registry.Registry
369 self._managers.collections.remove(name)
371 def getCollectionChain(self, parent: str) -> tuple[str, ...]:
372 # Docstring inherited from lsst.daf.butler.registry.Registry
373 record = self._managers.collections.find(parent)
374 if record.type is not CollectionType.CHAINED:
375 raise CollectionTypeError(f"Collection '{parent}' has type {record.type.name}, not CHAINED.")
376 assert isinstance(record, ChainedCollectionRecord)
377 return record.children
379 @transactional
380 def setCollectionChain(self, parent: str, children: Any, *, flatten: bool = False) -> None:
381 # Docstring inherited from lsst.daf.butler.registry.Registry
382 record = self._managers.collections.find(parent)
383 if record.type is not CollectionType.CHAINED:
384 raise CollectionTypeError(f"Collection '{parent}' has type {record.type.name}, not CHAINED.")
385 assert isinstance(record, ChainedCollectionRecord)
386 children = CollectionWildcard.from_expression(children).require_ordered()
387 if children != record.children or flatten:
388 record.update(self._managers.collections, children, flatten=flatten)
390 def getCollectionParentChains(self, collection: str) -> Set[str]:
391 # Docstring inherited from lsst.daf.butler.registry.Registry
392 return {
393 record.name
394 for record in self._managers.collections.getParentChains(
395 self._managers.collections.find(collection).key
396 )
397 }
399 def getCollectionDocumentation(self, collection: str) -> Optional[str]:
400 # Docstring inherited from lsst.daf.butler.registry.Registry
401 return self._managers.collections.getDocumentation(self._managers.collections.find(collection).key)
403 def setCollectionDocumentation(self, collection: str, doc: Optional[str]) -> None:
404 # Docstring inherited from lsst.daf.butler.registry.Registry
405 self._managers.collections.setDocumentation(self._managers.collections.find(collection).key, doc)
407 def getCollectionSummary(self, collection: str) -> CollectionSummary:
408 # Docstring inherited from lsst.daf.butler.registry.Registry
409 record = self._managers.collections.find(collection)
410 return self._managers.datasets.getCollectionSummary(record)
412 def registerDatasetType(self, datasetType: DatasetType) -> bool:
413 # Docstring inherited from lsst.daf.butler.registry.Registry
414 _, inserted = self._managers.datasets.register(datasetType)
415 return inserted
417 def removeDatasetType(self, name: str) -> None:
418 # Docstring inherited from lsst.daf.butler.registry.Registry
419 self._managers.datasets.remove(name)
421 def getDatasetType(self, name: str) -> DatasetType:
422 # Docstring inherited from lsst.daf.butler.registry.Registry
423 parent_name, component = DatasetType.splitDatasetTypeName(name)
424 storage = self._managers.datasets[parent_name]
425 if component is None:
426 return storage.datasetType
427 else:
428 return storage.datasetType.makeComponentDatasetType(component)
430 def supportsIdGenerationMode(self, mode: DatasetIdGenEnum) -> bool:
431 # Docstring inherited from lsst.daf.butler.registry.Registry
432 return self._managers.datasets.supportsIdGenerationMode(mode)
434 def findDataset(
435 self,
436 datasetType: Union[DatasetType, str],
437 dataId: Optional[DataId] = None,
438 *,
439 collections: Any = None,
440 timespan: Optional[Timespan] = None,
441 **kwargs: Any,
442 ) -> Optional[DatasetRef]:
443 # Docstring inherited from lsst.daf.butler.registry.Registry
444 storage_class: str | None = None
445 if isinstance(datasetType, DatasetType):
446 parent_name, component = datasetType.nameAndComponent()
447 if component is None:
448 storage_class = datasetType.storageClass_name
449 else:
450 parent_name, component = DatasetType.splitDatasetTypeName(datasetType)
451 storage = self._managers.datasets[parent_name]
452 dataId = DataCoordinate.standardize(
453 dataId,
454 graph=storage.datasetType.dimensions,
455 universe=self.dimensions,
456 defaults=self.defaults.dataId,
457 **kwargs,
458 )
459 if collections is None:
460 if not self.defaults.collections:
461 raise NoDefaultCollectionError(
462 "No collections provided to findDataset, and no defaults from registry construction."
463 )
464 collections = self.defaults.collections
465 collections = CollectionWildcard.from_expression(collections)
466 collections.require_ordered()
467 for collectionRecord in self._managers.collections.resolve_wildcard(collections):
468 if collectionRecord.type is CollectionType.CALIBRATION and (
469 not storage.datasetType.isCalibration() or timespan is None
470 ):
471 continue
472 result = storage.find(collectionRecord, dataId, timespan=timespan, storage_class=storage_class)
473 if result is not None:
474 if component is not None:
475 return result.makeComponentRef(component)
476 return result
478 return None
480 @transactional
481 def insertDatasets(
482 self,
483 datasetType: Union[DatasetType, str],
484 dataIds: Iterable[DataId],
485 run: Optional[str] = None,
486 expand: bool = True,
487 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE,
488 ) -> List[DatasetRef]:
489 # Docstring inherited from lsst.daf.butler.registry.Registry
490 if isinstance(datasetType, DatasetType):
491 storage = self._managers.datasets.find(datasetType.name)
492 if storage is None:
493 raise DatasetTypeError(f"DatasetType '{datasetType}' has not been registered.")
494 else:
495 storage = self._managers.datasets.find(datasetType)
496 if storage is None:
497 raise DatasetTypeError(f"DatasetType with name '{datasetType}' has not been registered.")
498 if run is None:
499 if self.defaults.run is None:
500 raise NoDefaultCollectionError(
501 "No run provided to insertDatasets, and no default from registry construction."
502 )
503 run = self.defaults.run
504 runRecord = self._managers.collections.find(run)
505 if runRecord.type is not CollectionType.RUN:
506 raise CollectionTypeError(
507 f"Given collection is of type {runRecord.type.name}; RUN collection required."
508 )
509 assert isinstance(runRecord, RunRecord)
510 progress = Progress("daf.butler.Registry.insertDatasets", level=logging.DEBUG)
511 if expand:
512 expandedDataIds = [
513 self.expandDataId(dataId, graph=storage.datasetType.dimensions)
514 for dataId in progress.wrap(dataIds, f"Expanding {storage.datasetType.name} data IDs")
515 ]
516 else:
517 expandedDataIds = [
518 DataCoordinate.standardize(dataId, graph=storage.datasetType.dimensions) for dataId in dataIds
519 ]
520 try:
521 refs = list(storage.insert(runRecord, expandedDataIds, idGenerationMode))
522 if self._managers.obscore:
523 self._managers.obscore.add_datasets(refs)
524 except sqlalchemy.exc.IntegrityError as err:
525 raise ConflictingDefinitionError(
526 f"A database constraint failure was triggered by inserting "
527 f"one or more datasets of type {storage.datasetType} into "
528 f"collection '{run}'. "
529 f"This probably means a dataset with the same data ID "
530 f"and dataset type already exists, but it may also mean a "
531 f"dimension row is missing."
532 ) from err
533 return refs
535 @transactional
536 def _importDatasets(
537 self,
538 datasets: Iterable[DatasetRef],
539 expand: bool = True,
540 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE,
541 reuseIds: bool = False,
542 ) -> List[DatasetRef]:
543 # Docstring inherited from lsst.daf.butler.registry.Registry
544 datasets = list(datasets)
545 if not datasets:
546 # nothing to do
547 return []
549 # find dataset type
550 datasetTypes = set(dataset.datasetType for dataset in datasets)
551 if len(datasetTypes) != 1:
552 raise DatasetTypeError(f"Multiple dataset types in input datasets: {datasetTypes}")
553 datasetType = datasetTypes.pop()
555 # get storage handler for this dataset type
556 storage = self._managers.datasets.find(datasetType.name)
557 if storage is None:
558 raise DatasetTypeError(f"DatasetType '{datasetType}' has not been registered.")
560 # find run name
561 runs = set(dataset.run for dataset in datasets)
562 if len(runs) != 1:
563 raise ValueError(f"Multiple run names in input datasets: {runs}")
564 run = runs.pop()
565 if run is None:
566 if self.defaults.run is None:
567 raise NoDefaultCollectionError(
568 "No run provided to ingestDatasets, and no default from registry construction."
569 )
570 run = self.defaults.run
572 runRecord = self._managers.collections.find(run)
573 if runRecord.type is not CollectionType.RUN:
574 raise CollectionTypeError(
575 f"Given collection '{runRecord.name}' is of type {runRecord.type.name};"
576 " RUN collection required."
577 )
578 assert isinstance(runRecord, RunRecord)
580 progress = Progress("daf.butler.Registry.insertDatasets", level=logging.DEBUG)
581 if expand:
582 expandedDatasets = [
583 dataset.expanded(self.expandDataId(dataset.dataId, graph=storage.datasetType.dimensions))
584 for dataset in progress.wrap(datasets, f"Expanding {storage.datasetType.name} data IDs")
585 ]
586 else:
587 expandedDatasets = [
588 DatasetRef(datasetType, dataset.dataId, id=dataset.id, run=dataset.run, conform=True)
589 for dataset in datasets
590 ]
592 try:
593 refs = list(storage.import_(runRecord, expandedDatasets, idGenerationMode, reuseIds))
594 if self._managers.obscore:
595 self._managers.obscore.add_datasets(refs)
596 except sqlalchemy.exc.IntegrityError as err:
597 raise ConflictingDefinitionError(
598 f"A database constraint failure was triggered by inserting "
599 f"one or more datasets of type {storage.datasetType} into "
600 f"collection '{run}'. "
601 f"This probably means a dataset with the same data ID "
602 f"and dataset type already exists, but it may also mean a "
603 f"dimension row is missing."
604 ) from err
605 return refs
607 def getDataset(self, id: DatasetId) -> Optional[DatasetRef]:
608 # Docstring inherited from lsst.daf.butler.registry.Registry
609 return self._managers.datasets.getDatasetRef(id)
611 @transactional
612 def removeDatasets(self, refs: Iterable[DatasetRef]) -> None:
613 # Docstring inherited from lsst.daf.butler.registry.Registry
614 progress = Progress("lsst.daf.butler.Registry.removeDatasets", level=logging.DEBUG)
615 for datasetType, refsForType in progress.iter_item_chunks(
616 DatasetRef.groupByType(refs).items(), desc="Removing datasets by type"
617 ):
618 storage = self._managers.datasets[datasetType.name]
619 try:
620 storage.delete(refsForType)
621 except sqlalchemy.exc.IntegrityError as err:
622 raise OrphanedRecordError(
623 "One or more datasets is still present in one or more Datastores."
624 ) from err
626 @transactional
627 def associate(self, collection: str, refs: Iterable[DatasetRef]) -> None:
628 # Docstring inherited from lsst.daf.butler.registry.Registry
629 progress = Progress("lsst.daf.butler.Registry.associate", level=logging.DEBUG)
630 collectionRecord = self._managers.collections.find(collection)
631 if collectionRecord.type is not CollectionType.TAGGED:
632 raise CollectionTypeError(
633 f"Collection '{collection}' has type {collectionRecord.type.name}, not TAGGED."
634 )
635 for datasetType, refsForType in progress.iter_item_chunks(
636 DatasetRef.groupByType(refs).items(), desc="Associating datasets by type"
637 ):
638 storage = self._managers.datasets[datasetType.name]
639 try:
640 storage.associate(collectionRecord, refsForType)
641 if self._managers.obscore:
642 # If a TAGGED collection is being monitored by ObsCore
643 # manager then we may need to save the dataset.
644 self._managers.obscore.associate(refsForType, collectionRecord)
645 except sqlalchemy.exc.IntegrityError as err:
646 raise ConflictingDefinitionError(
647 f"Constraint violation while associating dataset of type {datasetType.name} with "
648 f"collection {collection}. This probably means that one or more datasets with the same "
649 f"dataset type and data ID already exist in the collection, but it may also indicate "
650 f"that the datasets do not exist."
651 ) from err
653 @transactional
654 def disassociate(self, collection: str, refs: Iterable[DatasetRef]) -> None:
655 # Docstring inherited from lsst.daf.butler.registry.Registry
656 progress = Progress("lsst.daf.butler.Registry.disassociate", level=logging.DEBUG)
657 collectionRecord = self._managers.collections.find(collection)
658 if collectionRecord.type is not CollectionType.TAGGED:
659 raise CollectionTypeError(
660 f"Collection '{collection}' has type {collectionRecord.type.name}; expected TAGGED."
661 )
662 for datasetType, refsForType in progress.iter_item_chunks(
663 DatasetRef.groupByType(refs).items(), desc="Disassociating datasets by type"
664 ):
665 storage = self._managers.datasets[datasetType.name]
666 storage.disassociate(collectionRecord, refsForType)
667 if self._managers.obscore:
668 self._managers.obscore.disassociate(refsForType, collectionRecord)
670 @transactional
671 def certify(self, collection: str, refs: Iterable[DatasetRef], timespan: Timespan) -> None:
672 # Docstring inherited from lsst.daf.butler.registry.Registry
673 progress = Progress("lsst.daf.butler.Registry.certify", level=logging.DEBUG)
674 collectionRecord = self._managers.collections.find(collection)
675 for datasetType, refsForType in progress.iter_item_chunks(
676 DatasetRef.groupByType(refs).items(), desc="Certifying datasets by type"
677 ):
678 storage = self._managers.datasets[datasetType.name]
679 storage.certify(collectionRecord, refsForType, timespan)
681 @transactional
682 def decertify(
683 self,
684 collection: str,
685 datasetType: Union[str, DatasetType],
686 timespan: Timespan,
687 *,
688 dataIds: Optional[Iterable[DataId]] = None,
689 ) -> None:
690 # Docstring inherited from lsst.daf.butler.registry.Registry
691 collectionRecord = self._managers.collections.find(collection)
692 if isinstance(datasetType, str):
693 storage = self._managers.datasets[datasetType]
694 else:
695 storage = self._managers.datasets[datasetType.name]
696 standardizedDataIds = None
697 if dataIds is not None:
698 standardizedDataIds = [
699 DataCoordinate.standardize(d, graph=storage.datasetType.dimensions) for d in dataIds
700 ]
701 storage.decertify(collectionRecord, timespan, dataIds=standardizedDataIds)
703 def getDatastoreBridgeManager(self) -> DatastoreRegistryBridgeManager:
704 """Return an object that allows a new `Datastore` instance to
705 communicate with this `Registry`.
707 Returns
708 -------
709 manager : `DatastoreRegistryBridgeManager`
710 Object that mediates communication between this `Registry` and its
711 associated datastores.
712 """
713 return self._managers.datastores
715 def getDatasetLocations(self, ref: DatasetRef) -> Iterable[str]:
716 # Docstring inherited from lsst.daf.butler.registry.Registry
717 return self._managers.datastores.findDatastores(ref)
719 def expandDataId(
720 self,
721 dataId: Optional[DataId] = None,
722 *,
723 graph: Optional[DimensionGraph] = None,
724 records: Optional[NameLookupMapping[DimensionElement, Optional[DimensionRecord]]] = None,
725 withDefaults: bool = True,
726 **kwargs: Any,
727 ) -> DataCoordinate:
728 # Docstring inherited from lsst.daf.butler.registry.Registry
729 if not withDefaults:
730 defaults = None
731 else:
732 defaults = self.defaults.dataId
733 try:
734 standardized = DataCoordinate.standardize(
735 dataId, graph=graph, universe=self.dimensions, defaults=defaults, **kwargs
736 )
737 except KeyError as exc:
738 # This means either kwargs have some odd name or required
739 # dimension is missing.
740 raise DimensionNameError(str(exc)) from exc
741 if standardized.hasRecords():
742 return standardized
743 if records is None:
744 records = {}
745 elif isinstance(records, NamedKeyMapping):
746 records = records.byName()
747 else:
748 records = dict(records)
749 if isinstance(dataId, DataCoordinate) and dataId.hasRecords():
750 records.update(dataId.records.byName())
751 keys = standardized.byName()
752 for element in standardized.graph.primaryKeyTraversalOrder:
753 record = records.get(element.name, ...) # Use ... to mean not found; None might mean NULL
754 if record is ...:
755 if isinstance(element, Dimension) and keys.get(element.name) is None:
756 if element in standardized.graph.required:
757 raise DimensionNameError(
758 f"No value or null value for required dimension {element.name}."
759 )
760 keys[element.name] = None
761 record = None
762 else:
763 storage = self._managers.dimensions[element]
764 dataIdSet = DataCoordinateIterable.fromScalar(
765 DataCoordinate.standardize(keys, graph=element.graph)
766 )
767 fetched = tuple(storage.fetch(dataIdSet))
768 try:
769 (record,) = fetched
770 except ValueError:
771 record = None
772 records[element.name] = record
773 if record is not None:
774 for d in element.implied:
775 value = getattr(record, d.name)
776 if keys.setdefault(d.name, value) != value:
777 raise InconsistentDataIdError(
778 f"Data ID {standardized} has {d.name}={keys[d.name]!r}, "
779 f"but {element.name} implies {d.name}={value!r}."
780 )
781 else:
782 if element in standardized.graph.required:
783 raise DataIdValueError(
784 f"Could not fetch record for required dimension {element.name} via keys {keys}."
785 )
786 if element.alwaysJoin:
787 raise InconsistentDataIdError(
788 f"Could not fetch record for element {element.name} via keys {keys}, ",
789 "but it is marked alwaysJoin=True; this means one or more dimensions are not "
790 "related.",
791 )
792 for d in element.implied:
793 keys.setdefault(d.name, None)
794 records.setdefault(d.name, None)
795 return DataCoordinate.standardize(keys, graph=standardized.graph).expanded(records=records)
797 def insertDimensionData(
798 self,
799 element: Union[DimensionElement, str],
800 *data: Union[Mapping[str, Any], DimensionRecord],
801 conform: bool = True,
802 replace: bool = False,
803 skip_existing: bool = False,
804 ) -> None:
805 # Docstring inherited from lsst.daf.butler.registry.Registry
806 if conform:
807 if isinstance(element, str):
808 element = self.dimensions[element]
809 records = [
810 row if isinstance(row, DimensionRecord) else element.RecordClass(**row) for row in data
811 ]
812 else:
813 # Ignore typing since caller said to trust them with conform=False.
814 records = data # type: ignore
815 storage = self._managers.dimensions[element] # type: ignore
816 storage.insert(*records, replace=replace, skip_existing=skip_existing)
818 def syncDimensionData(
819 self,
820 element: Union[DimensionElement, str],
821 row: Union[Mapping[str, Any], DimensionRecord],
822 conform: bool = True,
823 update: bool = False,
824 ) -> Union[bool, Dict[str, Any]]:
825 # Docstring inherited from lsst.daf.butler.registry.Registry
826 if conform:
827 if isinstance(element, str):
828 element = self.dimensions[element]
829 record = row if isinstance(row, DimensionRecord) else element.RecordClass(**row)
830 else:
831 # Ignore typing since caller said to trust them with conform=False.
832 record = row # type: ignore
833 storage = self._managers.dimensions[element] # type: ignore
834 return storage.sync(record, update=update)
836 def queryDatasetTypes(
837 self,
838 expression: Any = ...,
839 *,
840 components: Optional[bool] = None,
841 missing: Optional[List[str]] = None,
842 ) -> Iterable[DatasetType]:
843 # Docstring inherited from lsst.daf.butler.registry.Registry
844 wildcard = DatasetTypeWildcard.from_expression(expression)
845 composition_dict = self._managers.datasets.resolve_wildcard(
846 wildcard,
847 components=components,
848 missing=missing,
849 )
850 result: list[DatasetType] = []
851 for parent_dataset_type, components_for_parent in composition_dict.items():
852 result.extend(
853 parent_dataset_type.makeComponentDatasetType(c) if c is not None else parent_dataset_type
854 for c in components_for_parent
855 )
856 return result
858 def queryCollections(
859 self,
860 expression: Any = ...,
861 datasetType: Optional[DatasetType] = None,
862 collectionTypes: Union[Iterable[CollectionType], CollectionType] = CollectionType.all(),
863 flattenChains: bool = False,
864 includeChains: Optional[bool] = None,
865 ) -> Sequence[str]:
866 # Docstring inherited from lsst.daf.butler.registry.Registry
868 # Right now the datasetTypes argument is completely ignored, but that
869 # is consistent with its [lack of] guarantees. DM-24939 or a follow-up
870 # ticket will take care of that.
871 try:
872 wildcard = CollectionWildcard.from_expression(expression)
873 except TypeError as exc:
874 raise CollectionExpressionError(f"Invalid collection expression '{expression}'") from exc
875 collectionTypes = ensure_iterable(collectionTypes)
876 return [
877 record.name
878 for record in self._managers.collections.resolve_wildcard(
879 wildcard,
880 collection_types=frozenset(collectionTypes),
881 flatten_chains=flattenChains,
882 include_chains=includeChains,
883 )
884 ]
886 def _makeQueryBuilder(
887 self, summary: queries.QuerySummary, doomed_by: Iterable[str] = ()
888 ) -> queries.QueryBuilder:
889 """Return a `QueryBuilder` instance capable of constructing and
890 managing more complex queries than those obtainable via `Registry`
891 interfaces.
893 This is an advanced interface; downstream code should prefer
894 `Registry.queryDataIds` and `Registry.queryDatasets` whenever those
895 are sufficient.
897 Parameters
898 ----------
899 summary : `queries.QuerySummary`
900 Object describing and categorizing the full set of dimensions that
901 will be included in the query.
902 doomed_by : `Iterable` of `str`, optional
903 A list of diagnostic messages that indicate why the query is going
904 to yield no results and should not even be executed. If an empty
905 container (default) the query will be executed unless other code
906 determines that it is doomed.
908 Returns
909 -------
910 builder : `queries.QueryBuilder`
911 Object that can be used to construct and perform advanced queries.
912 """
913 return queries.QueryBuilder(
914 summary,
915 backend=queries.SqlQueryBackend(self._db, self._managers),
916 doomed_by=doomed_by,
917 )
919 def _standardize_query_dataset_args(
920 self,
921 datasets: Any,
922 collections: Any,
923 components: bool | None,
924 mode: Literal["find_first"] | Literal["find_all"] | Literal["constrain"] = "constrain",
925 *,
926 doomed_by: list[str],
927 ) -> tuple[dict[DatasetType, list[str | None]], CollectionWildcard | None]:
928 """Preprocess dataset arguments passed to query* methods.
930 Parameters
931 ----------
932 datasets : `DatasetType`, `str`, `re.Pattern`, or iterable of these
933 Expression identifying dataset types. See `queryDatasetTypes` for
934 details.
935 collections : `str`, `re.Pattern`, or iterable of these
936 Expression identifying collections to be searched. See
937 `queryCollections` for details.
938 components : `bool`, optional
939 If `True`, apply all expression patterns to component dataset type
940 names as well. If `False`, never apply patterns to components.
941 If `None` (default), apply patterns to components only if their
942 parent datasets were not matched by the expression.
943 Fully-specified component datasets (`str` or `DatasetType`
944 instances) are always included.
946 Values other than `False` are deprecated, and only `False` will be
947 supported after v26. After v27 this argument will be removed
948 entirely.
949 mode : `str`, optional
950 The way in which datasets are being used in this query; one of:
952 - "find_first": this is a query for the first dataset in an
953 ordered list of collections. Prohibits collection wildcards,
954 but permits dataset type wildcards.
956 - "find_all": this is a query for all datasets in all matched
957 collections. Permits collection and dataset type wildcards.
959 - "constrain": this is a query for something other than datasets,
960 with results constrained by dataset existence. Permits
961 collection wildcards and prohibits ``...`` as a dataset type
962 wildcard.
963 doomed_by : `list` [ `str` ]
964 List to append messages indicating why the query is doomed to
965 yield no results.
967 Returns
968 -------
969 composition : `defaultdict` [ `DatasetType`, `list` [ `str` ] ]
970 Dictionary mapping parent dataset type to `list` of components
971 matched for that dataset type (or `None` for the parent itself).
972 collections : `CollectionWildcard`
973 Processed collection expression.
974 """
975 composition: dict[DatasetType, list[str | None]] = {}
976 if datasets is not None:
977 if not collections:
978 if not self.defaults.collections:
979 raise NoDefaultCollectionError("No collections, and no registry default collections.")
980 collections = self.defaults.collections
981 else:
982 collections = CollectionWildcard.from_expression(collections)
983 if mode == "find_first" and collections.patterns:
984 raise TypeError(
985 f"Collection pattern(s) {collections.patterns} not allowed in this context."
986 )
987 missing: list[str] = []
988 composition = self._managers.datasets.resolve_wildcard(
989 datasets, components=components, missing=missing, explicit_only=(mode == "constrain")
990 )
991 if missing and mode == "constrain":
992 # After v26 this should raise MissingDatasetTypeError, to be
993 # implemented on DM-36303.
994 warnings.warn(
995 f"Dataset type(s) {missing} are not registered; this will be an error after v26.",
996 FutureWarning,
997 )
998 doomed_by.extend(f"Dataset type {name} is not registered." for name in missing)
999 elif collections:
1000 raise ArgumentError(f"Cannot pass 'collections' (='{collections}') without 'datasets'.")
1001 return composition, collections
1003 def queryDatasets(
1004 self,
1005 datasetType: Any,
1006 *,
1007 collections: Any = None,
1008 dimensions: Optional[Iterable[Union[Dimension, str]]] = None,
1009 dataId: Optional[DataId] = None,
1010 where: Optional[str] = None,
1011 findFirst: bool = False,
1012 components: Optional[bool] = None,
1013 bind: Optional[Mapping[str, Any]] = None,
1014 check: bool = True,
1015 **kwargs: Any,
1016 ) -> queries.DatasetQueryResults:
1017 # Docstring inherited from lsst.daf.butler.registry.Registry
1018 doomed_by: list[str] = []
1019 data_id = self.expandDataId(dataId, **kwargs)
1020 dataset_composition, collections = self._standardize_query_dataset_args(
1021 datasetType,
1022 collections,
1023 components,
1024 mode="find_first" if findFirst else "find_all",
1025 doomed_by=doomed_by,
1026 )
1027 parent_results: list[queries.ParentDatasetQueryResults] = []
1028 for parent_dataset_type, components_for_parent in dataset_composition.items():
1029 # The full set of dimensions in the query is the combination of
1030 # those needed for the DatasetType and those explicitly requested,
1031 # if any.
1032 dimension_names = set(parent_dataset_type.dimensions.names)
1033 if dimensions is not None:
1034 dimension_names.update(self.dimensions.extract(dimensions).names)
1035 # Construct the summary structure needed to construct a
1036 # QueryBuilder.
1037 summary = queries.QuerySummary(
1038 requested=DimensionGraph(self.dimensions, names=dimension_names),
1039 dataId=data_id,
1040 expression=where,
1041 bind=bind,
1042 defaults=self.defaults.dataId,
1043 check=check,
1044 datasets=[parent_dataset_type],
1045 )
1046 builder = self._makeQueryBuilder(summary)
1047 # Add the dataset subquery to the query, telling the QueryBuilder
1048 # to include the rank of the selected collection in the results
1049 # only if we need to findFirst. Note that if any of the
1050 # collections are actually wildcard expressions, and
1051 # findFirst=True, this will raise TypeError for us.
1052 builder.joinDataset(parent_dataset_type, collections, isResult=True, findFirst=findFirst)
1053 query = builder.finish()
1054 parent_results.append(
1055 queries.ParentDatasetQueryResults(
1056 self._db, query, datasetType=parent_dataset_type, components=components_for_parent
1057 )
1058 )
1059 if not parent_results:
1060 doomed_by.extend(
1061 f"No registered dataset type matching {t!r} found, so no matching datasets can "
1062 "exist in any collection."
1063 for t in ensure_iterable(datasetType)
1064 )
1065 return queries.ChainedDatasetQueryResults([], doomed_by=doomed_by)
1066 elif len(parent_results) == 1:
1067 return parent_results[0]
1068 else:
1069 return queries.ChainedDatasetQueryResults(parent_results)
1071 def queryDataIds(
1072 self,
1073 dimensions: Union[Iterable[Union[Dimension, str]], Dimension, str],
1074 *,
1075 dataId: Optional[DataId] = None,
1076 datasets: Any = None,
1077 collections: Any = None,
1078 where: Optional[str] = None,
1079 components: Optional[bool] = None,
1080 bind: Optional[Mapping[str, Any]] = None,
1081 check: bool = True,
1082 **kwargs: Any,
1083 ) -> queries.DataCoordinateQueryResults:
1084 # Docstring inherited from lsst.daf.butler.registry.Registry
1085 dimensions = ensure_iterable(dimensions)
1086 requestedDimensions = self.dimensions.extract(dimensions)
1087 doomed_by: list[str] = []
1088 data_id = self.expandDataId(dataId, **kwargs)
1089 dataset_composition, collections = self._standardize_query_dataset_args(
1090 datasets, collections, components, doomed_by=doomed_by
1091 )
1093 def query_factory(
1094 order_by: Optional[Iterable[str]] = None, limit: Optional[Tuple[int, Optional[int]]] = None
1095 ) -> queries.Query:
1096 """Construct the Query object that generates query results."""
1097 summary = queries.QuerySummary(
1098 requested=requestedDimensions,
1099 dataId=data_id,
1100 expression=where,
1101 bind=bind,
1102 defaults=self.defaults.dataId,
1103 check=check,
1104 datasets=dataset_composition.keys(),
1105 order_by=order_by,
1106 limit=limit,
1107 )
1108 builder = self._makeQueryBuilder(summary, doomed_by=doomed_by)
1109 for datasetType in dataset_composition:
1110 builder.joinDataset(datasetType, collections, isResult=False)
1111 return builder.finish()
1113 return queries.DataCoordinateQueryResults(self._db, query_factory, requestedDimensions)
1115 def queryDimensionRecords(
1116 self,
1117 element: Union[DimensionElement, str],
1118 *,
1119 dataId: Optional[DataId] = None,
1120 datasets: Any = None,
1121 collections: Any = None,
1122 where: Optional[str] = None,
1123 components: Optional[bool] = None,
1124 bind: Optional[Mapping[str, Any]] = None,
1125 check: bool = True,
1126 **kwargs: Any,
1127 ) -> queries.DimensionRecordQueryResults:
1128 # Docstring inherited from lsst.daf.butler.registry.Registry
1129 if not isinstance(element, DimensionElement):
1130 try:
1131 element = self.dimensions[element]
1132 except KeyError as e:
1133 raise DimensionNameError(
1134 f"No such dimension '{element}', available dimensions: "
1135 + str(self.dimensions.getStaticElements())
1136 ) from e
1137 dataIds = self.queryDataIds(
1138 element.graph,
1139 dataId=dataId,
1140 datasets=datasets,
1141 collections=collections,
1142 where=where,
1143 components=components,
1144 bind=bind,
1145 check=check,
1146 **kwargs,
1147 )
1148 return queries.DatabaseDimensionRecordQueryResults(dataIds, self._managers.dimensions[element])
1150 def queryDatasetAssociations(
1151 self,
1152 datasetType: Union[str, DatasetType],
1153 collections: Any = ...,
1154 *,
1155 collectionTypes: Iterable[CollectionType] = CollectionType.all(),
1156 flattenChains: bool = False,
1157 ) -> Iterator[DatasetAssociation]:
1158 # Docstring inherited from lsst.daf.butler.registry.Registry
1159 if collections is None:
1160 if not self.defaults.collections:
1161 raise NoDefaultCollectionError(
1162 "No collections provided to findDataset, and no defaults from registry construction."
1163 )
1164 collections = self.defaults.collections
1165 collections = CollectionWildcard.from_expression(collections)
1166 TimespanReprClass = self._db.getTimespanRepresentation()
1167 if isinstance(datasetType, str):
1168 storage = self._managers.datasets[datasetType]
1169 else:
1170 storage = self._managers.datasets[datasetType.name]
1171 for collectionRecord in self._managers.collections.resolve_wildcard(
1172 collections,
1173 collection_types=frozenset(collectionTypes),
1174 flatten_chains=flattenChains,
1175 ):
1176 query = storage.select(collectionRecord)
1177 with self._db.query(query) as sql_result:
1178 sql_mappings = sql_result.mappings().fetchall()
1179 for row in sql_mappings:
1180 dataId = DataCoordinate.fromRequiredValues(
1181 storage.datasetType.dimensions,
1182 tuple(row[name] for name in storage.datasetType.dimensions.required.names),
1183 )
1184 runRecord = self._managers.collections[row[self._managers.collections.getRunForeignKeyName()]]
1185 ref = DatasetRef(storage.datasetType, dataId, id=row["id"], run=runRecord.name, conform=False)
1186 if collectionRecord.type is CollectionType.CALIBRATION:
1187 timespan = TimespanReprClass.extract(row)
1188 else:
1189 timespan = None
1190 yield DatasetAssociation(ref=ref, collection=collectionRecord.name, timespan=timespan)
1192 storageClasses: StorageClassFactory
1193 """All storage classes known to the registry (`StorageClassFactory`).
1194 """