Coverage for python/lsst/daf/butler/registries/sql.py: 14%
433 statements
« prev ^ index » next coverage.py v6.5.0, created at 2022-10-04 02:19 -0700
« prev ^ index » next coverage.py v6.5.0, created at 2022-10-04 02:19 -0700
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22from __future__ import annotations
24__all__ = ("SqlRegistry",)
26import contextlib
27import logging
28import warnings
29from typing import (
30 TYPE_CHECKING,
31 Any,
32 Dict,
33 Iterable,
34 Iterator,
35 List,
36 Literal,
37 Mapping,
38 Optional,
39 Sequence,
40 Set,
41 Tuple,
42 Union,
43)
45import sqlalchemy
46from lsst.resources import ResourcePathExpression
47from lsst.utils.iteration import ensure_iterable
49from ..core import (
50 Config,
51 DataCoordinate,
52 DataCoordinateIterable,
53 DataId,
54 DatasetAssociation,
55 DatasetId,
56 DatasetRef,
57 DatasetType,
58 Dimension,
59 DimensionConfig,
60 DimensionElement,
61 DimensionGraph,
62 DimensionRecord,
63 DimensionUniverse,
64 NamedKeyMapping,
65 NameLookupMapping,
66 Progress,
67 StorageClassFactory,
68 Timespan,
69 ddl,
70)
71from ..core.utils import transactional
72from ..registry import (
73 ArgumentError,
74 CollectionExpressionError,
75 CollectionSummary,
76 CollectionType,
77 CollectionTypeError,
78 ConflictingDefinitionError,
79 DataIdValueError,
80 DatasetTypeError,
81 DimensionNameError,
82 InconsistentDataIdError,
83 NoDefaultCollectionError,
84 OrphanedRecordError,
85 Registry,
86 RegistryConfig,
87 RegistryDefaults,
88 queries,
89)
90from ..registry.interfaces import ChainedCollectionRecord, DatasetIdFactory, DatasetIdGenEnum, RunRecord
91from ..registry.managers import RegistryManagerInstances, RegistryManagerTypes
92from ..registry.wildcards import CollectionWildcard, DatasetTypeWildcard
94if TYPE_CHECKING: 94 ↛ 95line 94 didn't jump to line 95, because the condition on line 94 was never true
95 from .._butlerConfig import ButlerConfig
96 from ..registry.interfaces import CollectionRecord, Database, DatastoreRegistryBridgeManager
99_LOG = logging.getLogger(__name__)
102class SqlRegistry(Registry):
103 """Registry implementation based on SQLAlchemy.
105 Parameters
106 ----------
107 database : `Database`
108 Database instance to store Registry.
109 defaults : `RegistryDefaults`
110 Default collection search path and/or output `~CollectionType.RUN`
111 collection.
112 managers : `RegistryManagerInstances`
113 All the managers required for this registry.
114 """
116 defaultConfigFile: Optional[str] = None
117 """Path to configuration defaults. Accessed within the ``configs`` resource
118 or relative to a search path. Can be None if no defaults specified.
119 """
121 @classmethod
122 def createFromConfig(
123 cls,
124 config: Optional[Union[RegistryConfig, str]] = None,
125 dimensionConfig: Optional[Union[DimensionConfig, str]] = None,
126 butlerRoot: Optional[ResourcePathExpression] = None,
127 ) -> Registry:
128 """Create registry database and return `SqlRegistry` instance.
130 This method initializes database contents, database must be empty
131 prior to calling this method.
133 Parameters
134 ----------
135 config : `RegistryConfig` or `str`, optional
136 Registry configuration, if missing then default configuration will
137 be loaded from registry.yaml.
138 dimensionConfig : `DimensionConfig` or `str`, optional
139 Dimensions configuration, if missing then default configuration
140 will be loaded from dimensions.yaml.
141 butlerRoot : convertible to `lsst.resources.ResourcePath`, optional
142 Path to the repository root this `SqlRegistry` will manage.
144 Returns
145 -------
146 registry : `SqlRegistry`
147 A new `SqlRegistry` instance.
148 """
149 config = cls.forceRegistryConfig(config)
150 config.replaceRoot(butlerRoot)
152 if isinstance(dimensionConfig, str):
153 dimensionConfig = DimensionConfig(dimensionConfig)
154 elif dimensionConfig is None:
155 dimensionConfig = DimensionConfig()
156 elif not isinstance(dimensionConfig, DimensionConfig):
157 raise TypeError(f"Incompatible Dimension configuration type: {type(dimensionConfig)}")
159 DatabaseClass = config.getDatabaseClass()
160 database = DatabaseClass.fromUri(
161 str(config.connectionString), origin=config.get("origin", 0), namespace=config.get("namespace")
162 )
163 managerTypes = RegistryManagerTypes.fromConfig(config)
164 managers = managerTypes.makeRepo(database, dimensionConfig)
165 return cls(database, RegistryDefaults(), managers)
167 @classmethod
168 def fromConfig(
169 cls,
170 config: Union[ButlerConfig, RegistryConfig, Config, str],
171 butlerRoot: Optional[ResourcePathExpression] = None,
172 writeable: bool = True,
173 defaults: Optional[RegistryDefaults] = None,
174 ) -> Registry:
175 """Create `Registry` subclass instance from `config`.
177 Registry database must be initialized prior to calling this method.
179 Parameters
180 ----------
181 config : `ButlerConfig`, `RegistryConfig`, `Config` or `str`
182 Registry configuration
183 butlerRoot : `lsst.resources.ResourcePathExpression`, optional
184 Path to the repository root this `Registry` will manage.
185 writeable : `bool`, optional
186 If `True` (default) create a read-write connection to the database.
187 defaults : `RegistryDefaults`, optional
188 Default collection search path and/or output `~CollectionType.RUN`
189 collection.
191 Returns
192 -------
193 registry : `SqlRegistry` (subclass)
194 A new `SqlRegistry` subclass instance.
195 """
196 config = cls.forceRegistryConfig(config)
197 config.replaceRoot(butlerRoot)
198 DatabaseClass = config.getDatabaseClass()
199 database = DatabaseClass.fromUri(
200 str(config.connectionString),
201 origin=config.get("origin", 0),
202 namespace=config.get("namespace"),
203 writeable=writeable,
204 )
205 managerTypes = RegistryManagerTypes.fromConfig(config)
206 managers = managerTypes.loadRepo(database)
207 if defaults is None:
208 defaults = RegistryDefaults()
209 return cls(database, defaults, managers)
211 def __init__(self, database: Database, defaults: RegistryDefaults, managers: RegistryManagerInstances):
212 self._db = database
213 self._managers = managers
214 self.storageClasses = StorageClassFactory()
215 # Intentionally invoke property setter to initialize defaults. This
216 # can only be done after most of the rest of Registry has already been
217 # initialized, and must be done before the property getter is used.
218 self.defaults = defaults
219 # In the future DatasetIdFactory may become configurable and this
220 # instance will need to be shared with datasets manager.
221 self.datasetIdFactory = DatasetIdFactory()
223 def __str__(self) -> str:
224 return str(self._db)
226 def __repr__(self) -> str:
227 return f"SqlRegistry({self._db!r}, {self.dimensions!r})"
229 def isWriteable(self) -> bool:
230 # Docstring inherited from lsst.daf.butler.registry.Registry
231 return self._db.isWriteable()
233 def copy(self, defaults: Optional[RegistryDefaults] = None) -> Registry:
234 # Docstring inherited from lsst.daf.butler.registry.Registry
235 if defaults is None:
236 # No need to copy, because `RegistryDefaults` is immutable; we
237 # effectively copy on write.
238 defaults = self.defaults
239 return type(self)(self._db, defaults, self._managers)
241 @property
242 def dimensions(self) -> DimensionUniverse:
243 # Docstring inherited from lsst.daf.butler.registry.Registry
244 return self._managers.dimensions.universe
246 def refresh(self) -> None:
247 # Docstring inherited from lsst.daf.butler.registry.Registry
248 self._managers.refresh()
250 @contextlib.contextmanager
251 def transaction(self, *, savepoint: bool = False) -> Iterator[None]:
252 # Docstring inherited from lsst.daf.butler.registry.Registry
253 try:
254 with self._db.transaction(savepoint=savepoint):
255 yield
256 except BaseException:
257 # TODO: this clears the caches sometimes when we wouldn't actually
258 # need to. Can we avoid that?
259 self._managers.dimensions.clearCaches()
260 raise
262 def resetConnectionPool(self) -> None:
263 """Reset SQLAlchemy connection pool for `SqlRegistry` database.
265 This operation is useful when using registry with fork-based
266 multiprocessing. To use registry across fork boundary one has to make
267 sure that there are no currently active connections (no session or
268 transaction is in progress) and connection pool is reset using this
269 method. This method should be called by the child process immediately
270 after the fork.
271 """
272 self._db._engine.dispose()
274 def registerOpaqueTable(self, tableName: str, spec: ddl.TableSpec) -> None:
275 """Add an opaque (to the `Registry`) table for use by a `Datastore` or
276 other data repository client.
278 Opaque table records can be added via `insertOpaqueData`, retrieved via
279 `fetchOpaqueData`, and removed via `deleteOpaqueData`.
281 Parameters
282 ----------
283 tableName : `str`
284 Logical name of the opaque table. This may differ from the
285 actual name used in the database by a prefix and/or suffix.
286 spec : `ddl.TableSpec`
287 Specification for the table to be added.
288 """
289 self._managers.opaque.register(tableName, spec)
291 @transactional
292 def insertOpaqueData(self, tableName: str, *data: dict) -> None:
293 """Insert records into an opaque table.
295 Parameters
296 ----------
297 tableName : `str`
298 Logical name of the opaque table. Must match the name used in a
299 previous call to `registerOpaqueTable`.
300 data
301 Each additional positional argument is a dictionary that represents
302 a single row to be added.
303 """
304 self._managers.opaque[tableName].insert(*data)
306 def fetchOpaqueData(self, tableName: str, **where: Any) -> Iterator[dict]:
307 """Retrieve records from an opaque table.
309 Parameters
310 ----------
311 tableName : `str`
312 Logical name of the opaque table. Must match the name used in a
313 previous call to `registerOpaqueTable`.
314 where
315 Additional keyword arguments are interpreted as equality
316 constraints that restrict the returned rows (combined with AND);
317 keyword arguments are column names and values are the values they
318 must have.
320 Yields
321 ------
322 row : `dict`
323 A dictionary representing a single result row.
324 """
325 yield from self._managers.opaque[tableName].fetch(**where)
327 @transactional
328 def deleteOpaqueData(self, tableName: str, **where: Any) -> None:
329 """Remove records from an opaque table.
331 Parameters
332 ----------
333 tableName : `str`
334 Logical name of the opaque table. Must match the name used in a
335 previous call to `registerOpaqueTable`.
336 where
337 Additional keyword arguments are interpreted as equality
338 constraints that restrict the deleted rows (combined with AND);
339 keyword arguments are column names and values are the values they
340 must have.
341 """
342 self._managers.opaque[tableName].delete(where.keys(), where)
344 def registerCollection(
345 self, name: str, type: CollectionType = CollectionType.TAGGED, doc: Optional[str] = None
346 ) -> bool:
347 # Docstring inherited from lsst.daf.butler.registry.Registry
348 _, registered = self._managers.collections.register(name, type, doc=doc)
349 return registered
351 def getCollectionType(self, name: str) -> CollectionType:
352 # Docstring inherited from lsst.daf.butler.registry.Registry
353 return self._managers.collections.find(name).type
355 def _get_collection_record(self, name: str) -> CollectionRecord:
356 # Docstring inherited from lsst.daf.butler.registry.Registry
357 return self._managers.collections.find(name)
359 def registerRun(self, name: str, doc: Optional[str] = None) -> bool:
360 # Docstring inherited from lsst.daf.butler.registry.Registry
361 _, registered = self._managers.collections.register(name, CollectionType.RUN, doc=doc)
362 return registered
364 @transactional
365 def removeCollection(self, name: str) -> None:
366 # Docstring inherited from lsst.daf.butler.registry.Registry
367 self._managers.collections.remove(name)
369 def getCollectionChain(self, parent: str) -> tuple[str, ...]:
370 # Docstring inherited from lsst.daf.butler.registry.Registry
371 record = self._managers.collections.find(parent)
372 if record.type is not CollectionType.CHAINED:
373 raise CollectionTypeError(f"Collection '{parent}' has type {record.type.name}, not CHAINED.")
374 assert isinstance(record, ChainedCollectionRecord)
375 return record.children
377 @transactional
378 def setCollectionChain(self, parent: str, children: Any, *, flatten: bool = False) -> None:
379 # Docstring inherited from lsst.daf.butler.registry.Registry
380 record = self._managers.collections.find(parent)
381 if record.type is not CollectionType.CHAINED:
382 raise CollectionTypeError(f"Collection '{parent}' has type {record.type.name}, not CHAINED.")
383 assert isinstance(record, ChainedCollectionRecord)
384 children = CollectionWildcard.from_expression(children).require_ordered()
385 if children != record.children or flatten:
386 record.update(self._managers.collections, children, flatten=flatten)
388 def getCollectionParentChains(self, collection: str) -> Set[str]:
389 # Docstring inherited from lsst.daf.butler.registry.Registry
390 return {
391 record.name
392 for record in self._managers.collections.getParentChains(
393 self._managers.collections.find(collection).key
394 )
395 }
397 def getCollectionDocumentation(self, collection: str) -> Optional[str]:
398 # Docstring inherited from lsst.daf.butler.registry.Registry
399 return self._managers.collections.getDocumentation(self._managers.collections.find(collection).key)
401 def setCollectionDocumentation(self, collection: str, doc: Optional[str]) -> None:
402 # Docstring inherited from lsst.daf.butler.registry.Registry
403 self._managers.collections.setDocumentation(self._managers.collections.find(collection).key, doc)
405 def getCollectionSummary(self, collection: str) -> CollectionSummary:
406 # Docstring inherited from lsst.daf.butler.registry.Registry
407 record = self._managers.collections.find(collection)
408 return self._managers.datasets.getCollectionSummary(record)
410 def registerDatasetType(self, datasetType: DatasetType) -> bool:
411 # Docstring inherited from lsst.daf.butler.registry.Registry
412 _, inserted = self._managers.datasets.register(datasetType)
413 return inserted
415 def removeDatasetType(self, name: str) -> None:
416 # Docstring inherited from lsst.daf.butler.registry.Registry
417 self._managers.datasets.remove(name)
419 def getDatasetType(self, name: str) -> DatasetType:
420 # Docstring inherited from lsst.daf.butler.registry.Registry
421 return self._managers.datasets[name].datasetType
423 def supportsIdGenerationMode(self, mode: DatasetIdGenEnum) -> bool:
424 # Docstring inherited from lsst.daf.butler.registry.Registry
425 return self._managers.datasets.supportsIdGenerationMode(mode)
427 def findDataset(
428 self,
429 datasetType: Union[DatasetType, str],
430 dataId: Optional[DataId] = None,
431 *,
432 collections: Any = None,
433 timespan: Optional[Timespan] = None,
434 **kwargs: Any,
435 ) -> Optional[DatasetRef]:
436 # Docstring inherited from lsst.daf.butler.registry.Registry
437 if isinstance(datasetType, DatasetType):
438 storage = self._managers.datasets[datasetType.name]
439 else:
440 storage = self._managers.datasets[datasetType]
441 dataId = DataCoordinate.standardize(
442 dataId,
443 graph=storage.datasetType.dimensions,
444 universe=self.dimensions,
445 defaults=self.defaults.dataId,
446 **kwargs,
447 )
448 if collections is None:
449 if not self.defaults.collections:
450 raise NoDefaultCollectionError(
451 "No collections provided to findDataset, and no defaults from registry construction."
452 )
453 collections = self.defaults.collections
454 collections = CollectionWildcard.from_expression(collections)
455 collections.require_ordered()
456 for collectionRecord in self._managers.collections.resolve_wildcard(collections):
457 if collectionRecord.type is CollectionType.CALIBRATION and (
458 not storage.datasetType.isCalibration() or timespan is None
459 ):
460 continue
461 result = storage.find(collectionRecord, dataId, timespan=timespan)
462 if result is not None:
463 return result
465 return None
467 @transactional
468 def insertDatasets(
469 self,
470 datasetType: Union[DatasetType, str],
471 dataIds: Iterable[DataId],
472 run: Optional[str] = None,
473 expand: bool = True,
474 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE,
475 ) -> List[DatasetRef]:
476 # Docstring inherited from lsst.daf.butler.registry.Registry
477 if isinstance(datasetType, DatasetType):
478 storage = self._managers.datasets.find(datasetType.name)
479 if storage is None:
480 raise DatasetTypeError(f"DatasetType '{datasetType}' has not been registered.")
481 else:
482 storage = self._managers.datasets.find(datasetType)
483 if storage is None:
484 raise DatasetTypeError(f"DatasetType with name '{datasetType}' has not been registered.")
485 if run is None:
486 if self.defaults.run is None:
487 raise NoDefaultCollectionError(
488 "No run provided to insertDatasets, and no default from registry construction."
489 )
490 run = self.defaults.run
491 runRecord = self._managers.collections.find(run)
492 if runRecord.type is not CollectionType.RUN:
493 raise CollectionTypeError(
494 f"Given collection is of type {runRecord.type.name}; RUN collection required."
495 )
496 assert isinstance(runRecord, RunRecord)
497 progress = Progress("daf.butler.Registry.insertDatasets", level=logging.DEBUG)
498 if expand:
499 expandedDataIds = [
500 self.expandDataId(dataId, graph=storage.datasetType.dimensions)
501 for dataId in progress.wrap(dataIds, f"Expanding {storage.datasetType.name} data IDs")
502 ]
503 else:
504 expandedDataIds = [
505 DataCoordinate.standardize(dataId, graph=storage.datasetType.dimensions) for dataId in dataIds
506 ]
507 try:
508 refs = list(storage.insert(runRecord, expandedDataIds, idGenerationMode))
509 if self._managers.obscore:
510 self._managers.obscore.add_datasets(refs)
511 except sqlalchemy.exc.IntegrityError as err:
512 raise ConflictingDefinitionError(
513 f"A database constraint failure was triggered by inserting "
514 f"one or more datasets of type {storage.datasetType} into "
515 f"collection '{run}'. "
516 f"This probably means a dataset with the same data ID "
517 f"and dataset type already exists, but it may also mean a "
518 f"dimension row is missing."
519 ) from err
520 return refs
522 @transactional
523 def _importDatasets(
524 self,
525 datasets: Iterable[DatasetRef],
526 expand: bool = True,
527 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE,
528 reuseIds: bool = False,
529 ) -> List[DatasetRef]:
530 # Docstring inherited from lsst.daf.butler.registry.Registry
531 datasets = list(datasets)
532 if not datasets:
533 # nothing to do
534 return []
536 # find dataset type
537 datasetTypes = set(dataset.datasetType for dataset in datasets)
538 if len(datasetTypes) != 1:
539 raise DatasetTypeError(f"Multiple dataset types in input datasets: {datasetTypes}")
540 datasetType = datasetTypes.pop()
542 # get storage handler for this dataset type
543 storage = self._managers.datasets.find(datasetType.name)
544 if storage is None:
545 raise DatasetTypeError(f"DatasetType '{datasetType}' has not been registered.")
547 # find run name
548 runs = set(dataset.run for dataset in datasets)
549 if len(runs) != 1:
550 raise ValueError(f"Multiple run names in input datasets: {runs}")
551 run = runs.pop()
552 if run is None:
553 if self.defaults.run is None:
554 raise NoDefaultCollectionError(
555 "No run provided to ingestDatasets, and no default from registry construction."
556 )
557 run = self.defaults.run
559 runRecord = self._managers.collections.find(run)
560 if runRecord.type is not CollectionType.RUN:
561 raise CollectionTypeError(
562 f"Given collection '{runRecord.name}' is of type {runRecord.type.name};"
563 " RUN collection required."
564 )
565 assert isinstance(runRecord, RunRecord)
567 progress = Progress("daf.butler.Registry.insertDatasets", level=logging.DEBUG)
568 if expand:
569 expandedDatasets = [
570 dataset.expanded(self.expandDataId(dataset.dataId, graph=storage.datasetType.dimensions))
571 for dataset in progress.wrap(datasets, f"Expanding {storage.datasetType.name} data IDs")
572 ]
573 else:
574 expandedDatasets = [
575 DatasetRef(datasetType, dataset.dataId, id=dataset.id, run=dataset.run, conform=True)
576 for dataset in datasets
577 ]
579 try:
580 refs = list(storage.import_(runRecord, expandedDatasets, idGenerationMode, reuseIds))
581 if self._managers.obscore:
582 self._managers.obscore.add_datasets(refs)
583 except sqlalchemy.exc.IntegrityError as err:
584 raise ConflictingDefinitionError(
585 f"A database constraint failure was triggered by inserting "
586 f"one or more datasets of type {storage.datasetType} into "
587 f"collection '{run}'. "
588 f"This probably means a dataset with the same data ID "
589 f"and dataset type already exists, but it may also mean a "
590 f"dimension row is missing."
591 ) from err
592 return refs
594 def getDataset(self, id: DatasetId) -> Optional[DatasetRef]:
595 # Docstring inherited from lsst.daf.butler.registry.Registry
596 return self._managers.datasets.getDatasetRef(id)
598 @transactional
599 def removeDatasets(self, refs: Iterable[DatasetRef]) -> None:
600 # Docstring inherited from lsst.daf.butler.registry.Registry
601 progress = Progress("lsst.daf.butler.Registry.removeDatasets", level=logging.DEBUG)
602 for datasetType, refsForType in progress.iter_item_chunks(
603 DatasetRef.groupByType(refs).items(), desc="Removing datasets by type"
604 ):
605 storage = self._managers.datasets[datasetType.name]
606 try:
607 storage.delete(refsForType)
608 except sqlalchemy.exc.IntegrityError as err:
609 raise OrphanedRecordError(
610 "One or more datasets is still present in one or more Datastores."
611 ) from err
613 @transactional
614 def associate(self, collection: str, refs: Iterable[DatasetRef]) -> None:
615 # Docstring inherited from lsst.daf.butler.registry.Registry
616 progress = Progress("lsst.daf.butler.Registry.associate", level=logging.DEBUG)
617 collectionRecord = self._managers.collections.find(collection)
618 if collectionRecord.type is not CollectionType.TAGGED:
619 raise CollectionTypeError(
620 f"Collection '{collection}' has type {collectionRecord.type.name}, not TAGGED."
621 )
622 for datasetType, refsForType in progress.iter_item_chunks(
623 DatasetRef.groupByType(refs).items(), desc="Associating datasets by type"
624 ):
625 storage = self._managers.datasets[datasetType.name]
626 try:
627 storage.associate(collectionRecord, refsForType)
628 if self._managers.obscore:
629 # If a TAGGED collection is being monitored by ObsCore
630 # manager then we may need to save the dataset.
631 self._managers.obscore.associate(refsForType, collectionRecord)
632 except sqlalchemy.exc.IntegrityError as err:
633 raise ConflictingDefinitionError(
634 f"Constraint violation while associating dataset of type {datasetType.name} with "
635 f"collection {collection}. This probably means that one or more datasets with the same "
636 f"dataset type and data ID already exist in the collection, but it may also indicate "
637 f"that the datasets do not exist."
638 ) from err
640 @transactional
641 def disassociate(self, collection: str, refs: Iterable[DatasetRef]) -> None:
642 # Docstring inherited from lsst.daf.butler.registry.Registry
643 progress = Progress("lsst.daf.butler.Registry.disassociate", level=logging.DEBUG)
644 collectionRecord = self._managers.collections.find(collection)
645 if collectionRecord.type is not CollectionType.TAGGED:
646 raise CollectionTypeError(
647 f"Collection '{collection}' has type {collectionRecord.type.name}; expected TAGGED."
648 )
649 for datasetType, refsForType in progress.iter_item_chunks(
650 DatasetRef.groupByType(refs).items(), desc="Disassociating datasets by type"
651 ):
652 storage = self._managers.datasets[datasetType.name]
653 storage.disassociate(collectionRecord, refsForType)
654 if self._managers.obscore:
655 self._managers.obscore.disassociate(refsForType, collectionRecord)
657 @transactional
658 def certify(self, collection: str, refs: Iterable[DatasetRef], timespan: Timespan) -> None:
659 # Docstring inherited from lsst.daf.butler.registry.Registry
660 progress = Progress("lsst.daf.butler.Registry.certify", level=logging.DEBUG)
661 collectionRecord = self._managers.collections.find(collection)
662 for datasetType, refsForType in progress.iter_item_chunks(
663 DatasetRef.groupByType(refs).items(), desc="Certifying datasets by type"
664 ):
665 storage = self._managers.datasets[datasetType.name]
666 storage.certify(collectionRecord, refsForType, timespan)
668 @transactional
669 def decertify(
670 self,
671 collection: str,
672 datasetType: Union[str, DatasetType],
673 timespan: Timespan,
674 *,
675 dataIds: Optional[Iterable[DataId]] = None,
676 ) -> None:
677 # Docstring inherited from lsst.daf.butler.registry.Registry
678 collectionRecord = self._managers.collections.find(collection)
679 if isinstance(datasetType, str):
680 storage = self._managers.datasets[datasetType]
681 else:
682 storage = self._managers.datasets[datasetType.name]
683 standardizedDataIds = None
684 if dataIds is not None:
685 standardizedDataIds = [
686 DataCoordinate.standardize(d, graph=storage.datasetType.dimensions) for d in dataIds
687 ]
688 storage.decertify(collectionRecord, timespan, dataIds=standardizedDataIds)
690 def getDatastoreBridgeManager(self) -> DatastoreRegistryBridgeManager:
691 """Return an object that allows a new `Datastore` instance to
692 communicate with this `Registry`.
694 Returns
695 -------
696 manager : `DatastoreRegistryBridgeManager`
697 Object that mediates communication between this `Registry` and its
698 associated datastores.
699 """
700 return self._managers.datastores
702 def getDatasetLocations(self, ref: DatasetRef) -> Iterable[str]:
703 # Docstring inherited from lsst.daf.butler.registry.Registry
704 return self._managers.datastores.findDatastores(ref)
706 def expandDataId(
707 self,
708 dataId: Optional[DataId] = None,
709 *,
710 graph: Optional[DimensionGraph] = None,
711 records: Optional[NameLookupMapping[DimensionElement, Optional[DimensionRecord]]] = None,
712 withDefaults: bool = True,
713 **kwargs: Any,
714 ) -> DataCoordinate:
715 # Docstring inherited from lsst.daf.butler.registry.Registry
716 if not withDefaults:
717 defaults = None
718 else:
719 defaults = self.defaults.dataId
720 try:
721 standardized = DataCoordinate.standardize(
722 dataId, graph=graph, universe=self.dimensions, defaults=defaults, **kwargs
723 )
724 except KeyError as exc:
725 # This means either kwargs have some odd name or required
726 # dimension is missing.
727 raise DimensionNameError(str(exc)) from exc
728 if standardized.hasRecords():
729 return standardized
730 if records is None:
731 records = {}
732 elif isinstance(records, NamedKeyMapping):
733 records = records.byName()
734 else:
735 records = dict(records)
736 if isinstance(dataId, DataCoordinate) and dataId.hasRecords():
737 records.update(dataId.records.byName())
738 keys = standardized.byName()
739 for element in standardized.graph.primaryKeyTraversalOrder:
740 record = records.get(element.name, ...) # Use ... to mean not found; None might mean NULL
741 if record is ...:
742 if isinstance(element, Dimension) and keys.get(element.name) is None:
743 if element in standardized.graph.required:
744 raise DimensionNameError(
745 f"No value or null value for required dimension {element.name}."
746 )
747 keys[element.name] = None
748 record = None
749 else:
750 storage = self._managers.dimensions[element]
751 dataIdSet = DataCoordinateIterable.fromScalar(
752 DataCoordinate.standardize(keys, graph=element.graph)
753 )
754 fetched = tuple(storage.fetch(dataIdSet))
755 try:
756 (record,) = fetched
757 except ValueError:
758 record = None
759 records[element.name] = record
760 if record is not None:
761 for d in element.implied:
762 value = getattr(record, d.name)
763 if keys.setdefault(d.name, value) != value:
764 raise InconsistentDataIdError(
765 f"Data ID {standardized} has {d.name}={keys[d.name]!r}, "
766 f"but {element.name} implies {d.name}={value!r}."
767 )
768 else:
769 if element in standardized.graph.required:
770 raise DataIdValueError(
771 f"Could not fetch record for required dimension {element.name} via keys {keys}."
772 )
773 if element.alwaysJoin:
774 raise InconsistentDataIdError(
775 f"Could not fetch record for element {element.name} via keys {keys}, ",
776 "but it is marked alwaysJoin=True; this means one or more dimensions are not "
777 "related.",
778 )
779 for d in element.implied:
780 keys.setdefault(d.name, None)
781 records.setdefault(d.name, None)
782 return DataCoordinate.standardize(keys, graph=standardized.graph).expanded(records=records)
784 def insertDimensionData(
785 self,
786 element: Union[DimensionElement, str],
787 *data: Union[Mapping[str, Any], DimensionRecord],
788 conform: bool = True,
789 replace: bool = False,
790 skip_existing: bool = False,
791 ) -> None:
792 # Docstring inherited from lsst.daf.butler.registry.Registry
793 if conform:
794 if isinstance(element, str):
795 element = self.dimensions[element]
796 records = [
797 row if isinstance(row, DimensionRecord) else element.RecordClass(**row) for row in data
798 ]
799 else:
800 # Ignore typing since caller said to trust them with conform=False.
801 records = data # type: ignore
802 storage = self._managers.dimensions[element] # type: ignore
803 storage.insert(*records, replace=replace, skip_existing=skip_existing)
805 def syncDimensionData(
806 self,
807 element: Union[DimensionElement, str],
808 row: Union[Mapping[str, Any], DimensionRecord],
809 conform: bool = True,
810 update: bool = False,
811 ) -> Union[bool, Dict[str, Any]]:
812 # Docstring inherited from lsst.daf.butler.registry.Registry
813 if conform:
814 if isinstance(element, str):
815 element = self.dimensions[element]
816 record = row if isinstance(row, DimensionRecord) else element.RecordClass(**row)
817 else:
818 # Ignore typing since caller said to trust them with conform=False.
819 record = row # type: ignore
820 storage = self._managers.dimensions[element] # type: ignore
821 return storage.sync(record, update=update)
823 def queryDatasetTypes(
824 self,
825 expression: Any = ...,
826 *,
827 components: Optional[bool] = None,
828 missing: Optional[List[str]] = None,
829 ) -> Iterable[DatasetType]:
830 # Docstring inherited from lsst.daf.butler.registry.Registry
831 wildcard = DatasetTypeWildcard.from_expression(expression)
832 composition_dict = self._managers.datasets.resolve_wildcard(
833 wildcard,
834 components=components,
835 missing=missing,
836 )
837 result: list[DatasetType] = []
838 for parent_dataset_type, components_for_parent in composition_dict.items():
839 result.extend(
840 parent_dataset_type.makeComponentDatasetType(c) if c is not None else parent_dataset_type
841 for c in components_for_parent
842 )
843 return result
845 def queryCollections(
846 self,
847 expression: Any = ...,
848 datasetType: Optional[DatasetType] = None,
849 collectionTypes: Union[Iterable[CollectionType], CollectionType] = CollectionType.all(),
850 flattenChains: bool = False,
851 includeChains: Optional[bool] = None,
852 ) -> Sequence[str]:
853 # Docstring inherited from lsst.daf.butler.registry.Registry
855 # Right now the datasetTypes argument is completely ignored, but that
856 # is consistent with its [lack of] guarantees. DM-24939 or a follow-up
857 # ticket will take care of that.
858 try:
859 wildcard = CollectionWildcard.from_expression(expression)
860 except TypeError as exc:
861 raise CollectionExpressionError(f"Invalid collection expression '{expression}'") from exc
862 collectionTypes = ensure_iterable(collectionTypes)
863 return [
864 record.name
865 for record in self._managers.collections.resolve_wildcard(
866 wildcard,
867 collection_types=frozenset(collectionTypes),
868 flatten_chains=flattenChains,
869 include_chains=includeChains,
870 )
871 ]
873 def _makeQueryBuilder(
874 self, summary: queries.QuerySummary, doomed_by: Iterable[str] = ()
875 ) -> queries.QueryBuilder:
876 """Return a `QueryBuilder` instance capable of constructing and
877 managing more complex queries than those obtainable via `Registry`
878 interfaces.
880 This is an advanced interface; downstream code should prefer
881 `Registry.queryDataIds` and `Registry.queryDatasets` whenever those
882 are sufficient.
884 Parameters
885 ----------
886 summary : `queries.QuerySummary`
887 Object describing and categorizing the full set of dimensions that
888 will be included in the query.
889 doomed_by : `Iterable` of `str`, optional
890 A list of diagnostic messages that indicate why the query is going
891 to yield no results and should not even be executed. If an empty
892 container (default) the query will be executed unless other code
893 determines that it is doomed.
895 Returns
896 -------
897 builder : `queries.QueryBuilder`
898 Object that can be used to construct and perform advanced queries.
899 """
900 return queries.QueryBuilder(
901 summary,
902 backend=queries.SqlQueryBackend(self._db, self._managers),
903 doomed_by=doomed_by,
904 )
906 def _standardize_query_dataset_args(
907 self,
908 datasets: Any,
909 collections: Any,
910 components: bool | None,
911 mode: Literal["find_first"] | Literal["find_all"] | Literal["constrain"] = "constrain",
912 *,
913 doomed_by: list[str],
914 ) -> tuple[dict[DatasetType, list[str | None]], CollectionWildcard | None]:
915 """Preprocess dataset arguments passed to query* methods.
917 Parameters
918 ----------
919 datasets : `DatasetType`, `str`, `re.Pattern`, or iterable of these
920 Expression identifying dataset types. See `queryDatasetTypes` for
921 details.
922 collections : `str`, `re.Pattern`, or iterable of these
923 Expression identifying collections to be searched. See
924 `queryCollections` for details.
925 components : `bool`, optional
926 If `True`, apply all expression patterns to component dataset type
927 names as well. If `False`, never apply patterns to components.
928 If `None` (default), apply patterns to components only if their
929 parent datasets were not matched by the expression.
930 Fully-specified component datasets (`str` or `DatasetType`
931 instances) are always included.
932 mode : `str`, optional
933 The way in which datasets are being used in this query; one of:
935 - "find_first": this is a query for the first dataset in an
936 ordered list of collections. Prohibits collection wildcards,
937 but permits dataset type wildcards.
939 - "find_all": this is a query for all datasets in all matched
940 collections. Permits collection and dataset type wildcards.
942 - "constrain": this is a query for something other than datasets,
943 with results constrained by dataset existence. Permits
944 collection wildcards and prohibits ``...`` as a dataset type
945 wildcard.
946 doomed_by : `list` [ `str` ]
947 List to append messages indicating why the query is doomed to
948 yield no results.
950 Returns
951 -------
952 composition : `defaultdict` [ `DatasetType`, `list` [ `str` ] ]
953 Dictionary mapping parent dataset type to `list` of components
954 matched for that dataset type (or `None` for the parent itself).
955 collections : `CollectionWildcard`
956 Processed collection expression.
957 """
958 composition: dict[DatasetType, list[str | None]] = {}
959 if datasets is not None:
960 if not collections:
961 if not self.defaults.collections:
962 raise NoDefaultCollectionError("No collections, and no registry default collections.")
963 collections = self.defaults.collections
964 else:
965 collections = CollectionWildcard.from_expression(collections)
966 if mode == "find_first" and collections.patterns:
967 raise TypeError(
968 f"Collection pattern(s) {collections.patterns} not allowed in this context."
969 )
970 missing: list[str] = []
971 composition = self._managers.datasets.resolve_wildcard(
972 datasets, components=components, missing=missing, explicit_only=(mode == "constrain")
973 )
974 if missing and mode == "constrain":
975 # After v26 this should raise MissingDatasetTypeError, to be
976 # implemented on DM-36303.
977 warnings.warn(
978 f"Dataset type(s) {missing} are not registered; this will be an error after v26.",
979 FutureWarning,
980 )
981 doomed_by.extend(f"Dataset type {name} is not registered." for name in missing)
982 elif collections:
983 raise ArgumentError(f"Cannot pass 'collections' (='{collections}') without 'datasets'.")
984 return composition, collections
986 def queryDatasets(
987 self,
988 datasetType: Any,
989 *,
990 collections: Any = None,
991 dimensions: Optional[Iterable[Union[Dimension, str]]] = None,
992 dataId: Optional[DataId] = None,
993 where: Optional[str] = None,
994 findFirst: bool = False,
995 components: Optional[bool] = None,
996 bind: Optional[Mapping[str, Any]] = None,
997 check: bool = True,
998 **kwargs: Any,
999 ) -> queries.DatasetQueryResults:
1000 # Docstring inherited from lsst.daf.butler.registry.Registry
1001 doomed_by: list[str] = []
1002 data_id = self.expandDataId(dataId, **kwargs)
1003 dataset_composition, collections = self._standardize_query_dataset_args(
1004 datasetType,
1005 collections,
1006 components,
1007 mode="find_first" if findFirst else "find_all",
1008 doomed_by=doomed_by,
1009 )
1010 parent_results: list[queries.ParentDatasetQueryResults] = []
1011 for parent_dataset_type, components_for_parent in dataset_composition.items():
1012 # The full set of dimensions in the query is the combination of
1013 # those needed for the DatasetType and those explicitly requested,
1014 # if any.
1015 dimension_names = set(parent_dataset_type.dimensions.names)
1016 if dimensions is not None:
1017 dimension_names.update(self.dimensions.extract(dimensions).names)
1018 # Construct the summary structure needed to construct a
1019 # QueryBuilder.
1020 summary = queries.QuerySummary(
1021 requested=DimensionGraph(self.dimensions, names=dimension_names),
1022 dataId=data_id,
1023 expression=where,
1024 bind=bind,
1025 defaults=self.defaults.dataId,
1026 check=check,
1027 datasets=[parent_dataset_type],
1028 )
1029 builder = self._makeQueryBuilder(summary, doomed_by=doomed_by)
1030 # Add the dataset subquery to the query, telling the QueryBuilder
1031 # to include the rank of the selected collection in the results
1032 # only if we need to findFirst. Note that if any of the
1033 # collections are actually wildcard expressions, and
1034 # findFirst=True, this will raise TypeError for us.
1035 builder.joinDataset(parent_dataset_type, collections, isResult=True, findFirst=findFirst)
1036 query = builder.finish()
1037 parent_results.append(
1038 queries.ParentDatasetQueryResults(
1039 self._db, query, datasetType=parent_dataset_type, components=components_for_parent
1040 )
1041 )
1042 if not parent_results:
1043 doomed_by.extend(
1044 f"No registered dataset type matching {t!r} found, so no matching datasets can "
1045 "exist in any collection."
1046 for t in ensure_iterable(datasetType)
1047 )
1048 return queries.ChainedDatasetQueryResults([], doomed_by=doomed_by)
1049 elif len(parent_results) == 1:
1050 return parent_results[0]
1051 else:
1052 return queries.ChainedDatasetQueryResults(parent_results)
1054 def queryDataIds(
1055 self,
1056 dimensions: Union[Iterable[Union[Dimension, str]], Dimension, str],
1057 *,
1058 dataId: Optional[DataId] = None,
1059 datasets: Any = None,
1060 collections: Any = None,
1061 where: Optional[str] = None,
1062 components: Optional[bool] = None,
1063 bind: Optional[Mapping[str, Any]] = None,
1064 check: bool = True,
1065 **kwargs: Any,
1066 ) -> queries.DataCoordinateQueryResults:
1067 # Docstring inherited from lsst.daf.butler.registry.Registry
1068 dimensions = ensure_iterable(dimensions)
1069 requestedDimensions = self.dimensions.extract(dimensions)
1070 doomed_by: list[str] = []
1071 data_id = self.expandDataId(dataId, **kwargs)
1072 dataset_composition, collections = self._standardize_query_dataset_args(
1073 datasets, collections, components, doomed_by=doomed_by
1074 )
1076 def query_factory(
1077 order_by: Optional[Iterable[str]] = None, limit: Optional[Tuple[int, Optional[int]]] = None
1078 ) -> queries.Query:
1079 """Construct the Query object that generates query results."""
1080 summary = queries.QuerySummary(
1081 requested=requestedDimensions,
1082 dataId=data_id,
1083 expression=where,
1084 bind=bind,
1085 defaults=self.defaults.dataId,
1086 check=check,
1087 datasets=dataset_composition.keys(),
1088 order_by=order_by,
1089 limit=limit,
1090 )
1091 builder = self._makeQueryBuilder(summary, doomed_by=doomed_by)
1092 for datasetType in dataset_composition:
1093 builder.joinDataset(datasetType, collections, isResult=False)
1094 return builder.finish()
1096 return queries.DataCoordinateQueryResults(self._db, query_factory, requestedDimensions)
1098 def queryDimensionRecords(
1099 self,
1100 element: Union[DimensionElement, str],
1101 *,
1102 dataId: Optional[DataId] = None,
1103 datasets: Any = None,
1104 collections: Any = None,
1105 where: Optional[str] = None,
1106 components: Optional[bool] = None,
1107 bind: Optional[Mapping[str, Any]] = None,
1108 check: bool = True,
1109 **kwargs: Any,
1110 ) -> queries.DimensionRecordQueryResults:
1111 # Docstring inherited from lsst.daf.butler.registry.Registry
1112 if not isinstance(element, DimensionElement):
1113 try:
1114 element = self.dimensions[element]
1115 except KeyError as e:
1116 raise DimensionNameError(
1117 f"No such dimension '{element}', available dimensions: "
1118 + str(self.dimensions.getStaticElements())
1119 ) from e
1120 dataIds = self.queryDataIds(
1121 element.graph,
1122 dataId=dataId,
1123 datasets=datasets,
1124 collections=collections,
1125 where=where,
1126 components=components,
1127 bind=bind,
1128 check=check,
1129 **kwargs,
1130 )
1131 return queries.DatabaseDimensionRecordQueryResults(dataIds, self._managers.dimensions[element])
1133 def queryDatasetAssociations(
1134 self,
1135 datasetType: Union[str, DatasetType],
1136 collections: Any = ...,
1137 *,
1138 collectionTypes: Iterable[CollectionType] = CollectionType.all(),
1139 flattenChains: bool = False,
1140 ) -> Iterator[DatasetAssociation]:
1141 # Docstring inherited from lsst.daf.butler.registry.Registry
1142 if collections is None:
1143 if not self.defaults.collections:
1144 raise NoDefaultCollectionError(
1145 "No collections provided to findDataset, and no defaults from registry construction."
1146 )
1147 collections = self.defaults.collections
1148 collections = CollectionWildcard.from_expression(collections)
1149 TimespanReprClass = self._db.getTimespanRepresentation()
1150 if isinstance(datasetType, str):
1151 storage = self._managers.datasets[datasetType]
1152 else:
1153 storage = self._managers.datasets[datasetType.name]
1154 for collectionRecord in self._managers.collections.resolve_wildcard(
1155 collections,
1156 collection_types=frozenset(collectionTypes),
1157 flatten_chains=flattenChains,
1158 ):
1159 query = storage.select(collectionRecord)
1160 for row in self._db.query(query).mappings():
1161 dataId = DataCoordinate.fromRequiredValues(
1162 storage.datasetType.dimensions,
1163 tuple(row[name] for name in storage.datasetType.dimensions.required.names),
1164 )
1165 runRecord = self._managers.collections[row[self._managers.collections.getRunForeignKeyName()]]
1166 ref = DatasetRef(storage.datasetType, dataId, id=row["id"], run=runRecord.name, conform=False)
1167 if collectionRecord.type is CollectionType.CALIBRATION:
1168 timespan = TimespanReprClass.extract(row)
1169 else:
1170 timespan = None
1171 yield DatasetAssociation(ref=ref, collection=collectionRecord.name, timespan=timespan)
1173 storageClasses: StorageClassFactory
1174 """All storage classes known to the registry (`StorageClassFactory`).
1175 """