Coverage for python/lsst/daf/butler/registries/sql.py: 13%
477 statements
« prev ^ index » next coverage.py v6.4.4, created at 2022-09-30 02:19 -0700
« prev ^ index » next coverage.py v6.4.4, created at 2022-09-30 02:19 -0700
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22from __future__ import annotations
24__all__ = ("SqlRegistry",)
26import contextlib
27import logging
28from collections import defaultdict
29from typing import (
30 TYPE_CHECKING,
31 Any,
32 Dict,
33 Iterable,
34 Iterator,
35 List,
36 Literal,
37 Mapping,
38 Optional,
39 Set,
40 Tuple,
41 Union,
42)
44import sqlalchemy
45from lsst.resources import ResourcePathExpression
46from lsst.utils.iteration import ensure_iterable
48from ..core import (
49 Config,
50 DataCoordinate,
51 DataCoordinateIterable,
52 DataId,
53 DatasetAssociation,
54 DatasetId,
55 DatasetRef,
56 DatasetType,
57 Dimension,
58 DimensionConfig,
59 DimensionElement,
60 DimensionGraph,
61 DimensionRecord,
62 DimensionUniverse,
63 NamedKeyMapping,
64 NameLookupMapping,
65 Progress,
66 StorageClassFactory,
67 Timespan,
68 ddl,
69)
70from ..core.utils import transactional
71from ..registry import (
72 ArgumentError,
73 CollectionExpressionError,
74 CollectionSearch,
75 CollectionSummary,
76 CollectionType,
77 CollectionTypeError,
78 ConflictingDefinitionError,
79 DataIdValueError,
80 DatasetTypeError,
81 DatasetTypeExpressionError,
82 DimensionNameError,
83 InconsistentDataIdError,
84 NoDefaultCollectionError,
85 OrphanedRecordError,
86 Registry,
87 RegistryConfig,
88 RegistryDefaults,
89 queries,
90)
91from ..registry.interfaces import ChainedCollectionRecord, DatasetIdFactory, DatasetIdGenEnum, RunRecord
92from ..registry.managers import RegistryManagerInstances, RegistryManagerTypes
93from ..registry.wildcards import CategorizedWildcard, CollectionQuery, Ellipsis
95if TYPE_CHECKING: 95 ↛ 96line 95 didn't jump to line 96, because the condition on line 95 was never true
96 from .._butlerConfig import ButlerConfig
97 from ..registry.interfaces import CollectionRecord, Database, DatastoreRegistryBridgeManager
100_LOG = logging.getLogger(__name__)
103class SqlRegistry(Registry):
104 """Registry implementation based on SQLAlchemy.
106 Parameters
107 ----------
108 database : `Database`
109 Database instance to store Registry.
110 defaults : `RegistryDefaults`
111 Default collection search path and/or output `~CollectionType.RUN`
112 collection.
113 managers : `RegistryManagerInstances`
114 All the managers required for this registry.
115 """
117 defaultConfigFile: Optional[str] = None
118 """Path to configuration defaults. Accessed within the ``configs`` resource
119 or relative to a search path. Can be None if no defaults specified.
120 """
122 @classmethod
123 def createFromConfig(
124 cls,
125 config: Optional[Union[RegistryConfig, str]] = None,
126 dimensionConfig: Optional[Union[DimensionConfig, str]] = None,
127 butlerRoot: Optional[ResourcePathExpression] = None,
128 ) -> Registry:
129 """Create registry database and return `SqlRegistry` instance.
131 This method initializes database contents, database must be empty
132 prior to calling this method.
134 Parameters
135 ----------
136 config : `RegistryConfig` or `str`, optional
137 Registry configuration, if missing then default configuration will
138 be loaded from registry.yaml.
139 dimensionConfig : `DimensionConfig` or `str`, optional
140 Dimensions configuration, if missing then default configuration
141 will be loaded from dimensions.yaml.
142 butlerRoot : convertible to `lsst.resources.ResourcePath`, optional
143 Path to the repository root this `SqlRegistry` will manage.
145 Returns
146 -------
147 registry : `SqlRegistry`
148 A new `SqlRegistry` instance.
149 """
150 config = cls.forceRegistryConfig(config)
151 config.replaceRoot(butlerRoot)
153 if isinstance(dimensionConfig, str):
154 dimensionConfig = DimensionConfig(dimensionConfig)
155 elif dimensionConfig is None:
156 dimensionConfig = DimensionConfig()
157 elif not isinstance(dimensionConfig, DimensionConfig):
158 raise TypeError(f"Incompatible Dimension configuration type: {type(dimensionConfig)}")
160 DatabaseClass = config.getDatabaseClass()
161 database = DatabaseClass.fromUri(
162 str(config.connectionString), origin=config.get("origin", 0), namespace=config.get("namespace")
163 )
164 managerTypes = RegistryManagerTypes.fromConfig(config)
165 managers = managerTypes.makeRepo(database, dimensionConfig)
166 return cls(database, RegistryDefaults(), managers)
168 @classmethod
169 def fromConfig(
170 cls,
171 config: Union[ButlerConfig, RegistryConfig, Config, str],
172 butlerRoot: Optional[ResourcePathExpression] = None,
173 writeable: bool = True,
174 defaults: Optional[RegistryDefaults] = None,
175 ) -> Registry:
176 """Create `Registry` subclass instance from `config`.
178 Registry database must be initialized prior to calling this method.
180 Parameters
181 ----------
182 config : `ButlerConfig`, `RegistryConfig`, `Config` or `str`
183 Registry configuration
184 butlerRoot : `lsst.resources.ResourcePathExpression`, optional
185 Path to the repository root this `Registry` will manage.
186 writeable : `bool`, optional
187 If `True` (default) create a read-write connection to the database.
188 defaults : `RegistryDefaults`, optional
189 Default collection search path and/or output `~CollectionType.RUN`
190 collection.
192 Returns
193 -------
194 registry : `SqlRegistry` (subclass)
195 A new `SqlRegistry` subclass instance.
196 """
197 config = cls.forceRegistryConfig(config)
198 config.replaceRoot(butlerRoot)
199 DatabaseClass = config.getDatabaseClass()
200 database = DatabaseClass.fromUri(
201 str(config.connectionString),
202 origin=config.get("origin", 0),
203 namespace=config.get("namespace"),
204 writeable=writeable,
205 )
206 managerTypes = RegistryManagerTypes.fromConfig(config)
207 managers = managerTypes.loadRepo(database)
208 if defaults is None:
209 defaults = RegistryDefaults()
210 return cls(database, defaults, managers)
212 def __init__(self, database: Database, defaults: RegistryDefaults, managers: RegistryManagerInstances):
213 self._db = database
214 self._managers = managers
215 self.storageClasses = StorageClassFactory()
216 # Intentionally invoke property setter to initialize defaults. This
217 # can only be done after most of the rest of Registry has already been
218 # initialized, and must be done before the property getter is used.
219 self.defaults = defaults
220 # In the future DatasetIdFactory may become configurable and this
221 # instance will need to be shared with datasets manager.
222 self.datasetIdFactory = DatasetIdFactory()
224 def __str__(self) -> str:
225 return str(self._db)
227 def __repr__(self) -> str:
228 return f"SqlRegistry({self._db!r}, {self.dimensions!r})"
230 def isWriteable(self) -> bool:
231 # Docstring inherited from lsst.daf.butler.registry.Registry
232 return self._db.isWriteable()
234 def copy(self, defaults: Optional[RegistryDefaults] = None) -> Registry:
235 # Docstring inherited from lsst.daf.butler.registry.Registry
236 if defaults is None:
237 # No need to copy, because `RegistryDefaults` is immutable; we
238 # effectively copy on write.
239 defaults = self.defaults
240 return type(self)(self._db, defaults, self._managers)
242 @property
243 def dimensions(self) -> DimensionUniverse:
244 # Docstring inherited from lsst.daf.butler.registry.Registry
245 return self._managers.dimensions.universe
247 def refresh(self) -> None:
248 # Docstring inherited from lsst.daf.butler.registry.Registry
249 self._managers.refresh()
251 @contextlib.contextmanager
252 def transaction(self, *, savepoint: bool = False) -> Iterator[None]:
253 # Docstring inherited from lsst.daf.butler.registry.Registry
254 try:
255 with self._db.transaction(savepoint=savepoint):
256 yield
257 except BaseException:
258 # TODO: this clears the caches sometimes when we wouldn't actually
259 # need to. Can we avoid that?
260 self._managers.dimensions.clearCaches()
261 raise
263 def resetConnectionPool(self) -> None:
264 """Reset SQLAlchemy connection pool for `SqlRegistry` database.
266 This operation is useful when using registry with fork-based
267 multiprocessing. To use registry across fork boundary one has to make
268 sure that there are no currently active connections (no session or
269 transaction is in progress) and connection pool is reset using this
270 method. This method should be called by the child process immediately
271 after the fork.
272 """
273 self._db._engine.dispose()
275 def registerOpaqueTable(self, tableName: str, spec: ddl.TableSpec) -> None:
276 """Add an opaque (to the `Registry`) table for use by a `Datastore` or
277 other data repository client.
279 Opaque table records can be added via `insertOpaqueData`, retrieved via
280 `fetchOpaqueData`, and removed via `deleteOpaqueData`.
282 Parameters
283 ----------
284 tableName : `str`
285 Logical name of the opaque table. This may differ from the
286 actual name used in the database by a prefix and/or suffix.
287 spec : `ddl.TableSpec`
288 Specification for the table to be added.
289 """
290 self._managers.opaque.register(tableName, spec)
292 @transactional
293 def insertOpaqueData(self, tableName: str, *data: dict) -> None:
294 """Insert records into an opaque table.
296 Parameters
297 ----------
298 tableName : `str`
299 Logical name of the opaque table. Must match the name used in a
300 previous call to `registerOpaqueTable`.
301 data
302 Each additional positional argument is a dictionary that represents
303 a single row to be added.
304 """
305 self._managers.opaque[tableName].insert(*data)
307 def fetchOpaqueData(self, tableName: str, **where: Any) -> Iterator[dict]:
308 """Retrieve records from an opaque table.
310 Parameters
311 ----------
312 tableName : `str`
313 Logical name of the opaque table. Must match the name used in a
314 previous call to `registerOpaqueTable`.
315 where
316 Additional keyword arguments are interpreted as equality
317 constraints that restrict the returned rows (combined with AND);
318 keyword arguments are column names and values are the values they
319 must have.
321 Yields
322 ------
323 row : `dict`
324 A dictionary representing a single result row.
325 """
326 yield from self._managers.opaque[tableName].fetch(**where)
328 @transactional
329 def deleteOpaqueData(self, tableName: str, **where: Any) -> None:
330 """Remove records from an opaque table.
332 Parameters
333 ----------
334 tableName : `str`
335 Logical name of the opaque table. Must match the name used in a
336 previous call to `registerOpaqueTable`.
337 where
338 Additional keyword arguments are interpreted as equality
339 constraints that restrict the deleted rows (combined with AND);
340 keyword arguments are column names and values are the values they
341 must have.
342 """
343 self._managers.opaque[tableName].delete(where.keys(), where)
345 def registerCollection(
346 self, name: str, type: CollectionType = CollectionType.TAGGED, doc: Optional[str] = None
347 ) -> bool:
348 # Docstring inherited from lsst.daf.butler.registry.Registry
349 _, registered = self._managers.collections.register(name, type, doc=doc)
350 return registered
352 def getCollectionType(self, name: str) -> CollectionType:
353 # Docstring inherited from lsst.daf.butler.registry.Registry
354 return self._managers.collections.find(name).type
356 def _get_collection_record(self, name: str) -> CollectionRecord:
357 # Docstring inherited from lsst.daf.butler.registry.Registry
358 return self._managers.collections.find(name)
360 def registerRun(self, name: str, doc: Optional[str] = None) -> bool:
361 # Docstring inherited from lsst.daf.butler.registry.Registry
362 _, registered = self._managers.collections.register(name, CollectionType.RUN, doc=doc)
363 return registered
365 @transactional
366 def removeCollection(self, name: str) -> None:
367 # Docstring inherited from lsst.daf.butler.registry.Registry
368 self._managers.collections.remove(name)
370 def getCollectionChain(self, parent: str) -> CollectionSearch:
371 # Docstring inherited from lsst.daf.butler.registry.Registry
372 record = self._managers.collections.find(parent)
373 if record.type is not CollectionType.CHAINED:
374 raise CollectionTypeError(f"Collection '{parent}' has type {record.type.name}, not CHAINED.")
375 assert isinstance(record, ChainedCollectionRecord)
376 return record.children
378 @transactional
379 def setCollectionChain(self, parent: str, children: Any, *, flatten: bool = False) -> None:
380 # Docstring inherited from lsst.daf.butler.registry.Registry
381 record = self._managers.collections.find(parent)
382 if record.type is not CollectionType.CHAINED:
383 raise CollectionTypeError(f"Collection '{parent}' has type {record.type.name}, not CHAINED.")
384 assert isinstance(record, ChainedCollectionRecord)
385 children = CollectionSearch.fromExpression(children)
386 if children != record.children or flatten:
387 record.update(self._managers.collections, children, flatten=flatten)
389 def getCollectionParentChains(self, collection: str) -> Set[str]:
390 # Docstring inherited from lsst.daf.butler.registry.Registry
391 return {
392 record.name
393 for record in self._managers.collections.getParentChains(
394 self._managers.collections.find(collection).key
395 )
396 }
398 def getCollectionDocumentation(self, collection: str) -> Optional[str]:
399 # Docstring inherited from lsst.daf.butler.registry.Registry
400 return self._managers.collections.getDocumentation(self._managers.collections.find(collection).key)
402 def setCollectionDocumentation(self, collection: str, doc: Optional[str]) -> None:
403 # Docstring inherited from lsst.daf.butler.registry.Registry
404 self._managers.collections.setDocumentation(self._managers.collections.find(collection).key, doc)
406 def getCollectionSummary(self, collection: str) -> CollectionSummary:
407 # Docstring inherited from lsst.daf.butler.registry.Registry
408 record = self._managers.collections.find(collection)
409 return self._managers.datasets.getCollectionSummary(record)
411 def registerDatasetType(self, datasetType: DatasetType) -> bool:
412 # Docstring inherited from lsst.daf.butler.registry.Registry
413 _, inserted = self._managers.datasets.register(datasetType)
414 return inserted
416 def removeDatasetType(self, name: str) -> None:
417 # Docstring inherited from lsst.daf.butler.registry.Registry
418 self._managers.datasets.remove(name)
420 def getDatasetType(self, name: str) -> DatasetType:
421 # Docstring inherited from lsst.daf.butler.registry.Registry
422 return self._managers.datasets[name].datasetType
424 def supportsIdGenerationMode(self, mode: DatasetIdGenEnum) -> bool:
425 # Docstring inherited from lsst.daf.butler.registry.Registry
426 return self._managers.datasets.supportsIdGenerationMode(mode)
428 def findDataset(
429 self,
430 datasetType: Union[DatasetType, str],
431 dataId: Optional[DataId] = None,
432 *,
433 collections: Any = None,
434 timespan: Optional[Timespan] = None,
435 **kwargs: Any,
436 ) -> Optional[DatasetRef]:
437 # Docstring inherited from lsst.daf.butler.registry.Registry
438 if isinstance(datasetType, DatasetType):
439 storage = self._managers.datasets[datasetType.name]
440 else:
441 storage = self._managers.datasets[datasetType]
442 dataId = DataCoordinate.standardize(
443 dataId,
444 graph=storage.datasetType.dimensions,
445 universe=self.dimensions,
446 defaults=self.defaults.dataId,
447 **kwargs,
448 )
449 if collections is None:
450 if not self.defaults.collections:
451 raise NoDefaultCollectionError(
452 "No collections provided to findDataset, and no defaults from registry construction."
453 )
454 collections = self.defaults.collections
455 else:
456 collections = CollectionSearch.fromExpression(collections)
457 for collectionRecord in collections.iter(self._managers.collections):
458 if collectionRecord.type is CollectionType.CALIBRATION and (
459 not storage.datasetType.isCalibration() or timespan is None
460 ):
461 continue
462 result = storage.find(collectionRecord, dataId, timespan=timespan)
463 if result is not None:
464 return result
466 return None
468 @transactional
469 def insertDatasets(
470 self,
471 datasetType: Union[DatasetType, str],
472 dataIds: Iterable[DataId],
473 run: Optional[str] = None,
474 expand: bool = True,
475 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE,
476 ) -> List[DatasetRef]:
477 # Docstring inherited from lsst.daf.butler.registry.Registry
478 if isinstance(datasetType, DatasetType):
479 storage = self._managers.datasets.find(datasetType.name)
480 if storage is None:
481 raise DatasetTypeError(f"DatasetType '{datasetType}' has not been registered.")
482 else:
483 storage = self._managers.datasets.find(datasetType)
484 if storage is None:
485 raise DatasetTypeError(f"DatasetType with name '{datasetType}' has not been registered.")
486 if run is None:
487 if self.defaults.run is None:
488 raise NoDefaultCollectionError(
489 "No run provided to insertDatasets, and no default from registry construction."
490 )
491 run = self.defaults.run
492 runRecord = self._managers.collections.find(run)
493 if runRecord.type is not CollectionType.RUN:
494 raise CollectionTypeError(
495 f"Given collection is of type {runRecord.type.name}; RUN collection required."
496 )
497 assert isinstance(runRecord, RunRecord)
498 progress = Progress("daf.butler.Registry.insertDatasets", level=logging.DEBUG)
499 if expand:
500 expandedDataIds = [
501 self.expandDataId(dataId, graph=storage.datasetType.dimensions)
502 for dataId in progress.wrap(dataIds, f"Expanding {storage.datasetType.name} data IDs")
503 ]
504 else:
505 expandedDataIds = [
506 DataCoordinate.standardize(dataId, graph=storage.datasetType.dimensions) for dataId in dataIds
507 ]
508 try:
509 refs = list(storage.insert(runRecord, expandedDataIds, idGenerationMode))
510 if self._managers.obscore:
511 self._managers.obscore.add_datasets(refs)
512 except sqlalchemy.exc.IntegrityError as err:
513 raise ConflictingDefinitionError(
514 f"A database constraint failure was triggered by inserting "
515 f"one or more datasets of type {storage.datasetType} into "
516 f"collection '{run}'. "
517 f"This probably means a dataset with the same data ID "
518 f"and dataset type already exists, but it may also mean a "
519 f"dimension row is missing."
520 ) from err
521 return refs
523 @transactional
524 def _importDatasets(
525 self,
526 datasets: Iterable[DatasetRef],
527 expand: bool = True,
528 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE,
529 reuseIds: bool = False,
530 ) -> List[DatasetRef]:
531 # Docstring inherited from lsst.daf.butler.registry.Registry
532 datasets = list(datasets)
533 if not datasets:
534 # nothing to do
535 return []
537 # find dataset type
538 datasetTypes = set(dataset.datasetType for dataset in datasets)
539 if len(datasetTypes) != 1:
540 raise DatasetTypeError(f"Multiple dataset types in input datasets: {datasetTypes}")
541 datasetType = datasetTypes.pop()
543 # get storage handler for this dataset type
544 storage = self._managers.datasets.find(datasetType.name)
545 if storage is None:
546 raise DatasetTypeError(f"DatasetType '{datasetType}' has not been registered.")
548 # find run name
549 runs = set(dataset.run for dataset in datasets)
550 if len(runs) != 1:
551 raise ValueError(f"Multiple run names in input datasets: {runs}")
552 run = runs.pop()
553 if run is None:
554 if self.defaults.run is None:
555 raise NoDefaultCollectionError(
556 "No run provided to ingestDatasets, and no default from registry construction."
557 )
558 run = self.defaults.run
560 runRecord = self._managers.collections.find(run)
561 if runRecord.type is not CollectionType.RUN:
562 raise CollectionTypeError(
563 f"Given collection '{runRecord.name}' is of type {runRecord.type.name};"
564 " RUN collection required."
565 )
566 assert isinstance(runRecord, RunRecord)
568 progress = Progress("daf.butler.Registry.insertDatasets", level=logging.DEBUG)
569 if expand:
570 expandedDatasets = [
571 dataset.expanded(self.expandDataId(dataset.dataId, graph=storage.datasetType.dimensions))
572 for dataset in progress.wrap(datasets, f"Expanding {storage.datasetType.name} data IDs")
573 ]
574 else:
575 expandedDatasets = [
576 DatasetRef(datasetType, dataset.dataId, id=dataset.id, run=dataset.run, conform=True)
577 for dataset in datasets
578 ]
580 try:
581 refs = list(storage.import_(runRecord, expandedDatasets, idGenerationMode, reuseIds))
582 if self._managers.obscore:
583 self._managers.obscore.add_datasets(refs)
584 except sqlalchemy.exc.IntegrityError as err:
585 raise ConflictingDefinitionError(
586 f"A database constraint failure was triggered by inserting "
587 f"one or more datasets of type {storage.datasetType} into "
588 f"collection '{run}'. "
589 f"This probably means a dataset with the same data ID "
590 f"and dataset type already exists, but it may also mean a "
591 f"dimension row is missing."
592 ) from err
593 return refs
595 def getDataset(self, id: DatasetId) -> Optional[DatasetRef]:
596 # Docstring inherited from lsst.daf.butler.registry.Registry
597 return self._managers.datasets.getDatasetRef(id)
599 @transactional
600 def removeDatasets(self, refs: Iterable[DatasetRef]) -> None:
601 # Docstring inherited from lsst.daf.butler.registry.Registry
602 progress = Progress("lsst.daf.butler.Registry.removeDatasets", level=logging.DEBUG)
603 for datasetType, refsForType in progress.iter_item_chunks(
604 DatasetRef.groupByType(refs).items(), desc="Removing datasets by type"
605 ):
606 storage = self._managers.datasets[datasetType.name]
607 try:
608 storage.delete(refsForType)
609 except sqlalchemy.exc.IntegrityError as err:
610 raise OrphanedRecordError(
611 "One or more datasets is still present in one or more Datastores."
612 ) from err
614 @transactional
615 def associate(self, collection: str, refs: Iterable[DatasetRef]) -> None:
616 # Docstring inherited from lsst.daf.butler.registry.Registry
617 progress = Progress("lsst.daf.butler.Registry.associate", level=logging.DEBUG)
618 collectionRecord = self._managers.collections.find(collection)
619 if collectionRecord.type is not CollectionType.TAGGED:
620 raise CollectionTypeError(
621 f"Collection '{collection}' has type {collectionRecord.type.name}, not TAGGED."
622 )
623 for datasetType, refsForType in progress.iter_item_chunks(
624 DatasetRef.groupByType(refs).items(), desc="Associating datasets by type"
625 ):
626 storage = self._managers.datasets[datasetType.name]
627 try:
628 storage.associate(collectionRecord, refsForType)
629 if self._managers.obscore:
630 # If a TAGGED collection is being monitored by ObsCore
631 # manager then we may need to save the dataset.
632 self._managers.obscore.associate(refsForType, collectionRecord)
633 except sqlalchemy.exc.IntegrityError as err:
634 raise ConflictingDefinitionError(
635 f"Constraint violation while associating dataset of type {datasetType.name} with "
636 f"collection {collection}. This probably means that one or more datasets with the same "
637 f"dataset type and data ID already exist in the collection, but it may also indicate "
638 f"that the datasets do not exist."
639 ) from err
641 @transactional
642 def disassociate(self, collection: str, refs: Iterable[DatasetRef]) -> None:
643 # Docstring inherited from lsst.daf.butler.registry.Registry
644 progress = Progress("lsst.daf.butler.Registry.disassociate", level=logging.DEBUG)
645 collectionRecord = self._managers.collections.find(collection)
646 if collectionRecord.type is not CollectionType.TAGGED:
647 raise CollectionTypeError(
648 f"Collection '{collection}' has type {collectionRecord.type.name}; expected TAGGED."
649 )
650 for datasetType, refsForType in progress.iter_item_chunks(
651 DatasetRef.groupByType(refs).items(), desc="Disassociating datasets by type"
652 ):
653 storage = self._managers.datasets[datasetType.name]
654 storage.disassociate(collectionRecord, refsForType)
655 if self._managers.obscore:
656 self._managers.obscore.disassociate(refsForType, collectionRecord)
658 @transactional
659 def certify(self, collection: str, refs: Iterable[DatasetRef], timespan: Timespan) -> None:
660 # Docstring inherited from lsst.daf.butler.registry.Registry
661 progress = Progress("lsst.daf.butler.Registry.certify", level=logging.DEBUG)
662 collectionRecord = self._managers.collections.find(collection)
663 for datasetType, refsForType in progress.iter_item_chunks(
664 DatasetRef.groupByType(refs).items(), desc="Certifying datasets by type"
665 ):
666 storage = self._managers.datasets[datasetType.name]
667 storage.certify(collectionRecord, refsForType, timespan)
669 @transactional
670 def decertify(
671 self,
672 collection: str,
673 datasetType: Union[str, DatasetType],
674 timespan: Timespan,
675 *,
676 dataIds: Optional[Iterable[DataId]] = None,
677 ) -> None:
678 # Docstring inherited from lsst.daf.butler.registry.Registry
679 collectionRecord = self._managers.collections.find(collection)
680 if isinstance(datasetType, str):
681 storage = self._managers.datasets[datasetType]
682 else:
683 storage = self._managers.datasets[datasetType.name]
684 standardizedDataIds = None
685 if dataIds is not None:
686 standardizedDataIds = [
687 DataCoordinate.standardize(d, graph=storage.datasetType.dimensions) for d in dataIds
688 ]
689 storage.decertify(collectionRecord, timespan, dataIds=standardizedDataIds)
691 def getDatastoreBridgeManager(self) -> DatastoreRegistryBridgeManager:
692 """Return an object that allows a new `Datastore` instance to
693 communicate with this `Registry`.
695 Returns
696 -------
697 manager : `DatastoreRegistryBridgeManager`
698 Object that mediates communication between this `Registry` and its
699 associated datastores.
700 """
701 return self._managers.datastores
703 def getDatasetLocations(self, ref: DatasetRef) -> Iterable[str]:
704 # Docstring inherited from lsst.daf.butler.registry.Registry
705 return self._managers.datastores.findDatastores(ref)
707 def expandDataId(
708 self,
709 dataId: Optional[DataId] = None,
710 *,
711 graph: Optional[DimensionGraph] = None,
712 records: Optional[NameLookupMapping[DimensionElement, Optional[DimensionRecord]]] = None,
713 withDefaults: bool = True,
714 **kwargs: Any,
715 ) -> DataCoordinate:
716 # Docstring inherited from lsst.daf.butler.registry.Registry
717 if not withDefaults:
718 defaults = None
719 else:
720 defaults = self.defaults.dataId
721 try:
722 standardized = DataCoordinate.standardize(
723 dataId, graph=graph, universe=self.dimensions, defaults=defaults, **kwargs
724 )
725 except KeyError as exc:
726 # This means either kwargs have some odd name or required
727 # dimension is missing.
728 raise DimensionNameError(str(exc)) from exc
729 if standardized.hasRecords():
730 return standardized
731 if records is None:
732 records = {}
733 elif isinstance(records, NamedKeyMapping):
734 records = records.byName()
735 else:
736 records = dict(records)
737 if isinstance(dataId, DataCoordinate) and dataId.hasRecords():
738 records.update(dataId.records.byName())
739 keys = standardized.byName()
740 for element in standardized.graph.primaryKeyTraversalOrder:
741 record = records.get(element.name, ...) # Use ... to mean not found; None might mean NULL
742 if record is ...:
743 if isinstance(element, Dimension) and keys.get(element.name) is None:
744 if element in standardized.graph.required:
745 raise DimensionNameError(
746 f"No value or null value for required dimension {element.name}."
747 )
748 keys[element.name] = None
749 record = None
750 else:
751 storage = self._managers.dimensions[element]
752 dataIdSet = DataCoordinateIterable.fromScalar(
753 DataCoordinate.standardize(keys, graph=element.graph)
754 )
755 fetched = tuple(storage.fetch(dataIdSet))
756 try:
757 (record,) = fetched
758 except ValueError:
759 record = None
760 records[element.name] = record
761 if record is not None:
762 for d in element.implied:
763 value = getattr(record, d.name)
764 if keys.setdefault(d.name, value) != value:
765 raise InconsistentDataIdError(
766 f"Data ID {standardized} has {d.name}={keys[d.name]!r}, "
767 f"but {element.name} implies {d.name}={value!r}."
768 )
769 else:
770 if element in standardized.graph.required:
771 raise DataIdValueError(
772 f"Could not fetch record for required dimension {element.name} via keys {keys}."
773 )
774 if element.alwaysJoin:
775 raise InconsistentDataIdError(
776 f"Could not fetch record for element {element.name} via keys {keys}, ",
777 "but it is marked alwaysJoin=True; this means one or more dimensions are not "
778 "related.",
779 )
780 for d in element.implied:
781 keys.setdefault(d.name, None)
782 records.setdefault(d.name, None)
783 return DataCoordinate.standardize(keys, graph=standardized.graph).expanded(records=records)
785 def insertDimensionData(
786 self,
787 element: Union[DimensionElement, str],
788 *data: Union[Mapping[str, Any], DimensionRecord],
789 conform: bool = True,
790 replace: bool = False,
791 skip_existing: bool = False,
792 ) -> None:
793 # Docstring inherited from lsst.daf.butler.registry.Registry
794 if conform:
795 if isinstance(element, str):
796 element = self.dimensions[element]
797 records = [
798 row if isinstance(row, DimensionRecord) else element.RecordClass(**row) for row in data
799 ]
800 else:
801 # Ignore typing since caller said to trust them with conform=False.
802 records = data # type: ignore
803 storage = self._managers.dimensions[element] # type: ignore
804 storage.insert(*records, replace=replace, skip_existing=skip_existing)
806 def syncDimensionData(
807 self,
808 element: Union[DimensionElement, str],
809 row: Union[Mapping[str, Any], DimensionRecord],
810 conform: bool = True,
811 update: bool = False,
812 ) -> Union[bool, Dict[str, Any]]:
813 # Docstring inherited from lsst.daf.butler.registry.Registry
814 if conform:
815 if isinstance(element, str):
816 element = self.dimensions[element]
817 record = row if isinstance(row, DimensionRecord) else element.RecordClass(**row)
818 else:
819 # Ignore typing since caller said to trust them with conform=False.
820 record = row # type: ignore
821 storage = self._managers.dimensions[element] # type: ignore
822 return storage.sync(record, update=update)
824 def queryDatasetTypes(
825 self,
826 expression: Any = ...,
827 *,
828 components: Optional[bool] = None,
829 missing: Optional[List[str]] = None,
830 ) -> Iterator[DatasetType]:
831 # Docstring inherited from lsst.daf.butler.registry.Registry
832 try:
833 wildcard = CategorizedWildcard.fromExpression(expression, coerceUnrecognized=lambda d: d.name)
834 except TypeError as exc:
835 raise DatasetTypeExpressionError(f"Invalid dataset type expression '{expression}'") from exc
836 unknownComponentsMessage = (
837 "Could not find definition for storage class %s for dataset type %r;"
838 " if it has components they will not be included in dataset type query results."
839 )
840 if wildcard is Ellipsis:
841 for datasetType in self._managers.datasets:
842 # The dataset type can no longer be a component
843 yield datasetType
844 if components:
845 # Automatically create the component dataset types
846 try:
847 componentsForDatasetType = datasetType.makeAllComponentDatasetTypes()
848 except KeyError as err:
849 _LOG.warning(unknownComponentsMessage, err, datasetType.name)
850 else:
851 yield from componentsForDatasetType
852 return
853 done: Set[str] = set()
854 for name in wildcard.strings:
855 storage = self._managers.datasets.find(name)
856 done.add(name)
857 if storage is None:
858 if missing is not None:
859 missing.append(name)
860 else:
861 yield storage.datasetType
862 if wildcard.patterns:
863 # If components (the argument) is None, we'll save component
864 # dataset that we might want to match, but only if their parents
865 # didn't get included.
866 componentsForLater = []
867 for registeredDatasetType in self._managers.datasets:
868 # Components are not stored in registry so expand them here
869 allDatasetTypes = [registeredDatasetType]
870 if components is not False:
871 # Only check for the components if we are being asked
872 # for components or components is None.
873 try:
874 allDatasetTypes.extend(registeredDatasetType.makeAllComponentDatasetTypes())
875 except KeyError as err:
876 _LOG.warning(unknownComponentsMessage, err, registeredDatasetType.name)
877 for datasetType in allDatasetTypes:
878 if datasetType.name in done:
879 continue
880 parentName, componentName = datasetType.nameAndComponent()
881 if componentName is not None and not components:
882 if components is None and parentName not in done:
883 componentsForLater.append(datasetType)
884 continue
885 if any(p.fullmatch(datasetType.name) for p in wildcard.patterns):
886 done.add(datasetType.name)
887 yield datasetType
888 # Go back and try to match saved components.
889 for datasetType in componentsForLater:
890 parentName, _ = datasetType.nameAndComponent()
891 if parentName not in done and any(p.fullmatch(datasetType.name) for p in wildcard.patterns):
892 yield datasetType
894 def queryCollections(
895 self,
896 expression: Any = ...,
897 datasetType: Optional[DatasetType] = None,
898 collectionTypes: Union[Iterable[CollectionType], CollectionType] = CollectionType.all(),
899 flattenChains: bool = False,
900 includeChains: Optional[bool] = None,
901 ) -> Iterator[str]:
902 # Docstring inherited from lsst.daf.butler.registry.Registry
904 # Right now the datasetTypes argument is completely ignored, but that
905 # is consistent with its [lack of] guarantees. DM-24939 or a follow-up
906 # ticket will take care of that.
907 try:
908 query = CollectionQuery.fromExpression(expression)
909 except TypeError as exc:
910 raise CollectionExpressionError(f"Invalid collection expression '{expression}'") from exc
911 collectionTypes = ensure_iterable(collectionTypes)
912 for record in query.iter(
913 self._managers.collections,
914 collectionTypes=frozenset(collectionTypes),
915 flattenChains=flattenChains,
916 includeChains=includeChains,
917 ):
918 yield record.name
920 def _makeQueryBuilder(
921 self, summary: queries.QuerySummary, doomed_by: Iterable[str] = ()
922 ) -> queries.QueryBuilder:
923 """Return a `QueryBuilder` instance capable of constructing and
924 managing more complex queries than those obtainable via `Registry`
925 interfaces.
927 This is an advanced interface; downstream code should prefer
928 `Registry.queryDataIds` and `Registry.queryDatasets` whenever those
929 are sufficient.
931 Parameters
932 ----------
933 summary : `queries.QuerySummary`
934 Object describing and categorizing the full set of dimensions that
935 will be included in the query.
936 doomed_by : `Iterable` of `str`, optional
937 A list of diagnostic messages that indicate why the query is going
938 to yield no results and should not even be executed. If an empty
939 container (default) the query will be executed unless other code
940 determines that it is doomed.
942 Returns
943 -------
944 builder : `queries.QueryBuilder`
945 Object that can be used to construct and perform advanced queries.
946 """
947 return queries.QueryBuilder(
948 summary,
949 backend=queries.SqlQueryBackend(self._db, self._managers),
950 doomed_by=doomed_by,
951 )
953 def _standardize_query_dataset_args(
954 self,
955 datasets: Any,
956 collections: Any,
957 components: bool | None,
958 mode: Literal["find_first"] | Literal["find_all"] | Literal["constrain"] = "constrain",
959 *,
960 doomed_by: list[str],
961 ) -> tuple[defaultdict[DatasetType, list[str | None]], CollectionQuery | CollectionSearch | None]:
962 """Preprocess dataset arguments passed to query* methods.
964 Parameters
965 ----------
966 datasets : `DatasetType`, `str`, `re.Pattern`, or iterable of these
967 Expression identifying dataset types. See `queryDatasetTypes` for
968 details.
969 collections : `str`, `re.Pattern`, or iterable of these
970 Expression identifying collections to be searched. See
971 `queryCollections` for details.
972 components : `bool`, optional
973 If `True`, apply all expression patterns to component dataset type
974 names as well. If `False`, never apply patterns to components.
975 If `None` (default), apply patterns to components only if their
976 parent datasets were not matched by the expression.
977 Fully-specified component datasets (`str` or `DatasetType`
978 instances) are always included.
979 mode : `str`, optional
980 The way in which datasets are being used in this query; one of:
982 - "find_first": this is a query for the first dataset in an
983 ordered list of collections. Prohibits collection wildcards,
984 but permits dataset type wildcards.
986 - "find_all": this is a query for all datasets in all matched
987 collections. Permits collection and dataset type wildcards.
989 - "constrain": this is a query for something other than datasets,
990 with results constrained by dataset existence. Permits
991 collection wildcards and prohibits ``...`` as a dataset type
992 wildcard.
993 doomed_by : `list` [ `str` ]
994 List to append messages indicating why the query is doomed to
995 yield no results.
997 Returns
998 -------
999 composition : `defaultdict` [ `DatasetType`, `list` [ `str` ] ]
1000 Dictionary mapping parent dataset type to `list` of components
1001 matched for that dataset type (or `None` for the parent itself).
1002 collections : `CollectionSearch` or `CollectionQuery`
1003 Processed collection expression.
1004 """
1005 composition: defaultdict[DatasetType, list[str | None]] = defaultdict(list)
1006 if datasets is not None:
1007 if not collections:
1008 if not self.defaults.collections:
1009 raise NoDefaultCollectionError("No collections, and no registry default collections.")
1010 collections = self.defaults.collections
1011 elif mode == "find_first":
1012 collections = CollectionSearch.fromExpression(collections)
1013 else:
1014 collections = CollectionQuery.fromExpression(collections)
1015 missing: list[str] = []
1016 if mode == "constrain" and datasets is Ellipsis:
1017 raise TypeError("Cannot pass the universal wildcard '...' for dataset types in this context.")
1018 for dataset_type in self.queryDatasetTypes(datasets, components=components, missing=missing):
1019 if dataset_type.isComponent():
1020 composition[dataset_type.makeCompositeDatasetType()].append(dataset_type.component())
1021 else:
1022 composition[dataset_type].append(None)
1023 doomed_by.extend(f"Dataset type {name} is not registered." for name in missing)
1024 elif collections:
1025 raise ArgumentError(f"Cannot pass 'collections' (='{collections}') without 'datasets'.")
1026 return composition, collections
1028 def queryDatasets(
1029 self,
1030 datasetType: Any,
1031 *,
1032 collections: Any = None,
1033 dimensions: Optional[Iterable[Union[Dimension, str]]] = None,
1034 dataId: Optional[DataId] = None,
1035 where: Optional[str] = None,
1036 findFirst: bool = False,
1037 components: Optional[bool] = None,
1038 bind: Optional[Mapping[str, Any]] = None,
1039 check: bool = True,
1040 **kwargs: Any,
1041 ) -> queries.DatasetQueryResults:
1042 # Docstring inherited from lsst.daf.butler.registry.Registry
1043 doomed_by: list[str] = []
1044 data_id = self.expandDataId(dataId, **kwargs)
1045 dataset_composition, collections = self._standardize_query_dataset_args(
1046 datasetType,
1047 collections,
1048 components,
1049 mode="find_first" if findFirst else "find_all",
1050 doomed_by=doomed_by,
1051 )
1052 parent_results: list[queries.ParentDatasetQueryResults] = []
1053 for parent_dataset_type, components_for_parent in dataset_composition.items():
1054 # The full set of dimensions in the query is the combination of
1055 # those needed for the DatasetType and those explicitly requested,
1056 # if any.
1057 dimension_names = set(parent_dataset_type.dimensions.names)
1058 if dimensions is not None:
1059 dimension_names.update(self.dimensions.extract(dimensions).names)
1060 # Construct the summary structure needed to construct a
1061 # QueryBuilder.
1062 summary = queries.QuerySummary(
1063 requested=DimensionGraph(self.dimensions, names=dimension_names),
1064 dataId=data_id,
1065 expression=where,
1066 bind=bind,
1067 defaults=self.defaults.dataId,
1068 check=check,
1069 datasets=[parent_dataset_type],
1070 )
1071 builder = self._makeQueryBuilder(summary, doomed_by=doomed_by)
1072 # Add the dataset subquery to the query, telling the QueryBuilder
1073 # to include the rank of the selected collection in the results
1074 # only if we need to findFirst. Note that if any of the
1075 # collections are actually wildcard expressions, and
1076 # findFirst=True, this will raise TypeError for us.
1077 builder.joinDataset(parent_dataset_type, collections, isResult=True, findFirst=findFirst)
1078 query = builder.finish()
1079 parent_results.append(
1080 queries.ParentDatasetQueryResults(
1081 self._db, query, datasetType=parent_dataset_type, components=components_for_parent
1082 )
1083 )
1084 if not parent_results:
1085 doomed_by.extend(
1086 f"No registered dataset type matching {t!r} found, so no matching datasets can "
1087 "exist in any collection."
1088 for t in ensure_iterable(datasetType)
1089 )
1090 return queries.ChainedDatasetQueryResults([], doomed_by=doomed_by)
1091 elif len(parent_results) == 1:
1092 return parent_results[0]
1093 else:
1094 return queries.ChainedDatasetQueryResults(parent_results)
1096 def queryDataIds(
1097 self,
1098 dimensions: Union[Iterable[Union[Dimension, str]], Dimension, str],
1099 *,
1100 dataId: Optional[DataId] = None,
1101 datasets: Any = None,
1102 collections: Any = None,
1103 where: Optional[str] = None,
1104 components: Optional[bool] = None,
1105 bind: Optional[Mapping[str, Any]] = None,
1106 check: bool = True,
1107 **kwargs: Any,
1108 ) -> queries.DataCoordinateQueryResults:
1109 # Docstring inherited from lsst.daf.butler.registry.Registry
1110 dimensions = ensure_iterable(dimensions)
1111 requestedDimensions = self.dimensions.extract(dimensions)
1112 doomed_by: list[str] = []
1113 data_id = self.expandDataId(dataId, **kwargs)
1114 dataset_composition, collections = self._standardize_query_dataset_args(
1115 datasets, collections, components, doomed_by=doomed_by
1116 )
1118 def query_factory(
1119 order_by: Optional[Iterable[str]] = None, limit: Optional[Tuple[int, Optional[int]]] = None
1120 ) -> queries.Query:
1121 """Construct the Query object that generates query results."""
1122 summary = queries.QuerySummary(
1123 requested=requestedDimensions,
1124 dataId=data_id,
1125 expression=where,
1126 bind=bind,
1127 defaults=self.defaults.dataId,
1128 check=check,
1129 datasets=dataset_composition.keys(),
1130 order_by=order_by,
1131 limit=limit,
1132 )
1133 builder = self._makeQueryBuilder(summary, doomed_by=doomed_by)
1134 for datasetType in dataset_composition:
1135 builder.joinDataset(datasetType, collections, isResult=False)
1136 return builder.finish()
1138 return queries.DataCoordinateQueryResults(self._db, query_factory, requestedDimensions)
1140 def queryDimensionRecords(
1141 self,
1142 element: Union[DimensionElement, str],
1143 *,
1144 dataId: Optional[DataId] = None,
1145 datasets: Any = None,
1146 collections: Any = None,
1147 where: Optional[str] = None,
1148 components: Optional[bool] = None,
1149 bind: Optional[Mapping[str, Any]] = None,
1150 check: bool = True,
1151 **kwargs: Any,
1152 ) -> queries.DimensionRecordQueryResults:
1153 # Docstring inherited from lsst.daf.butler.registry.Registry
1154 if not isinstance(element, DimensionElement):
1155 try:
1156 element = self.dimensions[element]
1157 except KeyError as e:
1158 raise DimensionNameError(
1159 f"No such dimension '{element}', available dimensions: "
1160 + str(self.dimensions.getStaticElements())
1161 ) from e
1162 dataIds = self.queryDataIds(
1163 element.graph,
1164 dataId=dataId,
1165 datasets=datasets,
1166 collections=collections,
1167 where=where,
1168 components=components,
1169 bind=bind,
1170 check=check,
1171 **kwargs,
1172 )
1173 return queries.DatabaseDimensionRecordQueryResults(dataIds, self._managers.dimensions[element])
1175 def queryDatasetAssociations(
1176 self,
1177 datasetType: Union[str, DatasetType],
1178 collections: Any = ...,
1179 *,
1180 collectionTypes: Iterable[CollectionType] = CollectionType.all(),
1181 flattenChains: bool = False,
1182 ) -> Iterator[DatasetAssociation]:
1183 # Docstring inherited from lsst.daf.butler.registry.Registry
1184 if collections is None:
1185 if not self.defaults.collections:
1186 raise NoDefaultCollectionError(
1187 "No collections provided to findDataset, and no defaults from registry construction."
1188 )
1189 collections = self.defaults.collections
1190 else:
1191 collections = CollectionQuery.fromExpression(collections)
1192 TimespanReprClass = self._db.getTimespanRepresentation()
1193 if isinstance(datasetType, str):
1194 storage = self._managers.datasets[datasetType]
1195 else:
1196 storage = self._managers.datasets[datasetType.name]
1197 for collectionRecord in collections.iter(
1198 self._managers.collections,
1199 collectionTypes=frozenset(collectionTypes),
1200 flattenChains=flattenChains,
1201 ):
1202 query = storage.select(collectionRecord)
1203 for row in self._db.query(query).mappings():
1204 dataId = DataCoordinate.fromRequiredValues(
1205 storage.datasetType.dimensions,
1206 tuple(row[name] for name in storage.datasetType.dimensions.required.names),
1207 )
1208 runRecord = self._managers.collections[row[self._managers.collections.getRunForeignKeyName()]]
1209 ref = DatasetRef(storage.datasetType, dataId, id=row["id"], run=runRecord.name, conform=False)
1210 if collectionRecord.type is CollectionType.CALIBRATION:
1211 timespan = TimespanReprClass.extract(row)
1212 else:
1213 timespan = None
1214 yield DatasetAssociation(ref=ref, collection=collectionRecord.name, timespan=timespan)
1216 storageClasses: StorageClassFactory
1217 """All storage classes known to the registry (`StorageClassFactory`).
1218 """