Coverage for python/lsst/daf/butler/registries/sql.py: 13%
485 statements
« prev ^ index » next coverage.py v7.5.0, created at 2024-04-24 23:50 -0700
« prev ^ index » next coverage.py v7.5.0, created at 2024-04-24 23:50 -0700
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22from __future__ import annotations
24__all__ = ("SqlRegistry",)
26import contextlib
27import logging
28from collections import defaultdict
29from typing import TYPE_CHECKING, Any, Dict, Iterable, Iterator, List, Mapping, Optional, Set, Tuple, Union
31import sqlalchemy
32from lsst.resources import ResourcePathExpression
33from lsst.utils.iteration import ensure_iterable
35from ..core import (
36 Config,
37 DataCoordinate,
38 DataCoordinateIterable,
39 DataId,
40 DatasetAssociation,
41 DatasetId,
42 DatasetRef,
43 DatasetType,
44 Dimension,
45 DimensionConfig,
46 DimensionElement,
47 DimensionGraph,
48 DimensionRecord,
49 DimensionUniverse,
50 NamedKeyMapping,
51 NameLookupMapping,
52 Progress,
53 StorageClassFactory,
54 Timespan,
55 ddl,
56)
57from ..core.utils import transactional
58from ..registry import (
59 ArgumentError,
60 CollectionExpressionError,
61 CollectionSearch,
62 CollectionType,
63 CollectionTypeError,
64 ConflictingDefinitionError,
65 DataIdValueError,
66 DatasetTypeError,
67 DatasetTypeExpressionError,
68 DimensionNameError,
69 InconsistentDataIdError,
70 NoDefaultCollectionError,
71 OrphanedRecordError,
72 Registry,
73 RegistryConfig,
74 RegistryDefaults,
75 queries,
76)
77from ..registry.interfaces import ChainedCollectionRecord, DatasetIdGenEnum, RunRecord
78from ..registry.managers import RegistryManagerInstances, RegistryManagerTypes
79from ..registry.queries import Query
80from ..registry.summaries import CollectionSummary
81from ..registry.wildcards import CategorizedWildcard, CollectionQuery, Ellipsis
83if TYPE_CHECKING: 83 ↛ 84line 83 didn't jump to line 84, because the condition on line 83 was never true
84 from .._butlerConfig import ButlerConfig
85 from ..registry.interfaces import CollectionRecord, Database, DatastoreRegistryBridgeManager
88_LOG = logging.getLogger(__name__)
91class SqlRegistry(Registry):
92 """Registry implementation based on SQLAlchemy.
94 Parameters
95 ----------
96 database : `Database`
97 Database instance to store Registry.
98 defaults : `RegistryDefaults`
99 Default collection search path and/or output `~CollectionType.RUN`
100 collection.
101 managers : `RegistryManagerInstances`
102 All the managers required for this registry.
103 """
105 defaultConfigFile: Optional[str] = None
106 """Path to configuration defaults. Accessed within the ``configs`` resource
107 or relative to a search path. Can be None if no defaults specified.
108 """
110 @classmethod
111 def createFromConfig(
112 cls,
113 config: Optional[Union[RegistryConfig, str]] = None,
114 dimensionConfig: Optional[Union[DimensionConfig, str]] = None,
115 butlerRoot: Optional[ResourcePathExpression] = None,
116 ) -> Registry:
117 """Create registry database and return `SqlRegistry` instance.
119 This method initializes database contents, database must be empty
120 prior to calling this method.
122 Parameters
123 ----------
124 config : `RegistryConfig` or `str`, optional
125 Registry configuration, if missing then default configuration will
126 be loaded from registry.yaml.
127 dimensionConfig : `DimensionConfig` or `str`, optional
128 Dimensions configuration, if missing then default configuration
129 will be loaded from dimensions.yaml.
130 butlerRoot : convertible to `lsst.resources.ResourcePath`, optional
131 Path to the repository root this `SqlRegistry` will manage.
133 Returns
134 -------
135 registry : `SqlRegistry`
136 A new `SqlRegistry` instance.
137 """
138 config = cls.forceRegistryConfig(config)
139 config.replaceRoot(butlerRoot)
141 if isinstance(dimensionConfig, str):
142 dimensionConfig = DimensionConfig(config)
143 elif dimensionConfig is None:
144 dimensionConfig = DimensionConfig()
145 elif not isinstance(dimensionConfig, DimensionConfig):
146 raise TypeError(f"Incompatible Dimension configuration type: {type(dimensionConfig)}")
148 DatabaseClass = config.getDatabaseClass()
149 database = DatabaseClass.fromUri(
150 str(config.connectionString), origin=config.get("origin", 0), namespace=config.get("namespace")
151 )
152 managerTypes = RegistryManagerTypes.fromConfig(config)
153 managers = managerTypes.makeRepo(database, dimensionConfig)
154 return cls(database, RegistryDefaults(), managers)
156 @classmethod
157 def fromConfig(
158 cls,
159 config: Union[ButlerConfig, RegistryConfig, Config, str],
160 butlerRoot: Optional[ResourcePathExpression] = None,
161 writeable: bool = True,
162 defaults: Optional[RegistryDefaults] = None,
163 ) -> Registry:
164 """Create `Registry` subclass instance from `config`.
166 Registry database must be initialized prior to calling this method.
168 Parameters
169 ----------
170 config : `ButlerConfig`, `RegistryConfig`, `Config` or `str`
171 Registry configuration
172 butlerRoot : `lsst.resources.ResourcePathExpression`, optional
173 Path to the repository root this `Registry` will manage.
174 writeable : `bool`, optional
175 If `True` (default) create a read-write connection to the database.
176 defaults : `RegistryDefaults`, optional
177 Default collection search path and/or output `~CollectionType.RUN`
178 collection.
180 Returns
181 -------
182 registry : `SqlRegistry` (subclass)
183 A new `SqlRegistry` subclass instance.
184 """
185 config = cls.forceRegistryConfig(config)
186 config.replaceRoot(butlerRoot)
187 DatabaseClass = config.getDatabaseClass()
188 database = DatabaseClass.fromUri(
189 str(config.connectionString),
190 origin=config.get("origin", 0),
191 namespace=config.get("namespace"),
192 writeable=writeable,
193 )
194 managerTypes = RegistryManagerTypes.fromConfig(config)
195 with database.session():
196 managers = managerTypes.loadRepo(database)
197 if defaults is None:
198 defaults = RegistryDefaults()
199 return cls(database, defaults, managers)
201 def __init__(self, database: Database, defaults: RegistryDefaults, managers: RegistryManagerInstances):
202 self._db = database
203 self._managers = managers
204 self.storageClasses = StorageClassFactory()
205 # Intentionally invoke property setter to initialize defaults. This
206 # can only be done after most of the rest of Registry has already been
207 # initialized, and must be done before the property getter is used.
208 self.defaults = defaults
210 def __str__(self) -> str:
211 return str(self._db)
213 def __repr__(self) -> str:
214 return f"SqlRegistry({self._db!r}, {self.dimensions!r})"
216 def isWriteable(self) -> bool:
217 # Docstring inherited from lsst.daf.butler.registry.Registry
218 return self._db.isWriteable()
220 def copy(self, defaults: Optional[RegistryDefaults] = None) -> Registry:
221 # Docstring inherited from lsst.daf.butler.registry.Registry
222 if defaults is None:
223 # No need to copy, because `RegistryDefaults` is immutable; we
224 # effectively copy on write.
225 defaults = self.defaults
226 return type(self)(self._db, defaults, self._managers)
228 @property
229 def dimensions(self) -> DimensionUniverse:
230 # Docstring inherited from lsst.daf.butler.registry.Registry
231 return self._managers.dimensions.universe
233 def refresh(self) -> None:
234 # Docstring inherited from lsst.daf.butler.registry.Registry
235 with self._db.transaction():
236 self._managers.refresh()
238 @contextlib.contextmanager
239 def transaction(self, *, savepoint: bool = False) -> Iterator[None]:
240 # Docstring inherited from lsst.daf.butler.registry.Registry
241 try:
242 with self._db.transaction(savepoint=savepoint):
243 yield
244 except BaseException:
245 # TODO: this clears the caches sometimes when we wouldn't actually
246 # need to. Can we avoid that?
247 self._managers.dimensions.clearCaches()
248 raise
250 def resetConnectionPool(self) -> None:
251 """Reset SQLAlchemy connection pool for `SqlRegistry` database.
253 This operation is useful when using registry with fork-based
254 multiprocessing. To use registry across fork boundary one has to make
255 sure that there are no currently active connections (no session or
256 transaction is in progress) and connection pool is reset using this
257 method. This method should be called by the child process immediately
258 after the fork.
259 """
260 self._db._engine.dispose()
262 def registerOpaqueTable(self, tableName: str, spec: ddl.TableSpec) -> None:
263 """Add an opaque (to the `Registry`) table for use by a `Datastore` or
264 other data repository client.
266 Opaque table records can be added via `insertOpaqueData`, retrieved via
267 `fetchOpaqueData`, and removed via `deleteOpaqueData`.
269 Parameters
270 ----------
271 tableName : `str`
272 Logical name of the opaque table. This may differ from the
273 actual name used in the database by a prefix and/or suffix.
274 spec : `ddl.TableSpec`
275 Specification for the table to be added.
276 """
277 self._managers.opaque.register(tableName, spec)
279 @transactional
280 def insertOpaqueData(self, tableName: str, *data: dict) -> None:
281 """Insert records into an opaque table.
283 Parameters
284 ----------
285 tableName : `str`
286 Logical name of the opaque table. Must match the name used in a
287 previous call to `registerOpaqueTable`.
288 data
289 Each additional positional argument is a dictionary that represents
290 a single row to be added.
291 """
292 self._managers.opaque[tableName].insert(*data)
294 def fetchOpaqueData(self, tableName: str, **where: Any) -> Iterator[dict]:
295 """Retrieve records from an opaque table.
297 Parameters
298 ----------
299 tableName : `str`
300 Logical name of the opaque table. Must match the name used in a
301 previous call to `registerOpaqueTable`.
302 where
303 Additional keyword arguments are interpreted as equality
304 constraints that restrict the returned rows (combined with AND);
305 keyword arguments are column names and values are the values they
306 must have.
308 Yields
309 ------
310 row : `dict`
311 A dictionary representing a single result row.
312 """
313 yield from self._managers.opaque[tableName].fetch(**where)
315 @transactional
316 def deleteOpaqueData(self, tableName: str, **where: Any) -> None:
317 """Remove records from an opaque table.
319 Parameters
320 ----------
321 tableName : `str`
322 Logical name of the opaque table. Must match the name used in a
323 previous call to `registerOpaqueTable`.
324 where
325 Additional keyword arguments are interpreted as equality
326 constraints that restrict the deleted rows (combined with AND);
327 keyword arguments are column names and values are the values they
328 must have.
329 """
330 self._managers.opaque[tableName].delete(where.keys(), where)
332 def registerCollection(
333 self, name: str, type: CollectionType = CollectionType.TAGGED, doc: Optional[str] = None
334 ) -> bool:
335 # Docstring inherited from lsst.daf.butler.registry.Registry
336 _, registered = self._managers.collections.register(name, type, doc=doc)
337 return registered
339 def getCollectionType(self, name: str) -> CollectionType:
340 # Docstring inherited from lsst.daf.butler.registry.Registry
341 return self._managers.collections.find(name).type
343 def _get_collection_record(self, name: str) -> CollectionRecord:
344 # Docstring inherited from lsst.daf.butler.registry.Registry
345 return self._managers.collections.find(name)
347 def registerRun(self, name: str, doc: Optional[str] = None) -> bool:
348 # Docstring inherited from lsst.daf.butler.registry.Registry
349 _, registered = self._managers.collections.register(name, CollectionType.RUN, doc=doc)
350 return registered
352 @transactional
353 def removeCollection(self, name: str) -> None:
354 # Docstring inherited from lsst.daf.butler.registry.Registry
355 self._managers.collections.remove(name)
357 def getCollectionChain(self, parent: str) -> CollectionSearch:
358 # Docstring inherited from lsst.daf.butler.registry.Registry
359 record = self._managers.collections.find(parent)
360 if record.type is not CollectionType.CHAINED:
361 raise CollectionTypeError(f"Collection '{parent}' has type {record.type.name}, not CHAINED.")
362 assert isinstance(record, ChainedCollectionRecord)
363 return record.children
365 @transactional
366 def setCollectionChain(self, parent: str, children: Any, *, flatten: bool = False) -> None:
367 # Docstring inherited from lsst.daf.butler.registry.Registry
368 record = self._managers.collections.find(parent)
369 if record.type is not CollectionType.CHAINED:
370 raise CollectionTypeError(f"Collection '{parent}' has type {record.type.name}, not CHAINED.")
371 assert isinstance(record, ChainedCollectionRecord)
372 children = CollectionSearch.fromExpression(children)
373 if children != record.children or flatten:
374 record.update(self._managers.collections, children, flatten=flatten)
376 def getCollectionParentChains(self, collection: str) -> Set[str]:
377 # Docstring inherited from lsst.daf.butler.registry.Registry
378 return {
379 record.name
380 for record in self._managers.collections.getParentChains(
381 self._managers.collections.find(collection).key
382 )
383 }
385 def getCollectionDocumentation(self, collection: str) -> Optional[str]:
386 # Docstring inherited from lsst.daf.butler.registry.Registry
387 return self._managers.collections.getDocumentation(self._managers.collections.find(collection).key)
389 def setCollectionDocumentation(self, collection: str, doc: Optional[str]) -> None:
390 # Docstring inherited from lsst.daf.butler.registry.Registry
391 self._managers.collections.setDocumentation(self._managers.collections.find(collection).key, doc)
393 def getCollectionSummary(self, collection: str) -> CollectionSummary:
394 # Docstring inherited from lsst.daf.butler.registry.Registry
395 record = self._managers.collections.find(collection)
396 return self._managers.datasets.getCollectionSummary(record)
398 def registerDatasetType(self, datasetType: DatasetType) -> bool:
399 # Docstring inherited from lsst.daf.butler.registry.Registry
400 _, inserted = self._managers.datasets.register(datasetType)
401 return inserted
403 def removeDatasetType(self, name: str) -> None:
404 # Docstring inherited from lsst.daf.butler.registry.Registry
405 self._managers.datasets.remove(name)
407 def getDatasetType(self, name: str) -> DatasetType:
408 # Docstring inherited from lsst.daf.butler.registry.Registry
409 return self._managers.datasets[name].datasetType
411 def supportsIdGenerationMode(self, mode: DatasetIdGenEnum) -> bool:
412 # Docstring inherited from lsst.daf.butler.registry.Registry
413 return self._managers.datasets.supportsIdGenerationMode(mode)
415 def findDataset(
416 self,
417 datasetType: Union[DatasetType, str],
418 dataId: Optional[DataId] = None,
419 *,
420 collections: Any = None,
421 timespan: Optional[Timespan] = None,
422 **kwargs: Any,
423 ) -> Optional[DatasetRef]:
424 # Docstring inherited from lsst.daf.butler.registry.Registry
425 if isinstance(datasetType, DatasetType):
426 storage = self._managers.datasets[datasetType.name]
427 else:
428 storage = self._managers.datasets[datasetType]
429 dataId = DataCoordinate.standardize(
430 dataId,
431 graph=storage.datasetType.dimensions,
432 universe=self.dimensions,
433 defaults=self.defaults.dataId,
434 **kwargs,
435 )
436 if collections is None:
437 if not self.defaults.collections:
438 raise NoDefaultCollectionError(
439 "No collections provided to findDataset, and no defaults from registry construction."
440 )
441 collections = self.defaults.collections
442 else:
443 collections = CollectionSearch.fromExpression(collections)
444 for collectionRecord in collections.iter(self._managers.collections):
445 if collectionRecord.type is CollectionType.CALIBRATION and (
446 not storage.datasetType.isCalibration() or timespan is None
447 ):
448 continue
449 result = storage.find(collectionRecord, dataId, timespan=timespan)
450 if result is not None:
451 return result
453 return None
455 @transactional
456 def insertDatasets(
457 self,
458 datasetType: Union[DatasetType, str],
459 dataIds: Iterable[DataId],
460 run: Optional[str] = None,
461 expand: bool = True,
462 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE,
463 ) -> List[DatasetRef]:
464 # Docstring inherited from lsst.daf.butler.registry.Registry
465 if isinstance(datasetType, DatasetType):
466 storage = self._managers.datasets.find(datasetType.name)
467 if storage is None:
468 raise DatasetTypeError(f"DatasetType '{datasetType}' has not been registered.")
469 else:
470 storage = self._managers.datasets.find(datasetType)
471 if storage is None:
472 raise DatasetTypeError(f"DatasetType with name '{datasetType}' has not been registered.")
473 if run is None:
474 if self.defaults.run is None:
475 raise NoDefaultCollectionError(
476 "No run provided to insertDatasets, and no default from registry construction."
477 )
478 run = self.defaults.run
479 runRecord = self._managers.collections.find(run)
480 if runRecord.type is not CollectionType.RUN:
481 raise CollectionTypeError(
482 f"Given collection is of type {runRecord.type.name}; RUN collection required."
483 )
484 assert isinstance(runRecord, RunRecord)
485 progress = Progress("daf.butler.Registry.insertDatasets", level=logging.DEBUG)
486 if expand:
487 expandedDataIds = [
488 self.expandDataId(dataId, graph=storage.datasetType.dimensions)
489 for dataId in progress.wrap(dataIds, f"Expanding {storage.datasetType.name} data IDs")
490 ]
491 else:
492 expandedDataIds = [
493 DataCoordinate.standardize(dataId, graph=storage.datasetType.dimensions) for dataId in dataIds
494 ]
495 try:
496 refs = list(storage.insert(runRecord, expandedDataIds, idGenerationMode))
497 except sqlalchemy.exc.IntegrityError as err:
498 raise ConflictingDefinitionError(
499 f"A database constraint failure was triggered by inserting "
500 f"one or more datasets of type {storage.datasetType} into "
501 f"collection '{run}'. "
502 f"This probably means a dataset with the same data ID "
503 f"and dataset type already exists, but it may also mean a "
504 f"dimension row is missing."
505 ) from err
506 return refs
508 @transactional
509 def _importDatasets(
510 self,
511 datasets: Iterable[DatasetRef],
512 expand: bool = True,
513 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE,
514 reuseIds: bool = False,
515 ) -> List[DatasetRef]:
516 # Docstring inherited from lsst.daf.butler.registry.Registry
517 datasets = list(datasets)
518 if not datasets:
519 # nothing to do
520 return []
522 # find dataset type
523 datasetTypes = set(dataset.datasetType for dataset in datasets)
524 if len(datasetTypes) != 1:
525 raise DatasetTypeError(f"Multiple dataset types in input datasets: {datasetTypes}")
526 datasetType = datasetTypes.pop()
528 # get storage handler for this dataset type
529 storage = self._managers.datasets.find(datasetType.name)
530 if storage is None:
531 raise DatasetTypeError(f"DatasetType '{datasetType}' has not been registered.")
533 # find run name
534 runs = set(dataset.run for dataset in datasets)
535 if len(runs) != 1:
536 raise ValueError(f"Multiple run names in input datasets: {runs}")
537 run = runs.pop()
538 if run is None:
539 if self.defaults.run is None:
540 raise NoDefaultCollectionError(
541 "No run provided to ingestDatasets, and no default from registry construction."
542 )
543 run = self.defaults.run
545 runRecord = self._managers.collections.find(run)
546 if runRecord.type is not CollectionType.RUN:
547 raise CollectionTypeError(
548 f"Given collection '{runRecord.name}' is of type {runRecord.type.name};"
549 " RUN collection required."
550 )
551 assert isinstance(runRecord, RunRecord)
553 progress = Progress("daf.butler.Registry.insertDatasets", level=logging.DEBUG)
554 if expand:
555 expandedDatasets = [
556 dataset.expanded(self.expandDataId(dataset.dataId, graph=storage.datasetType.dimensions))
557 for dataset in progress.wrap(datasets, f"Expanding {storage.datasetType.name} data IDs")
558 ]
559 else:
560 expandedDatasets = [
561 DatasetRef(datasetType, dataset.dataId, id=dataset.id, run=dataset.run, conform=True)
562 for dataset in datasets
563 ]
565 try:
566 refs = list(storage.import_(runRecord, expandedDatasets, idGenerationMode, reuseIds))
567 except sqlalchemy.exc.IntegrityError as err:
568 raise ConflictingDefinitionError(
569 f"A database constraint failure was triggered by inserting "
570 f"one or more datasets of type {storage.datasetType} into "
571 f"collection '{run}'. "
572 f"This probably means a dataset with the same data ID "
573 f"and dataset type already exists, but it may also mean a "
574 f"dimension row is missing."
575 ) from err
576 return refs
578 def getDataset(self, id: DatasetId) -> Optional[DatasetRef]:
579 # Docstring inherited from lsst.daf.butler.registry.Registry
580 return self._managers.datasets.getDatasetRef(id)
582 @transactional
583 def removeDatasets(self, refs: Iterable[DatasetRef]) -> None:
584 # Docstring inherited from lsst.daf.butler.registry.Registry
585 progress = Progress("lsst.daf.butler.Registry.removeDatasets", level=logging.DEBUG)
586 for datasetType, refsForType in progress.iter_item_chunks(
587 DatasetRef.groupByType(refs).items(), desc="Removing datasets by type"
588 ):
589 storage = self._managers.datasets[datasetType.name]
590 try:
591 storage.delete(refsForType)
592 except sqlalchemy.exc.IntegrityError as err:
593 raise OrphanedRecordError(
594 "One or more datasets is still present in one or more Datastores."
595 ) from err
597 @transactional
598 def associate(self, collection: str, refs: Iterable[DatasetRef]) -> None:
599 # Docstring inherited from lsst.daf.butler.registry.Registry
600 progress = Progress("lsst.daf.butler.Registry.associate", level=logging.DEBUG)
601 collectionRecord = self._managers.collections.find(collection)
602 if collectionRecord.type is not CollectionType.TAGGED:
603 raise CollectionTypeError(
604 f"Collection '{collection}' has type {collectionRecord.type.name}, not TAGGED."
605 )
606 for datasetType, refsForType in progress.iter_item_chunks(
607 DatasetRef.groupByType(refs).items(), desc="Associating datasets by type"
608 ):
609 storage = self._managers.datasets[datasetType.name]
610 try:
611 storage.associate(collectionRecord, refsForType)
612 except sqlalchemy.exc.IntegrityError as err:
613 raise ConflictingDefinitionError(
614 f"Constraint violation while associating dataset of type {datasetType.name} with "
615 f"collection {collection}. This probably means that one or more datasets with the same "
616 f"dataset type and data ID already exist in the collection, but it may also indicate "
617 f"that the datasets do not exist."
618 ) from err
620 @transactional
621 def disassociate(self, collection: str, refs: Iterable[DatasetRef]) -> None:
622 # Docstring inherited from lsst.daf.butler.registry.Registry
623 progress = Progress("lsst.daf.butler.Registry.disassociate", level=logging.DEBUG)
624 collectionRecord = self._managers.collections.find(collection)
625 if collectionRecord.type is not CollectionType.TAGGED:
626 raise CollectionTypeError(
627 f"Collection '{collection}' has type {collectionRecord.type.name}; expected TAGGED."
628 )
629 for datasetType, refsForType in progress.iter_item_chunks(
630 DatasetRef.groupByType(refs).items(), desc="Disassociating datasets by type"
631 ):
632 storage = self._managers.datasets[datasetType.name]
633 storage.disassociate(collectionRecord, refsForType)
635 @transactional
636 def certify(self, collection: str, refs: Iterable[DatasetRef], timespan: Timespan) -> None:
637 # Docstring inherited from lsst.daf.butler.registry.Registry
638 progress = Progress("lsst.daf.butler.Registry.certify", level=logging.DEBUG)
639 collectionRecord = self._managers.collections.find(collection)
640 for datasetType, refsForType in progress.iter_item_chunks(
641 DatasetRef.groupByType(refs).items(), desc="Certifying datasets by type"
642 ):
643 storage = self._managers.datasets[datasetType.name]
644 storage.certify(collectionRecord, refsForType, timespan)
646 @transactional
647 def decertify(
648 self,
649 collection: str,
650 datasetType: Union[str, DatasetType],
651 timespan: Timespan,
652 *,
653 dataIds: Optional[Iterable[DataId]] = None,
654 ) -> None:
655 # Docstring inherited from lsst.daf.butler.registry.Registry
656 collectionRecord = self._managers.collections.find(collection)
657 if isinstance(datasetType, str):
658 storage = self._managers.datasets[datasetType]
659 else:
660 storage = self._managers.datasets[datasetType.name]
661 standardizedDataIds = None
662 if dataIds is not None:
663 standardizedDataIds = [
664 DataCoordinate.standardize(d, graph=storage.datasetType.dimensions) for d in dataIds
665 ]
666 storage.decertify(collectionRecord, timespan, dataIds=standardizedDataIds)
668 def getDatastoreBridgeManager(self) -> DatastoreRegistryBridgeManager:
669 """Return an object that allows a new `Datastore` instance to
670 communicate with this `Registry`.
672 Returns
673 -------
674 manager : `DatastoreRegistryBridgeManager`
675 Object that mediates communication between this `Registry` and its
676 associated datastores.
677 """
678 return self._managers.datastores
680 def getDatasetLocations(self, ref: DatasetRef) -> Iterable[str]:
681 # Docstring inherited from lsst.daf.butler.registry.Registry
682 return self._managers.datastores.findDatastores(ref)
684 def expandDataId(
685 self,
686 dataId: Optional[DataId] = None,
687 *,
688 graph: Optional[DimensionGraph] = None,
689 records: Optional[NameLookupMapping[DimensionElement, Optional[DimensionRecord]]] = None,
690 withDefaults: bool = True,
691 **kwargs: Any,
692 ) -> DataCoordinate:
693 # Docstring inherited from lsst.daf.butler.registry.Registry
694 if not withDefaults:
695 defaults = None
696 else:
697 defaults = self.defaults.dataId
698 try:
699 standardized = DataCoordinate.standardize(
700 dataId, graph=graph, universe=self.dimensions, defaults=defaults, **kwargs
701 )
702 except KeyError as exc:
703 # This means either kwargs have some odd name or required
704 # dimension is missing.
705 raise DimensionNameError(str(exc)) from exc
706 if standardized.hasRecords():
707 return standardized
708 if records is None:
709 records = {}
710 elif isinstance(records, NamedKeyMapping):
711 records = records.byName()
712 else:
713 records = dict(records)
714 if isinstance(dataId, DataCoordinate) and dataId.hasRecords():
715 records.update(dataId.records.byName())
716 keys = standardized.byName()
717 for element in standardized.graph.primaryKeyTraversalOrder:
718 record = records.get(element.name, ...) # Use ... to mean not found; None might mean NULL
719 if record is ...:
720 if isinstance(element, Dimension) and keys.get(element.name) is None:
721 if element in standardized.graph.required:
722 raise DimensionNameError(
723 f"No value or null value for required dimension {element.name}."
724 )
725 keys[element.name] = None
726 record = None
727 else:
728 storage = self._managers.dimensions[element]
729 dataIdSet = DataCoordinateIterable.fromScalar(
730 DataCoordinate.standardize(keys, graph=element.graph)
731 )
732 fetched = tuple(storage.fetch(dataIdSet))
733 try:
734 (record,) = fetched
735 except ValueError:
736 record = None
737 records[element.name] = record
738 if record is not None:
739 for d in element.implied:
740 value = getattr(record, d.name)
741 if keys.setdefault(d.name, value) != value:
742 raise InconsistentDataIdError(
743 f"Data ID {standardized} has {d.name}={keys[d.name]!r}, "
744 f"but {element.name} implies {d.name}={value!r}."
745 )
746 else:
747 if element in standardized.graph.required:
748 raise DataIdValueError(
749 f"Could not fetch record for required dimension {element.name} via keys {keys}."
750 )
751 if element.alwaysJoin:
752 raise InconsistentDataIdError(
753 f"Could not fetch record for element {element.name} via keys {keys}, ",
754 "but it is marked alwaysJoin=True; this means one or more dimensions are not "
755 "related.",
756 )
757 for d in element.implied:
758 keys.setdefault(d.name, None)
759 records.setdefault(d.name, None)
760 return DataCoordinate.standardize(keys, graph=standardized.graph).expanded(records=records)
762 def insertDimensionData(
763 self,
764 element: Union[DimensionElement, str],
765 *data: Union[Mapping[str, Any], DimensionRecord],
766 conform: bool = True,
767 replace: bool = False,
768 skip_existing: bool = False,
769 ) -> None:
770 # Docstring inherited from lsst.daf.butler.registry.Registry
771 if conform:
772 if isinstance(element, str):
773 element = self.dimensions[element]
774 records = [
775 row if isinstance(row, DimensionRecord) else element.RecordClass(**row) for row in data
776 ]
777 else:
778 # Ignore typing since caller said to trust them with conform=False.
779 records = data # type: ignore
780 storage = self._managers.dimensions[element] # type: ignore
781 storage.insert(*records, replace=replace, skip_existing=skip_existing)
783 def syncDimensionData(
784 self,
785 element: Union[DimensionElement, str],
786 row: Union[Mapping[str, Any], DimensionRecord],
787 conform: bool = True,
788 update: bool = False,
789 ) -> Union[bool, Dict[str, Any]]:
790 # Docstring inherited from lsst.daf.butler.registry.Registry
791 if conform:
792 if isinstance(element, str):
793 element = self.dimensions[element]
794 record = row if isinstance(row, DimensionRecord) else element.RecordClass(**row)
795 else:
796 # Ignore typing since caller said to trust them with conform=False.
797 record = row # type: ignore
798 storage = self._managers.dimensions[element] # type: ignore
799 return storage.sync(record, update=update)
801 def queryDatasetTypes(
802 self,
803 expression: Any = ...,
804 *,
805 components: Optional[bool] = None,
806 missing: Optional[List[str]] = None,
807 ) -> Iterator[DatasetType]:
808 # Docstring inherited from lsst.daf.butler.registry.Registry
809 try:
810 wildcard = CategorizedWildcard.fromExpression(expression, coerceUnrecognized=lambda d: d.name)
811 except TypeError as exc:
812 raise DatasetTypeExpressionError(f"Invalid dataset type expression '{expression}'") from exc
813 unknownComponentsMessage = (
814 "Could not find definition for storage class %s for dataset type %r;"
815 " if it has components they will not be included in dataset type query results."
816 )
817 if wildcard is Ellipsis:
818 for datasetType in self._managers.datasets:
819 # The dataset type can no longer be a component
820 yield datasetType
821 if components:
822 # Automatically create the component dataset types
823 try:
824 componentsForDatasetType = datasetType.makeAllComponentDatasetTypes()
825 except KeyError as err:
826 _LOG.warning(unknownComponentsMessage, err, datasetType.name)
827 else:
828 yield from componentsForDatasetType
829 return
830 done: Set[str] = set()
831 for name in wildcard.strings:
832 storage = self._managers.datasets.find(name)
833 done.add(name)
834 if storage is None:
835 if missing is not None:
836 missing.append(name)
837 else:
838 yield storage.datasetType
839 if wildcard.patterns:
840 # If components (the argument) is None, we'll save component
841 # dataset that we might want to match, but only if their parents
842 # didn't get included.
843 componentsForLater = []
844 for registeredDatasetType in self._managers.datasets:
845 # Components are not stored in registry so expand them here
846 allDatasetTypes = [registeredDatasetType]
847 if components is not False:
848 # Only check for the components if we are being asked
849 # for components or components is None.
850 try:
851 allDatasetTypes.extend(registeredDatasetType.makeAllComponentDatasetTypes())
852 except KeyError as err:
853 _LOG.warning(unknownComponentsMessage, err, registeredDatasetType.name)
854 for datasetType in allDatasetTypes:
855 if datasetType.name in done:
856 continue
857 parentName, componentName = datasetType.nameAndComponent()
858 if componentName is not None and not components:
859 if components is None and parentName not in done:
860 componentsForLater.append(datasetType)
861 continue
862 if any(p.fullmatch(datasetType.name) for p in wildcard.patterns):
863 done.add(datasetType.name)
864 yield datasetType
865 # Go back and try to match saved components.
866 for datasetType in componentsForLater:
867 parentName, _ = datasetType.nameAndComponent()
868 if parentName not in done and any(p.fullmatch(datasetType.name) for p in wildcard.patterns):
869 yield datasetType
871 def queryCollections(
872 self,
873 expression: Any = ...,
874 datasetType: Optional[DatasetType] = None,
875 collectionTypes: Union[Iterable[CollectionType], CollectionType] = CollectionType.all(),
876 flattenChains: bool = False,
877 includeChains: Optional[bool] = None,
878 ) -> Iterator[str]:
879 # Docstring inherited from lsst.daf.butler.registry.Registry
881 # Right now the datasetTypes argument is completely ignored, but that
882 # is consistent with its [lack of] guarantees. DM-24939 or a follow-up
883 # ticket will take care of that.
884 try:
885 query = CollectionQuery.fromExpression(expression)
886 except TypeError as exc:
887 raise CollectionExpressionError(f"Invalid collection expression '{expression}'") from exc
888 collectionTypes = ensure_iterable(collectionTypes)
889 for record in query.iter(
890 self._managers.collections,
891 collectionTypes=frozenset(collectionTypes),
892 flattenChains=flattenChains,
893 includeChains=includeChains,
894 ):
895 yield record.name
897 def _makeQueryBuilder(
898 self, summary: queries.QuerySummary, doomed_by: Iterable[str] = ()
899 ) -> queries.QueryBuilder:
900 """Return a `QueryBuilder` instance capable of constructing and
901 managing more complex queries than those obtainable via `Registry`
902 interfaces.
904 This is an advanced interface; downstream code should prefer
905 `Registry.queryDataIds` and `Registry.queryDatasets` whenever those
906 are sufficient.
908 Parameters
909 ----------
910 summary : `queries.QuerySummary`
911 Object describing and categorizing the full set of dimensions that
912 will be included in the query.
913 doomed_by : `Iterable` of `str`, optional
914 A list of diagnostic messages that indicate why the query is going
915 to yield no results and should not even be executed. If an empty
916 container (default) the query will be executed unless other code
917 determines that it is doomed.
919 Returns
920 -------
921 builder : `queries.QueryBuilder`
922 Object that can be used to construct and perform advanced queries.
923 """
924 return queries.QueryBuilder(
925 summary,
926 queries.RegistryManagers(
927 collections=self._managers.collections,
928 dimensions=self._managers.dimensions,
929 datasets=self._managers.datasets,
930 TimespanReprClass=self._db.getTimespanRepresentation(),
931 ),
932 doomed_by=doomed_by,
933 )
935 def queryDatasets(
936 self,
937 datasetType: Any,
938 *,
939 collections: Any = None,
940 dimensions: Optional[Iterable[Union[Dimension, str]]] = None,
941 dataId: Optional[DataId] = None,
942 where: Optional[str] = None,
943 findFirst: bool = False,
944 components: Optional[bool] = None,
945 bind: Optional[Mapping[str, Any]] = None,
946 check: bool = True,
947 **kwargs: Any,
948 ) -> queries.DatasetQueryResults:
949 # Docstring inherited from lsst.daf.butler.registry.Registry
951 # Standardize the collections expression.
952 if collections is None:
953 if not self.defaults.collections:
954 raise NoDefaultCollectionError(
955 "No collections provided to findDataset, and no defaults from registry construction."
956 )
957 collections = self.defaults.collections
958 elif findFirst:
959 collections = CollectionSearch.fromExpression(collections)
960 else:
961 collections = CollectionQuery.fromExpression(collections)
962 # Standardize and expand the data ID provided as a constraint.
963 standardizedDataId = self.expandDataId(dataId, **kwargs)
965 # We can only query directly if given a non-component DatasetType
966 # instance. If we were given an expression or str or a component
967 # DatasetType instance, we'll populate this dict, recurse, and return.
968 # If we already have a non-component DatasetType, it will remain None
969 # and we'll run the query directly.
970 composition: Optional[
971 Dict[
972 DatasetType, List[Optional[str]] # parent dataset type # component name, or None for parent
973 ]
974 ] = None
975 if not isinstance(datasetType, DatasetType):
976 # We were given a dataset type expression (which may be as simple
977 # as a str). Loop over all matching datasets, delegating handling
978 # of the `components` argument to queryDatasetTypes, as we populate
979 # the composition dict.
980 composition = defaultdict(list)
981 for trueDatasetType in self.queryDatasetTypes(datasetType, components=components):
982 parentName, componentName = trueDatasetType.nameAndComponent()
983 if componentName is not None:
984 parentDatasetType = self.getDatasetType(parentName)
985 composition.setdefault(parentDatasetType, []).append(componentName)
986 else:
987 composition.setdefault(trueDatasetType, []).append(None)
988 if not composition:
989 return queries.ChainedDatasetQueryResults(
990 [],
991 doomed_by=[
992 f"No registered dataset type matching {t!r} found, so no matching datasets can "
993 "exist in any collection."
994 for t in ensure_iterable(datasetType)
995 ],
996 )
997 elif datasetType.isComponent():
998 # We were given a true DatasetType instance, but it's a component.
999 # the composition dict will have exactly one item.
1000 parentName, componentName = datasetType.nameAndComponent()
1001 parentDatasetType = self.getDatasetType(parentName)
1002 composition = {parentDatasetType: [componentName]}
1003 if composition is not None:
1004 # We need to recurse. Do that once for each parent dataset type.
1005 chain = []
1006 for parentDatasetType, componentNames in composition.items():
1007 parentResults = self.queryDatasets(
1008 parentDatasetType,
1009 collections=collections,
1010 dimensions=dimensions,
1011 dataId=standardizedDataId,
1012 where=where,
1013 bind=bind,
1014 findFirst=findFirst,
1015 check=check,
1016 )
1017 assert isinstance(
1018 parentResults, queries.ParentDatasetQueryResults
1019 ), "Should always be true if passing in a DatasetType instance, and we are."
1020 chain.append(parentResults.withComponents(componentNames))
1021 return queries.ChainedDatasetQueryResults(chain)
1022 # If we get here, there's no need to recurse (or we are already
1023 # recursing; there can only ever be one level of recursion).
1025 # The full set of dimensions in the query is the combination of those
1026 # needed for the DatasetType and those explicitly requested, if any.
1027 requestedDimensionNames = set(datasetType.dimensions.names)
1028 if dimensions is not None:
1029 requestedDimensionNames.update(self.dimensions.extract(dimensions).names)
1030 # Construct the summary structure needed to construct a QueryBuilder.
1031 summary = queries.QuerySummary(
1032 requested=DimensionGraph(self.dimensions, names=requestedDimensionNames),
1033 dataId=standardizedDataId,
1034 expression=where,
1035 bind=bind,
1036 defaults=self.defaults.dataId,
1037 check=check,
1038 datasets=[datasetType],
1039 )
1040 builder = self._makeQueryBuilder(summary)
1041 # Add the dataset subquery to the query, telling the QueryBuilder to
1042 # include the rank of the selected collection in the results only if we
1043 # need to findFirst. Note that if any of the collections are
1044 # actually wildcard expressions, and we've asked for deduplication,
1045 # this will raise TypeError for us.
1046 builder.joinDataset(datasetType, collections, isResult=True, findFirst=findFirst)
1047 query = builder.finish()
1048 return queries.ParentDatasetQueryResults(self._db, query, components=[None], datasetType=datasetType)
1050 def queryDataIds(
1051 self,
1052 dimensions: Union[Iterable[Union[Dimension, str]], Dimension, str],
1053 *,
1054 dataId: Optional[DataId] = None,
1055 datasets: Any = None,
1056 collections: Any = None,
1057 where: Optional[str] = None,
1058 components: Optional[bool] = None,
1059 bind: Optional[Mapping[str, Any]] = None,
1060 check: bool = True,
1061 **kwargs: Any,
1062 ) -> queries.DataCoordinateQueryResults:
1063 # Docstring inherited from lsst.daf.butler.registry.Registry
1064 dimensions = ensure_iterable(dimensions)
1065 standardizedDataId = self.expandDataId(dataId, **kwargs)
1066 standardizedDatasetTypes = set()
1067 requestedDimensions = self.dimensions.extract(dimensions)
1068 missing: List[str] = []
1069 if datasets is not None:
1070 if not collections:
1071 if not self.defaults.collections:
1072 raise NoDefaultCollectionError(
1073 f"Cannot pass 'datasets' (='{datasets}') without 'collections'."
1074 )
1075 collections = self.defaults.collections
1076 else:
1077 # Preprocess collections expression in case the original
1078 # included single-pass iterators (we'll want to use it multiple
1079 # times below).
1080 collections = CollectionQuery.fromExpression(collections)
1081 for datasetType in self.queryDatasetTypes(datasets, components=components, missing=missing):
1082 # If any matched dataset type is a component, just operate on
1083 # its parent instead, because Registry doesn't know anything
1084 # about what components exist, and here (unlike queryDatasets)
1085 # we don't care about returning them.
1086 parentDatasetTypeName, componentName = datasetType.nameAndComponent()
1087 if componentName is not None:
1088 datasetType = self.getDatasetType(parentDatasetTypeName)
1089 standardizedDatasetTypes.add(datasetType)
1090 elif collections:
1091 raise ArgumentError(f"Cannot pass 'collections' (='{collections}') without 'datasets'.")
1093 def query_factory(
1094 order_by: Optional[Iterable[str]] = None, limit: Optional[Tuple[int, Optional[int]]] = None
1095 ) -> Query:
1096 """Construct the Query object that generates query results."""
1097 summary = queries.QuerySummary(
1098 requested=requestedDimensions,
1099 dataId=standardizedDataId,
1100 expression=where,
1101 bind=bind,
1102 defaults=self.defaults.dataId,
1103 check=check,
1104 datasets=standardizedDatasetTypes,
1105 order_by=order_by,
1106 limit=limit,
1107 )
1108 builder = self._makeQueryBuilder(
1109 summary, doomed_by=[f"Dataset type {name} is not registered." for name in missing]
1110 )
1111 for datasetType in standardizedDatasetTypes:
1112 builder.joinDataset(
1113 datasetType,
1114 collections,
1115 isResult=False,
1116 )
1117 return builder.finish()
1119 return queries.DataCoordinateQueryResults(self._db, query_factory, requestedDimensions)
1121 def queryDimensionRecords(
1122 self,
1123 element: Union[DimensionElement, str],
1124 *,
1125 dataId: Optional[DataId] = None,
1126 datasets: Any = None,
1127 collections: Any = None,
1128 where: Optional[str] = None,
1129 components: Optional[bool] = None,
1130 bind: Optional[Mapping[str, Any]] = None,
1131 check: bool = True,
1132 **kwargs: Any,
1133 ) -> queries.DimensionRecordQueryResults:
1134 # Docstring inherited from lsst.daf.butler.registry.Registry
1135 if not isinstance(element, DimensionElement):
1136 try:
1137 element = self.dimensions[element]
1138 except KeyError as e:
1139 raise DimensionNameError(
1140 f"No such dimension '{element}', available dimensions: "
1141 + str(self.dimensions.getStaticElements())
1142 ) from e
1143 dataIds = self.queryDataIds(
1144 element.graph,
1145 dataId=dataId,
1146 datasets=datasets,
1147 collections=collections,
1148 where=where,
1149 components=components,
1150 bind=bind,
1151 check=check,
1152 **kwargs,
1153 )
1154 return queries.DatabaseDimensionRecordQueryResults(dataIds, self._managers.dimensions[element])
1156 def queryDatasetAssociations(
1157 self,
1158 datasetType: Union[str, DatasetType],
1159 collections: Any = ...,
1160 *,
1161 collectionTypes: Iterable[CollectionType] = CollectionType.all(),
1162 flattenChains: bool = False,
1163 ) -> Iterator[DatasetAssociation]:
1164 # Docstring inherited from lsst.daf.butler.registry.Registry
1165 if collections is None:
1166 if not self.defaults.collections:
1167 raise NoDefaultCollectionError(
1168 "No collections provided to findDataset, and no defaults from registry construction."
1169 )
1170 collections = self.defaults.collections
1171 else:
1172 collections = CollectionQuery.fromExpression(collections)
1173 TimespanReprClass = self._db.getTimespanRepresentation()
1174 if isinstance(datasetType, str):
1175 storage = self._managers.datasets[datasetType]
1176 else:
1177 storage = self._managers.datasets[datasetType.name]
1178 for collectionRecord in collections.iter(
1179 self._managers.collections,
1180 collectionTypes=frozenset(collectionTypes),
1181 flattenChains=flattenChains,
1182 ):
1183 query = storage.select(collectionRecord)
1184 with self._db.query(query) as sql_result:
1185 sql_mappings = sql_result.mappings().fetchall()
1186 for row in sql_mappings:
1187 dataId = DataCoordinate.fromRequiredValues(
1188 storage.datasetType.dimensions,
1189 tuple(row[name] for name in storage.datasetType.dimensions.required.names),
1190 )
1191 runRecord = self._managers.collections[row[self._managers.collections.getRunForeignKeyName()]]
1192 ref = DatasetRef(storage.datasetType, dataId, id=row["id"], run=runRecord.name, conform=False)
1193 if collectionRecord.type is CollectionType.CALIBRATION:
1194 timespan = TimespanReprClass.extract(row)
1195 else:
1196 timespan = None
1197 yield DatasetAssociation(ref=ref, collection=collectionRecord.name, timespan=timespan)
1199 storageClasses: StorageClassFactory
1200 """All storage classes known to the registry (`StorageClassFactory`).
1201 """