Coverage for python/lsst/daf/butler/registries/sql.py: 14%
440 statements
« prev ^ index » next coverage.py v6.5.0, created at 2022-12-15 02:03 -0800
« prev ^ index » next coverage.py v6.5.0, created at 2022-12-15 02:03 -0800
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22from __future__ import annotations
24__all__ = ("SqlRegistry",)
26import contextlib
27import logging
28import warnings
29from typing import (
30 TYPE_CHECKING,
31 Any,
32 Dict,
33 Iterable,
34 Iterator,
35 List,
36 Literal,
37 Mapping,
38 Optional,
39 Sequence,
40 Set,
41 Tuple,
42 Union,
43)
45import sqlalchemy
46from lsst.resources import ResourcePathExpression
47from lsst.utils.iteration import ensure_iterable
49from ..core import (
50 Config,
51 DataCoordinate,
52 DataCoordinateIterable,
53 DataId,
54 DatasetAssociation,
55 DatasetId,
56 DatasetRef,
57 DatasetType,
58 Dimension,
59 DimensionConfig,
60 DimensionElement,
61 DimensionGraph,
62 DimensionRecord,
63 DimensionUniverse,
64 NamedKeyMapping,
65 NameLookupMapping,
66 Progress,
67 StorageClassFactory,
68 Timespan,
69 ddl,
70)
71from ..core.utils import transactional
72from ..registry import (
73 ArgumentError,
74 CollectionExpressionError,
75 CollectionSummary,
76 CollectionType,
77 CollectionTypeError,
78 ConflictingDefinitionError,
79 DataIdValueError,
80 DatasetTypeError,
81 DimensionNameError,
82 InconsistentDataIdError,
83 NoDefaultCollectionError,
84 OrphanedRecordError,
85 Registry,
86 RegistryConfig,
87 RegistryDefaults,
88 queries,
89)
90from ..registry.interfaces import ChainedCollectionRecord, DatasetIdFactory, DatasetIdGenEnum, RunRecord
91from ..registry.managers import RegistryManagerInstances, RegistryManagerTypes
92from ..registry.wildcards import CollectionWildcard, DatasetTypeWildcard
94if TYPE_CHECKING: 94 ↛ 95line 94 didn't jump to line 95, because the condition on line 94 was never true
95 from .._butlerConfig import ButlerConfig
96 from ..registry.interfaces import CollectionRecord, Database, DatastoreRegistryBridgeManager
99_LOG = logging.getLogger(__name__)
102class SqlRegistry(Registry):
103 """Registry implementation based on SQLAlchemy.
105 Parameters
106 ----------
107 database : `Database`
108 Database instance to store Registry.
109 defaults : `RegistryDefaults`
110 Default collection search path and/or output `~CollectionType.RUN`
111 collection.
112 managers : `RegistryManagerInstances`
113 All the managers required for this registry.
114 """
116 defaultConfigFile: Optional[str] = None
117 """Path to configuration defaults. Accessed within the ``configs`` resource
118 or relative to a search path. Can be None if no defaults specified.
119 """
121 @classmethod
122 def createFromConfig(
123 cls,
124 config: Optional[Union[RegistryConfig, str]] = None,
125 dimensionConfig: Optional[Union[DimensionConfig, str]] = None,
126 butlerRoot: Optional[ResourcePathExpression] = None,
127 ) -> Registry:
128 """Create registry database and return `SqlRegistry` instance.
130 This method initializes database contents, database must be empty
131 prior to calling this method.
133 Parameters
134 ----------
135 config : `RegistryConfig` or `str`, optional
136 Registry configuration, if missing then default configuration will
137 be loaded from registry.yaml.
138 dimensionConfig : `DimensionConfig` or `str`, optional
139 Dimensions configuration, if missing then default configuration
140 will be loaded from dimensions.yaml.
141 butlerRoot : convertible to `lsst.resources.ResourcePath`, optional
142 Path to the repository root this `SqlRegistry` will manage.
144 Returns
145 -------
146 registry : `SqlRegistry`
147 A new `SqlRegistry` instance.
148 """
149 config = cls.forceRegistryConfig(config)
150 config.replaceRoot(butlerRoot)
152 if isinstance(dimensionConfig, str):
153 dimensionConfig = DimensionConfig(dimensionConfig)
154 elif dimensionConfig is None:
155 dimensionConfig = DimensionConfig()
156 elif not isinstance(dimensionConfig, DimensionConfig):
157 raise TypeError(f"Incompatible Dimension configuration type: {type(dimensionConfig)}")
159 DatabaseClass = config.getDatabaseClass()
160 database = DatabaseClass.fromUri(
161 str(config.connectionString), origin=config.get("origin", 0), namespace=config.get("namespace")
162 )
163 managerTypes = RegistryManagerTypes.fromConfig(config)
164 managers = managerTypes.makeRepo(database, dimensionConfig)
165 return cls(database, RegistryDefaults(), managers)
167 @classmethod
168 def fromConfig(
169 cls,
170 config: Union[ButlerConfig, RegistryConfig, Config, str],
171 butlerRoot: Optional[ResourcePathExpression] = None,
172 writeable: bool = True,
173 defaults: Optional[RegistryDefaults] = None,
174 ) -> Registry:
175 """Create `Registry` subclass instance from `config`.
177 Registry database must be initialized prior to calling this method.
179 Parameters
180 ----------
181 config : `ButlerConfig`, `RegistryConfig`, `Config` or `str`
182 Registry configuration
183 butlerRoot : `lsst.resources.ResourcePathExpression`, optional
184 Path to the repository root this `Registry` will manage.
185 writeable : `bool`, optional
186 If `True` (default) create a read-write connection to the database.
187 defaults : `RegistryDefaults`, optional
188 Default collection search path and/or output `~CollectionType.RUN`
189 collection.
191 Returns
192 -------
193 registry : `SqlRegistry` (subclass)
194 A new `SqlRegistry` subclass instance.
195 """
196 config = cls.forceRegistryConfig(config)
197 config.replaceRoot(butlerRoot)
198 DatabaseClass = config.getDatabaseClass()
199 database = DatabaseClass.fromUri(
200 str(config.connectionString),
201 origin=config.get("origin", 0),
202 namespace=config.get("namespace"),
203 writeable=writeable,
204 )
205 managerTypes = RegistryManagerTypes.fromConfig(config)
206 managers = managerTypes.loadRepo(database)
207 if defaults is None:
208 defaults = RegistryDefaults()
209 return cls(database, defaults, managers)
211 def __init__(self, database: Database, defaults: RegistryDefaults, managers: RegistryManagerInstances):
212 self._db = database
213 self._managers = managers
214 self.storageClasses = StorageClassFactory()
215 # Intentionally invoke property setter to initialize defaults. This
216 # can only be done after most of the rest of Registry has already been
217 # initialized, and must be done before the property getter is used.
218 self.defaults = defaults
219 # In the future DatasetIdFactory may become configurable and this
220 # instance will need to be shared with datasets manager.
221 self.datasetIdFactory = DatasetIdFactory()
223 def __str__(self) -> str:
224 return str(self._db)
226 def __repr__(self) -> str:
227 return f"SqlRegistry({self._db!r}, {self.dimensions!r})"
229 def isWriteable(self) -> bool:
230 # Docstring inherited from lsst.daf.butler.registry.Registry
231 return self._db.isWriteable()
233 def copy(self, defaults: Optional[RegistryDefaults] = None) -> Registry:
234 # Docstring inherited from lsst.daf.butler.registry.Registry
235 if defaults is None:
236 # No need to copy, because `RegistryDefaults` is immutable; we
237 # effectively copy on write.
238 defaults = self.defaults
239 return type(self)(self._db, defaults, self._managers)
241 @property
242 def dimensions(self) -> DimensionUniverse:
243 # Docstring inherited from lsst.daf.butler.registry.Registry
244 return self._managers.dimensions.universe
246 def refresh(self) -> None:
247 # Docstring inherited from lsst.daf.butler.registry.Registry
248 self._managers.refresh()
250 @contextlib.contextmanager
251 def transaction(self, *, savepoint: bool = False) -> Iterator[None]:
252 # Docstring inherited from lsst.daf.butler.registry.Registry
253 try:
254 with self._db.transaction(savepoint=savepoint):
255 yield
256 except BaseException:
257 # TODO: this clears the caches sometimes when we wouldn't actually
258 # need to. Can we avoid that?
259 self._managers.dimensions.clearCaches()
260 raise
262 def resetConnectionPool(self) -> None:
263 """Reset SQLAlchemy connection pool for `SqlRegistry` database.
265 This operation is useful when using registry with fork-based
266 multiprocessing. To use registry across fork boundary one has to make
267 sure that there are no currently active connections (no session or
268 transaction is in progress) and connection pool is reset using this
269 method. This method should be called by the child process immediately
270 after the fork.
271 """
272 self._db._engine.dispose()
274 def registerOpaqueTable(self, tableName: str, spec: ddl.TableSpec) -> None:
275 """Add an opaque (to the `Registry`) table for use by a `Datastore` or
276 other data repository client.
278 Opaque table records can be added via `insertOpaqueData`, retrieved via
279 `fetchOpaqueData`, and removed via `deleteOpaqueData`.
281 Parameters
282 ----------
283 tableName : `str`
284 Logical name of the opaque table. This may differ from the
285 actual name used in the database by a prefix and/or suffix.
286 spec : `ddl.TableSpec`
287 Specification for the table to be added.
288 """
289 self._managers.opaque.register(tableName, spec)
291 @transactional
292 def insertOpaqueData(self, tableName: str, *data: dict) -> None:
293 """Insert records into an opaque table.
295 Parameters
296 ----------
297 tableName : `str`
298 Logical name of the opaque table. Must match the name used in a
299 previous call to `registerOpaqueTable`.
300 data
301 Each additional positional argument is a dictionary that represents
302 a single row to be added.
303 """
304 self._managers.opaque[tableName].insert(*data)
306 def fetchOpaqueData(self, tableName: str, **where: Any) -> Iterator[dict]:
307 """Retrieve records from an opaque table.
309 Parameters
310 ----------
311 tableName : `str`
312 Logical name of the opaque table. Must match the name used in a
313 previous call to `registerOpaqueTable`.
314 where
315 Additional keyword arguments are interpreted as equality
316 constraints that restrict the returned rows (combined with AND);
317 keyword arguments are column names and values are the values they
318 must have.
320 Yields
321 ------
322 row : `dict`
323 A dictionary representing a single result row.
324 """
325 yield from self._managers.opaque[tableName].fetch(**where)
327 @transactional
328 def deleteOpaqueData(self, tableName: str, **where: Any) -> None:
329 """Remove records from an opaque table.
331 Parameters
332 ----------
333 tableName : `str`
334 Logical name of the opaque table. Must match the name used in a
335 previous call to `registerOpaqueTable`.
336 where
337 Additional keyword arguments are interpreted as equality
338 constraints that restrict the deleted rows (combined with AND);
339 keyword arguments are column names and values are the values they
340 must have.
341 """
342 self._managers.opaque[tableName].delete(where.keys(), where)
344 def registerCollection(
345 self, name: str, type: CollectionType = CollectionType.TAGGED, doc: Optional[str] = None
346 ) -> bool:
347 # Docstring inherited from lsst.daf.butler.registry.Registry
348 _, registered = self._managers.collections.register(name, type, doc=doc)
349 return registered
351 def getCollectionType(self, name: str) -> CollectionType:
352 # Docstring inherited from lsst.daf.butler.registry.Registry
353 return self._managers.collections.find(name).type
355 def _get_collection_record(self, name: str) -> CollectionRecord:
356 # Docstring inherited from lsst.daf.butler.registry.Registry
357 return self._managers.collections.find(name)
359 def registerRun(self, name: str, doc: Optional[str] = None) -> bool:
360 # Docstring inherited from lsst.daf.butler.registry.Registry
361 _, registered = self._managers.collections.register(name, CollectionType.RUN, doc=doc)
362 return registered
364 @transactional
365 def removeCollection(self, name: str) -> None:
366 # Docstring inherited from lsst.daf.butler.registry.Registry
367 self._managers.collections.remove(name)
369 def getCollectionChain(self, parent: str) -> tuple[str, ...]:
370 # Docstring inherited from lsst.daf.butler.registry.Registry
371 record = self._managers.collections.find(parent)
372 if record.type is not CollectionType.CHAINED:
373 raise CollectionTypeError(f"Collection '{parent}' has type {record.type.name}, not CHAINED.")
374 assert isinstance(record, ChainedCollectionRecord)
375 return record.children
377 @transactional
378 def setCollectionChain(self, parent: str, children: Any, *, flatten: bool = False) -> None:
379 # Docstring inherited from lsst.daf.butler.registry.Registry
380 record = self._managers.collections.find(parent)
381 if record.type is not CollectionType.CHAINED:
382 raise CollectionTypeError(f"Collection '{parent}' has type {record.type.name}, not CHAINED.")
383 assert isinstance(record, ChainedCollectionRecord)
384 children = CollectionWildcard.from_expression(children).require_ordered()
385 if children != record.children or flatten:
386 record.update(self._managers.collections, children, flatten=flatten)
388 def getCollectionParentChains(self, collection: str) -> Set[str]:
389 # Docstring inherited from lsst.daf.butler.registry.Registry
390 return {
391 record.name
392 for record in self._managers.collections.getParentChains(
393 self._managers.collections.find(collection).key
394 )
395 }
397 def getCollectionDocumentation(self, collection: str) -> Optional[str]:
398 # Docstring inherited from lsst.daf.butler.registry.Registry
399 return self._managers.collections.getDocumentation(self._managers.collections.find(collection).key)
401 def setCollectionDocumentation(self, collection: str, doc: Optional[str]) -> None:
402 # Docstring inherited from lsst.daf.butler.registry.Registry
403 self._managers.collections.setDocumentation(self._managers.collections.find(collection).key, doc)
405 def getCollectionSummary(self, collection: str) -> CollectionSummary:
406 # Docstring inherited from lsst.daf.butler.registry.Registry
407 record = self._managers.collections.find(collection)
408 return self._managers.datasets.getCollectionSummary(record)
410 def registerDatasetType(self, datasetType: DatasetType) -> bool:
411 # Docstring inherited from lsst.daf.butler.registry.Registry
412 _, inserted = self._managers.datasets.register(datasetType)
413 return inserted
415 def removeDatasetType(self, name: str) -> None:
416 # Docstring inherited from lsst.daf.butler.registry.Registry
417 self._managers.datasets.remove(name)
419 def getDatasetType(self, name: str) -> DatasetType:
420 # Docstring inherited from lsst.daf.butler.registry.Registry
421 parent_name, component = DatasetType.splitDatasetTypeName(name)
422 storage = self._managers.datasets[parent_name]
423 if component is None:
424 return storage.datasetType
425 else:
426 return storage.datasetType.makeComponentDatasetType(component)
428 def supportsIdGenerationMode(self, mode: DatasetIdGenEnum) -> bool:
429 # Docstring inherited from lsst.daf.butler.registry.Registry
430 return self._managers.datasets.supportsIdGenerationMode(mode)
432 def findDataset(
433 self,
434 datasetType: Union[DatasetType, str],
435 dataId: Optional[DataId] = None,
436 *,
437 collections: Any = None,
438 timespan: Optional[Timespan] = None,
439 **kwargs: Any,
440 ) -> Optional[DatasetRef]:
441 # Docstring inherited from lsst.daf.butler.registry.Registry
442 if isinstance(datasetType, DatasetType):
443 parent_name, component = datasetType.nameAndComponent()
444 else:
445 parent_name, component = DatasetType.splitDatasetTypeName(datasetType)
446 storage = self._managers.datasets[parent_name]
447 dataId = DataCoordinate.standardize(
448 dataId,
449 graph=storage.datasetType.dimensions,
450 universe=self.dimensions,
451 defaults=self.defaults.dataId,
452 **kwargs,
453 )
454 if collections is None:
455 if not self.defaults.collections:
456 raise NoDefaultCollectionError(
457 "No collections provided to findDataset, and no defaults from registry construction."
458 )
459 collections = self.defaults.collections
460 collections = CollectionWildcard.from_expression(collections)
461 collections.require_ordered()
462 for collectionRecord in self._managers.collections.resolve_wildcard(collections):
463 if collectionRecord.type is CollectionType.CALIBRATION and (
464 not storage.datasetType.isCalibration() or timespan is None
465 ):
466 continue
467 result = storage.find(collectionRecord, dataId, timespan=timespan)
468 if result is not None:
469 if component is not None:
470 return result.makeComponentRef(component)
471 return result
473 return None
475 @transactional
476 def insertDatasets(
477 self,
478 datasetType: Union[DatasetType, str],
479 dataIds: Iterable[DataId],
480 run: Optional[str] = None,
481 expand: bool = True,
482 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE,
483 ) -> List[DatasetRef]:
484 # Docstring inherited from lsst.daf.butler.registry.Registry
485 if isinstance(datasetType, DatasetType):
486 storage = self._managers.datasets.find(datasetType.name)
487 if storage is None:
488 raise DatasetTypeError(f"DatasetType '{datasetType}' has not been registered.")
489 else:
490 storage = self._managers.datasets.find(datasetType)
491 if storage is None:
492 raise DatasetTypeError(f"DatasetType with name '{datasetType}' has not been registered.")
493 if run is None:
494 if self.defaults.run is None:
495 raise NoDefaultCollectionError(
496 "No run provided to insertDatasets, and no default from registry construction."
497 )
498 run = self.defaults.run
499 runRecord = self._managers.collections.find(run)
500 if runRecord.type is not CollectionType.RUN:
501 raise CollectionTypeError(
502 f"Given collection is of type {runRecord.type.name}; RUN collection required."
503 )
504 assert isinstance(runRecord, RunRecord)
505 progress = Progress("daf.butler.Registry.insertDatasets", level=logging.DEBUG)
506 if expand:
507 expandedDataIds = [
508 self.expandDataId(dataId, graph=storage.datasetType.dimensions)
509 for dataId in progress.wrap(dataIds, f"Expanding {storage.datasetType.name} data IDs")
510 ]
511 else:
512 expandedDataIds = [
513 DataCoordinate.standardize(dataId, graph=storage.datasetType.dimensions) for dataId in dataIds
514 ]
515 try:
516 refs = list(storage.insert(runRecord, expandedDataIds, idGenerationMode))
517 if self._managers.obscore:
518 self._managers.obscore.add_datasets(refs)
519 except sqlalchemy.exc.IntegrityError as err:
520 raise ConflictingDefinitionError(
521 f"A database constraint failure was triggered by inserting "
522 f"one or more datasets of type {storage.datasetType} into "
523 f"collection '{run}'. "
524 f"This probably means a dataset with the same data ID "
525 f"and dataset type already exists, but it may also mean a "
526 f"dimension row is missing."
527 ) from err
528 return refs
530 @transactional
531 def _importDatasets(
532 self,
533 datasets: Iterable[DatasetRef],
534 expand: bool = True,
535 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE,
536 reuseIds: bool = False,
537 ) -> List[DatasetRef]:
538 # Docstring inherited from lsst.daf.butler.registry.Registry
539 datasets = list(datasets)
540 if not datasets:
541 # nothing to do
542 return []
544 # find dataset type
545 datasetTypes = set(dataset.datasetType for dataset in datasets)
546 if len(datasetTypes) != 1:
547 raise DatasetTypeError(f"Multiple dataset types in input datasets: {datasetTypes}")
548 datasetType = datasetTypes.pop()
550 # get storage handler for this dataset type
551 storage = self._managers.datasets.find(datasetType.name)
552 if storage is None:
553 raise DatasetTypeError(f"DatasetType '{datasetType}' has not been registered.")
555 # find run name
556 runs = set(dataset.run for dataset in datasets)
557 if len(runs) != 1:
558 raise ValueError(f"Multiple run names in input datasets: {runs}")
559 run = runs.pop()
560 if run is None:
561 if self.defaults.run is None:
562 raise NoDefaultCollectionError(
563 "No run provided to ingestDatasets, and no default from registry construction."
564 )
565 run = self.defaults.run
567 runRecord = self._managers.collections.find(run)
568 if runRecord.type is not CollectionType.RUN:
569 raise CollectionTypeError(
570 f"Given collection '{runRecord.name}' is of type {runRecord.type.name};"
571 " RUN collection required."
572 )
573 assert isinstance(runRecord, RunRecord)
575 progress = Progress("daf.butler.Registry.insertDatasets", level=logging.DEBUG)
576 if expand:
577 expandedDatasets = [
578 dataset.expanded(self.expandDataId(dataset.dataId, graph=storage.datasetType.dimensions))
579 for dataset in progress.wrap(datasets, f"Expanding {storage.datasetType.name} data IDs")
580 ]
581 else:
582 expandedDatasets = [
583 DatasetRef(datasetType, dataset.dataId, id=dataset.id, run=dataset.run, conform=True)
584 for dataset in datasets
585 ]
587 try:
588 refs = list(storage.import_(runRecord, expandedDatasets, idGenerationMode, reuseIds))
589 if self._managers.obscore:
590 self._managers.obscore.add_datasets(refs)
591 except sqlalchemy.exc.IntegrityError as err:
592 raise ConflictingDefinitionError(
593 f"A database constraint failure was triggered by inserting "
594 f"one or more datasets of type {storage.datasetType} into "
595 f"collection '{run}'. "
596 f"This probably means a dataset with the same data ID "
597 f"and dataset type already exists, but it may also mean a "
598 f"dimension row is missing."
599 ) from err
600 return refs
602 def getDataset(self, id: DatasetId) -> Optional[DatasetRef]:
603 # Docstring inherited from lsst.daf.butler.registry.Registry
604 return self._managers.datasets.getDatasetRef(id)
606 @transactional
607 def removeDatasets(self, refs: Iterable[DatasetRef]) -> None:
608 # Docstring inherited from lsst.daf.butler.registry.Registry
609 progress = Progress("lsst.daf.butler.Registry.removeDatasets", level=logging.DEBUG)
610 for datasetType, refsForType in progress.iter_item_chunks(
611 DatasetRef.groupByType(refs).items(), desc="Removing datasets by type"
612 ):
613 storage = self._managers.datasets[datasetType.name]
614 try:
615 storage.delete(refsForType)
616 except sqlalchemy.exc.IntegrityError as err:
617 raise OrphanedRecordError(
618 "One or more datasets is still present in one or more Datastores."
619 ) from err
621 @transactional
622 def associate(self, collection: str, refs: Iterable[DatasetRef]) -> None:
623 # Docstring inherited from lsst.daf.butler.registry.Registry
624 progress = Progress("lsst.daf.butler.Registry.associate", level=logging.DEBUG)
625 collectionRecord = self._managers.collections.find(collection)
626 if collectionRecord.type is not CollectionType.TAGGED:
627 raise CollectionTypeError(
628 f"Collection '{collection}' has type {collectionRecord.type.name}, not TAGGED."
629 )
630 for datasetType, refsForType in progress.iter_item_chunks(
631 DatasetRef.groupByType(refs).items(), desc="Associating datasets by type"
632 ):
633 storage = self._managers.datasets[datasetType.name]
634 try:
635 storage.associate(collectionRecord, refsForType)
636 if self._managers.obscore:
637 # If a TAGGED collection is being monitored by ObsCore
638 # manager then we may need to save the dataset.
639 self._managers.obscore.associate(refsForType, collectionRecord)
640 except sqlalchemy.exc.IntegrityError as err:
641 raise ConflictingDefinitionError(
642 f"Constraint violation while associating dataset of type {datasetType.name} with "
643 f"collection {collection}. This probably means that one or more datasets with the same "
644 f"dataset type and data ID already exist in the collection, but it may also indicate "
645 f"that the datasets do not exist."
646 ) from err
648 @transactional
649 def disassociate(self, collection: str, refs: Iterable[DatasetRef]) -> None:
650 # Docstring inherited from lsst.daf.butler.registry.Registry
651 progress = Progress("lsst.daf.butler.Registry.disassociate", level=logging.DEBUG)
652 collectionRecord = self._managers.collections.find(collection)
653 if collectionRecord.type is not CollectionType.TAGGED:
654 raise CollectionTypeError(
655 f"Collection '{collection}' has type {collectionRecord.type.name}; expected TAGGED."
656 )
657 for datasetType, refsForType in progress.iter_item_chunks(
658 DatasetRef.groupByType(refs).items(), desc="Disassociating datasets by type"
659 ):
660 storage = self._managers.datasets[datasetType.name]
661 storage.disassociate(collectionRecord, refsForType)
662 if self._managers.obscore:
663 self._managers.obscore.disassociate(refsForType, collectionRecord)
665 @transactional
666 def certify(self, collection: str, refs: Iterable[DatasetRef], timespan: Timespan) -> None:
667 # Docstring inherited from lsst.daf.butler.registry.Registry
668 progress = Progress("lsst.daf.butler.Registry.certify", level=logging.DEBUG)
669 collectionRecord = self._managers.collections.find(collection)
670 for datasetType, refsForType in progress.iter_item_chunks(
671 DatasetRef.groupByType(refs).items(), desc="Certifying datasets by type"
672 ):
673 storage = self._managers.datasets[datasetType.name]
674 storage.certify(collectionRecord, refsForType, timespan)
676 @transactional
677 def decertify(
678 self,
679 collection: str,
680 datasetType: Union[str, DatasetType],
681 timespan: Timespan,
682 *,
683 dataIds: Optional[Iterable[DataId]] = None,
684 ) -> None:
685 # Docstring inherited from lsst.daf.butler.registry.Registry
686 collectionRecord = self._managers.collections.find(collection)
687 if isinstance(datasetType, str):
688 storage = self._managers.datasets[datasetType]
689 else:
690 storage = self._managers.datasets[datasetType.name]
691 standardizedDataIds = None
692 if dataIds is not None:
693 standardizedDataIds = [
694 DataCoordinate.standardize(d, graph=storage.datasetType.dimensions) for d in dataIds
695 ]
696 storage.decertify(collectionRecord, timespan, dataIds=standardizedDataIds)
698 def getDatastoreBridgeManager(self) -> DatastoreRegistryBridgeManager:
699 """Return an object that allows a new `Datastore` instance to
700 communicate with this `Registry`.
702 Returns
703 -------
704 manager : `DatastoreRegistryBridgeManager`
705 Object that mediates communication between this `Registry` and its
706 associated datastores.
707 """
708 return self._managers.datastores
710 def getDatasetLocations(self, ref: DatasetRef) -> Iterable[str]:
711 # Docstring inherited from lsst.daf.butler.registry.Registry
712 return self._managers.datastores.findDatastores(ref)
714 def expandDataId(
715 self,
716 dataId: Optional[DataId] = None,
717 *,
718 graph: Optional[DimensionGraph] = None,
719 records: Optional[NameLookupMapping[DimensionElement, Optional[DimensionRecord]]] = None,
720 withDefaults: bool = True,
721 **kwargs: Any,
722 ) -> DataCoordinate:
723 # Docstring inherited from lsst.daf.butler.registry.Registry
724 if not withDefaults:
725 defaults = None
726 else:
727 defaults = self.defaults.dataId
728 try:
729 standardized = DataCoordinate.standardize(
730 dataId, graph=graph, universe=self.dimensions, defaults=defaults, **kwargs
731 )
732 except KeyError as exc:
733 # This means either kwargs have some odd name or required
734 # dimension is missing.
735 raise DimensionNameError(str(exc)) from exc
736 if standardized.hasRecords():
737 return standardized
738 if records is None:
739 records = {}
740 elif isinstance(records, NamedKeyMapping):
741 records = records.byName()
742 else:
743 records = dict(records)
744 if isinstance(dataId, DataCoordinate) and dataId.hasRecords():
745 records.update(dataId.records.byName())
746 keys = standardized.byName()
747 for element in standardized.graph.primaryKeyTraversalOrder:
748 record = records.get(element.name, ...) # Use ... to mean not found; None might mean NULL
749 if record is ...:
750 if isinstance(element, Dimension) and keys.get(element.name) is None:
751 if element in standardized.graph.required:
752 raise DimensionNameError(
753 f"No value or null value for required dimension {element.name}."
754 )
755 keys[element.name] = None
756 record = None
757 else:
758 storage = self._managers.dimensions[element]
759 dataIdSet = DataCoordinateIterable.fromScalar(
760 DataCoordinate.standardize(keys, graph=element.graph)
761 )
762 fetched = tuple(storage.fetch(dataIdSet))
763 try:
764 (record,) = fetched
765 except ValueError:
766 record = None
767 records[element.name] = record
768 if record is not None:
769 for d in element.implied:
770 value = getattr(record, d.name)
771 if keys.setdefault(d.name, value) != value:
772 raise InconsistentDataIdError(
773 f"Data ID {standardized} has {d.name}={keys[d.name]!r}, "
774 f"but {element.name} implies {d.name}={value!r}."
775 )
776 else:
777 if element in standardized.graph.required:
778 raise DataIdValueError(
779 f"Could not fetch record for required dimension {element.name} via keys {keys}."
780 )
781 if element.alwaysJoin:
782 raise InconsistentDataIdError(
783 f"Could not fetch record for element {element.name} via keys {keys}, ",
784 "but it is marked alwaysJoin=True; this means one or more dimensions are not "
785 "related.",
786 )
787 for d in element.implied:
788 keys.setdefault(d.name, None)
789 records.setdefault(d.name, None)
790 return DataCoordinate.standardize(keys, graph=standardized.graph).expanded(records=records)
792 def insertDimensionData(
793 self,
794 element: Union[DimensionElement, str],
795 *data: Union[Mapping[str, Any], DimensionRecord],
796 conform: bool = True,
797 replace: bool = False,
798 skip_existing: bool = False,
799 ) -> None:
800 # Docstring inherited from lsst.daf.butler.registry.Registry
801 if conform:
802 if isinstance(element, str):
803 element = self.dimensions[element]
804 records = [
805 row if isinstance(row, DimensionRecord) else element.RecordClass(**row) for row in data
806 ]
807 else:
808 # Ignore typing since caller said to trust them with conform=False.
809 records = data # type: ignore
810 storage = self._managers.dimensions[element] # type: ignore
811 storage.insert(*records, replace=replace, skip_existing=skip_existing)
813 def syncDimensionData(
814 self,
815 element: Union[DimensionElement, str],
816 row: Union[Mapping[str, Any], DimensionRecord],
817 conform: bool = True,
818 update: bool = False,
819 ) -> Union[bool, Dict[str, Any]]:
820 # Docstring inherited from lsst.daf.butler.registry.Registry
821 if conform:
822 if isinstance(element, str):
823 element = self.dimensions[element]
824 record = row if isinstance(row, DimensionRecord) else element.RecordClass(**row)
825 else:
826 # Ignore typing since caller said to trust them with conform=False.
827 record = row # type: ignore
828 storage = self._managers.dimensions[element] # type: ignore
829 return storage.sync(record, update=update)
831 def queryDatasetTypes(
832 self,
833 expression: Any = ...,
834 *,
835 components: Optional[bool] = None,
836 missing: Optional[List[str]] = None,
837 ) -> Iterable[DatasetType]:
838 # Docstring inherited from lsst.daf.butler.registry.Registry
839 wildcard = DatasetTypeWildcard.from_expression(expression)
840 composition_dict = self._managers.datasets.resolve_wildcard(
841 wildcard,
842 components=components,
843 missing=missing,
844 )
845 result: list[DatasetType] = []
846 for parent_dataset_type, components_for_parent in composition_dict.items():
847 result.extend(
848 parent_dataset_type.makeComponentDatasetType(c) if c is not None else parent_dataset_type
849 for c in components_for_parent
850 )
851 return result
853 def queryCollections(
854 self,
855 expression: Any = ...,
856 datasetType: Optional[DatasetType] = None,
857 collectionTypes: Union[Iterable[CollectionType], CollectionType] = CollectionType.all(),
858 flattenChains: bool = False,
859 includeChains: Optional[bool] = None,
860 ) -> Sequence[str]:
861 # Docstring inherited from lsst.daf.butler.registry.Registry
863 # Right now the datasetTypes argument is completely ignored, but that
864 # is consistent with its [lack of] guarantees. DM-24939 or a follow-up
865 # ticket will take care of that.
866 try:
867 wildcard = CollectionWildcard.from_expression(expression)
868 except TypeError as exc:
869 raise CollectionExpressionError(f"Invalid collection expression '{expression}'") from exc
870 collectionTypes = ensure_iterable(collectionTypes)
871 return [
872 record.name
873 for record in self._managers.collections.resolve_wildcard(
874 wildcard,
875 collection_types=frozenset(collectionTypes),
876 flatten_chains=flattenChains,
877 include_chains=includeChains,
878 )
879 ]
881 def _makeQueryBuilder(
882 self, summary: queries.QuerySummary, doomed_by: Iterable[str] = ()
883 ) -> queries.QueryBuilder:
884 """Return a `QueryBuilder` instance capable of constructing and
885 managing more complex queries than those obtainable via `Registry`
886 interfaces.
888 This is an advanced interface; downstream code should prefer
889 `Registry.queryDataIds` and `Registry.queryDatasets` whenever those
890 are sufficient.
892 Parameters
893 ----------
894 summary : `queries.QuerySummary`
895 Object describing and categorizing the full set of dimensions that
896 will be included in the query.
897 doomed_by : `Iterable` of `str`, optional
898 A list of diagnostic messages that indicate why the query is going
899 to yield no results and should not even be executed. If an empty
900 container (default) the query will be executed unless other code
901 determines that it is doomed.
903 Returns
904 -------
905 builder : `queries.QueryBuilder`
906 Object that can be used to construct and perform advanced queries.
907 """
908 return queries.QueryBuilder(
909 summary,
910 backend=queries.SqlQueryBackend(self._db, self._managers),
911 doomed_by=doomed_by,
912 )
914 def _standardize_query_dataset_args(
915 self,
916 datasets: Any,
917 collections: Any,
918 components: bool | None,
919 mode: Literal["find_first"] | Literal["find_all"] | Literal["constrain"] = "constrain",
920 *,
921 doomed_by: list[str],
922 ) -> tuple[dict[DatasetType, list[str | None]], CollectionWildcard | None]:
923 """Preprocess dataset arguments passed to query* methods.
925 Parameters
926 ----------
927 datasets : `DatasetType`, `str`, `re.Pattern`, or iterable of these
928 Expression identifying dataset types. See `queryDatasetTypes` for
929 details.
930 collections : `str`, `re.Pattern`, or iterable of these
931 Expression identifying collections to be searched. See
932 `queryCollections` for details.
933 components : `bool`, optional
934 If `True`, apply all expression patterns to component dataset type
935 names as well. If `False`, never apply patterns to components.
936 If `None` (default), apply patterns to components only if their
937 parent datasets were not matched by the expression.
938 Fully-specified component datasets (`str` or `DatasetType`
939 instances) are always included.
941 Values other than `False` are deprecated, and only `False` will be
942 supported after v26. After v27 this argument will be removed
943 entirely.
944 mode : `str`, optional
945 The way in which datasets are being used in this query; one of:
947 - "find_first": this is a query for the first dataset in an
948 ordered list of collections. Prohibits collection wildcards,
949 but permits dataset type wildcards.
951 - "find_all": this is a query for all datasets in all matched
952 collections. Permits collection and dataset type wildcards.
954 - "constrain": this is a query for something other than datasets,
955 with results constrained by dataset existence. Permits
956 collection wildcards and prohibits ``...`` as a dataset type
957 wildcard.
958 doomed_by : `list` [ `str` ]
959 List to append messages indicating why the query is doomed to
960 yield no results.
962 Returns
963 -------
964 composition : `defaultdict` [ `DatasetType`, `list` [ `str` ] ]
965 Dictionary mapping parent dataset type to `list` of components
966 matched for that dataset type (or `None` for the parent itself).
967 collections : `CollectionWildcard`
968 Processed collection expression.
969 """
970 composition: dict[DatasetType, list[str | None]] = {}
971 if datasets is not None:
972 if not collections:
973 if not self.defaults.collections:
974 raise NoDefaultCollectionError("No collections, and no registry default collections.")
975 collections = self.defaults.collections
976 else:
977 collections = CollectionWildcard.from_expression(collections)
978 if mode == "find_first" and collections.patterns:
979 raise TypeError(
980 f"Collection pattern(s) {collections.patterns} not allowed in this context."
981 )
982 missing: list[str] = []
983 composition = self._managers.datasets.resolve_wildcard(
984 datasets, components=components, missing=missing, explicit_only=(mode == "constrain")
985 )
986 if missing and mode == "constrain":
987 # After v26 this should raise MissingDatasetTypeError, to be
988 # implemented on DM-36303.
989 warnings.warn(
990 f"Dataset type(s) {missing} are not registered; this will be an error after v26.",
991 FutureWarning,
992 )
993 doomed_by.extend(f"Dataset type {name} is not registered." for name in missing)
994 elif collections:
995 raise ArgumentError(f"Cannot pass 'collections' (='{collections}') without 'datasets'.")
996 return composition, collections
998 def queryDatasets(
999 self,
1000 datasetType: Any,
1001 *,
1002 collections: Any = None,
1003 dimensions: Optional[Iterable[Union[Dimension, str]]] = None,
1004 dataId: Optional[DataId] = None,
1005 where: Optional[str] = None,
1006 findFirst: bool = False,
1007 components: Optional[bool] = None,
1008 bind: Optional[Mapping[str, Any]] = None,
1009 check: bool = True,
1010 **kwargs: Any,
1011 ) -> queries.DatasetQueryResults:
1012 # Docstring inherited from lsst.daf.butler.registry.Registry
1013 doomed_by: list[str] = []
1014 data_id = self.expandDataId(dataId, **kwargs)
1015 dataset_composition, collections = self._standardize_query_dataset_args(
1016 datasetType,
1017 collections,
1018 components,
1019 mode="find_first" if findFirst else "find_all",
1020 doomed_by=doomed_by,
1021 )
1022 parent_results: list[queries.ParentDatasetQueryResults] = []
1023 for parent_dataset_type, components_for_parent in dataset_composition.items():
1024 # The full set of dimensions in the query is the combination of
1025 # those needed for the DatasetType and those explicitly requested,
1026 # if any.
1027 dimension_names = set(parent_dataset_type.dimensions.names)
1028 if dimensions is not None:
1029 dimension_names.update(self.dimensions.extract(dimensions).names)
1030 # Construct the summary structure needed to construct a
1031 # QueryBuilder.
1032 summary = queries.QuerySummary(
1033 requested=DimensionGraph(self.dimensions, names=dimension_names),
1034 dataId=data_id,
1035 expression=where,
1036 bind=bind,
1037 defaults=self.defaults.dataId,
1038 check=check,
1039 datasets=[parent_dataset_type],
1040 )
1041 builder = self._makeQueryBuilder(summary)
1042 # Add the dataset subquery to the query, telling the QueryBuilder
1043 # to include the rank of the selected collection in the results
1044 # only if we need to findFirst. Note that if any of the
1045 # collections are actually wildcard expressions, and
1046 # findFirst=True, this will raise TypeError for us.
1047 builder.joinDataset(parent_dataset_type, collections, isResult=True, findFirst=findFirst)
1048 query = builder.finish()
1049 parent_results.append(
1050 queries.ParentDatasetQueryResults(
1051 self._db, query, datasetType=parent_dataset_type, components=components_for_parent
1052 )
1053 )
1054 if not parent_results:
1055 doomed_by.extend(
1056 f"No registered dataset type matching {t!r} found, so no matching datasets can "
1057 "exist in any collection."
1058 for t in ensure_iterable(datasetType)
1059 )
1060 return queries.ChainedDatasetQueryResults([], doomed_by=doomed_by)
1061 elif len(parent_results) == 1:
1062 return parent_results[0]
1063 else:
1064 return queries.ChainedDatasetQueryResults(parent_results)
1066 def queryDataIds(
1067 self,
1068 dimensions: Union[Iterable[Union[Dimension, str]], Dimension, str],
1069 *,
1070 dataId: Optional[DataId] = None,
1071 datasets: Any = None,
1072 collections: Any = None,
1073 where: Optional[str] = None,
1074 components: Optional[bool] = None,
1075 bind: Optional[Mapping[str, Any]] = None,
1076 check: bool = True,
1077 **kwargs: Any,
1078 ) -> queries.DataCoordinateQueryResults:
1079 # Docstring inherited from lsst.daf.butler.registry.Registry
1080 dimensions = ensure_iterable(dimensions)
1081 requestedDimensions = self.dimensions.extract(dimensions)
1082 doomed_by: list[str] = []
1083 data_id = self.expandDataId(dataId, **kwargs)
1084 dataset_composition, collections = self._standardize_query_dataset_args(
1085 datasets, collections, components, doomed_by=doomed_by
1086 )
1088 def query_factory(
1089 order_by: Optional[Iterable[str]] = None, limit: Optional[Tuple[int, Optional[int]]] = None
1090 ) -> queries.Query:
1091 """Construct the Query object that generates query results."""
1092 summary = queries.QuerySummary(
1093 requested=requestedDimensions,
1094 dataId=data_id,
1095 expression=where,
1096 bind=bind,
1097 defaults=self.defaults.dataId,
1098 check=check,
1099 datasets=dataset_composition.keys(),
1100 order_by=order_by,
1101 limit=limit,
1102 )
1103 builder = self._makeQueryBuilder(summary, doomed_by=doomed_by)
1104 for datasetType in dataset_composition:
1105 builder.joinDataset(datasetType, collections, isResult=False)
1106 return builder.finish()
1108 return queries.DataCoordinateQueryResults(self._db, query_factory, requestedDimensions)
1110 def queryDimensionRecords(
1111 self,
1112 element: Union[DimensionElement, str],
1113 *,
1114 dataId: Optional[DataId] = None,
1115 datasets: Any = None,
1116 collections: Any = None,
1117 where: Optional[str] = None,
1118 components: Optional[bool] = None,
1119 bind: Optional[Mapping[str, Any]] = None,
1120 check: bool = True,
1121 **kwargs: Any,
1122 ) -> queries.DimensionRecordQueryResults:
1123 # Docstring inherited from lsst.daf.butler.registry.Registry
1124 if not isinstance(element, DimensionElement):
1125 try:
1126 element = self.dimensions[element]
1127 except KeyError as e:
1128 raise DimensionNameError(
1129 f"No such dimension '{element}', available dimensions: "
1130 + str(self.dimensions.getStaticElements())
1131 ) from e
1132 dataIds = self.queryDataIds(
1133 element.graph,
1134 dataId=dataId,
1135 datasets=datasets,
1136 collections=collections,
1137 where=where,
1138 components=components,
1139 bind=bind,
1140 check=check,
1141 **kwargs,
1142 )
1143 return queries.DatabaseDimensionRecordQueryResults(dataIds, self._managers.dimensions[element])
1145 def queryDatasetAssociations(
1146 self,
1147 datasetType: Union[str, DatasetType],
1148 collections: Any = ...,
1149 *,
1150 collectionTypes: Iterable[CollectionType] = CollectionType.all(),
1151 flattenChains: bool = False,
1152 ) -> Iterator[DatasetAssociation]:
1153 # Docstring inherited from lsst.daf.butler.registry.Registry
1154 if collections is None:
1155 if not self.defaults.collections:
1156 raise NoDefaultCollectionError(
1157 "No collections provided to findDataset, and no defaults from registry construction."
1158 )
1159 collections = self.defaults.collections
1160 collections = CollectionWildcard.from_expression(collections)
1161 TimespanReprClass = self._db.getTimespanRepresentation()
1162 if isinstance(datasetType, str):
1163 storage = self._managers.datasets[datasetType]
1164 else:
1165 storage = self._managers.datasets[datasetType.name]
1166 for collectionRecord in self._managers.collections.resolve_wildcard(
1167 collections,
1168 collection_types=frozenset(collectionTypes),
1169 flatten_chains=flattenChains,
1170 ):
1171 query = storage.select(collectionRecord)
1172 for row in self._db.query(query).mappings():
1173 dataId = DataCoordinate.fromRequiredValues(
1174 storage.datasetType.dimensions,
1175 tuple(row[name] for name in storage.datasetType.dimensions.required.names),
1176 )
1177 runRecord = self._managers.collections[row[self._managers.collections.getRunForeignKeyName()]]
1178 ref = DatasetRef(storage.datasetType, dataId, id=row["id"], run=runRecord.name, conform=False)
1179 if collectionRecord.type is CollectionType.CALIBRATION:
1180 timespan = TimespanReprClass.extract(row)
1181 else:
1182 timespan = None
1183 yield DatasetAssociation(ref=ref, collection=collectionRecord.name, timespan=timespan)
1185 storageClasses: StorageClassFactory
1186 """All storage classes known to the registry (`StorageClassFactory`).
1187 """