Coverage for python/lsst/daf/butler/registries/sql.py: 14%
444 statements
« prev ^ index » next coverage.py v7.2.5, created at 2023-05-15 00:10 +0000
« prev ^ index » next coverage.py v7.2.5, created at 2023-05-15 00:10 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22from __future__ import annotations
24__all__ = ("SqlRegistry",)
26import contextlib
27import logging
28import warnings
29from typing import (
30 TYPE_CHECKING,
31 Any,
32 Dict,
33 Iterable,
34 Iterator,
35 List,
36 Literal,
37 Mapping,
38 Optional,
39 Sequence,
40 Set,
41 Tuple,
42 Union,
43)
45import sqlalchemy
46from lsst.resources import ResourcePathExpression
47from lsst.utils.iteration import ensure_iterable
49from ..core import (
50 Config,
51 DataCoordinate,
52 DataCoordinateIterable,
53 DataId,
54 DatasetAssociation,
55 DatasetId,
56 DatasetRef,
57 DatasetType,
58 Dimension,
59 DimensionConfig,
60 DimensionElement,
61 DimensionGraph,
62 DimensionRecord,
63 DimensionUniverse,
64 NamedKeyMapping,
65 NameLookupMapping,
66 Progress,
67 StorageClassFactory,
68 Timespan,
69 ddl,
70)
71from ..core.utils import transactional
72from ..registry import (
73 ArgumentError,
74 CollectionExpressionError,
75 CollectionSummary,
76 CollectionType,
77 CollectionTypeError,
78 ConflictingDefinitionError,
79 DataIdValueError,
80 DatasetTypeError,
81 DimensionNameError,
82 InconsistentDataIdError,
83 NoDefaultCollectionError,
84 OrphanedRecordError,
85 Registry,
86 RegistryConfig,
87 RegistryDefaults,
88 queries,
89)
90from ..registry.interfaces import ChainedCollectionRecord, DatasetIdFactory, DatasetIdGenEnum, RunRecord
91from ..registry.managers import RegistryManagerInstances, RegistryManagerTypes
92from ..registry.wildcards import CollectionWildcard, DatasetTypeWildcard
94if TYPE_CHECKING: 94 ↛ 95line 94 didn't jump to line 95, because the condition on line 94 was never true
95 from .._butlerConfig import ButlerConfig
96 from ..registry.interfaces import CollectionRecord, Database, DatastoreRegistryBridgeManager
99_LOG = logging.getLogger(__name__)
102class SqlRegistry(Registry):
103 """Registry implementation based on SQLAlchemy.
105 Parameters
106 ----------
107 database : `Database`
108 Database instance to store Registry.
109 defaults : `RegistryDefaults`
110 Default collection search path and/or output `~CollectionType.RUN`
111 collection.
112 managers : `RegistryManagerInstances`
113 All the managers required for this registry.
114 """
116 defaultConfigFile: Optional[str] = None
117 """Path to configuration defaults. Accessed within the ``configs`` resource
118 or relative to a search path. Can be None if no defaults specified.
119 """
121 @classmethod
122 def createFromConfig(
123 cls,
124 config: Optional[Union[RegistryConfig, str]] = None,
125 dimensionConfig: Optional[Union[DimensionConfig, str]] = None,
126 butlerRoot: Optional[ResourcePathExpression] = None,
127 ) -> Registry:
128 """Create registry database and return `SqlRegistry` instance.
130 This method initializes database contents, database must be empty
131 prior to calling this method.
133 Parameters
134 ----------
135 config : `RegistryConfig` or `str`, optional
136 Registry configuration, if missing then default configuration will
137 be loaded from registry.yaml.
138 dimensionConfig : `DimensionConfig` or `str`, optional
139 Dimensions configuration, if missing then default configuration
140 will be loaded from dimensions.yaml.
141 butlerRoot : convertible to `lsst.resources.ResourcePath`, optional
142 Path to the repository root this `SqlRegistry` will manage.
144 Returns
145 -------
146 registry : `SqlRegistry`
147 A new `SqlRegistry` instance.
148 """
149 config = cls.forceRegistryConfig(config)
150 config.replaceRoot(butlerRoot)
152 if isinstance(dimensionConfig, str):
153 dimensionConfig = DimensionConfig(dimensionConfig)
154 elif dimensionConfig is None:
155 dimensionConfig = DimensionConfig()
156 elif not isinstance(dimensionConfig, DimensionConfig):
157 raise TypeError(f"Incompatible Dimension configuration type: {type(dimensionConfig)}")
159 DatabaseClass = config.getDatabaseClass()
160 database = DatabaseClass.fromUri(
161 str(config.connectionString), origin=config.get("origin", 0), namespace=config.get("namespace")
162 )
163 managerTypes = RegistryManagerTypes.fromConfig(config)
164 managers = managerTypes.makeRepo(database, dimensionConfig)
165 return cls(database, RegistryDefaults(), managers)
167 @classmethod
168 def fromConfig(
169 cls,
170 config: Union[ButlerConfig, RegistryConfig, Config, str],
171 butlerRoot: Optional[ResourcePathExpression] = None,
172 writeable: bool = True,
173 defaults: Optional[RegistryDefaults] = None,
174 ) -> Registry:
175 """Create `Registry` subclass instance from `config`.
177 Registry database must be initialized prior to calling this method.
179 Parameters
180 ----------
181 config : `ButlerConfig`, `RegistryConfig`, `Config` or `str`
182 Registry configuration
183 butlerRoot : `lsst.resources.ResourcePathExpression`, optional
184 Path to the repository root this `Registry` will manage.
185 writeable : `bool`, optional
186 If `True` (default) create a read-write connection to the database.
187 defaults : `RegistryDefaults`, optional
188 Default collection search path and/or output `~CollectionType.RUN`
189 collection.
191 Returns
192 -------
193 registry : `SqlRegistry` (subclass)
194 A new `SqlRegistry` subclass instance.
195 """
196 config = cls.forceRegistryConfig(config)
197 config.replaceRoot(butlerRoot)
198 DatabaseClass = config.getDatabaseClass()
199 database = DatabaseClass.fromUri(
200 str(config.connectionString),
201 origin=config.get("origin", 0),
202 namespace=config.get("namespace"),
203 writeable=writeable,
204 )
205 managerTypes = RegistryManagerTypes.fromConfig(config)
206 with database.session():
207 managers = managerTypes.loadRepo(database)
208 if defaults is None:
209 defaults = RegistryDefaults()
210 return cls(database, defaults, managers)
212 def __init__(self, database: Database, defaults: RegistryDefaults, managers: RegistryManagerInstances):
213 self._db = database
214 self._managers = managers
215 self.storageClasses = StorageClassFactory()
216 # Intentionally invoke property setter to initialize defaults. This
217 # can only be done after most of the rest of Registry has already been
218 # initialized, and must be done before the property getter is used.
219 self.defaults = defaults
220 # In the future DatasetIdFactory may become configurable and this
221 # instance will need to be shared with datasets manager.
222 self.datasetIdFactory = DatasetIdFactory()
224 def __str__(self) -> str:
225 return str(self._db)
227 def __repr__(self) -> str:
228 return f"SqlRegistry({self._db!r}, {self.dimensions!r})"
230 def isWriteable(self) -> bool:
231 # Docstring inherited from lsst.daf.butler.registry.Registry
232 return self._db.isWriteable()
234 def copy(self, defaults: Optional[RegistryDefaults] = None) -> Registry:
235 # Docstring inherited from lsst.daf.butler.registry.Registry
236 if defaults is None:
237 # No need to copy, because `RegistryDefaults` is immutable; we
238 # effectively copy on write.
239 defaults = self.defaults
240 return type(self)(self._db, defaults, self._managers)
242 @property
243 def dimensions(self) -> DimensionUniverse:
244 # Docstring inherited from lsst.daf.butler.registry.Registry
245 return self._managers.dimensions.universe
247 def refresh(self) -> None:
248 # Docstring inherited from lsst.daf.butler.registry.Registry
249 with self._db.transaction():
250 self._managers.refresh()
252 @contextlib.contextmanager
253 def transaction(self, *, savepoint: bool = False) -> Iterator[None]:
254 # Docstring inherited from lsst.daf.butler.registry.Registry
255 try:
256 with self._db.transaction(savepoint=savepoint):
257 yield
258 except BaseException:
259 # TODO: this clears the caches sometimes when we wouldn't actually
260 # need to. Can we avoid that?
261 self._managers.dimensions.clearCaches()
262 raise
264 def resetConnectionPool(self) -> None:
265 """Reset SQLAlchemy connection pool for `SqlRegistry` database.
267 This operation is useful when using registry with fork-based
268 multiprocessing. To use registry across fork boundary one has to make
269 sure that there are no currently active connections (no session or
270 transaction is in progress) and connection pool is reset using this
271 method. This method should be called by the child process immediately
272 after the fork.
273 """
274 self._db._engine.dispose()
276 def registerOpaqueTable(self, tableName: str, spec: ddl.TableSpec) -> None:
277 """Add an opaque (to the `Registry`) table for use by a `Datastore` or
278 other data repository client.
280 Opaque table records can be added via `insertOpaqueData`, retrieved via
281 `fetchOpaqueData`, and removed via `deleteOpaqueData`.
283 Parameters
284 ----------
285 tableName : `str`
286 Logical name of the opaque table. This may differ from the
287 actual name used in the database by a prefix and/or suffix.
288 spec : `ddl.TableSpec`
289 Specification for the table to be added.
290 """
291 self._managers.opaque.register(tableName, spec)
293 @transactional
294 def insertOpaqueData(self, tableName: str, *data: dict) -> None:
295 """Insert records into an opaque table.
297 Parameters
298 ----------
299 tableName : `str`
300 Logical name of the opaque table. Must match the name used in a
301 previous call to `registerOpaqueTable`.
302 data
303 Each additional positional argument is a dictionary that represents
304 a single row to be added.
305 """
306 self._managers.opaque[tableName].insert(*data)
308 def fetchOpaqueData(self, tableName: str, **where: Any) -> Iterator[dict]:
309 """Retrieve records from an opaque table.
311 Parameters
312 ----------
313 tableName : `str`
314 Logical name of the opaque table. Must match the name used in a
315 previous call to `registerOpaqueTable`.
316 where
317 Additional keyword arguments are interpreted as equality
318 constraints that restrict the returned rows (combined with AND);
319 keyword arguments are column names and values are the values they
320 must have.
322 Yields
323 ------
324 row : `dict`
325 A dictionary representing a single result row.
326 """
327 yield from self._managers.opaque[tableName].fetch(**where)
329 @transactional
330 def deleteOpaqueData(self, tableName: str, **where: Any) -> None:
331 """Remove records from an opaque table.
333 Parameters
334 ----------
335 tableName : `str`
336 Logical name of the opaque table. Must match the name used in a
337 previous call to `registerOpaqueTable`.
338 where
339 Additional keyword arguments are interpreted as equality
340 constraints that restrict the deleted rows (combined with AND);
341 keyword arguments are column names and values are the values they
342 must have.
343 """
344 self._managers.opaque[tableName].delete(where.keys(), where)
346 def registerCollection(
347 self, name: str, type: CollectionType = CollectionType.TAGGED, doc: Optional[str] = None
348 ) -> bool:
349 # Docstring inherited from lsst.daf.butler.registry.Registry
350 _, registered = self._managers.collections.register(name, type, doc=doc)
351 return registered
353 def getCollectionType(self, name: str) -> CollectionType:
354 # Docstring inherited from lsst.daf.butler.registry.Registry
355 return self._managers.collections.find(name).type
357 def _get_collection_record(self, name: str) -> CollectionRecord:
358 # Docstring inherited from lsst.daf.butler.registry.Registry
359 return self._managers.collections.find(name)
361 def registerRun(self, name: str, doc: Optional[str] = None) -> bool:
362 # Docstring inherited from lsst.daf.butler.registry.Registry
363 _, registered = self._managers.collections.register(name, CollectionType.RUN, doc=doc)
364 return registered
366 @transactional
367 def removeCollection(self, name: str) -> None:
368 # Docstring inherited from lsst.daf.butler.registry.Registry
369 self._managers.collections.remove(name)
371 def getCollectionChain(self, parent: str) -> tuple[str, ...]:
372 # Docstring inherited from lsst.daf.butler.registry.Registry
373 record = self._managers.collections.find(parent)
374 if record.type is not CollectionType.CHAINED:
375 raise CollectionTypeError(f"Collection '{parent}' has type {record.type.name}, not CHAINED.")
376 assert isinstance(record, ChainedCollectionRecord)
377 return record.children
379 @transactional
380 def setCollectionChain(self, parent: str, children: Any, *, flatten: bool = False) -> None:
381 # Docstring inherited from lsst.daf.butler.registry.Registry
382 record = self._managers.collections.find(parent)
383 if record.type is not CollectionType.CHAINED:
384 raise CollectionTypeError(f"Collection '{parent}' has type {record.type.name}, not CHAINED.")
385 assert isinstance(record, ChainedCollectionRecord)
386 children = CollectionWildcard.from_expression(children).require_ordered()
387 if children != record.children or flatten:
388 record.update(self._managers.collections, children, flatten=flatten)
390 def getCollectionParentChains(self, collection: str) -> Set[str]:
391 # Docstring inherited from lsst.daf.butler.registry.Registry
392 return {
393 record.name
394 for record in self._managers.collections.getParentChains(
395 self._managers.collections.find(collection).key
396 )
397 }
399 def getCollectionDocumentation(self, collection: str) -> Optional[str]:
400 # Docstring inherited from lsst.daf.butler.registry.Registry
401 return self._managers.collections.getDocumentation(self._managers.collections.find(collection).key)
403 def setCollectionDocumentation(self, collection: str, doc: Optional[str]) -> None:
404 # Docstring inherited from lsst.daf.butler.registry.Registry
405 self._managers.collections.setDocumentation(self._managers.collections.find(collection).key, doc)
407 def getCollectionSummary(self, collection: str) -> CollectionSummary:
408 # Docstring inherited from lsst.daf.butler.registry.Registry
409 record = self._managers.collections.find(collection)
410 return self._managers.datasets.getCollectionSummary(record)
412 def registerDatasetType(self, datasetType: DatasetType) -> bool:
413 # Docstring inherited from lsst.daf.butler.registry.Registry
414 _, inserted = self._managers.datasets.register(datasetType)
415 return inserted
417 def removeDatasetType(self, name: str) -> None:
418 # Docstring inherited from lsst.daf.butler.registry.Registry
419 self._managers.datasets.remove(name)
421 def getDatasetType(self, name: str) -> DatasetType:
422 # Docstring inherited from lsst.daf.butler.registry.Registry
423 parent_name, component = DatasetType.splitDatasetTypeName(name)
424 storage = self._managers.datasets[parent_name]
425 if component is None:
426 return storage.datasetType
427 else:
428 return storage.datasetType.makeComponentDatasetType(component)
430 def supportsIdGenerationMode(self, mode: DatasetIdGenEnum) -> bool:
431 # Docstring inherited from lsst.daf.butler.registry.Registry
432 return self._managers.datasets.supportsIdGenerationMode(mode)
434 def findDataset(
435 self,
436 datasetType: Union[DatasetType, str],
437 dataId: Optional[DataId] = None,
438 *,
439 collections: Any = None,
440 timespan: Optional[Timespan] = None,
441 **kwargs: Any,
442 ) -> Optional[DatasetRef]:
443 # Docstring inherited from lsst.daf.butler.registry.Registry
444 if isinstance(datasetType, DatasetType):
445 parent_name, component = datasetType.nameAndComponent()
446 else:
447 parent_name, component = DatasetType.splitDatasetTypeName(datasetType)
448 storage = self._managers.datasets[parent_name]
449 dataId = DataCoordinate.standardize(
450 dataId,
451 graph=storage.datasetType.dimensions,
452 universe=self.dimensions,
453 defaults=self.defaults.dataId,
454 **kwargs,
455 )
456 if collections is None:
457 if not self.defaults.collections:
458 raise NoDefaultCollectionError(
459 "No collections provided to findDataset, and no defaults from registry construction."
460 )
461 collections = self.defaults.collections
462 collections = CollectionWildcard.from_expression(collections)
463 collections.require_ordered()
464 for collectionRecord in self._managers.collections.resolve_wildcard(collections):
465 if collectionRecord.type is CollectionType.CALIBRATION and (
466 not storage.datasetType.isCalibration() or timespan is None
467 ):
468 continue
469 result = storage.find(collectionRecord, dataId, timespan=timespan)
470 if result is not None:
471 if component is not None:
472 return result.makeComponentRef(component)
473 return result
475 return None
477 @transactional
478 def insertDatasets(
479 self,
480 datasetType: Union[DatasetType, str],
481 dataIds: Iterable[DataId],
482 run: Optional[str] = None,
483 expand: bool = True,
484 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE,
485 ) -> List[DatasetRef]:
486 # Docstring inherited from lsst.daf.butler.registry.Registry
487 if isinstance(datasetType, DatasetType):
488 storage = self._managers.datasets.find(datasetType.name)
489 if storage is None:
490 raise DatasetTypeError(f"DatasetType '{datasetType}' has not been registered.")
491 else:
492 storage = self._managers.datasets.find(datasetType)
493 if storage is None:
494 raise DatasetTypeError(f"DatasetType with name '{datasetType}' has not been registered.")
495 if run is None:
496 if self.defaults.run is None:
497 raise NoDefaultCollectionError(
498 "No run provided to insertDatasets, and no default from registry construction."
499 )
500 run = self.defaults.run
501 runRecord = self._managers.collections.find(run)
502 if runRecord.type is not CollectionType.RUN:
503 raise CollectionTypeError(
504 f"Given collection is of type {runRecord.type.name}; RUN collection required."
505 )
506 assert isinstance(runRecord, RunRecord)
507 progress = Progress("daf.butler.Registry.insertDatasets", level=logging.DEBUG)
508 if expand:
509 expandedDataIds = [
510 self.expandDataId(dataId, graph=storage.datasetType.dimensions)
511 for dataId in progress.wrap(dataIds, f"Expanding {storage.datasetType.name} data IDs")
512 ]
513 else:
514 expandedDataIds = [
515 DataCoordinate.standardize(dataId, graph=storage.datasetType.dimensions) for dataId in dataIds
516 ]
517 try:
518 refs = list(storage.insert(runRecord, expandedDataIds, idGenerationMode))
519 if self._managers.obscore:
520 self._managers.obscore.add_datasets(refs)
521 except sqlalchemy.exc.IntegrityError as err:
522 raise ConflictingDefinitionError(
523 f"A database constraint failure was triggered by inserting "
524 f"one or more datasets of type {storage.datasetType} into "
525 f"collection '{run}'. "
526 f"This probably means a dataset with the same data ID "
527 f"and dataset type already exists, but it may also mean a "
528 f"dimension row is missing."
529 ) from err
530 return refs
532 @transactional
533 def _importDatasets(
534 self,
535 datasets: Iterable[DatasetRef],
536 expand: bool = True,
537 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE,
538 reuseIds: bool = False,
539 ) -> List[DatasetRef]:
540 # Docstring inherited from lsst.daf.butler.registry.Registry
541 datasets = list(datasets)
542 if not datasets:
543 # nothing to do
544 return []
546 # find dataset type
547 datasetTypes = set(dataset.datasetType for dataset in datasets)
548 if len(datasetTypes) != 1:
549 raise DatasetTypeError(f"Multiple dataset types in input datasets: {datasetTypes}")
550 datasetType = datasetTypes.pop()
552 # get storage handler for this dataset type
553 storage = self._managers.datasets.find(datasetType.name)
554 if storage is None:
555 raise DatasetTypeError(f"DatasetType '{datasetType}' has not been registered.")
557 # find run name
558 runs = set(dataset.run for dataset in datasets)
559 if len(runs) != 1:
560 raise ValueError(f"Multiple run names in input datasets: {runs}")
561 run = runs.pop()
562 if run is None:
563 if self.defaults.run is None:
564 raise NoDefaultCollectionError(
565 "No run provided to ingestDatasets, and no default from registry construction."
566 )
567 run = self.defaults.run
569 runRecord = self._managers.collections.find(run)
570 if runRecord.type is not CollectionType.RUN:
571 raise CollectionTypeError(
572 f"Given collection '{runRecord.name}' is of type {runRecord.type.name};"
573 " RUN collection required."
574 )
575 assert isinstance(runRecord, RunRecord)
577 progress = Progress("daf.butler.Registry.insertDatasets", level=logging.DEBUG)
578 if expand:
579 expandedDatasets = [
580 dataset.expanded(self.expandDataId(dataset.dataId, graph=storage.datasetType.dimensions))
581 for dataset in progress.wrap(datasets, f"Expanding {storage.datasetType.name} data IDs")
582 ]
583 else:
584 expandedDatasets = [
585 DatasetRef(datasetType, dataset.dataId, id=dataset.id, run=dataset.run, conform=True)
586 for dataset in datasets
587 ]
589 try:
590 refs = list(storage.import_(runRecord, expandedDatasets, idGenerationMode, reuseIds))
591 if self._managers.obscore:
592 self._managers.obscore.add_datasets(refs)
593 except sqlalchemy.exc.IntegrityError as err:
594 raise ConflictingDefinitionError(
595 f"A database constraint failure was triggered by inserting "
596 f"one or more datasets of type {storage.datasetType} into "
597 f"collection '{run}'. "
598 f"This probably means a dataset with the same data ID "
599 f"and dataset type already exists, but it may also mean a "
600 f"dimension row is missing."
601 ) from err
602 return refs
604 def getDataset(self, id: DatasetId) -> Optional[DatasetRef]:
605 # Docstring inherited from lsst.daf.butler.registry.Registry
606 return self._managers.datasets.getDatasetRef(id)
608 @transactional
609 def removeDatasets(self, refs: Iterable[DatasetRef]) -> None:
610 # Docstring inherited from lsst.daf.butler.registry.Registry
611 progress = Progress("lsst.daf.butler.Registry.removeDatasets", level=logging.DEBUG)
612 for datasetType, refsForType in progress.iter_item_chunks(
613 DatasetRef.groupByType(refs).items(), desc="Removing datasets by type"
614 ):
615 storage = self._managers.datasets[datasetType.name]
616 try:
617 storage.delete(refsForType)
618 except sqlalchemy.exc.IntegrityError as err:
619 raise OrphanedRecordError(
620 "One or more datasets is still present in one or more Datastores."
621 ) from err
623 @transactional
624 def associate(self, collection: str, refs: Iterable[DatasetRef]) -> None:
625 # Docstring inherited from lsst.daf.butler.registry.Registry
626 progress = Progress("lsst.daf.butler.Registry.associate", level=logging.DEBUG)
627 collectionRecord = self._managers.collections.find(collection)
628 if collectionRecord.type is not CollectionType.TAGGED:
629 raise CollectionTypeError(
630 f"Collection '{collection}' has type {collectionRecord.type.name}, not TAGGED."
631 )
632 for datasetType, refsForType in progress.iter_item_chunks(
633 DatasetRef.groupByType(refs).items(), desc="Associating datasets by type"
634 ):
635 storage = self._managers.datasets[datasetType.name]
636 try:
637 storage.associate(collectionRecord, refsForType)
638 if self._managers.obscore:
639 # If a TAGGED collection is being monitored by ObsCore
640 # manager then we may need to save the dataset.
641 self._managers.obscore.associate(refsForType, collectionRecord)
642 except sqlalchemy.exc.IntegrityError as err:
643 raise ConflictingDefinitionError(
644 f"Constraint violation while associating dataset of type {datasetType.name} with "
645 f"collection {collection}. This probably means that one or more datasets with the same "
646 f"dataset type and data ID already exist in the collection, but it may also indicate "
647 f"that the datasets do not exist."
648 ) from err
650 @transactional
651 def disassociate(self, collection: str, refs: Iterable[DatasetRef]) -> None:
652 # Docstring inherited from lsst.daf.butler.registry.Registry
653 progress = Progress("lsst.daf.butler.Registry.disassociate", level=logging.DEBUG)
654 collectionRecord = self._managers.collections.find(collection)
655 if collectionRecord.type is not CollectionType.TAGGED:
656 raise CollectionTypeError(
657 f"Collection '{collection}' has type {collectionRecord.type.name}; expected TAGGED."
658 )
659 for datasetType, refsForType in progress.iter_item_chunks(
660 DatasetRef.groupByType(refs).items(), desc="Disassociating datasets by type"
661 ):
662 storage = self._managers.datasets[datasetType.name]
663 storage.disassociate(collectionRecord, refsForType)
664 if self._managers.obscore:
665 self._managers.obscore.disassociate(refsForType, collectionRecord)
667 @transactional
668 def certify(self, collection: str, refs: Iterable[DatasetRef], timespan: Timespan) -> None:
669 # Docstring inherited from lsst.daf.butler.registry.Registry
670 progress = Progress("lsst.daf.butler.Registry.certify", level=logging.DEBUG)
671 collectionRecord = self._managers.collections.find(collection)
672 for datasetType, refsForType in progress.iter_item_chunks(
673 DatasetRef.groupByType(refs).items(), desc="Certifying datasets by type"
674 ):
675 storage = self._managers.datasets[datasetType.name]
676 storage.certify(collectionRecord, refsForType, timespan)
678 @transactional
679 def decertify(
680 self,
681 collection: str,
682 datasetType: Union[str, DatasetType],
683 timespan: Timespan,
684 *,
685 dataIds: Optional[Iterable[DataId]] = None,
686 ) -> None:
687 # Docstring inherited from lsst.daf.butler.registry.Registry
688 collectionRecord = self._managers.collections.find(collection)
689 if isinstance(datasetType, str):
690 storage = self._managers.datasets[datasetType]
691 else:
692 storage = self._managers.datasets[datasetType.name]
693 standardizedDataIds = None
694 if dataIds is not None:
695 standardizedDataIds = [
696 DataCoordinate.standardize(d, graph=storage.datasetType.dimensions) for d in dataIds
697 ]
698 storage.decertify(collectionRecord, timespan, dataIds=standardizedDataIds)
700 def getDatastoreBridgeManager(self) -> DatastoreRegistryBridgeManager:
701 """Return an object that allows a new `Datastore` instance to
702 communicate with this `Registry`.
704 Returns
705 -------
706 manager : `DatastoreRegistryBridgeManager`
707 Object that mediates communication between this `Registry` and its
708 associated datastores.
709 """
710 return self._managers.datastores
712 def getDatasetLocations(self, ref: DatasetRef) -> Iterable[str]:
713 # Docstring inherited from lsst.daf.butler.registry.Registry
714 return self._managers.datastores.findDatastores(ref)
716 def expandDataId(
717 self,
718 dataId: Optional[DataId] = None,
719 *,
720 graph: Optional[DimensionGraph] = None,
721 records: Optional[NameLookupMapping[DimensionElement, Optional[DimensionRecord]]] = None,
722 withDefaults: bool = True,
723 **kwargs: Any,
724 ) -> DataCoordinate:
725 # Docstring inherited from lsst.daf.butler.registry.Registry
726 if not withDefaults:
727 defaults = None
728 else:
729 defaults = self.defaults.dataId
730 try:
731 standardized = DataCoordinate.standardize(
732 dataId, graph=graph, universe=self.dimensions, defaults=defaults, **kwargs
733 )
734 except KeyError as exc:
735 # This means either kwargs have some odd name or required
736 # dimension is missing.
737 raise DimensionNameError(str(exc)) from exc
738 if standardized.hasRecords():
739 return standardized
740 if records is None:
741 records = {}
742 elif isinstance(records, NamedKeyMapping):
743 records = records.byName()
744 else:
745 records = dict(records)
746 if isinstance(dataId, DataCoordinate) and dataId.hasRecords():
747 records.update(dataId.records.byName())
748 keys = standardized.byName()
749 for element in standardized.graph.primaryKeyTraversalOrder:
750 record = records.get(element.name, ...) # Use ... to mean not found; None might mean NULL
751 if record is ...:
752 if isinstance(element, Dimension) and keys.get(element.name) is None:
753 if element in standardized.graph.required:
754 raise DimensionNameError(
755 f"No value or null value for required dimension {element.name}."
756 )
757 keys[element.name] = None
758 record = None
759 else:
760 storage = self._managers.dimensions[element]
761 dataIdSet = DataCoordinateIterable.fromScalar(
762 DataCoordinate.standardize(keys, graph=element.graph)
763 )
764 fetched = tuple(storage.fetch(dataIdSet))
765 try:
766 (record,) = fetched
767 except ValueError:
768 record = None
769 records[element.name] = record
770 if record is not None:
771 for d in element.implied:
772 value = getattr(record, d.name)
773 if keys.setdefault(d.name, value) != value:
774 raise InconsistentDataIdError(
775 f"Data ID {standardized} has {d.name}={keys[d.name]!r}, "
776 f"but {element.name} implies {d.name}={value!r}."
777 )
778 else:
779 if element in standardized.graph.required:
780 raise DataIdValueError(
781 f"Could not fetch record for required dimension {element.name} via keys {keys}."
782 )
783 if element.alwaysJoin:
784 raise InconsistentDataIdError(
785 f"Could not fetch record for element {element.name} via keys {keys}, ",
786 "but it is marked alwaysJoin=True; this means one or more dimensions are not "
787 "related.",
788 )
789 for d in element.implied:
790 keys.setdefault(d.name, None)
791 records.setdefault(d.name, None)
792 return DataCoordinate.standardize(keys, graph=standardized.graph).expanded(records=records)
794 def insertDimensionData(
795 self,
796 element: Union[DimensionElement, str],
797 *data: Union[Mapping[str, Any], DimensionRecord],
798 conform: bool = True,
799 replace: bool = False,
800 skip_existing: bool = False,
801 ) -> None:
802 # Docstring inherited from lsst.daf.butler.registry.Registry
803 if conform:
804 if isinstance(element, str):
805 element = self.dimensions[element]
806 records = [
807 row if isinstance(row, DimensionRecord) else element.RecordClass(**row) for row in data
808 ]
809 else:
810 # Ignore typing since caller said to trust them with conform=False.
811 records = data # type: ignore
812 storage = self._managers.dimensions[element] # type: ignore
813 storage.insert(*records, replace=replace, skip_existing=skip_existing)
815 def syncDimensionData(
816 self,
817 element: Union[DimensionElement, str],
818 row: Union[Mapping[str, Any], DimensionRecord],
819 conform: bool = True,
820 update: bool = False,
821 ) -> Union[bool, Dict[str, Any]]:
822 # Docstring inherited from lsst.daf.butler.registry.Registry
823 if conform:
824 if isinstance(element, str):
825 element = self.dimensions[element]
826 record = row if isinstance(row, DimensionRecord) else element.RecordClass(**row)
827 else:
828 # Ignore typing since caller said to trust them with conform=False.
829 record = row # type: ignore
830 storage = self._managers.dimensions[element] # type: ignore
831 return storage.sync(record, update=update)
833 def queryDatasetTypes(
834 self,
835 expression: Any = ...,
836 *,
837 components: Optional[bool] = None,
838 missing: Optional[List[str]] = None,
839 ) -> Iterable[DatasetType]:
840 # Docstring inherited from lsst.daf.butler.registry.Registry
841 wildcard = DatasetTypeWildcard.from_expression(expression)
842 composition_dict = self._managers.datasets.resolve_wildcard(
843 wildcard,
844 components=components,
845 missing=missing,
846 )
847 result: list[DatasetType] = []
848 for parent_dataset_type, components_for_parent in composition_dict.items():
849 result.extend(
850 parent_dataset_type.makeComponentDatasetType(c) if c is not None else parent_dataset_type
851 for c in components_for_parent
852 )
853 return result
855 def queryCollections(
856 self,
857 expression: Any = ...,
858 datasetType: Optional[DatasetType] = None,
859 collectionTypes: Union[Iterable[CollectionType], CollectionType] = CollectionType.all(),
860 flattenChains: bool = False,
861 includeChains: Optional[bool] = None,
862 ) -> Sequence[str]:
863 # Docstring inherited from lsst.daf.butler.registry.Registry
865 # Right now the datasetTypes argument is completely ignored, but that
866 # is consistent with its [lack of] guarantees. DM-24939 or a follow-up
867 # ticket will take care of that.
868 try:
869 wildcard = CollectionWildcard.from_expression(expression)
870 except TypeError as exc:
871 raise CollectionExpressionError(f"Invalid collection expression '{expression}'") from exc
872 collectionTypes = ensure_iterable(collectionTypes)
873 return [
874 record.name
875 for record in self._managers.collections.resolve_wildcard(
876 wildcard,
877 collection_types=frozenset(collectionTypes),
878 flatten_chains=flattenChains,
879 include_chains=includeChains,
880 )
881 ]
883 def _makeQueryBuilder(
884 self, summary: queries.QuerySummary, doomed_by: Iterable[str] = ()
885 ) -> queries.QueryBuilder:
886 """Return a `QueryBuilder` instance capable of constructing and
887 managing more complex queries than those obtainable via `Registry`
888 interfaces.
890 This is an advanced interface; downstream code should prefer
891 `Registry.queryDataIds` and `Registry.queryDatasets` whenever those
892 are sufficient.
894 Parameters
895 ----------
896 summary : `queries.QuerySummary`
897 Object describing and categorizing the full set of dimensions that
898 will be included in the query.
899 doomed_by : `Iterable` of `str`, optional
900 A list of diagnostic messages that indicate why the query is going
901 to yield no results and should not even be executed. If an empty
902 container (default) the query will be executed unless other code
903 determines that it is doomed.
905 Returns
906 -------
907 builder : `queries.QueryBuilder`
908 Object that can be used to construct and perform advanced queries.
909 """
910 return queries.QueryBuilder(
911 summary,
912 backend=queries.SqlQueryBackend(self._db, self._managers),
913 doomed_by=doomed_by,
914 )
916 def _standardize_query_dataset_args(
917 self,
918 datasets: Any,
919 collections: Any,
920 components: bool | None,
921 mode: Literal["find_first"] | Literal["find_all"] | Literal["constrain"] = "constrain",
922 *,
923 doomed_by: list[str],
924 ) -> tuple[dict[DatasetType, list[str | None]], CollectionWildcard | None]:
925 """Preprocess dataset arguments passed to query* methods.
927 Parameters
928 ----------
929 datasets : `DatasetType`, `str`, `re.Pattern`, or iterable of these
930 Expression identifying dataset types. See `queryDatasetTypes` for
931 details.
932 collections : `str`, `re.Pattern`, or iterable of these
933 Expression identifying collections to be searched. See
934 `queryCollections` for details.
935 components : `bool`, optional
936 If `True`, apply all expression patterns to component dataset type
937 names as well. If `False`, never apply patterns to components.
938 If `None` (default), apply patterns to components only if their
939 parent datasets were not matched by the expression.
940 Fully-specified component datasets (`str` or `DatasetType`
941 instances) are always included.
943 Values other than `False` are deprecated, and only `False` will be
944 supported after v26. After v27 this argument will be removed
945 entirely.
946 mode : `str`, optional
947 The way in which datasets are being used in this query; one of:
949 - "find_first": this is a query for the first dataset in an
950 ordered list of collections. Prohibits collection wildcards,
951 but permits dataset type wildcards.
953 - "find_all": this is a query for all datasets in all matched
954 collections. Permits collection and dataset type wildcards.
956 - "constrain": this is a query for something other than datasets,
957 with results constrained by dataset existence. Permits
958 collection wildcards and prohibits ``...`` as a dataset type
959 wildcard.
960 doomed_by : `list` [ `str` ]
961 List to append messages indicating why the query is doomed to
962 yield no results.
964 Returns
965 -------
966 composition : `defaultdict` [ `DatasetType`, `list` [ `str` ] ]
967 Dictionary mapping parent dataset type to `list` of components
968 matched for that dataset type (or `None` for the parent itself).
969 collections : `CollectionWildcard`
970 Processed collection expression.
971 """
972 composition: dict[DatasetType, list[str | None]] = {}
973 if datasets is not None:
974 if not collections:
975 if not self.defaults.collections:
976 raise NoDefaultCollectionError("No collections, and no registry default collections.")
977 collections = self.defaults.collections
978 else:
979 collections = CollectionWildcard.from_expression(collections)
980 if mode == "find_first" and collections.patterns:
981 raise TypeError(
982 f"Collection pattern(s) {collections.patterns} not allowed in this context."
983 )
984 missing: list[str] = []
985 composition = self._managers.datasets.resolve_wildcard(
986 datasets, components=components, missing=missing, explicit_only=(mode == "constrain")
987 )
988 if missing and mode == "constrain":
989 # After v26 this should raise MissingDatasetTypeError, to be
990 # implemented on DM-36303.
991 warnings.warn(
992 f"Dataset type(s) {missing} are not registered; this will be an error after v26.",
993 FutureWarning,
994 )
995 doomed_by.extend(f"Dataset type {name} is not registered." for name in missing)
996 elif collections:
997 raise ArgumentError(f"Cannot pass 'collections' (='{collections}') without 'datasets'.")
998 return composition, collections
1000 def queryDatasets(
1001 self,
1002 datasetType: Any,
1003 *,
1004 collections: Any = None,
1005 dimensions: Optional[Iterable[Union[Dimension, str]]] = None,
1006 dataId: Optional[DataId] = None,
1007 where: Optional[str] = None,
1008 findFirst: bool = False,
1009 components: Optional[bool] = None,
1010 bind: Optional[Mapping[str, Any]] = None,
1011 check: bool = True,
1012 **kwargs: Any,
1013 ) -> queries.DatasetQueryResults:
1014 # Docstring inherited from lsst.daf.butler.registry.Registry
1015 doomed_by: list[str] = []
1016 data_id = self.expandDataId(dataId, **kwargs)
1017 dataset_composition, collections = self._standardize_query_dataset_args(
1018 datasetType,
1019 collections,
1020 components,
1021 mode="find_first" if findFirst else "find_all",
1022 doomed_by=doomed_by,
1023 )
1024 parent_results: list[queries.ParentDatasetQueryResults] = []
1025 for parent_dataset_type, components_for_parent in dataset_composition.items():
1026 # The full set of dimensions in the query is the combination of
1027 # those needed for the DatasetType and those explicitly requested,
1028 # if any.
1029 dimension_names = set(parent_dataset_type.dimensions.names)
1030 if dimensions is not None:
1031 dimension_names.update(self.dimensions.extract(dimensions).names)
1032 # Construct the summary structure needed to construct a
1033 # QueryBuilder.
1034 summary = queries.QuerySummary(
1035 requested=DimensionGraph(self.dimensions, names=dimension_names),
1036 dataId=data_id,
1037 expression=where,
1038 bind=bind,
1039 defaults=self.defaults.dataId,
1040 check=check,
1041 datasets=[parent_dataset_type],
1042 )
1043 builder = self._makeQueryBuilder(summary)
1044 # Add the dataset subquery to the query, telling the QueryBuilder
1045 # to include the rank of the selected collection in the results
1046 # only if we need to findFirst. Note that if any of the
1047 # collections are actually wildcard expressions, and
1048 # findFirst=True, this will raise TypeError for us.
1049 builder.joinDataset(parent_dataset_type, collections, isResult=True, findFirst=findFirst)
1050 query = builder.finish()
1051 parent_results.append(
1052 queries.ParentDatasetQueryResults(
1053 self._db, query, datasetType=parent_dataset_type, components=components_for_parent
1054 )
1055 )
1056 if not parent_results:
1057 doomed_by.extend(
1058 f"No registered dataset type matching {t!r} found, so no matching datasets can "
1059 "exist in any collection."
1060 for t in ensure_iterable(datasetType)
1061 )
1062 return queries.ChainedDatasetQueryResults([], doomed_by=doomed_by)
1063 elif len(parent_results) == 1:
1064 return parent_results[0]
1065 else:
1066 return queries.ChainedDatasetQueryResults(parent_results)
1068 def queryDataIds(
1069 self,
1070 dimensions: Union[Iterable[Union[Dimension, str]], Dimension, str],
1071 *,
1072 dataId: Optional[DataId] = None,
1073 datasets: Any = None,
1074 collections: Any = None,
1075 where: Optional[str] = None,
1076 components: Optional[bool] = None,
1077 bind: Optional[Mapping[str, Any]] = None,
1078 check: bool = True,
1079 **kwargs: Any,
1080 ) -> queries.DataCoordinateQueryResults:
1081 # Docstring inherited from lsst.daf.butler.registry.Registry
1082 dimensions = ensure_iterable(dimensions)
1083 requestedDimensions = self.dimensions.extract(dimensions)
1084 doomed_by: list[str] = []
1085 data_id = self.expandDataId(dataId, **kwargs)
1086 dataset_composition, collections = self._standardize_query_dataset_args(
1087 datasets, collections, components, doomed_by=doomed_by
1088 )
1090 def query_factory(
1091 order_by: Optional[Iterable[str]] = None, limit: Optional[Tuple[int, Optional[int]]] = None
1092 ) -> queries.Query:
1093 """Construct the Query object that generates query results."""
1094 summary = queries.QuerySummary(
1095 requested=requestedDimensions,
1096 dataId=data_id,
1097 expression=where,
1098 bind=bind,
1099 defaults=self.defaults.dataId,
1100 check=check,
1101 datasets=dataset_composition.keys(),
1102 order_by=order_by,
1103 limit=limit,
1104 )
1105 builder = self._makeQueryBuilder(summary, doomed_by=doomed_by)
1106 for datasetType in dataset_composition:
1107 builder.joinDataset(datasetType, collections, isResult=False)
1108 return builder.finish()
1110 return queries.DataCoordinateQueryResults(self._db, query_factory, requestedDimensions)
1112 def queryDimensionRecords(
1113 self,
1114 element: Union[DimensionElement, str],
1115 *,
1116 dataId: Optional[DataId] = None,
1117 datasets: Any = None,
1118 collections: Any = None,
1119 where: Optional[str] = None,
1120 components: Optional[bool] = None,
1121 bind: Optional[Mapping[str, Any]] = None,
1122 check: bool = True,
1123 **kwargs: Any,
1124 ) -> queries.DimensionRecordQueryResults:
1125 # Docstring inherited from lsst.daf.butler.registry.Registry
1126 if not isinstance(element, DimensionElement):
1127 try:
1128 element = self.dimensions[element]
1129 except KeyError as e:
1130 raise DimensionNameError(
1131 f"No such dimension '{element}', available dimensions: "
1132 + str(self.dimensions.getStaticElements())
1133 ) from e
1134 dataIds = self.queryDataIds(
1135 element.graph,
1136 dataId=dataId,
1137 datasets=datasets,
1138 collections=collections,
1139 where=where,
1140 components=components,
1141 bind=bind,
1142 check=check,
1143 **kwargs,
1144 )
1145 return queries.DatabaseDimensionRecordQueryResults(dataIds, self._managers.dimensions[element])
1147 def queryDatasetAssociations(
1148 self,
1149 datasetType: Union[str, DatasetType],
1150 collections: Any = ...,
1151 *,
1152 collectionTypes: Iterable[CollectionType] = CollectionType.all(),
1153 flattenChains: bool = False,
1154 ) -> Iterator[DatasetAssociation]:
1155 # Docstring inherited from lsst.daf.butler.registry.Registry
1156 if collections is None:
1157 if not self.defaults.collections:
1158 raise NoDefaultCollectionError(
1159 "No collections provided to findDataset, and no defaults from registry construction."
1160 )
1161 collections = self.defaults.collections
1162 collections = CollectionWildcard.from_expression(collections)
1163 TimespanReprClass = self._db.getTimespanRepresentation()
1164 if isinstance(datasetType, str):
1165 storage = self._managers.datasets[datasetType]
1166 else:
1167 storage = self._managers.datasets[datasetType.name]
1168 for collectionRecord in self._managers.collections.resolve_wildcard(
1169 collections,
1170 collection_types=frozenset(collectionTypes),
1171 flatten_chains=flattenChains,
1172 ):
1173 query = storage.select(collectionRecord)
1174 with self._db.query(query) as sql_result:
1175 sql_mappings = sql_result.mappings().fetchall()
1176 for row in sql_mappings:
1177 dataId = DataCoordinate.fromRequiredValues(
1178 storage.datasetType.dimensions,
1179 tuple(row[name] for name in storage.datasetType.dimensions.required.names),
1180 )
1181 runRecord = self._managers.collections[row[self._managers.collections.getRunForeignKeyName()]]
1182 ref = DatasetRef(storage.datasetType, dataId, id=row["id"], run=runRecord.name, conform=False)
1183 if collectionRecord.type is CollectionType.CALIBRATION:
1184 timespan = TimespanReprClass.extract(row)
1185 else:
1186 timespan = None
1187 yield DatasetAssociation(ref=ref, collection=collectionRecord.name, timespan=timespan)
1189 storageClasses: StorageClassFactory
1190 """All storage classes known to the registry (`StorageClassFactory`).
1191 """