Coverage for python/lsst/daf/butler/registries/sql.py: 13%
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22from __future__ import annotations
24__all__ = ("SqlRegistry",)
26import contextlib
27import logging
28from collections import defaultdict
29from typing import TYPE_CHECKING, Any, Dict, Iterable, Iterator, List, Mapping, Optional, Set, Tuple, Union
31import sqlalchemy
32from lsst.resources import ResourcePathExpression
33from lsst.utils.iteration import ensure_iterable
35from ..core import (
36 Config,
37 DataCoordinate,
38 DataCoordinateIterable,
39 DataId,
40 DatasetAssociation,
41 DatasetId,
42 DatasetRef,
43 DatasetType,
44 Dimension,
45 DimensionConfig,
46 DimensionElement,
47 DimensionGraph,
48 DimensionRecord,
49 DimensionUniverse,
50 NamedKeyMapping,
51 NameLookupMapping,
52 Progress,
53 StorageClassFactory,
54 Timespan,
55 ddl,
56)
57from ..core.utils import transactional
58from ..registry import (
59 CollectionSearch,
60 CollectionType,
61 ConflictingDefinitionError,
62 InconsistentDataIdError,
63 OrphanedRecordError,
64 Registry,
65 RegistryConfig,
66 RegistryDefaults,
67 queries,
68)
69from ..registry.interfaces import ChainedCollectionRecord, DatasetIdGenEnum, RunRecord
70from ..registry.managers import RegistryManagerInstances, RegistryManagerTypes
71from ..registry.queries import Query
72from ..registry.summaries import CollectionSummary
73from ..registry.wildcards import CategorizedWildcard, CollectionQuery, Ellipsis
75if TYPE_CHECKING: 75 ↛ 76line 75 didn't jump to line 76, because the condition on line 75 was never true
76 from .._butlerConfig import ButlerConfig
77 from ..registry.interfaces import CollectionRecord, Database, DatastoreRegistryBridgeManager
80_LOG = logging.getLogger(__name__)
83class SqlRegistry(Registry):
84 """Registry implementation based on SQLAlchemy.
86 Parameters
87 ----------
88 database : `Database`
89 Database instance to store Registry.
90 defaults : `RegistryDefaults`
91 Default collection search path and/or output `~CollectionType.RUN`
92 collection.
93 managers : `RegistryManagerInstances`
94 All the managers required for this registry.
95 """
97 defaultConfigFile: Optional[str] = None
98 """Path to configuration defaults. Accessed within the ``configs`` resource
99 or relative to a search path. Can be None if no defaults specified.
100 """
102 @classmethod
103 def createFromConfig(
104 cls,
105 config: Optional[Union[RegistryConfig, str]] = None,
106 dimensionConfig: Optional[Union[DimensionConfig, str]] = None,
107 butlerRoot: Optional[ResourcePathExpression] = None,
108 ) -> Registry:
109 """Create registry database and return `SqlRegistry` instance.
111 This method initializes database contents, database must be empty
112 prior to calling this method.
114 Parameters
115 ----------
116 config : `RegistryConfig` or `str`, optional
117 Registry configuration, if missing then default configuration will
118 be loaded from registry.yaml.
119 dimensionConfig : `DimensionConfig` or `str`, optional
120 Dimensions configuration, if missing then default configuration
121 will be loaded from dimensions.yaml.
122 butlerRoot : convertible to `lsst.resources.ResourcePath`, optional
123 Path to the repository root this `SqlRegistry` will manage.
125 Returns
126 -------
127 registry : `SqlRegistry`
128 A new `SqlRegistry` instance.
129 """
130 config = cls.forceRegistryConfig(config)
131 config.replaceRoot(butlerRoot)
133 if isinstance(dimensionConfig, str):
134 dimensionConfig = DimensionConfig(config)
135 elif dimensionConfig is None:
136 dimensionConfig = DimensionConfig()
137 elif not isinstance(dimensionConfig, DimensionConfig):
138 raise TypeError(f"Incompatible Dimension configuration type: {type(dimensionConfig)}")
140 DatabaseClass = config.getDatabaseClass()
141 database = DatabaseClass.fromUri(
142 str(config.connectionString), origin=config.get("origin", 0), namespace=config.get("namespace")
143 )
144 managerTypes = RegistryManagerTypes.fromConfig(config)
145 managers = managerTypes.makeRepo(database, dimensionConfig)
146 return cls(database, RegistryDefaults(), managers)
148 @classmethod
149 def fromConfig(
150 cls,
151 config: Union[ButlerConfig, RegistryConfig, Config, str],
152 butlerRoot: Optional[ResourcePathExpression] = None,
153 writeable: bool = True,
154 defaults: Optional[RegistryDefaults] = None,
155 ) -> Registry:
156 """Create `Registry` subclass instance from `config`.
158 Registry database must be initialized prior to calling this method.
160 Parameters
161 ----------
162 config : `ButlerConfig`, `RegistryConfig`, `Config` or `str`
163 Registry configuration
164 butlerRoot : `lsst.resources.ResourcePathExpression`, optional
165 Path to the repository root this `Registry` will manage.
166 writeable : `bool`, optional
167 If `True` (default) create a read-write connection to the database.
168 defaults : `RegistryDefaults`, optional
169 Default collection search path and/or output `~CollectionType.RUN`
170 collection.
172 Returns
173 -------
174 registry : `SqlRegistry` (subclass)
175 A new `SqlRegistry` subclass instance.
176 """
177 config = cls.forceRegistryConfig(config)
178 config.replaceRoot(butlerRoot)
179 DatabaseClass = config.getDatabaseClass()
180 database = DatabaseClass.fromUri(
181 str(config.connectionString),
182 origin=config.get("origin", 0),
183 namespace=config.get("namespace"),
184 writeable=writeable,
185 )
186 managerTypes = RegistryManagerTypes.fromConfig(config)
187 managers = managerTypes.loadRepo(database)
188 if defaults is None:
189 defaults = RegistryDefaults()
190 return cls(database, defaults, managers)
192 def __init__(self, database: Database, defaults: RegistryDefaults, managers: RegistryManagerInstances):
193 self._db = database
194 self._managers = managers
195 self.storageClasses = StorageClassFactory()
196 # Intentionally invoke property setter to initialize defaults. This
197 # can only be done after most of the rest of Registry has already been
198 # initialized, and must be done before the property getter is used.
199 self.defaults = defaults
201 def __str__(self) -> str:
202 return str(self._db)
204 def __repr__(self) -> str:
205 return f"SqlRegistry({self._db!r}, {self.dimensions!r})"
207 def isWriteable(self) -> bool:
208 # Docstring inherited from lsst.daf.butler.registry.Registry
209 return self._db.isWriteable()
211 def copy(self, defaults: Optional[RegistryDefaults] = None) -> Registry:
212 # Docstring inherited from lsst.daf.butler.registry.Registry
213 if defaults is None:
214 # No need to copy, because `RegistryDefaults` is immutable; we
215 # effectively copy on write.
216 defaults = self.defaults
217 return type(self)(self._db, defaults, self._managers)
219 @property
220 def dimensions(self) -> DimensionUniverse:
221 # Docstring inherited from lsst.daf.butler.registry.Registry
222 return self._managers.dimensions.universe
224 def refresh(self) -> None:
225 # Docstring inherited from lsst.daf.butler.registry.Registry
226 self._managers.refresh()
228 @contextlib.contextmanager
229 def transaction(self, *, savepoint: bool = False) -> Iterator[None]:
230 # Docstring inherited from lsst.daf.butler.registry.Registry
231 try:
232 with self._db.transaction(savepoint=savepoint):
233 yield
234 except BaseException:
235 # TODO: this clears the caches sometimes when we wouldn't actually
236 # need to. Can we avoid that?
237 self._managers.dimensions.clearCaches()
238 raise
240 def resetConnectionPool(self) -> None:
241 """Reset SQLAlchemy connection pool for `SqlRegistry` database.
243 This operation is useful when using registry with fork-based
244 multiprocessing. To use registry across fork boundary one has to make
245 sure that there are no currently active connections (no session or
246 transaction is in progress) and connection pool is reset using this
247 method. This method should be called by the child process immediately
248 after the fork.
249 """
250 self._db._engine.dispose()
252 def registerOpaqueTable(self, tableName: str, spec: ddl.TableSpec) -> None:
253 """Add an opaque (to the `Registry`) table for use by a `Datastore` or
254 other data repository client.
256 Opaque table records can be added via `insertOpaqueData`, retrieved via
257 `fetchOpaqueData`, and removed via `deleteOpaqueData`.
259 Parameters
260 ----------
261 tableName : `str`
262 Logical name of the opaque table. This may differ from the
263 actual name used in the database by a prefix and/or suffix.
264 spec : `ddl.TableSpec`
265 Specification for the table to be added.
266 """
267 self._managers.opaque.register(tableName, spec)
269 @transactional
270 def insertOpaqueData(self, tableName: str, *data: dict) -> None:
271 """Insert records into an opaque table.
273 Parameters
274 ----------
275 tableName : `str`
276 Logical name of the opaque table. Must match the name used in a
277 previous call to `registerOpaqueTable`.
278 data
279 Each additional positional argument is a dictionary that represents
280 a single row to be added.
281 """
282 self._managers.opaque[tableName].insert(*data)
284 def fetchOpaqueData(self, tableName: str, **where: Any) -> Iterator[dict]:
285 """Retrieve records from an opaque table.
287 Parameters
288 ----------
289 tableName : `str`
290 Logical name of the opaque table. Must match the name used in a
291 previous call to `registerOpaqueTable`.
292 where
293 Additional keyword arguments are interpreted as equality
294 constraints that restrict the returned rows (combined with AND);
295 keyword arguments are column names and values are the values they
296 must have.
298 Yields
299 ------
300 row : `dict`
301 A dictionary representing a single result row.
302 """
303 yield from self._managers.opaque[tableName].fetch(**where)
305 @transactional
306 def deleteOpaqueData(self, tableName: str, **where: Any) -> None:
307 """Remove records from an opaque table.
309 Parameters
310 ----------
311 tableName : `str`
312 Logical name of the opaque table. Must match the name used in a
313 previous call to `registerOpaqueTable`.
314 where
315 Additional keyword arguments are interpreted as equality
316 constraints that restrict the deleted rows (combined with AND);
317 keyword arguments are column names and values are the values they
318 must have.
319 """
320 self._managers.opaque[tableName].delete(where.keys(), where)
322 def registerCollection(
323 self, name: str, type: CollectionType = CollectionType.TAGGED, doc: Optional[str] = None
324 ) -> bool:
325 # Docstring inherited from lsst.daf.butler.registry.Registry
326 _, registered = self._managers.collections.register(name, type, doc=doc)
327 return registered
329 def getCollectionType(self, name: str) -> CollectionType:
330 # Docstring inherited from lsst.daf.butler.registry.Registry
331 return self._managers.collections.find(name).type
333 def _get_collection_record(self, name: str) -> CollectionRecord:
334 # Docstring inherited from lsst.daf.butler.registry.Registry
335 return self._managers.collections.find(name)
337 def registerRun(self, name: str, doc: Optional[str] = None) -> bool:
338 # Docstring inherited from lsst.daf.butler.registry.Registry
339 _, registered = self._managers.collections.register(name, CollectionType.RUN, doc=doc)
340 return registered
342 @transactional
343 def removeCollection(self, name: str) -> None:
344 # Docstring inherited from lsst.daf.butler.registry.Registry
345 self._managers.collections.remove(name)
347 def getCollectionChain(self, parent: str) -> CollectionSearch:
348 # Docstring inherited from lsst.daf.butler.registry.Registry
349 record = self._managers.collections.find(parent)
350 if record.type is not CollectionType.CHAINED:
351 raise TypeError(f"Collection '{parent}' has type {record.type.name}, not CHAINED.")
352 assert isinstance(record, ChainedCollectionRecord)
353 return record.children
355 @transactional
356 def setCollectionChain(self, parent: str, children: Any, *, flatten: bool = False) -> None:
357 # Docstring inherited from lsst.daf.butler.registry.Registry
358 record = self._managers.collections.find(parent)
359 if record.type is not CollectionType.CHAINED:
360 raise TypeError(f"Collection '{parent}' has type {record.type.name}, not CHAINED.")
361 assert isinstance(record, ChainedCollectionRecord)
362 children = CollectionSearch.fromExpression(children)
363 if children != record.children or flatten:
364 record.update(self._managers.collections, children, flatten=flatten)
366 def getCollectionParentChains(self, collection: str) -> Set[str]:
367 # Docstring inherited from lsst.daf.butler.registry.Registry
368 return {
369 record.name
370 for record in self._managers.collections.getParentChains(
371 self._managers.collections.find(collection).key
372 )
373 }
375 def getCollectionDocumentation(self, collection: str) -> Optional[str]:
376 # Docstring inherited from lsst.daf.butler.registry.Registry
377 return self._managers.collections.getDocumentation(self._managers.collections.find(collection).key)
379 def setCollectionDocumentation(self, collection: str, doc: Optional[str]) -> None:
380 # Docstring inherited from lsst.daf.butler.registry.Registry
381 self._managers.collections.setDocumentation(self._managers.collections.find(collection).key, doc)
383 def getCollectionSummary(self, collection: str) -> CollectionSummary:
384 # Docstring inherited from lsst.daf.butler.registry.Registry
385 record = self._managers.collections.find(collection)
386 return self._managers.datasets.getCollectionSummary(record)
388 def registerDatasetType(self, datasetType: DatasetType) -> bool:
389 # Docstring inherited from lsst.daf.butler.registry.Registry
390 _, inserted = self._managers.datasets.register(datasetType)
391 return inserted
393 def removeDatasetType(self, name: str) -> None:
394 # Docstring inherited from lsst.daf.butler.registry.Registry
395 self._managers.datasets.remove(name)
397 def getDatasetType(self, name: str) -> DatasetType:
398 # Docstring inherited from lsst.daf.butler.registry.Registry
399 return self._managers.datasets[name].datasetType
401 def supportsIdGenerationMode(self, mode: DatasetIdGenEnum) -> bool:
402 # Docstring inherited from lsst.daf.butler.registry.Registry
403 return self._managers.datasets.supportsIdGenerationMode(mode)
405 def findDataset(
406 self,
407 datasetType: Union[DatasetType, str],
408 dataId: Optional[DataId] = None,
409 *,
410 collections: Any = None,
411 timespan: Optional[Timespan] = None,
412 **kwargs: Any,
413 ) -> Optional[DatasetRef]:
414 # Docstring inherited from lsst.daf.butler.registry.Registry
415 if isinstance(datasetType, DatasetType):
416 storage = self._managers.datasets[datasetType.name]
417 else:
418 storage = self._managers.datasets[datasetType]
419 dataId = DataCoordinate.standardize(
420 dataId,
421 graph=storage.datasetType.dimensions,
422 universe=self.dimensions,
423 defaults=self.defaults.dataId,
424 **kwargs,
425 )
426 if collections is None:
427 if not self.defaults.collections:
428 raise TypeError(
429 "No collections provided to findDataset, and no defaults from registry construction."
430 )
431 collections = self.defaults.collections
432 else:
433 collections = CollectionSearch.fromExpression(collections)
434 for collectionRecord in collections.iter(self._managers.collections):
435 if collectionRecord.type is CollectionType.CALIBRATION and (
436 not storage.datasetType.isCalibration() or timespan is None
437 ):
438 continue
439 result = storage.find(collectionRecord, dataId, timespan=timespan)
440 if result is not None:
441 return result
443 return None
445 @transactional
446 def insertDatasets(
447 self,
448 datasetType: Union[DatasetType, str],
449 dataIds: Iterable[DataId],
450 run: Optional[str] = None,
451 expand: bool = True,
452 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE,
453 ) -> List[DatasetRef]:
454 # Docstring inherited from lsst.daf.butler.registry.Registry
455 if isinstance(datasetType, DatasetType):
456 storage = self._managers.datasets.find(datasetType.name)
457 if storage is None:
458 raise LookupError(f"DatasetType '{datasetType}' has not been registered.")
459 else:
460 storage = self._managers.datasets.find(datasetType)
461 if storage is None:
462 raise LookupError(f"DatasetType with name '{datasetType}' has not been registered.")
463 if run is None:
464 if self.defaults.run is None:
465 raise TypeError(
466 "No run provided to insertDatasets, and no default from registry construction."
467 )
468 run = self.defaults.run
469 runRecord = self._managers.collections.find(run)
470 if runRecord.type is not CollectionType.RUN:
471 raise TypeError(f"Given collection is of type {runRecord.type.name}; RUN collection required.")
472 assert isinstance(runRecord, RunRecord)
473 progress = Progress("daf.butler.Registry.insertDatasets", level=logging.DEBUG)
474 if expand:
475 expandedDataIds = [
476 self.expandDataId(dataId, graph=storage.datasetType.dimensions)
477 for dataId in progress.wrap(dataIds, f"Expanding {storage.datasetType.name} data IDs")
478 ]
479 else:
480 expandedDataIds = [
481 DataCoordinate.standardize(dataId, graph=storage.datasetType.dimensions) for dataId in dataIds
482 ]
483 try:
484 refs = list(storage.insert(runRecord, expandedDataIds, idGenerationMode))
485 except sqlalchemy.exc.IntegrityError as err:
486 raise ConflictingDefinitionError(
487 f"A database constraint failure was triggered by inserting "
488 f"one or more datasets of type {storage.datasetType} into "
489 f"collection '{run}'. "
490 f"This probably means a dataset with the same data ID "
491 f"and dataset type already exists, but it may also mean a "
492 f"dimension row is missing."
493 ) from err
494 return refs
496 @transactional
497 def _importDatasets(
498 self,
499 datasets: Iterable[DatasetRef],
500 expand: bool = True,
501 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE,
502 reuseIds: bool = False,
503 ) -> List[DatasetRef]:
504 # Docstring inherited from lsst.daf.butler.registry.Registry
505 datasets = list(datasets)
506 if not datasets:
507 # nothing to do
508 return []
510 # find dataset type
511 datasetTypes = set(dataset.datasetType for dataset in datasets)
512 if len(datasetTypes) != 1:
513 raise ValueError(f"Multiple dataset types in input datasets: {datasetTypes}")
514 datasetType = datasetTypes.pop()
516 # get storage handler for this dataset type
517 storage = self._managers.datasets.find(datasetType.name)
518 if storage is None:
519 raise LookupError(f"DatasetType '{datasetType}' has not been registered.")
521 # find run name
522 runs = set(dataset.run for dataset in datasets)
523 if len(runs) != 1:
524 raise ValueError(f"Multiple run names in input datasets: {runs}")
525 run = runs.pop()
526 if run is None:
527 if self.defaults.run is None:
528 raise TypeError(
529 "No run provided to ingestDatasets, and no default from registry construction."
530 )
531 run = self.defaults.run
533 runRecord = self._managers.collections.find(run)
534 if runRecord.type is not CollectionType.RUN:
535 raise TypeError(
536 f"Given collection '{runRecord.name}' is of type {runRecord.type.name};"
537 " RUN collection required."
538 )
539 assert isinstance(runRecord, RunRecord)
541 progress = Progress("daf.butler.Registry.insertDatasets", level=logging.DEBUG)
542 if expand:
543 expandedDatasets = [
544 dataset.expanded(self.expandDataId(dataset.dataId, graph=storage.datasetType.dimensions))
545 for dataset in progress.wrap(datasets, f"Expanding {storage.datasetType.name} data IDs")
546 ]
547 else:
548 expandedDatasets = [
549 DatasetRef(datasetType, dataset.dataId, id=dataset.id, run=dataset.run, conform=True)
550 for dataset in datasets
551 ]
553 try:
554 refs = list(storage.import_(runRecord, expandedDatasets, idGenerationMode, reuseIds))
555 except sqlalchemy.exc.IntegrityError as err:
556 raise ConflictingDefinitionError(
557 f"A database constraint failure was triggered by inserting "
558 f"one or more datasets of type {storage.datasetType} into "
559 f"collection '{run}'. "
560 f"This probably means a dataset with the same data ID "
561 f"and dataset type already exists, but it may also mean a "
562 f"dimension row is missing."
563 ) from err
564 return refs
566 def getDataset(self, id: DatasetId) -> Optional[DatasetRef]:
567 # Docstring inherited from lsst.daf.butler.registry.Registry
568 return self._managers.datasets.getDatasetRef(id)
570 @transactional
571 def removeDatasets(self, refs: Iterable[DatasetRef]) -> None:
572 # Docstring inherited from lsst.daf.butler.registry.Registry
573 progress = Progress("lsst.daf.butler.Registry.removeDatasets", level=logging.DEBUG)
574 for datasetType, refsForType in progress.iter_item_chunks(
575 DatasetRef.groupByType(refs).items(), desc="Removing datasets by type"
576 ):
577 storage = self._managers.datasets[datasetType.name]
578 try:
579 storage.delete(refsForType)
580 except sqlalchemy.exc.IntegrityError as err:
581 raise OrphanedRecordError(
582 "One or more datasets is still present in one or more Datastores."
583 ) from err
585 @transactional
586 def associate(self, collection: str, refs: Iterable[DatasetRef]) -> None:
587 # Docstring inherited from lsst.daf.butler.registry.Registry
588 progress = Progress("lsst.daf.butler.Registry.associate", level=logging.DEBUG)
589 collectionRecord = self._managers.collections.find(collection)
590 if collectionRecord.type is not CollectionType.TAGGED:
591 raise TypeError(f"Collection '{collection}' has type {collectionRecord.type.name}, not TAGGED.")
592 for datasetType, refsForType in progress.iter_item_chunks(
593 DatasetRef.groupByType(refs).items(), desc="Associating datasets by type"
594 ):
595 storage = self._managers.datasets[datasetType.name]
596 try:
597 storage.associate(collectionRecord, refsForType)
598 except sqlalchemy.exc.IntegrityError as err:
599 raise ConflictingDefinitionError(
600 f"Constraint violation while associating dataset of type {datasetType.name} with "
601 f"collection {collection}. This probably means that one or more datasets with the same "
602 f"dataset type and data ID already exist in the collection, but it may also indicate "
603 f"that the datasets do not exist."
604 ) from err
606 @transactional
607 def disassociate(self, collection: str, refs: Iterable[DatasetRef]) -> None:
608 # Docstring inherited from lsst.daf.butler.registry.Registry
609 progress = Progress("lsst.daf.butler.Registry.disassociate", level=logging.DEBUG)
610 collectionRecord = self._managers.collections.find(collection)
611 if collectionRecord.type is not CollectionType.TAGGED:
612 raise TypeError(
613 f"Collection '{collection}' has type {collectionRecord.type.name}; expected TAGGED."
614 )
615 for datasetType, refsForType in progress.iter_item_chunks(
616 DatasetRef.groupByType(refs).items(), desc="Disassociating datasets by type"
617 ):
618 storage = self._managers.datasets[datasetType.name]
619 storage.disassociate(collectionRecord, refsForType)
621 @transactional
622 def certify(self, collection: str, refs: Iterable[DatasetRef], timespan: Timespan) -> None:
623 # Docstring inherited from lsst.daf.butler.registry.Registry
624 progress = Progress("lsst.daf.butler.Registry.certify", level=logging.DEBUG)
625 collectionRecord = self._managers.collections.find(collection)
626 for datasetType, refsForType in progress.iter_item_chunks(
627 DatasetRef.groupByType(refs).items(), desc="Certifying datasets by type"
628 ):
629 storage = self._managers.datasets[datasetType.name]
630 storage.certify(collectionRecord, refsForType, timespan)
632 @transactional
633 def decertify(
634 self,
635 collection: str,
636 datasetType: Union[str, DatasetType],
637 timespan: Timespan,
638 *,
639 dataIds: Optional[Iterable[DataId]] = None,
640 ) -> None:
641 # Docstring inherited from lsst.daf.butler.registry.Registry
642 collectionRecord = self._managers.collections.find(collection)
643 if isinstance(datasetType, str):
644 storage = self._managers.datasets[datasetType]
645 else:
646 storage = self._managers.datasets[datasetType.name]
647 standardizedDataIds = None
648 if dataIds is not None:
649 standardizedDataIds = [
650 DataCoordinate.standardize(d, graph=storage.datasetType.dimensions) for d in dataIds
651 ]
652 storage.decertify(collectionRecord, timespan, dataIds=standardizedDataIds)
654 def getDatastoreBridgeManager(self) -> DatastoreRegistryBridgeManager:
655 """Return an object that allows a new `Datastore` instance to
656 communicate with this `Registry`.
658 Returns
659 -------
660 manager : `DatastoreRegistryBridgeManager`
661 Object that mediates communication between this `Registry` and its
662 associated datastores.
663 """
664 return self._managers.datastores
666 def getDatasetLocations(self, ref: DatasetRef) -> Iterable[str]:
667 # Docstring inherited from lsst.daf.butler.registry.Registry
668 return self._managers.datastores.findDatastores(ref)
670 def expandDataId(
671 self,
672 dataId: Optional[DataId] = None,
673 *,
674 graph: Optional[DimensionGraph] = None,
675 records: Optional[NameLookupMapping[DimensionElement, Optional[DimensionRecord]]] = None,
676 withDefaults: bool = True,
677 **kwargs: Any,
678 ) -> DataCoordinate:
679 # Docstring inherited from lsst.daf.butler.registry.Registry
680 if not withDefaults:
681 defaults = None
682 else:
683 defaults = self.defaults.dataId
684 standardized = DataCoordinate.standardize(
685 dataId, graph=graph, universe=self.dimensions, defaults=defaults, **kwargs
686 )
687 if standardized.hasRecords():
688 return standardized
689 if records is None:
690 records = {}
691 elif isinstance(records, NamedKeyMapping):
692 records = records.byName()
693 else:
694 records = dict(records)
695 if isinstance(dataId, DataCoordinate) and dataId.hasRecords():
696 records.update(dataId.records.byName())
697 keys = standardized.byName()
698 for element in standardized.graph.primaryKeyTraversalOrder:
699 record = records.get(element.name, ...) # Use ... to mean not found; None might mean NULL
700 if record is ...:
701 if isinstance(element, Dimension) and keys.get(element.name) is None:
702 if element in standardized.graph.required:
703 raise LookupError(f"No value or null value for required dimension {element.name}.")
704 keys[element.name] = None
705 record = None
706 else:
707 storage = self._managers.dimensions[element]
708 dataIdSet = DataCoordinateIterable.fromScalar(
709 DataCoordinate.standardize(keys, graph=element.graph)
710 )
711 fetched = tuple(storage.fetch(dataIdSet))
712 try:
713 (record,) = fetched
714 except ValueError:
715 record = None
716 records[element.name] = record
717 if record is not None:
718 for d in element.implied:
719 value = getattr(record, d.name)
720 if keys.setdefault(d.name, value) != value:
721 raise InconsistentDataIdError(
722 f"Data ID {standardized} has {d.name}={keys[d.name]!r}, "
723 f"but {element.name} implies {d.name}={value!r}."
724 )
725 else:
726 if element in standardized.graph.required:
727 raise LookupError(
728 f"Could not fetch record for required dimension {element.name} via keys {keys}."
729 )
730 if element.alwaysJoin:
731 raise InconsistentDataIdError(
732 f"Could not fetch record for element {element.name} via keys {keys}, ",
733 "but it is marked alwaysJoin=True; this means one or more dimensions are not "
734 "related.",
735 )
736 for d in element.implied:
737 keys.setdefault(d.name, None)
738 records.setdefault(d.name, None)
739 return DataCoordinate.standardize(keys, graph=standardized.graph).expanded(records=records)
741 def insertDimensionData(
742 self,
743 element: Union[DimensionElement, str],
744 *data: Union[Mapping[str, Any], DimensionRecord],
745 conform: bool = True,
746 replace: bool = False,
747 ) -> None:
748 # Docstring inherited from lsst.daf.butler.registry.Registry
749 if conform:
750 if isinstance(element, str):
751 element = self.dimensions[element]
752 records = [
753 row if isinstance(row, DimensionRecord) else element.RecordClass(**row) for row in data
754 ]
755 else:
756 # Ignore typing since caller said to trust them with conform=False.
757 records = data # type: ignore
758 storage = self._managers.dimensions[element] # type: ignore
759 storage.insert(*records, replace=replace)
761 def syncDimensionData(
762 self,
763 element: Union[DimensionElement, str],
764 row: Union[Mapping[str, Any], DimensionRecord],
765 conform: bool = True,
766 update: bool = False,
767 ) -> Union[bool, Dict[str, Any]]:
768 # Docstring inherited from lsst.daf.butler.registry.Registry
769 if conform:
770 if isinstance(element, str):
771 element = self.dimensions[element]
772 record = row if isinstance(row, DimensionRecord) else element.RecordClass(**row)
773 else:
774 # Ignore typing since caller said to trust them with conform=False.
775 record = row # type: ignore
776 storage = self._managers.dimensions[element] # type: ignore
777 return storage.sync(record, update=update)
779 def queryDatasetTypes(
780 self,
781 expression: Any = ...,
782 *,
783 components: Optional[bool] = None,
784 missing: Optional[List[str]] = None,
785 ) -> Iterator[DatasetType]:
786 # Docstring inherited from lsst.daf.butler.registry.Registry
787 wildcard = CategorizedWildcard.fromExpression(expression, coerceUnrecognized=lambda d: d.name)
788 unknownComponentsMessage = (
789 "Could not find definition for storage class %s for dataset type %r;"
790 " if it has components they will not be included in dataset type query results."
791 )
792 if wildcard is Ellipsis:
793 for datasetType in self._managers.datasets:
794 # The dataset type can no longer be a component
795 yield datasetType
796 if components:
797 # Automatically create the component dataset types
798 try:
799 componentsForDatasetType = datasetType.makeAllComponentDatasetTypes()
800 except KeyError as err:
801 _LOG.warning(unknownComponentsMessage, err, datasetType.name)
802 else:
803 yield from componentsForDatasetType
804 return
805 done: Set[str] = set()
806 for name in wildcard.strings:
807 storage = self._managers.datasets.find(name)
808 done.add(name)
809 if storage is None:
810 if missing is not None:
811 missing.append(name)
812 else:
813 yield storage.datasetType
814 if wildcard.patterns:
815 # If components (the argument) is None, we'll save component
816 # dataset that we might want to match, but only if their parents
817 # didn't get included.
818 componentsForLater = []
819 for registeredDatasetType in self._managers.datasets:
820 # Components are not stored in registry so expand them here
821 allDatasetTypes = [registeredDatasetType]
822 if components is not False:
823 # Only check for the components if we are being asked
824 # for components or components is None.
825 try:
826 allDatasetTypes.extend(registeredDatasetType.makeAllComponentDatasetTypes())
827 except KeyError as err:
828 _LOG.warning(unknownComponentsMessage, err, registeredDatasetType.name)
829 for datasetType in allDatasetTypes:
830 if datasetType.name in done:
831 continue
832 parentName, componentName = datasetType.nameAndComponent()
833 if componentName is not None and not components:
834 if components is None and parentName not in done:
835 componentsForLater.append(datasetType)
836 continue
837 if any(p.fullmatch(datasetType.name) for p in wildcard.patterns):
838 done.add(datasetType.name)
839 yield datasetType
840 # Go back and try to match saved components.
841 for datasetType in componentsForLater:
842 parentName, _ = datasetType.nameAndComponent()
843 if parentName not in done and any(p.fullmatch(datasetType.name) for p in wildcard.patterns):
844 yield datasetType
846 def queryCollections(
847 self,
848 expression: Any = ...,
849 datasetType: Optional[DatasetType] = None,
850 collectionTypes: Union[Iterable[CollectionType], CollectionType] = CollectionType.all(),
851 flattenChains: bool = False,
852 includeChains: Optional[bool] = None,
853 ) -> Iterator[str]:
854 # Docstring inherited from lsst.daf.butler.registry.Registry
856 # Right now the datasetTypes argument is completely ignored, but that
857 # is consistent with its [lack of] guarantees. DM-24939 or a follow-up
858 # ticket will take care of that.
859 query = CollectionQuery.fromExpression(expression)
860 collectionTypes = ensure_iterable(collectionTypes)
861 for record in query.iter(
862 self._managers.collections,
863 collectionTypes=frozenset(collectionTypes),
864 flattenChains=flattenChains,
865 includeChains=includeChains,
866 ):
867 yield record.name
869 def _makeQueryBuilder(
870 self, summary: queries.QuerySummary, doomed_by: Iterable[str] = ()
871 ) -> queries.QueryBuilder:
872 """Return a `QueryBuilder` instance capable of constructing and
873 managing more complex queries than those obtainable via `Registry`
874 interfaces.
876 This is an advanced interface; downstream code should prefer
877 `Registry.queryDataIds` and `Registry.queryDatasets` whenever those
878 are sufficient.
880 Parameters
881 ----------
882 summary : `queries.QuerySummary`
883 Object describing and categorizing the full set of dimensions that
884 will be included in the query.
885 doomed_by : `Iterable` of `str`, optional
886 A list of diagnostic messages that indicate why the query is going
887 to yield no results and should not even be executed. If an empty
888 container (default) the query will be executed unless other code
889 determines that it is doomed.
891 Returns
892 -------
893 builder : `queries.QueryBuilder`
894 Object that can be used to construct and perform advanced queries.
895 """
896 return queries.QueryBuilder(
897 summary,
898 queries.RegistryManagers(
899 collections=self._managers.collections,
900 dimensions=self._managers.dimensions,
901 datasets=self._managers.datasets,
902 TimespanReprClass=self._db.getTimespanRepresentation(),
903 ),
904 doomed_by=doomed_by,
905 )
907 def queryDatasets(
908 self,
909 datasetType: Any,
910 *,
911 collections: Any = None,
912 dimensions: Optional[Iterable[Union[Dimension, str]]] = None,
913 dataId: Optional[DataId] = None,
914 where: Optional[str] = None,
915 findFirst: bool = False,
916 components: Optional[bool] = None,
917 bind: Optional[Mapping[str, Any]] = None,
918 check: bool = True,
919 **kwargs: Any,
920 ) -> queries.DatasetQueryResults:
921 # Docstring inherited from lsst.daf.butler.registry.Registry
923 # Standardize the collections expression.
924 if collections is None:
925 if not self.defaults.collections:
926 raise TypeError(
927 "No collections provided to findDataset, and no defaults from registry construction."
928 )
929 collections = self.defaults.collections
930 elif findFirst:
931 collections = CollectionSearch.fromExpression(collections)
932 else:
933 collections = CollectionQuery.fromExpression(collections)
934 # Standardize and expand the data ID provided as a constraint.
935 standardizedDataId = self.expandDataId(dataId, **kwargs)
937 # We can only query directly if given a non-component DatasetType
938 # instance. If we were given an expression or str or a component
939 # DatasetType instance, we'll populate this dict, recurse, and return.
940 # If we already have a non-component DatasetType, it will remain None
941 # and we'll run the query directly.
942 composition: Optional[
943 Dict[
944 DatasetType, List[Optional[str]] # parent dataset type # component name, or None for parent
945 ]
946 ] = None
947 if not isinstance(datasetType, DatasetType):
948 # We were given a dataset type expression (which may be as simple
949 # as a str). Loop over all matching datasets, delegating handling
950 # of the `components` argument to queryDatasetTypes, as we populate
951 # the composition dict.
952 composition = defaultdict(list)
953 for trueDatasetType in self.queryDatasetTypes(datasetType, components=components):
954 parentName, componentName = trueDatasetType.nameAndComponent()
955 if componentName is not None:
956 parentDatasetType = self.getDatasetType(parentName)
957 composition.setdefault(parentDatasetType, []).append(componentName)
958 else:
959 composition.setdefault(trueDatasetType, []).append(None)
960 if not composition:
961 return queries.ChainedDatasetQueryResults(
962 [],
963 doomed_by=[
964 f"No registered dataset type matching {t!r} found."
965 for t in ensure_iterable(datasetType)
966 ],
967 )
968 elif datasetType.isComponent():
969 # We were given a true DatasetType instance, but it's a component.
970 # the composition dict will have exactly one item.
971 parentName, componentName = datasetType.nameAndComponent()
972 parentDatasetType = self.getDatasetType(parentName)
973 composition = {parentDatasetType: [componentName]}
974 if composition is not None:
975 # We need to recurse. Do that once for each parent dataset type.
976 chain = []
977 for parentDatasetType, componentNames in composition.items():
978 parentResults = self.queryDatasets(
979 parentDatasetType,
980 collections=collections,
981 dimensions=dimensions,
982 dataId=standardizedDataId,
983 where=where,
984 bind=bind,
985 findFirst=findFirst,
986 check=check,
987 )
988 assert isinstance(
989 parentResults, queries.ParentDatasetQueryResults
990 ), "Should always be true if passing in a DatasetType instance, and we are."
991 chain.append(parentResults.withComponents(componentNames))
992 return queries.ChainedDatasetQueryResults(chain)
993 # If we get here, there's no need to recurse (or we are already
994 # recursing; there can only ever be one level of recursion).
996 # The full set of dimensions in the query is the combination of those
997 # needed for the DatasetType and those explicitly requested, if any.
998 requestedDimensionNames = set(datasetType.dimensions.names)
999 if dimensions is not None:
1000 requestedDimensionNames.update(self.dimensions.extract(dimensions).names)
1001 # Construct the summary structure needed to construct a QueryBuilder.
1002 summary = queries.QuerySummary(
1003 requested=DimensionGraph(self.dimensions, names=requestedDimensionNames),
1004 dataId=standardizedDataId,
1005 expression=where,
1006 bind=bind,
1007 defaults=self.defaults.dataId,
1008 check=check,
1009 datasets=[datasetType],
1010 )
1011 builder = self._makeQueryBuilder(summary)
1012 # Add the dataset subquery to the query, telling the QueryBuilder to
1013 # include the rank of the selected collection in the results only if we
1014 # need to findFirst. Note that if any of the collections are
1015 # actually wildcard expressions, and we've asked for deduplication,
1016 # this will raise TypeError for us.
1017 builder.joinDataset(datasetType, collections, isResult=True, findFirst=findFirst)
1018 query = builder.finish()
1019 return queries.ParentDatasetQueryResults(self._db, query, components=[None], datasetType=datasetType)
1021 def queryDataIds(
1022 self,
1023 dimensions: Union[Iterable[Union[Dimension, str]], Dimension, str],
1024 *,
1025 dataId: Optional[DataId] = None,
1026 datasets: Any = None,
1027 collections: Any = None,
1028 where: Optional[str] = None,
1029 components: Optional[bool] = None,
1030 bind: Optional[Mapping[str, Any]] = None,
1031 check: bool = True,
1032 **kwargs: Any,
1033 ) -> queries.DataCoordinateQueryResults:
1034 # Docstring inherited from lsst.daf.butler.registry.Registry
1035 dimensions = ensure_iterable(dimensions)
1036 standardizedDataId = self.expandDataId(dataId, **kwargs)
1037 standardizedDatasetTypes = set()
1038 requestedDimensions = self.dimensions.extract(dimensions)
1039 missing: List[str] = []
1040 if datasets is not None:
1041 if not collections:
1042 if not self.defaults.collections:
1043 raise TypeError(f"Cannot pass 'datasets' (='{datasets}') without 'collections'.")
1044 collections = self.defaults.collections
1045 else:
1046 # Preprocess collections expression in case the original
1047 # included single-pass iterators (we'll want to use it multiple
1048 # times below).
1049 collections = CollectionQuery.fromExpression(collections)
1050 for datasetType in self.queryDatasetTypes(datasets, components=components, missing=missing):
1051 # If any matched dataset type is a component, just operate on
1052 # its parent instead, because Registry doesn't know anything
1053 # about what components exist, and here (unlike queryDatasets)
1054 # we don't care about returning them.
1055 parentDatasetTypeName, componentName = datasetType.nameAndComponent()
1056 if componentName is not None:
1057 datasetType = self.getDatasetType(parentDatasetTypeName)
1058 standardizedDatasetTypes.add(datasetType)
1059 elif collections:
1060 raise TypeError(f"Cannot pass 'collections' (='{collections}') without 'datasets'.")
1062 def query_factory(
1063 order_by: Optional[Iterable[str]] = None, limit: Optional[Tuple[int, Optional[int]]] = None
1064 ) -> Query:
1065 """Construct the Query object that generates query results."""
1066 summary = queries.QuerySummary(
1067 requested=requestedDimensions,
1068 dataId=standardizedDataId,
1069 expression=where,
1070 bind=bind,
1071 defaults=self.defaults.dataId,
1072 check=check,
1073 datasets=standardizedDatasetTypes,
1074 order_by=order_by,
1075 limit=limit,
1076 )
1077 builder = self._makeQueryBuilder(
1078 summary, doomed_by=[f"Dataset type {name} is not registered." for name in missing]
1079 )
1080 for datasetType in standardizedDatasetTypes:
1081 builder.joinDataset(
1082 datasetType,
1083 collections,
1084 isResult=False,
1085 )
1086 return builder.finish()
1088 return queries.DataCoordinateQueryResults(self._db, query_factory, requestedDimensions)
1090 def queryDimensionRecords(
1091 self,
1092 element: Union[DimensionElement, str],
1093 *,
1094 dataId: Optional[DataId] = None,
1095 datasets: Any = None,
1096 collections: Any = None,
1097 where: Optional[str] = None,
1098 components: Optional[bool] = None,
1099 bind: Optional[Mapping[str, Any]] = None,
1100 check: bool = True,
1101 **kwargs: Any,
1102 ) -> queries.DimensionRecordQueryResults:
1103 # Docstring inherited from lsst.daf.butler.registry.Registry
1104 if not isinstance(element, DimensionElement):
1105 try:
1106 element = self.dimensions[element]
1107 except KeyError as e:
1108 raise KeyError(
1109 f"No such dimension '{element}', available dimensions: "
1110 + str(self.dimensions.getStaticElements())
1111 ) from e
1112 dataIds = self.queryDataIds(
1113 element.graph,
1114 dataId=dataId,
1115 datasets=datasets,
1116 collections=collections,
1117 where=where,
1118 components=components,
1119 bind=bind,
1120 check=check,
1121 **kwargs,
1122 )
1123 return queries.DatabaseDimensionRecordQueryResults(dataIds, self._managers.dimensions[element])
1125 def queryDatasetAssociations(
1126 self,
1127 datasetType: Union[str, DatasetType],
1128 collections: Any = ...,
1129 *,
1130 collectionTypes: Iterable[CollectionType] = CollectionType.all(),
1131 flattenChains: bool = False,
1132 ) -> Iterator[DatasetAssociation]:
1133 # Docstring inherited from lsst.daf.butler.registry.Registry
1134 if collections is None:
1135 if not self.defaults.collections:
1136 raise TypeError(
1137 "No collections provided to findDataset, and no defaults from registry construction."
1138 )
1139 collections = self.defaults.collections
1140 else:
1141 collections = CollectionQuery.fromExpression(collections)
1142 TimespanReprClass = self._db.getTimespanRepresentation()
1143 if isinstance(datasetType, str):
1144 storage = self._managers.datasets[datasetType]
1145 else:
1146 storage = self._managers.datasets[datasetType.name]
1147 for collectionRecord in collections.iter(
1148 self._managers.collections,
1149 collectionTypes=frozenset(collectionTypes),
1150 flattenChains=flattenChains,
1151 ):
1152 query = storage.select(collectionRecord)
1153 for row in self._db.query(query.combine()).mappings():
1154 dataId = DataCoordinate.fromRequiredValues(
1155 storage.datasetType.dimensions,
1156 tuple(row[name] for name in storage.datasetType.dimensions.required.names),
1157 )
1158 runRecord = self._managers.collections[row[self._managers.collections.getRunForeignKeyName()]]
1159 ref = DatasetRef(storage.datasetType, dataId, id=row["id"], run=runRecord.name, conform=False)
1160 if collectionRecord.type is CollectionType.CALIBRATION:
1161 timespan = TimespanReprClass.extract(row)
1162 else:
1163 timespan = None
1164 yield DatasetAssociation(ref=ref, collection=collectionRecord.name, timespan=timespan)
1166 storageClasses: StorageClassFactory
1167 """All storage classes known to the registry (`StorageClassFactory`).
1168 """