Coverage for python/lsst/daf/butler/registries/sql.py: 13%
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22from __future__ import annotations
24__all__ = ("SqlRegistry",)
26import contextlib
27import logging
28from collections import defaultdict
29from typing import TYPE_CHECKING, Any, Dict, Iterable, Iterator, List, Mapping, Optional, Set, Tuple, Union
31import sqlalchemy
32from lsst.resources import ResourcePathExpression
33from lsst.utils.iteration import ensure_iterable
35from ..core import (
36 Config,
37 DataCoordinate,
38 DataCoordinateIterable,
39 DataId,
40 DatasetAssociation,
41 DatasetId,
42 DatasetRef,
43 DatasetType,
44 Dimension,
45 DimensionConfig,
46 DimensionElement,
47 DimensionGraph,
48 DimensionRecord,
49 DimensionUniverse,
50 NamedKeyMapping,
51 NameLookupMapping,
52 Progress,
53 StorageClassFactory,
54 Timespan,
55 ddl,
56)
57from ..core.utils import transactional
58from ..registry import (
59 CollectionSearch,
60 CollectionType,
61 ConflictingDefinitionError,
62 InconsistentDataIdError,
63 OrphanedRecordError,
64 Registry,
65 RegistryConfig,
66 RegistryDefaults,
67 queries,
68)
69from ..registry.interfaces import ChainedCollectionRecord, DatasetIdGenEnum, RunRecord
70from ..registry.managers import RegistryManagerInstances, RegistryManagerTypes
71from ..registry.queries import Query
72from ..registry.summaries import CollectionSummary
73from ..registry.wildcards import CategorizedWildcard, CollectionQuery, Ellipsis
75if TYPE_CHECKING: 75 ↛ 76line 75 didn't jump to line 76, because the condition on line 75 was never true
76 from .._butlerConfig import ButlerConfig
77 from ..registry.interfaces import CollectionRecord, Database, DatastoreRegistryBridgeManager
80_LOG = logging.getLogger(__name__)
83class SqlRegistry(Registry):
84 """Registry implementation based on SQLAlchemy.
86 Parameters
87 ----------
88 database : `Database`
89 Database instance to store Registry.
90 defaults : `RegistryDefaults`
91 Default collection search path and/or output `~CollectionType.RUN`
92 collection.
93 managers : `RegistryManagerInstances`
94 All the managers required for this registry.
95 """
97 defaultConfigFile: Optional[str] = None
98 """Path to configuration defaults. Accessed within the ``configs`` resource
99 or relative to a search path. Can be None if no defaults specified.
100 """
102 @classmethod
103 def createFromConfig(
104 cls,
105 config: Optional[Union[RegistryConfig, str]] = None,
106 dimensionConfig: Optional[Union[DimensionConfig, str]] = None,
107 butlerRoot: Optional[ResourcePathExpression] = None,
108 ) -> Registry:
109 """Create registry database and return `SqlRegistry` instance.
111 This method initializes database contents, database must be empty
112 prior to calling this method.
114 Parameters
115 ----------
116 config : `RegistryConfig` or `str`, optional
117 Registry configuration, if missing then default configuration will
118 be loaded from registry.yaml.
119 dimensionConfig : `DimensionConfig` or `str`, optional
120 Dimensions configuration, if missing then default configuration
121 will be loaded from dimensions.yaml.
122 butlerRoot : convertible to `lsst.resources.ResourcePath`, optional
123 Path to the repository root this `SqlRegistry` will manage.
125 Returns
126 -------
127 registry : `SqlRegistry`
128 A new `SqlRegistry` instance.
129 """
130 config = cls.forceRegistryConfig(config)
131 config.replaceRoot(butlerRoot)
133 if isinstance(dimensionConfig, str):
134 dimensionConfig = DimensionConfig(config)
135 elif dimensionConfig is None:
136 dimensionConfig = DimensionConfig()
137 elif not isinstance(dimensionConfig, DimensionConfig):
138 raise TypeError(f"Incompatible Dimension configuration type: {type(dimensionConfig)}")
140 DatabaseClass = config.getDatabaseClass()
141 database = DatabaseClass.fromUri(
142 str(config.connectionString), origin=config.get("origin", 0), namespace=config.get("namespace")
143 )
144 managerTypes = RegistryManagerTypes.fromConfig(config)
145 managers = managerTypes.makeRepo(database, dimensionConfig)
146 return cls(database, RegistryDefaults(), managers)
148 @classmethod
149 def fromConfig(
150 cls,
151 config: Union[ButlerConfig, RegistryConfig, Config, str],
152 butlerRoot: Optional[ResourcePathExpression] = None,
153 writeable: bool = True,
154 defaults: Optional[RegistryDefaults] = None,
155 ) -> Registry:
156 """Create `Registry` subclass instance from `config`.
158 Registry database must be initialized prior to calling this method.
160 Parameters
161 ----------
162 config : `ButlerConfig`, `RegistryConfig`, `Config` or `str`
163 Registry configuration
164 butlerRoot : `lsst.resources.ResourcePathExpression`, optional
165 Path to the repository root this `Registry` will manage.
166 writeable : `bool`, optional
167 If `True` (default) create a read-write connection to the database.
168 defaults : `RegistryDefaults`, optional
169 Default collection search path and/or output `~CollectionType.RUN`
170 collection.
172 Returns
173 -------
174 registry : `SqlRegistry` (subclass)
175 A new `SqlRegistry` subclass instance.
176 """
177 config = cls.forceRegistryConfig(config)
178 config.replaceRoot(butlerRoot)
179 DatabaseClass = config.getDatabaseClass()
180 database = DatabaseClass.fromUri(
181 str(config.connectionString),
182 origin=config.get("origin", 0),
183 namespace=config.get("namespace"),
184 writeable=writeable,
185 )
186 managerTypes = RegistryManagerTypes.fromConfig(config)
187 managers = managerTypes.loadRepo(database)
188 if defaults is None:
189 defaults = RegistryDefaults()
190 return cls(database, defaults, managers)
192 def __init__(self, database: Database, defaults: RegistryDefaults, managers: RegistryManagerInstances):
193 self._db = database
194 self._managers = managers
195 self.storageClasses = StorageClassFactory()
196 # Intentionally invoke property setter to initialize defaults. This
197 # can only be done after most of the rest of Registry has already been
198 # initialized, and must be done before the property getter is used.
199 self.defaults = defaults
201 def __str__(self) -> str:
202 return str(self._db)
204 def __repr__(self) -> str:
205 return f"SqlRegistry({self._db!r}, {self.dimensions!r})"
207 def isWriteable(self) -> bool:
208 # Docstring inherited from lsst.daf.butler.registry.Registry
209 return self._db.isWriteable()
211 def copy(self, defaults: Optional[RegistryDefaults] = None) -> Registry:
212 # Docstring inherited from lsst.daf.butler.registry.Registry
213 if defaults is None:
214 # No need to copy, because `RegistryDefaults` is immutable; we
215 # effectively copy on write.
216 defaults = self.defaults
217 return type(self)(self._db, defaults, self._managers)
219 @property
220 def dimensions(self) -> DimensionUniverse:
221 # Docstring inherited from lsst.daf.butler.registry.Registry
222 return self._managers.dimensions.universe
224 def refresh(self) -> None:
225 # Docstring inherited from lsst.daf.butler.registry.Registry
226 self._managers.refresh()
228 @contextlib.contextmanager
229 def transaction(self, *, savepoint: bool = False) -> Iterator[None]:
230 # Docstring inherited from lsst.daf.butler.registry.Registry
231 try:
232 with self._db.transaction(savepoint=savepoint):
233 yield
234 except BaseException:
235 # TODO: this clears the caches sometimes when we wouldn't actually
236 # need to. Can we avoid that?
237 self._managers.dimensions.clearCaches()
238 raise
240 def resetConnectionPool(self) -> None:
241 """Reset SQLAlchemy connection pool for `SqlRegistry` database.
243 This operation is useful when using registry with fork-based
244 multiprocessing. To use registry across fork boundary one has to make
245 sure that there are no currently active connections (no session or
246 transaction is in progress) and connection pool is reset using this
247 method. This method should be called by the child process immediately
248 after the fork.
249 """
250 self._db._engine.dispose()
252 def registerOpaqueTable(self, tableName: str, spec: ddl.TableSpec) -> None:
253 """Add an opaque (to the `Registry`) table for use by a `Datastore` or
254 other data repository client.
256 Opaque table records can be added via `insertOpaqueData`, retrieved via
257 `fetchOpaqueData`, and removed via `deleteOpaqueData`.
259 Parameters
260 ----------
261 tableName : `str`
262 Logical name of the opaque table. This may differ from the
263 actual name used in the database by a prefix and/or suffix.
264 spec : `ddl.TableSpec`
265 Specification for the table to be added.
266 """
267 self._managers.opaque.register(tableName, spec)
269 @transactional
270 def insertOpaqueData(self, tableName: str, *data: dict) -> None:
271 """Insert records into an opaque table.
273 Parameters
274 ----------
275 tableName : `str`
276 Logical name of the opaque table. Must match the name used in a
277 previous call to `registerOpaqueTable`.
278 data
279 Each additional positional argument is a dictionary that represents
280 a single row to be added.
281 """
282 self._managers.opaque[tableName].insert(*data)
284 def fetchOpaqueData(self, tableName: str, **where: Any) -> Iterator[dict]:
285 """Retrieve records from an opaque table.
287 Parameters
288 ----------
289 tableName : `str`
290 Logical name of the opaque table. Must match the name used in a
291 previous call to `registerOpaqueTable`.
292 where
293 Additional keyword arguments are interpreted as equality
294 constraints that restrict the returned rows (combined with AND);
295 keyword arguments are column names and values are the values they
296 must have.
298 Yields
299 ------
300 row : `dict`
301 A dictionary representing a single result row.
302 """
303 yield from self._managers.opaque[tableName].fetch(**where)
305 @transactional
306 def deleteOpaqueData(self, tableName: str, **where: Any) -> None:
307 """Remove records from an opaque table.
309 Parameters
310 ----------
311 tableName : `str`
312 Logical name of the opaque table. Must match the name used in a
313 previous call to `registerOpaqueTable`.
314 where
315 Additional keyword arguments are interpreted as equality
316 constraints that restrict the deleted rows (combined with AND);
317 keyword arguments are column names and values are the values they
318 must have.
319 """
320 self._managers.opaque[tableName].delete(where.keys(), where)
322 def registerCollection(
323 self, name: str, type: CollectionType = CollectionType.TAGGED, doc: Optional[str] = None
324 ) -> bool:
325 # Docstring inherited from lsst.daf.butler.registry.Registry
326 _, registered = self._managers.collections.register(name, type, doc=doc)
327 return registered
329 def getCollectionType(self, name: str) -> CollectionType:
330 # Docstring inherited from lsst.daf.butler.registry.Registry
331 return self._managers.collections.find(name).type
333 def _get_collection_record(self, name: str) -> CollectionRecord:
334 # Docstring inherited from lsst.daf.butler.registry.Registry
335 return self._managers.collections.find(name)
337 def registerRun(self, name: str, doc: Optional[str] = None) -> bool:
338 # Docstring inherited from lsst.daf.butler.registry.Registry
339 _, registered = self._managers.collections.register(name, CollectionType.RUN, doc=doc)
340 return registered
342 @transactional
343 def removeCollection(self, name: str) -> None:
344 # Docstring inherited from lsst.daf.butler.registry.Registry
345 self._managers.collections.remove(name)
347 def getCollectionChain(self, parent: str) -> CollectionSearch:
348 # Docstring inherited from lsst.daf.butler.registry.Registry
349 record = self._managers.collections.find(parent)
350 if record.type is not CollectionType.CHAINED:
351 raise TypeError(f"Collection '{parent}' has type {record.type.name}, not CHAINED.")
352 assert isinstance(record, ChainedCollectionRecord)
353 return record.children
355 @transactional
356 def setCollectionChain(self, parent: str, children: Any, *, flatten: bool = False) -> None:
357 # Docstring inherited from lsst.daf.butler.registry.Registry
358 record = self._managers.collections.find(parent)
359 if record.type is not CollectionType.CHAINED:
360 raise TypeError(f"Collection '{parent}' has type {record.type.name}, not CHAINED.")
361 assert isinstance(record, ChainedCollectionRecord)
362 children = CollectionSearch.fromExpression(children)
363 if children != record.children or flatten:
364 record.update(self._managers.collections, children, flatten=flatten)
366 def getCollectionParentChains(self, collection: str) -> Set[str]:
367 # Docstring inherited from lsst.daf.butler.registry.Registry
368 return {
369 record.name
370 for record in self._managers.collections.getParentChains(
371 self._managers.collections.find(collection).key
372 )
373 }
375 def getCollectionDocumentation(self, collection: str) -> Optional[str]:
376 # Docstring inherited from lsst.daf.butler.registry.Registry
377 return self._managers.collections.getDocumentation(self._managers.collections.find(collection).key)
379 def setCollectionDocumentation(self, collection: str, doc: Optional[str]) -> None:
380 # Docstring inherited from lsst.daf.butler.registry.Registry
381 self._managers.collections.setDocumentation(self._managers.collections.find(collection).key, doc)
383 def getCollectionSummary(self, collection: str) -> CollectionSummary:
384 # Docstring inherited from lsst.daf.butler.registry.Registry
385 record = self._managers.collections.find(collection)
386 return self._managers.datasets.getCollectionSummary(record)
388 def registerDatasetType(self, datasetType: DatasetType) -> bool:
389 # Docstring inherited from lsst.daf.butler.registry.Registry
390 _, inserted = self._managers.datasets.register(datasetType)
391 return inserted
393 def removeDatasetType(self, name: str) -> None:
394 # Docstring inherited from lsst.daf.butler.registry.Registry
395 self._managers.datasets.remove(name)
397 def getDatasetType(self, name: str) -> DatasetType:
398 # Docstring inherited from lsst.daf.butler.registry.Registry
399 return self._managers.datasets[name].datasetType
401 def supportsIdGenerationMode(self, mode: DatasetIdGenEnum) -> bool:
402 # Docstring inherited from lsst.daf.butler.registry.Registry
403 return self._managers.datasets.supportsIdGenerationMode(mode)
405 def findDataset(
406 self,
407 datasetType: Union[DatasetType, str],
408 dataId: Optional[DataId] = None,
409 *,
410 collections: Any = None,
411 timespan: Optional[Timespan] = None,
412 **kwargs: Any,
413 ) -> Optional[DatasetRef]:
414 # Docstring inherited from lsst.daf.butler.registry.Registry
415 if isinstance(datasetType, DatasetType):
416 storage = self._managers.datasets[datasetType.name]
417 else:
418 storage = self._managers.datasets[datasetType]
419 dataId = DataCoordinate.standardize(
420 dataId,
421 graph=storage.datasetType.dimensions,
422 universe=self.dimensions,
423 defaults=self.defaults.dataId,
424 **kwargs,
425 )
426 if collections is None:
427 if not self.defaults.collections:
428 raise TypeError(
429 "No collections provided to findDataset, and no defaults from registry construction."
430 )
431 collections = self.defaults.collections
432 else:
433 collections = CollectionSearch.fromExpression(collections)
434 for collectionRecord in collections.iter(self._managers.collections):
435 if collectionRecord.type is CollectionType.CALIBRATION and (
436 not storage.datasetType.isCalibration() or timespan is None
437 ):
438 continue
439 result = storage.find(collectionRecord, dataId, timespan=timespan)
440 if result is not None:
441 return result
443 return None
445 @transactional
446 def insertDatasets(
447 self,
448 datasetType: Union[DatasetType, str],
449 dataIds: Iterable[DataId],
450 run: Optional[str] = None,
451 expand: bool = True,
452 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE,
453 ) -> List[DatasetRef]:
454 # Docstring inherited from lsst.daf.butler.registry.Registry
455 if isinstance(datasetType, DatasetType):
456 storage = self._managers.datasets.find(datasetType.name)
457 if storage is None:
458 raise LookupError(f"DatasetType '{datasetType}' has not been registered.")
459 else:
460 storage = self._managers.datasets.find(datasetType)
461 if storage is None:
462 raise LookupError(f"DatasetType with name '{datasetType}' has not been registered.")
463 if run is None:
464 if self.defaults.run is None:
465 raise TypeError(
466 "No run provided to insertDatasets, and no default from registry construction."
467 )
468 run = self.defaults.run
469 runRecord = self._managers.collections.find(run)
470 if runRecord.type is not CollectionType.RUN:
471 raise TypeError(f"Given collection is of type {runRecord.type.name}; RUN collection required.")
472 assert isinstance(runRecord, RunRecord)
473 progress = Progress("daf.butler.Registry.insertDatasets", level=logging.DEBUG)
474 if expand:
475 expandedDataIds = [
476 self.expandDataId(dataId, graph=storage.datasetType.dimensions)
477 for dataId in progress.wrap(dataIds, f"Expanding {storage.datasetType.name} data IDs")
478 ]
479 else:
480 expandedDataIds = [
481 DataCoordinate.standardize(dataId, graph=storage.datasetType.dimensions) for dataId in dataIds
482 ]
483 try:
484 refs = list(storage.insert(runRecord, expandedDataIds, idGenerationMode))
485 except sqlalchemy.exc.IntegrityError as err:
486 raise ConflictingDefinitionError(
487 f"A database constraint failure was triggered by inserting "
488 f"one or more datasets of type {storage.datasetType} into "
489 f"collection '{run}'. "
490 f"This probably means a dataset with the same data ID "
491 f"and dataset type already exists, but it may also mean a "
492 f"dimension row is missing."
493 ) from err
494 return refs
496 @transactional
497 def _importDatasets(
498 self,
499 datasets: Iterable[DatasetRef],
500 expand: bool = True,
501 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE,
502 reuseIds: bool = False,
503 ) -> List[DatasetRef]:
504 # Docstring inherited from lsst.daf.butler.registry.Registry
505 datasets = list(datasets)
506 if not datasets:
507 # nothing to do
508 return []
510 # find dataset type
511 datasetTypes = set(dataset.datasetType for dataset in datasets)
512 if len(datasetTypes) != 1:
513 raise ValueError(f"Multiple dataset types in input datasets: {datasetTypes}")
514 datasetType = datasetTypes.pop()
516 # get storage handler for this dataset type
517 storage = self._managers.datasets.find(datasetType.name)
518 if storage is None:
519 raise LookupError(f"DatasetType '{datasetType}' has not been registered.")
521 # find run name
522 runs = set(dataset.run for dataset in datasets)
523 if len(runs) != 1:
524 raise ValueError(f"Multiple run names in input datasets: {runs}")
525 run = runs.pop()
526 if run is None:
527 if self.defaults.run is None:
528 raise TypeError(
529 "No run provided to ingestDatasets, and no default from registry construction."
530 )
531 run = self.defaults.run
533 runRecord = self._managers.collections.find(run)
534 if runRecord.type is not CollectionType.RUN:
535 raise TypeError(
536 f"Given collection '{runRecord.name}' is of type {runRecord.type.name};"
537 " RUN collection required."
538 )
539 assert isinstance(runRecord, RunRecord)
541 progress = Progress("daf.butler.Registry.insertDatasets", level=logging.DEBUG)
542 if expand:
543 expandedDatasets = [
544 dataset.expanded(self.expandDataId(dataset.dataId, graph=storage.datasetType.dimensions))
545 for dataset in progress.wrap(datasets, f"Expanding {storage.datasetType.name} data IDs")
546 ]
547 else:
548 expandedDatasets = [
549 DatasetRef(datasetType, dataset.dataId, id=dataset.id, run=dataset.run, conform=True)
550 for dataset in datasets
551 ]
553 try:
554 refs = list(storage.import_(runRecord, expandedDatasets, idGenerationMode, reuseIds))
555 except sqlalchemy.exc.IntegrityError as err:
556 raise ConflictingDefinitionError(
557 f"A database constraint failure was triggered by inserting "
558 f"one or more datasets of type {storage.datasetType} into "
559 f"collection '{run}'. "
560 f"This probably means a dataset with the same data ID "
561 f"and dataset type already exists, but it may also mean a "
562 f"dimension row is missing."
563 ) from err
564 return refs
566 def getDataset(self, id: DatasetId) -> Optional[DatasetRef]:
567 # Docstring inherited from lsst.daf.butler.registry.Registry
568 return self._managers.datasets.getDatasetRef(id)
570 @transactional
571 def removeDatasets(self, refs: Iterable[DatasetRef]) -> None:
572 # Docstring inherited from lsst.daf.butler.registry.Registry
573 progress = Progress("lsst.daf.butler.Registry.removeDatasets", level=logging.DEBUG)
574 for datasetType, refsForType in progress.iter_item_chunks(
575 DatasetRef.groupByType(refs).items(), desc="Removing datasets by type"
576 ):
577 storage = self._managers.datasets[datasetType.name]
578 try:
579 storage.delete(refsForType)
580 except sqlalchemy.exc.IntegrityError as err:
581 raise OrphanedRecordError(
582 "One or more datasets is still present in one or more Datastores."
583 ) from err
585 @transactional
586 def associate(self, collection: str, refs: Iterable[DatasetRef]) -> None:
587 # Docstring inherited from lsst.daf.butler.registry.Registry
588 progress = Progress("lsst.daf.butler.Registry.associate", level=logging.DEBUG)
589 collectionRecord = self._managers.collections.find(collection)
590 if collectionRecord.type is not CollectionType.TAGGED:
591 raise TypeError(f"Collection '{collection}' has type {collectionRecord.type.name}, not TAGGED.")
592 for datasetType, refsForType in progress.iter_item_chunks(
593 DatasetRef.groupByType(refs).items(), desc="Associating datasets by type"
594 ):
595 storage = self._managers.datasets[datasetType.name]
596 try:
597 storage.associate(collectionRecord, refsForType)
598 except sqlalchemy.exc.IntegrityError as err:
599 raise ConflictingDefinitionError(
600 f"Constraint violation while associating dataset of type {datasetType.name} with "
601 f"collection {collection}. This probably means that one or more datasets with the same "
602 f"dataset type and data ID already exist in the collection, but it may also indicate "
603 f"that the datasets do not exist."
604 ) from err
606 @transactional
607 def disassociate(self, collection: str, refs: Iterable[DatasetRef]) -> None:
608 # Docstring inherited from lsst.daf.butler.registry.Registry
609 progress = Progress("lsst.daf.butler.Registry.disassociate", level=logging.DEBUG)
610 collectionRecord = self._managers.collections.find(collection)
611 if collectionRecord.type is not CollectionType.TAGGED:
612 raise TypeError(
613 f"Collection '{collection}' has type {collectionRecord.type.name}; expected TAGGED."
614 )
615 for datasetType, refsForType in progress.iter_item_chunks(
616 DatasetRef.groupByType(refs).items(), desc="Disassociating datasets by type"
617 ):
618 storage = self._managers.datasets[datasetType.name]
619 storage.disassociate(collectionRecord, refsForType)
621 @transactional
622 def certify(self, collection: str, refs: Iterable[DatasetRef], timespan: Timespan) -> None:
623 # Docstring inherited from lsst.daf.butler.registry.Registry
624 progress = Progress("lsst.daf.butler.Registry.certify", level=logging.DEBUG)
625 collectionRecord = self._managers.collections.find(collection)
626 for datasetType, refsForType in progress.iter_item_chunks(
627 DatasetRef.groupByType(refs).items(), desc="Certifying datasets by type"
628 ):
629 storage = self._managers.datasets[datasetType.name]
630 storage.certify(collectionRecord, refsForType, timespan)
632 @transactional
633 def decertify(
634 self,
635 collection: str,
636 datasetType: Union[str, DatasetType],
637 timespan: Timespan,
638 *,
639 dataIds: Optional[Iterable[DataId]] = None,
640 ) -> None:
641 # Docstring inherited from lsst.daf.butler.registry.Registry
642 collectionRecord = self._managers.collections.find(collection)
643 if isinstance(datasetType, str):
644 storage = self._managers.datasets[datasetType]
645 else:
646 storage = self._managers.datasets[datasetType.name]
647 standardizedDataIds = None
648 if dataIds is not None:
649 standardizedDataIds = [
650 DataCoordinate.standardize(d, graph=storage.datasetType.dimensions) for d in dataIds
651 ]
652 storage.decertify(collectionRecord, timespan, dataIds=standardizedDataIds)
654 def getDatastoreBridgeManager(self) -> DatastoreRegistryBridgeManager:
655 """Return an object that allows a new `Datastore` instance to
656 communicate with this `Registry`.
658 Returns
659 -------
660 manager : `DatastoreRegistryBridgeManager`
661 Object that mediates communication between this `Registry` and its
662 associated datastores.
663 """
664 return self._managers.datastores
666 def getDatasetLocations(self, ref: DatasetRef) -> Iterable[str]:
667 # Docstring inherited from lsst.daf.butler.registry.Registry
668 return self._managers.datastores.findDatastores(ref)
670 def expandDataId(
671 self,
672 dataId: Optional[DataId] = None,
673 *,
674 graph: Optional[DimensionGraph] = None,
675 records: Optional[NameLookupMapping[DimensionElement, Optional[DimensionRecord]]] = None,
676 withDefaults: bool = True,
677 **kwargs: Any,
678 ) -> DataCoordinate:
679 # Docstring inherited from lsst.daf.butler.registry.Registry
680 if not withDefaults:
681 defaults = None
682 else:
683 defaults = self.defaults.dataId
684 standardized = DataCoordinate.standardize(
685 dataId, graph=graph, universe=self.dimensions, defaults=defaults, **kwargs
686 )
687 if standardized.hasRecords():
688 return standardized
689 if records is None:
690 records = {}
691 elif isinstance(records, NamedKeyMapping):
692 records = records.byName()
693 else:
694 records = dict(records)
695 if isinstance(dataId, DataCoordinate) and dataId.hasRecords():
696 records.update(dataId.records.byName())
697 keys = standardized.byName()
698 for element in standardized.graph.primaryKeyTraversalOrder:
699 record = records.get(element.name, ...) # Use ... to mean not found; None might mean NULL
700 if record is ...:
701 if isinstance(element, Dimension) and keys.get(element.name) is None:
702 if element in standardized.graph.required:
703 raise LookupError(f"No value or null value for required dimension {element.name}.")
704 keys[element.name] = None
705 record = None
706 else:
707 storage = self._managers.dimensions[element]
708 dataIdSet = DataCoordinateIterable.fromScalar(
709 DataCoordinate.standardize(keys, graph=element.graph)
710 )
711 fetched = tuple(storage.fetch(dataIdSet))
712 try:
713 (record,) = fetched
714 except ValueError:
715 record = None
716 records[element.name] = record
717 if record is not None:
718 for d in element.implied:
719 value = getattr(record, d.name)
720 if keys.setdefault(d.name, value) != value:
721 raise InconsistentDataIdError(
722 f"Data ID {standardized} has {d.name}={keys[d.name]!r}, "
723 f"but {element.name} implies {d.name}={value!r}."
724 )
725 else:
726 if element in standardized.graph.required:
727 raise LookupError(
728 f"Could not fetch record for required dimension {element.name} via keys {keys}."
729 )
730 if element.alwaysJoin:
731 raise InconsistentDataIdError(
732 f"Could not fetch record for element {element.name} via keys {keys}, ",
733 "but it is marked alwaysJoin=True; this means one or more dimensions are not "
734 "related.",
735 )
736 for d in element.implied:
737 keys.setdefault(d.name, None)
738 records.setdefault(d.name, None)
739 return DataCoordinate.standardize(keys, graph=standardized.graph).expanded(records=records)
741 def insertDimensionData(
742 self,
743 element: Union[DimensionElement, str],
744 *data: Union[Mapping[str, Any], DimensionRecord],
745 conform: bool = True,
746 replace: bool = False,
747 ) -> None:
748 # Docstring inherited from lsst.daf.butler.registry.Registry
749 if conform:
750 if isinstance(element, str):
751 element = self.dimensions[element]
752 records = [
753 row if isinstance(row, DimensionRecord) else element.RecordClass(**row) for row in data
754 ]
755 else:
756 # Ignore typing since caller said to trust them with conform=False.
757 records = data # type: ignore
758 storage = self._managers.dimensions[element] # type: ignore
759 storage.insert(*records, replace=replace)
761 def syncDimensionData(
762 self,
763 element: Union[DimensionElement, str],
764 row: Union[Mapping[str, Any], DimensionRecord],
765 conform: bool = True,
766 update: bool = False,
767 ) -> Union[bool, Dict[str, Any]]:
768 # Docstring inherited from lsst.daf.butler.registry.Registry
769 if conform:
770 if isinstance(element, str):
771 element = self.dimensions[element]
772 record = row if isinstance(row, DimensionRecord) else element.RecordClass(**row)
773 else:
774 # Ignore typing since caller said to trust them with conform=False.
775 record = row # type: ignore
776 storage = self._managers.dimensions[element] # type: ignore
777 return storage.sync(record, update=update)
779 def queryDatasetTypes(
780 self,
781 expression: Any = ...,
782 *,
783 components: Optional[bool] = None,
784 missing: Optional[List[str]] = None,
785 ) -> Iterator[DatasetType]:
786 # Docstring inherited from lsst.daf.butler.registry.Registry
787 wildcard = CategorizedWildcard.fromExpression(expression, coerceUnrecognized=lambda d: d.name)
788 if wildcard is Ellipsis:
789 for datasetType in self._managers.datasets:
790 # The dataset type can no longer be a component
791 yield datasetType
792 if components:
793 # Automatically create the component dataset types
794 try:
795 componentsForDatasetType = datasetType.makeAllComponentDatasetTypes()
796 except KeyError as err:
797 _LOG.warning(
798 f"Could not load storage class {err} for {datasetType.name}; "
799 "if it has components they will not be included in query results."
800 )
801 else:
802 yield from componentsForDatasetType
803 return
804 done: Set[str] = set()
805 for name in wildcard.strings:
806 storage = self._managers.datasets.find(name)
807 done.add(name)
808 if storage is None:
809 if missing is not None:
810 missing.append(name)
811 else:
812 yield storage.datasetType
813 if wildcard.patterns:
814 # If components (the argument) is None, we'll save component
815 # dataset that we might want to match, but only if their parents
816 # didn't get included.
817 componentsForLater = []
818 for registeredDatasetType in self._managers.datasets:
819 # Components are not stored in registry so expand them here
820 allDatasetTypes = [registeredDatasetType]
821 try:
822 allDatasetTypes.extend(registeredDatasetType.makeAllComponentDatasetTypes())
823 except KeyError as err:
824 _LOG.warning(
825 f"Could not load storage class {err} for {registeredDatasetType.name}; "
826 "if it has components they will not be included in query results."
827 )
828 for datasetType in allDatasetTypes:
829 if datasetType.name in done:
830 continue
831 parentName, componentName = datasetType.nameAndComponent()
832 if componentName is not None and not components:
833 if components is None and parentName not in done:
834 componentsForLater.append(datasetType)
835 continue
836 if any(p.fullmatch(datasetType.name) for p in wildcard.patterns):
837 done.add(datasetType.name)
838 yield datasetType
839 # Go back and try to match saved components.
840 for datasetType in componentsForLater:
841 parentName, _ = datasetType.nameAndComponent()
842 if parentName not in done and any(p.fullmatch(datasetType.name) for p in wildcard.patterns):
843 yield datasetType
845 def queryCollections(
846 self,
847 expression: Any = ...,
848 datasetType: Optional[DatasetType] = None,
849 collectionTypes: Union[Iterable[CollectionType], CollectionType] = CollectionType.all(),
850 flattenChains: bool = False,
851 includeChains: Optional[bool] = None,
852 ) -> Iterator[str]:
853 # Docstring inherited from lsst.daf.butler.registry.Registry
855 # Right now the datasetTypes argument is completely ignored, but that
856 # is consistent with its [lack of] guarantees. DM-24939 or a follow-up
857 # ticket will take care of that.
858 query = CollectionQuery.fromExpression(expression)
859 collectionTypes = ensure_iterable(collectionTypes)
860 for record in query.iter(
861 self._managers.collections,
862 collectionTypes=frozenset(collectionTypes),
863 flattenChains=flattenChains,
864 includeChains=includeChains,
865 ):
866 yield record.name
868 def _makeQueryBuilder(
869 self, summary: queries.QuerySummary, doomed_by: Iterable[str] = ()
870 ) -> queries.QueryBuilder:
871 """Return a `QueryBuilder` instance capable of constructing and
872 managing more complex queries than those obtainable via `Registry`
873 interfaces.
875 This is an advanced interface; downstream code should prefer
876 `Registry.queryDataIds` and `Registry.queryDatasets` whenever those
877 are sufficient.
879 Parameters
880 ----------
881 summary : `queries.QuerySummary`
882 Object describing and categorizing the full set of dimensions that
883 will be included in the query.
884 doomed_by : `Iterable` of `str`, optional
885 A list of diagnostic messages that indicate why the query is going
886 to yield no results and should not even be executed. If an empty
887 container (default) the query will be executed unless other code
888 determines that it is doomed.
890 Returns
891 -------
892 builder : `queries.QueryBuilder`
893 Object that can be used to construct and perform advanced queries.
894 """
895 return queries.QueryBuilder(
896 summary,
897 queries.RegistryManagers(
898 collections=self._managers.collections,
899 dimensions=self._managers.dimensions,
900 datasets=self._managers.datasets,
901 TimespanReprClass=self._db.getTimespanRepresentation(),
902 ),
903 doomed_by=doomed_by,
904 )
906 def queryDatasets(
907 self,
908 datasetType: Any,
909 *,
910 collections: Any = None,
911 dimensions: Optional[Iterable[Union[Dimension, str]]] = None,
912 dataId: Optional[DataId] = None,
913 where: Optional[str] = None,
914 findFirst: bool = False,
915 components: Optional[bool] = None,
916 bind: Optional[Mapping[str, Any]] = None,
917 check: bool = True,
918 **kwargs: Any,
919 ) -> queries.DatasetQueryResults:
920 # Docstring inherited from lsst.daf.butler.registry.Registry
922 # Standardize the collections expression.
923 if collections is None:
924 if not self.defaults.collections:
925 raise TypeError(
926 "No collections provided to findDataset, and no defaults from registry construction."
927 )
928 collections = self.defaults.collections
929 elif findFirst:
930 collections = CollectionSearch.fromExpression(collections)
931 else:
932 collections = CollectionQuery.fromExpression(collections)
933 # Standardize and expand the data ID provided as a constraint.
934 standardizedDataId = self.expandDataId(dataId, **kwargs)
936 # We can only query directly if given a non-component DatasetType
937 # instance. If we were given an expression or str or a component
938 # DatasetType instance, we'll populate this dict, recurse, and return.
939 # If we already have a non-component DatasetType, it will remain None
940 # and we'll run the query directly.
941 composition: Optional[
942 Dict[
943 DatasetType, List[Optional[str]] # parent dataset type # component name, or None for parent
944 ]
945 ] = None
946 if not isinstance(datasetType, DatasetType):
947 # We were given a dataset type expression (which may be as simple
948 # as a str). Loop over all matching datasets, delegating handling
949 # of the `components` argument to queryDatasetTypes, as we populate
950 # the composition dict.
951 composition = defaultdict(list)
952 for trueDatasetType in self.queryDatasetTypes(datasetType, components=components):
953 parentName, componentName = trueDatasetType.nameAndComponent()
954 if componentName is not None:
955 parentDatasetType = self.getDatasetType(parentName)
956 composition.setdefault(parentDatasetType, []).append(componentName)
957 else:
958 composition.setdefault(trueDatasetType, []).append(None)
959 if not composition:
960 return queries.ChainedDatasetQueryResults(
961 [],
962 doomed_by=[
963 f"No registered dataset type matching {t!r} found."
964 for t in ensure_iterable(datasetType)
965 ],
966 )
967 elif datasetType.isComponent():
968 # We were given a true DatasetType instance, but it's a component.
969 # the composition dict will have exactly one item.
970 parentName, componentName = datasetType.nameAndComponent()
971 parentDatasetType = self.getDatasetType(parentName)
972 composition = {parentDatasetType: [componentName]}
973 if composition is not None:
974 # We need to recurse. Do that once for each parent dataset type.
975 chain = []
976 for parentDatasetType, componentNames in composition.items():
977 parentResults = self.queryDatasets(
978 parentDatasetType,
979 collections=collections,
980 dimensions=dimensions,
981 dataId=standardizedDataId,
982 where=where,
983 bind=bind,
984 findFirst=findFirst,
985 check=check,
986 )
987 assert isinstance(
988 parentResults, queries.ParentDatasetQueryResults
989 ), "Should always be true if passing in a DatasetType instance, and we are."
990 chain.append(parentResults.withComponents(componentNames))
991 return queries.ChainedDatasetQueryResults(chain)
992 # If we get here, there's no need to recurse (or we are already
993 # recursing; there can only ever be one level of recursion).
995 # The full set of dimensions in the query is the combination of those
996 # needed for the DatasetType and those explicitly requested, if any.
997 requestedDimensionNames = set(datasetType.dimensions.names)
998 if dimensions is not None:
999 requestedDimensionNames.update(self.dimensions.extract(dimensions).names)
1000 # Construct the summary structure needed to construct a QueryBuilder.
1001 summary = queries.QuerySummary(
1002 requested=DimensionGraph(self.dimensions, names=requestedDimensionNames),
1003 dataId=standardizedDataId,
1004 expression=where,
1005 bind=bind,
1006 defaults=self.defaults.dataId,
1007 check=check,
1008 datasets=[datasetType],
1009 )
1010 builder = self._makeQueryBuilder(summary)
1011 # Add the dataset subquery to the query, telling the QueryBuilder to
1012 # include the rank of the selected collection in the results only if we
1013 # need to findFirst. Note that if any of the collections are
1014 # actually wildcard expressions, and we've asked for deduplication,
1015 # this will raise TypeError for us.
1016 builder.joinDataset(datasetType, collections, isResult=True, findFirst=findFirst)
1017 query = builder.finish()
1018 return queries.ParentDatasetQueryResults(self._db, query, components=[None], datasetType=datasetType)
1020 def queryDataIds(
1021 self,
1022 dimensions: Union[Iterable[Union[Dimension, str]], Dimension, str],
1023 *,
1024 dataId: Optional[DataId] = None,
1025 datasets: Any = None,
1026 collections: Any = None,
1027 where: Optional[str] = None,
1028 components: Optional[bool] = None,
1029 bind: Optional[Mapping[str, Any]] = None,
1030 check: bool = True,
1031 **kwargs: Any,
1032 ) -> queries.DataCoordinateQueryResults:
1033 # Docstring inherited from lsst.daf.butler.registry.Registry
1034 dimensions = ensure_iterable(dimensions)
1035 standardizedDataId = self.expandDataId(dataId, **kwargs)
1036 standardizedDatasetTypes = set()
1037 requestedDimensions = self.dimensions.extract(dimensions)
1038 missing: List[str] = []
1039 if datasets is not None:
1040 if not collections:
1041 if not self.defaults.collections:
1042 raise TypeError(f"Cannot pass 'datasets' (='{datasets}') without 'collections'.")
1043 collections = self.defaults.collections
1044 else:
1045 # Preprocess collections expression in case the original
1046 # included single-pass iterators (we'll want to use it multiple
1047 # times below).
1048 collections = CollectionQuery.fromExpression(collections)
1049 for datasetType in self.queryDatasetTypes(datasets, components=components, missing=missing):
1050 # If any matched dataset type is a component, just operate on
1051 # its parent instead, because Registry doesn't know anything
1052 # about what components exist, and here (unlike queryDatasets)
1053 # we don't care about returning them.
1054 parentDatasetTypeName, componentName = datasetType.nameAndComponent()
1055 if componentName is not None:
1056 datasetType = self.getDatasetType(parentDatasetTypeName)
1057 standardizedDatasetTypes.add(datasetType)
1058 elif collections:
1059 raise TypeError(f"Cannot pass 'collections' (='{collections}') without 'datasets'.")
1061 def query_factory(
1062 order_by: Optional[Iterable[str]] = None, limit: Optional[Tuple[int, Optional[int]]] = None
1063 ) -> Query:
1064 """Construct the Query object that generates query results."""
1065 summary = queries.QuerySummary(
1066 requested=requestedDimensions,
1067 dataId=standardizedDataId,
1068 expression=where,
1069 bind=bind,
1070 defaults=self.defaults.dataId,
1071 check=check,
1072 datasets=standardizedDatasetTypes,
1073 order_by=order_by,
1074 limit=limit,
1075 )
1076 builder = self._makeQueryBuilder(
1077 summary, doomed_by=[f"Dataset type {name} is not registered." for name in missing]
1078 )
1079 for datasetType in standardizedDatasetTypes:
1080 builder.joinDataset(
1081 datasetType,
1082 collections,
1083 isResult=False,
1084 )
1085 return builder.finish()
1087 return queries.DataCoordinateQueryResults(self._db, query_factory, requestedDimensions)
1089 def queryDimensionRecords(
1090 self,
1091 element: Union[DimensionElement, str],
1092 *,
1093 dataId: Optional[DataId] = None,
1094 datasets: Any = None,
1095 collections: Any = None,
1096 where: Optional[str] = None,
1097 components: Optional[bool] = None,
1098 bind: Optional[Mapping[str, Any]] = None,
1099 check: bool = True,
1100 **kwargs: Any,
1101 ) -> queries.DimensionRecordQueryResults:
1102 # Docstring inherited from lsst.daf.butler.registry.Registry
1103 if not isinstance(element, DimensionElement):
1104 try:
1105 element = self.dimensions[element]
1106 except KeyError as e:
1107 raise KeyError(
1108 f"No such dimension '{element}', available dimensions: "
1109 + str(self.dimensions.getStaticElements())
1110 ) from e
1111 dataIds = self.queryDataIds(
1112 element.graph,
1113 dataId=dataId,
1114 datasets=datasets,
1115 collections=collections,
1116 where=where,
1117 components=components,
1118 bind=bind,
1119 check=check,
1120 **kwargs,
1121 )
1122 return queries.DatabaseDimensionRecordQueryResults(dataIds, self._managers.dimensions[element])
1124 def queryDatasetAssociations(
1125 self,
1126 datasetType: Union[str, DatasetType],
1127 collections: Any = ...,
1128 *,
1129 collectionTypes: Iterable[CollectionType] = CollectionType.all(),
1130 flattenChains: bool = False,
1131 ) -> Iterator[DatasetAssociation]:
1132 # Docstring inherited from lsst.daf.butler.registry.Registry
1133 if collections is None:
1134 if not self.defaults.collections:
1135 raise TypeError(
1136 "No collections provided to findDataset, and no defaults from registry construction."
1137 )
1138 collections = self.defaults.collections
1139 else:
1140 collections = CollectionQuery.fromExpression(collections)
1141 TimespanReprClass = self._db.getTimespanRepresentation()
1142 if isinstance(datasetType, str):
1143 storage = self._managers.datasets[datasetType]
1144 else:
1145 storage = self._managers.datasets[datasetType.name]
1146 for collectionRecord in collections.iter(
1147 self._managers.collections,
1148 collectionTypes=frozenset(collectionTypes),
1149 flattenChains=flattenChains,
1150 ):
1151 query = storage.select(collectionRecord)
1152 for row in self._db.query(query.combine()).mappings():
1153 dataId = DataCoordinate.fromRequiredValues(
1154 storage.datasetType.dimensions,
1155 tuple(row[name] for name in storage.datasetType.dimensions.required.names),
1156 )
1157 runRecord = self._managers.collections[row[self._managers.collections.getRunForeignKeyName()]]
1158 ref = DatasetRef(storage.datasetType, dataId, id=row["id"], run=runRecord.name, conform=False)
1159 if collectionRecord.type is CollectionType.CALIBRATION:
1160 timespan = TimespanReprClass.extract(row)
1161 else:
1162 timespan = None
1163 yield DatasetAssociation(ref=ref, collection=collectionRecord.name, timespan=timespan)
1165 storageClasses: StorageClassFactory
1166 """All storage classes known to the registry (`StorageClassFactory`).
1167 """