Coverage for python/lsst/daf/butler/registries/sql.py: 13%
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22from __future__ import annotations
24__all__ = ("SqlRegistry",)
26import contextlib
27import logging
28from collections import defaultdict
29from typing import TYPE_CHECKING, Any, Dict, Iterable, Iterator, List, Mapping, Optional, Set, Tuple, Union
31import sqlalchemy
32from lsst.resources import ResourcePathExpression
33from lsst.utils.iteration import ensure_iterable
35from ..core import (
36 Config,
37 DataCoordinate,
38 DataCoordinateIterable,
39 DataId,
40 DatasetAssociation,
41 DatasetId,
42 DatasetRef,
43 DatasetType,
44 Dimension,
45 DimensionConfig,
46 DimensionElement,
47 DimensionGraph,
48 DimensionRecord,
49 DimensionUniverse,
50 NamedKeyMapping,
51 NameLookupMapping,
52 Progress,
53 StorageClassFactory,
54 Timespan,
55 ddl,
56)
57from ..core.utils import transactional
58from ..registry import (
59 ArgumentError,
60 CollectionExpressionError,
61 CollectionSearch,
62 CollectionType,
63 CollectionTypeError,
64 ConflictingDefinitionError,
65 DataIdValueError,
66 DatasetTypeError,
67 DatasetTypeExpressionError,
68 DimensionNameError,
69 InconsistentDataIdError,
70 NoDefaultCollectionError,
71 OrphanedRecordError,
72 Registry,
73 RegistryConfig,
74 RegistryDefaults,
75 queries,
76)
77from ..registry.interfaces import ChainedCollectionRecord, DatasetIdGenEnum, RunRecord
78from ..registry.managers import RegistryManagerInstances, RegistryManagerTypes
79from ..registry.queries import Query
80from ..registry.summaries import CollectionSummary
81from ..registry.wildcards import CategorizedWildcard, CollectionQuery, Ellipsis
83if TYPE_CHECKING: 83 ↛ 84line 83 didn't jump to line 84, because the condition on line 83 was never true
84 from .._butlerConfig import ButlerConfig
85 from ..registry.interfaces import CollectionRecord, Database, DatastoreRegistryBridgeManager
88_LOG = logging.getLogger(__name__)
91class SqlRegistry(Registry):
92 """Registry implementation based on SQLAlchemy.
94 Parameters
95 ----------
96 database : `Database`
97 Database instance to store Registry.
98 defaults : `RegistryDefaults`
99 Default collection search path and/or output `~CollectionType.RUN`
100 collection.
101 managers : `RegistryManagerInstances`
102 All the managers required for this registry.
103 """
105 defaultConfigFile: Optional[str] = None
106 """Path to configuration defaults. Accessed within the ``configs`` resource
107 or relative to a search path. Can be None if no defaults specified.
108 """
110 @classmethod
111 def createFromConfig(
112 cls,
113 config: Optional[Union[RegistryConfig, str]] = None,
114 dimensionConfig: Optional[Union[DimensionConfig, str]] = None,
115 butlerRoot: Optional[ResourcePathExpression] = None,
116 ) -> Registry:
117 """Create registry database and return `SqlRegistry` instance.
119 This method initializes database contents, database must be empty
120 prior to calling this method.
122 Parameters
123 ----------
124 config : `RegistryConfig` or `str`, optional
125 Registry configuration, if missing then default configuration will
126 be loaded from registry.yaml.
127 dimensionConfig : `DimensionConfig` or `str`, optional
128 Dimensions configuration, if missing then default configuration
129 will be loaded from dimensions.yaml.
130 butlerRoot : convertible to `lsst.resources.ResourcePath`, optional
131 Path to the repository root this `SqlRegistry` will manage.
133 Returns
134 -------
135 registry : `SqlRegistry`
136 A new `SqlRegistry` instance.
137 """
138 config = cls.forceRegistryConfig(config)
139 config.replaceRoot(butlerRoot)
141 if isinstance(dimensionConfig, str):
142 dimensionConfig = DimensionConfig(config)
143 elif dimensionConfig is None:
144 dimensionConfig = DimensionConfig()
145 elif not isinstance(dimensionConfig, DimensionConfig):
146 raise TypeError(f"Incompatible Dimension configuration type: {type(dimensionConfig)}")
148 DatabaseClass = config.getDatabaseClass()
149 database = DatabaseClass.fromUri(
150 str(config.connectionString), origin=config.get("origin", 0), namespace=config.get("namespace")
151 )
152 managerTypes = RegistryManagerTypes.fromConfig(config)
153 managers = managerTypes.makeRepo(database, dimensionConfig)
154 return cls(database, RegistryDefaults(), managers)
156 @classmethod
157 def fromConfig(
158 cls,
159 config: Union[ButlerConfig, RegistryConfig, Config, str],
160 butlerRoot: Optional[ResourcePathExpression] = None,
161 writeable: bool = True,
162 defaults: Optional[RegistryDefaults] = None,
163 ) -> Registry:
164 """Create `Registry` subclass instance from `config`.
166 Registry database must be initialized prior to calling this method.
168 Parameters
169 ----------
170 config : `ButlerConfig`, `RegistryConfig`, `Config` or `str`
171 Registry configuration
172 butlerRoot : `lsst.resources.ResourcePathExpression`, optional
173 Path to the repository root this `Registry` will manage.
174 writeable : `bool`, optional
175 If `True` (default) create a read-write connection to the database.
176 defaults : `RegistryDefaults`, optional
177 Default collection search path and/or output `~CollectionType.RUN`
178 collection.
180 Returns
181 -------
182 registry : `SqlRegistry` (subclass)
183 A new `SqlRegistry` subclass instance.
184 """
185 config = cls.forceRegistryConfig(config)
186 config.replaceRoot(butlerRoot)
187 DatabaseClass = config.getDatabaseClass()
188 database = DatabaseClass.fromUri(
189 str(config.connectionString),
190 origin=config.get("origin", 0),
191 namespace=config.get("namespace"),
192 writeable=writeable,
193 )
194 managerTypes = RegistryManagerTypes.fromConfig(config)
195 managers = managerTypes.loadRepo(database)
196 if defaults is None:
197 defaults = RegistryDefaults()
198 return cls(database, defaults, managers)
200 def __init__(self, database: Database, defaults: RegistryDefaults, managers: RegistryManagerInstances):
201 self._db = database
202 self._managers = managers
203 self.storageClasses = StorageClassFactory()
204 # Intentionally invoke property setter to initialize defaults. This
205 # can only be done after most of the rest of Registry has already been
206 # initialized, and must be done before the property getter is used.
207 self.defaults = defaults
209 def __str__(self) -> str:
210 return str(self._db)
212 def __repr__(self) -> str:
213 return f"SqlRegistry({self._db!r}, {self.dimensions!r})"
215 def isWriteable(self) -> bool:
216 # Docstring inherited from lsst.daf.butler.registry.Registry
217 return self._db.isWriteable()
219 def copy(self, defaults: Optional[RegistryDefaults] = None) -> Registry:
220 # Docstring inherited from lsst.daf.butler.registry.Registry
221 if defaults is None:
222 # No need to copy, because `RegistryDefaults` is immutable; we
223 # effectively copy on write.
224 defaults = self.defaults
225 return type(self)(self._db, defaults, self._managers)
227 @property
228 def dimensions(self) -> DimensionUniverse:
229 # Docstring inherited from lsst.daf.butler.registry.Registry
230 return self._managers.dimensions.universe
232 def refresh(self) -> None:
233 # Docstring inherited from lsst.daf.butler.registry.Registry
234 self._managers.refresh()
236 @contextlib.contextmanager
237 def transaction(self, *, savepoint: bool = False) -> Iterator[None]:
238 # Docstring inherited from lsst.daf.butler.registry.Registry
239 try:
240 with self._db.transaction(savepoint=savepoint):
241 yield
242 except BaseException:
243 # TODO: this clears the caches sometimes when we wouldn't actually
244 # need to. Can we avoid that?
245 self._managers.dimensions.clearCaches()
246 raise
248 def resetConnectionPool(self) -> None:
249 """Reset SQLAlchemy connection pool for `SqlRegistry` database.
251 This operation is useful when using registry with fork-based
252 multiprocessing. To use registry across fork boundary one has to make
253 sure that there are no currently active connections (no session or
254 transaction is in progress) and connection pool is reset using this
255 method. This method should be called by the child process immediately
256 after the fork.
257 """
258 self._db._engine.dispose()
260 def registerOpaqueTable(self, tableName: str, spec: ddl.TableSpec) -> None:
261 """Add an opaque (to the `Registry`) table for use by a `Datastore` or
262 other data repository client.
264 Opaque table records can be added via `insertOpaqueData`, retrieved via
265 `fetchOpaqueData`, and removed via `deleteOpaqueData`.
267 Parameters
268 ----------
269 tableName : `str`
270 Logical name of the opaque table. This may differ from the
271 actual name used in the database by a prefix and/or suffix.
272 spec : `ddl.TableSpec`
273 Specification for the table to be added.
274 """
275 self._managers.opaque.register(tableName, spec)
277 @transactional
278 def insertOpaqueData(self, tableName: str, *data: dict) -> None:
279 """Insert records into an opaque table.
281 Parameters
282 ----------
283 tableName : `str`
284 Logical name of the opaque table. Must match the name used in a
285 previous call to `registerOpaqueTable`.
286 data
287 Each additional positional argument is a dictionary that represents
288 a single row to be added.
289 """
290 self._managers.opaque[tableName].insert(*data)
292 def fetchOpaqueData(self, tableName: str, **where: Any) -> Iterator[dict]:
293 """Retrieve records from an opaque table.
295 Parameters
296 ----------
297 tableName : `str`
298 Logical name of the opaque table. Must match the name used in a
299 previous call to `registerOpaqueTable`.
300 where
301 Additional keyword arguments are interpreted as equality
302 constraints that restrict the returned rows (combined with AND);
303 keyword arguments are column names and values are the values they
304 must have.
306 Yields
307 ------
308 row : `dict`
309 A dictionary representing a single result row.
310 """
311 yield from self._managers.opaque[tableName].fetch(**where)
313 @transactional
314 def deleteOpaqueData(self, tableName: str, **where: Any) -> None:
315 """Remove records from an opaque table.
317 Parameters
318 ----------
319 tableName : `str`
320 Logical name of the opaque table. Must match the name used in a
321 previous call to `registerOpaqueTable`.
322 where
323 Additional keyword arguments are interpreted as equality
324 constraints that restrict the deleted rows (combined with AND);
325 keyword arguments are column names and values are the values they
326 must have.
327 """
328 self._managers.opaque[tableName].delete(where.keys(), where)
330 def registerCollection(
331 self, name: str, type: CollectionType = CollectionType.TAGGED, doc: Optional[str] = None
332 ) -> bool:
333 # Docstring inherited from lsst.daf.butler.registry.Registry
334 _, registered = self._managers.collections.register(name, type, doc=doc)
335 return registered
337 def getCollectionType(self, name: str) -> CollectionType:
338 # Docstring inherited from lsst.daf.butler.registry.Registry
339 return self._managers.collections.find(name).type
341 def _get_collection_record(self, name: str) -> CollectionRecord:
342 # Docstring inherited from lsst.daf.butler.registry.Registry
343 return self._managers.collections.find(name)
345 def registerRun(self, name: str, doc: Optional[str] = None) -> bool:
346 # Docstring inherited from lsst.daf.butler.registry.Registry
347 _, registered = self._managers.collections.register(name, CollectionType.RUN, doc=doc)
348 return registered
350 @transactional
351 def removeCollection(self, name: str) -> None:
352 # Docstring inherited from lsst.daf.butler.registry.Registry
353 self._managers.collections.remove(name)
355 def getCollectionChain(self, parent: str) -> CollectionSearch:
356 # Docstring inherited from lsst.daf.butler.registry.Registry
357 record = self._managers.collections.find(parent)
358 if record.type is not CollectionType.CHAINED:
359 raise CollectionTypeError(f"Collection '{parent}' has type {record.type.name}, not CHAINED.")
360 assert isinstance(record, ChainedCollectionRecord)
361 return record.children
363 @transactional
364 def setCollectionChain(self, parent: str, children: Any, *, flatten: bool = False) -> None:
365 # Docstring inherited from lsst.daf.butler.registry.Registry
366 record = self._managers.collections.find(parent)
367 if record.type is not CollectionType.CHAINED:
368 raise CollectionTypeError(f"Collection '{parent}' has type {record.type.name}, not CHAINED.")
369 assert isinstance(record, ChainedCollectionRecord)
370 children = CollectionSearch.fromExpression(children)
371 if children != record.children or flatten:
372 record.update(self._managers.collections, children, flatten=flatten)
374 def getCollectionParentChains(self, collection: str) -> Set[str]:
375 # Docstring inherited from lsst.daf.butler.registry.Registry
376 return {
377 record.name
378 for record in self._managers.collections.getParentChains(
379 self._managers.collections.find(collection).key
380 )
381 }
383 def getCollectionDocumentation(self, collection: str) -> Optional[str]:
384 # Docstring inherited from lsst.daf.butler.registry.Registry
385 return self._managers.collections.getDocumentation(self._managers.collections.find(collection).key)
387 def setCollectionDocumentation(self, collection: str, doc: Optional[str]) -> None:
388 # Docstring inherited from lsst.daf.butler.registry.Registry
389 self._managers.collections.setDocumentation(self._managers.collections.find(collection).key, doc)
391 def getCollectionSummary(self, collection: str) -> CollectionSummary:
392 # Docstring inherited from lsst.daf.butler.registry.Registry
393 record = self._managers.collections.find(collection)
394 return self._managers.datasets.getCollectionSummary(record)
396 def registerDatasetType(self, datasetType: DatasetType) -> bool:
397 # Docstring inherited from lsst.daf.butler.registry.Registry
398 _, inserted = self._managers.datasets.register(datasetType)
399 return inserted
401 def removeDatasetType(self, name: str) -> None:
402 # Docstring inherited from lsst.daf.butler.registry.Registry
403 self._managers.datasets.remove(name)
405 def getDatasetType(self, name: str) -> DatasetType:
406 # Docstring inherited from lsst.daf.butler.registry.Registry
407 return self._managers.datasets[name].datasetType
409 def supportsIdGenerationMode(self, mode: DatasetIdGenEnum) -> bool:
410 # Docstring inherited from lsst.daf.butler.registry.Registry
411 return self._managers.datasets.supportsIdGenerationMode(mode)
413 def findDataset(
414 self,
415 datasetType: Union[DatasetType, str],
416 dataId: Optional[DataId] = None,
417 *,
418 collections: Any = None,
419 timespan: Optional[Timespan] = None,
420 **kwargs: Any,
421 ) -> Optional[DatasetRef]:
422 # Docstring inherited from lsst.daf.butler.registry.Registry
423 if isinstance(datasetType, DatasetType):
424 storage = self._managers.datasets[datasetType.name]
425 else:
426 storage = self._managers.datasets[datasetType]
427 dataId = DataCoordinate.standardize(
428 dataId,
429 graph=storage.datasetType.dimensions,
430 universe=self.dimensions,
431 defaults=self.defaults.dataId,
432 **kwargs,
433 )
434 if collections is None:
435 if not self.defaults.collections:
436 raise NoDefaultCollectionError(
437 "No collections provided to findDataset, and no defaults from registry construction."
438 )
439 collections = self.defaults.collections
440 else:
441 collections = CollectionSearch.fromExpression(collections)
442 for collectionRecord in collections.iter(self._managers.collections):
443 if collectionRecord.type is CollectionType.CALIBRATION and (
444 not storage.datasetType.isCalibration() or timespan is None
445 ):
446 continue
447 result = storage.find(collectionRecord, dataId, timespan=timespan)
448 if result is not None:
449 return result
451 return None
453 @transactional
454 def insertDatasets(
455 self,
456 datasetType: Union[DatasetType, str],
457 dataIds: Iterable[DataId],
458 run: Optional[str] = None,
459 expand: bool = True,
460 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE,
461 ) -> List[DatasetRef]:
462 # Docstring inherited from lsst.daf.butler.registry.Registry
463 if isinstance(datasetType, DatasetType):
464 storage = self._managers.datasets.find(datasetType.name)
465 if storage is None:
466 raise DatasetTypeError(f"DatasetType '{datasetType}' has not been registered.")
467 else:
468 storage = self._managers.datasets.find(datasetType)
469 if storage is None:
470 raise DatasetTypeError(f"DatasetType with name '{datasetType}' has not been registered.")
471 if run is None:
472 if self.defaults.run is None:
473 raise NoDefaultCollectionError(
474 "No run provided to insertDatasets, and no default from registry construction."
475 )
476 run = self.defaults.run
477 runRecord = self._managers.collections.find(run)
478 if runRecord.type is not CollectionType.RUN:
479 raise CollectionTypeError(
480 f"Given collection is of type {runRecord.type.name}; RUN collection required."
481 )
482 assert isinstance(runRecord, RunRecord)
483 progress = Progress("daf.butler.Registry.insertDatasets", level=logging.DEBUG)
484 if expand:
485 expandedDataIds = [
486 self.expandDataId(dataId, graph=storage.datasetType.dimensions)
487 for dataId in progress.wrap(dataIds, f"Expanding {storage.datasetType.name} data IDs")
488 ]
489 else:
490 expandedDataIds = [
491 DataCoordinate.standardize(dataId, graph=storage.datasetType.dimensions) for dataId in dataIds
492 ]
493 try:
494 refs = list(storage.insert(runRecord, expandedDataIds, idGenerationMode))
495 except sqlalchemy.exc.IntegrityError as err:
496 raise ConflictingDefinitionError(
497 f"A database constraint failure was triggered by inserting "
498 f"one or more datasets of type {storage.datasetType} into "
499 f"collection '{run}'. "
500 f"This probably means a dataset with the same data ID "
501 f"and dataset type already exists, but it may also mean a "
502 f"dimension row is missing."
503 ) from err
504 return refs
506 @transactional
507 def _importDatasets(
508 self,
509 datasets: Iterable[DatasetRef],
510 expand: bool = True,
511 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE,
512 reuseIds: bool = False,
513 ) -> List[DatasetRef]:
514 # Docstring inherited from lsst.daf.butler.registry.Registry
515 datasets = list(datasets)
516 if not datasets:
517 # nothing to do
518 return []
520 # find dataset type
521 datasetTypes = set(dataset.datasetType for dataset in datasets)
522 if len(datasetTypes) != 1:
523 raise DatasetTypeError(f"Multiple dataset types in input datasets: {datasetTypes}")
524 datasetType = datasetTypes.pop()
526 # get storage handler for this dataset type
527 storage = self._managers.datasets.find(datasetType.name)
528 if storage is None:
529 raise DatasetTypeError(f"DatasetType '{datasetType}' has not been registered.")
531 # find run name
532 runs = set(dataset.run for dataset in datasets)
533 if len(runs) != 1:
534 raise ValueError(f"Multiple run names in input datasets: {runs}")
535 run = runs.pop()
536 if run is None:
537 if self.defaults.run is None:
538 raise NoDefaultCollectionError(
539 "No run provided to ingestDatasets, and no default from registry construction."
540 )
541 run = self.defaults.run
543 runRecord = self._managers.collections.find(run)
544 if runRecord.type is not CollectionType.RUN:
545 raise CollectionTypeError(
546 f"Given collection '{runRecord.name}' is of type {runRecord.type.name};"
547 " RUN collection required."
548 )
549 assert isinstance(runRecord, RunRecord)
551 progress = Progress("daf.butler.Registry.insertDatasets", level=logging.DEBUG)
552 if expand:
553 expandedDatasets = [
554 dataset.expanded(self.expandDataId(dataset.dataId, graph=storage.datasetType.dimensions))
555 for dataset in progress.wrap(datasets, f"Expanding {storage.datasetType.name} data IDs")
556 ]
557 else:
558 expandedDatasets = [
559 DatasetRef(datasetType, dataset.dataId, id=dataset.id, run=dataset.run, conform=True)
560 for dataset in datasets
561 ]
563 try:
564 refs = list(storage.import_(runRecord, expandedDatasets, idGenerationMode, reuseIds))
565 except sqlalchemy.exc.IntegrityError as err:
566 raise ConflictingDefinitionError(
567 f"A database constraint failure was triggered by inserting "
568 f"one or more datasets of type {storage.datasetType} into "
569 f"collection '{run}'. "
570 f"This probably means a dataset with the same data ID "
571 f"and dataset type already exists, but it may also mean a "
572 f"dimension row is missing."
573 ) from err
574 return refs
576 def getDataset(self, id: DatasetId) -> Optional[DatasetRef]:
577 # Docstring inherited from lsst.daf.butler.registry.Registry
578 return self._managers.datasets.getDatasetRef(id)
580 @transactional
581 def removeDatasets(self, refs: Iterable[DatasetRef]) -> None:
582 # Docstring inherited from lsst.daf.butler.registry.Registry
583 progress = Progress("lsst.daf.butler.Registry.removeDatasets", level=logging.DEBUG)
584 for datasetType, refsForType in progress.iter_item_chunks(
585 DatasetRef.groupByType(refs).items(), desc="Removing datasets by type"
586 ):
587 storage = self._managers.datasets[datasetType.name]
588 try:
589 storage.delete(refsForType)
590 except sqlalchemy.exc.IntegrityError as err:
591 raise OrphanedRecordError(
592 "One or more datasets is still present in one or more Datastores."
593 ) from err
595 @transactional
596 def associate(self, collection: str, refs: Iterable[DatasetRef]) -> None:
597 # Docstring inherited from lsst.daf.butler.registry.Registry
598 progress = Progress("lsst.daf.butler.Registry.associate", level=logging.DEBUG)
599 collectionRecord = self._managers.collections.find(collection)
600 if collectionRecord.type is not CollectionType.TAGGED:
601 raise CollectionTypeError(
602 f"Collection '{collection}' has type {collectionRecord.type.name}, not TAGGED."
603 )
604 for datasetType, refsForType in progress.iter_item_chunks(
605 DatasetRef.groupByType(refs).items(), desc="Associating datasets by type"
606 ):
607 storage = self._managers.datasets[datasetType.name]
608 try:
609 storage.associate(collectionRecord, refsForType)
610 except sqlalchemy.exc.IntegrityError as err:
611 raise ConflictingDefinitionError(
612 f"Constraint violation while associating dataset of type {datasetType.name} with "
613 f"collection {collection}. This probably means that one or more datasets with the same "
614 f"dataset type and data ID already exist in the collection, but it may also indicate "
615 f"that the datasets do not exist."
616 ) from err
618 @transactional
619 def disassociate(self, collection: str, refs: Iterable[DatasetRef]) -> None:
620 # Docstring inherited from lsst.daf.butler.registry.Registry
621 progress = Progress("lsst.daf.butler.Registry.disassociate", level=logging.DEBUG)
622 collectionRecord = self._managers.collections.find(collection)
623 if collectionRecord.type is not CollectionType.TAGGED:
624 raise CollectionTypeError(
625 f"Collection '{collection}' has type {collectionRecord.type.name}; expected TAGGED."
626 )
627 for datasetType, refsForType in progress.iter_item_chunks(
628 DatasetRef.groupByType(refs).items(), desc="Disassociating datasets by type"
629 ):
630 storage = self._managers.datasets[datasetType.name]
631 storage.disassociate(collectionRecord, refsForType)
633 @transactional
634 def certify(self, collection: str, refs: Iterable[DatasetRef], timespan: Timespan) -> None:
635 # Docstring inherited from lsst.daf.butler.registry.Registry
636 progress = Progress("lsst.daf.butler.Registry.certify", level=logging.DEBUG)
637 collectionRecord = self._managers.collections.find(collection)
638 for datasetType, refsForType in progress.iter_item_chunks(
639 DatasetRef.groupByType(refs).items(), desc="Certifying datasets by type"
640 ):
641 storage = self._managers.datasets[datasetType.name]
642 storage.certify(collectionRecord, refsForType, timespan)
644 @transactional
645 def decertify(
646 self,
647 collection: str,
648 datasetType: Union[str, DatasetType],
649 timespan: Timespan,
650 *,
651 dataIds: Optional[Iterable[DataId]] = None,
652 ) -> None:
653 # Docstring inherited from lsst.daf.butler.registry.Registry
654 collectionRecord = self._managers.collections.find(collection)
655 if isinstance(datasetType, str):
656 storage = self._managers.datasets[datasetType]
657 else:
658 storage = self._managers.datasets[datasetType.name]
659 standardizedDataIds = None
660 if dataIds is not None:
661 standardizedDataIds = [
662 DataCoordinate.standardize(d, graph=storage.datasetType.dimensions) for d in dataIds
663 ]
664 storage.decertify(collectionRecord, timespan, dataIds=standardizedDataIds)
666 def getDatastoreBridgeManager(self) -> DatastoreRegistryBridgeManager:
667 """Return an object that allows a new `Datastore` instance to
668 communicate with this `Registry`.
670 Returns
671 -------
672 manager : `DatastoreRegistryBridgeManager`
673 Object that mediates communication between this `Registry` and its
674 associated datastores.
675 """
676 return self._managers.datastores
678 def getDatasetLocations(self, ref: DatasetRef) -> Iterable[str]:
679 # Docstring inherited from lsst.daf.butler.registry.Registry
680 return self._managers.datastores.findDatastores(ref)
682 def expandDataId(
683 self,
684 dataId: Optional[DataId] = None,
685 *,
686 graph: Optional[DimensionGraph] = None,
687 records: Optional[NameLookupMapping[DimensionElement, Optional[DimensionRecord]]] = None,
688 withDefaults: bool = True,
689 **kwargs: Any,
690 ) -> DataCoordinate:
691 # Docstring inherited from lsst.daf.butler.registry.Registry
692 if not withDefaults:
693 defaults = None
694 else:
695 defaults = self.defaults.dataId
696 try:
697 standardized = DataCoordinate.standardize(
698 dataId, graph=graph, universe=self.dimensions, defaults=defaults, **kwargs
699 )
700 except KeyError as exc:
701 # This means either kwargs have some odd name or required
702 # dimension is missing.
703 raise DimensionNameError(str(exc)) from exc
704 if standardized.hasRecords():
705 return standardized
706 if records is None:
707 records = {}
708 elif isinstance(records, NamedKeyMapping):
709 records = records.byName()
710 else:
711 records = dict(records)
712 if isinstance(dataId, DataCoordinate) and dataId.hasRecords():
713 records.update(dataId.records.byName())
714 keys = standardized.byName()
715 for element in standardized.graph.primaryKeyTraversalOrder:
716 record = records.get(element.name, ...) # Use ... to mean not found; None might mean NULL
717 if record is ...:
718 if isinstance(element, Dimension) and keys.get(element.name) is None:
719 if element in standardized.graph.required:
720 raise DimensionNameError(
721 f"No value or null value for required dimension {element.name}."
722 )
723 keys[element.name] = None
724 record = None
725 else:
726 storage = self._managers.dimensions[element]
727 dataIdSet = DataCoordinateIterable.fromScalar(
728 DataCoordinate.standardize(keys, graph=element.graph)
729 )
730 fetched = tuple(storage.fetch(dataIdSet))
731 try:
732 (record,) = fetched
733 except ValueError:
734 record = None
735 records[element.name] = record
736 if record is not None:
737 for d in element.implied:
738 value = getattr(record, d.name)
739 if keys.setdefault(d.name, value) != value:
740 raise InconsistentDataIdError(
741 f"Data ID {standardized} has {d.name}={keys[d.name]!r}, "
742 f"but {element.name} implies {d.name}={value!r}."
743 )
744 else:
745 if element in standardized.graph.required:
746 raise DataIdValueError(
747 f"Could not fetch record for required dimension {element.name} via keys {keys}."
748 )
749 if element.alwaysJoin:
750 raise InconsistentDataIdError(
751 f"Could not fetch record for element {element.name} via keys {keys}, ",
752 "but it is marked alwaysJoin=True; this means one or more dimensions are not "
753 "related.",
754 )
755 for d in element.implied:
756 keys.setdefault(d.name, None)
757 records.setdefault(d.name, None)
758 return DataCoordinate.standardize(keys, graph=standardized.graph).expanded(records=records)
760 def insertDimensionData(
761 self,
762 element: Union[DimensionElement, str],
763 *data: Union[Mapping[str, Any], DimensionRecord],
764 conform: bool = True,
765 replace: bool = False,
766 ) -> None:
767 # Docstring inherited from lsst.daf.butler.registry.Registry
768 if conform:
769 if isinstance(element, str):
770 element = self.dimensions[element]
771 records = [
772 row if isinstance(row, DimensionRecord) else element.RecordClass(**row) for row in data
773 ]
774 else:
775 # Ignore typing since caller said to trust them with conform=False.
776 records = data # type: ignore
777 storage = self._managers.dimensions[element] # type: ignore
778 storage.insert(*records, replace=replace)
780 def syncDimensionData(
781 self,
782 element: Union[DimensionElement, str],
783 row: Union[Mapping[str, Any], DimensionRecord],
784 conform: bool = True,
785 update: bool = False,
786 ) -> Union[bool, Dict[str, Any]]:
787 # Docstring inherited from lsst.daf.butler.registry.Registry
788 if conform:
789 if isinstance(element, str):
790 element = self.dimensions[element]
791 record = row if isinstance(row, DimensionRecord) else element.RecordClass(**row)
792 else:
793 # Ignore typing since caller said to trust them with conform=False.
794 record = row # type: ignore
795 storage = self._managers.dimensions[element] # type: ignore
796 return storage.sync(record, update=update)
798 def queryDatasetTypes(
799 self,
800 expression: Any = ...,
801 *,
802 components: Optional[bool] = None,
803 missing: Optional[List[str]] = None,
804 ) -> Iterator[DatasetType]:
805 # Docstring inherited from lsst.daf.butler.registry.Registry
806 try:
807 wildcard = CategorizedWildcard.fromExpression(expression, coerceUnrecognized=lambda d: d.name)
808 except TypeError as exc:
809 raise DatasetTypeExpressionError(f"Invalid dataset type expression '{expression}'") from exc
810 unknownComponentsMessage = (
811 "Could not find definition for storage class %s for dataset type %r;"
812 " if it has components they will not be included in dataset type query results."
813 )
814 if wildcard is Ellipsis:
815 for datasetType in self._managers.datasets:
816 # The dataset type can no longer be a component
817 yield datasetType
818 if components:
819 # Automatically create the component dataset types
820 try:
821 componentsForDatasetType = datasetType.makeAllComponentDatasetTypes()
822 except KeyError as err:
823 _LOG.warning(unknownComponentsMessage, err, datasetType.name)
824 else:
825 yield from componentsForDatasetType
826 return
827 done: Set[str] = set()
828 for name in wildcard.strings:
829 storage = self._managers.datasets.find(name)
830 done.add(name)
831 if storage is None:
832 if missing is not None:
833 missing.append(name)
834 else:
835 yield storage.datasetType
836 if wildcard.patterns:
837 # If components (the argument) is None, we'll save component
838 # dataset that we might want to match, but only if their parents
839 # didn't get included.
840 componentsForLater = []
841 for registeredDatasetType in self._managers.datasets:
842 # Components are not stored in registry so expand them here
843 allDatasetTypes = [registeredDatasetType]
844 if components is not False:
845 # Only check for the components if we are being asked
846 # for components or components is None.
847 try:
848 allDatasetTypes.extend(registeredDatasetType.makeAllComponentDatasetTypes())
849 except KeyError as err:
850 _LOG.warning(unknownComponentsMessage, err, registeredDatasetType.name)
851 for datasetType in allDatasetTypes:
852 if datasetType.name in done:
853 continue
854 parentName, componentName = datasetType.nameAndComponent()
855 if componentName is not None and not components:
856 if components is None and parentName not in done:
857 componentsForLater.append(datasetType)
858 continue
859 if any(p.fullmatch(datasetType.name) for p in wildcard.patterns):
860 done.add(datasetType.name)
861 yield datasetType
862 # Go back and try to match saved components.
863 for datasetType in componentsForLater:
864 parentName, _ = datasetType.nameAndComponent()
865 if parentName not in done and any(p.fullmatch(datasetType.name) for p in wildcard.patterns):
866 yield datasetType
868 def queryCollections(
869 self,
870 expression: Any = ...,
871 datasetType: Optional[DatasetType] = None,
872 collectionTypes: Union[Iterable[CollectionType], CollectionType] = CollectionType.all(),
873 flattenChains: bool = False,
874 includeChains: Optional[bool] = None,
875 ) -> Iterator[str]:
876 # Docstring inherited from lsst.daf.butler.registry.Registry
878 # Right now the datasetTypes argument is completely ignored, but that
879 # is consistent with its [lack of] guarantees. DM-24939 or a follow-up
880 # ticket will take care of that.
881 try:
882 query = CollectionQuery.fromExpression(expression)
883 except TypeError as exc:
884 raise CollectionExpressionError(f"Invalid collection expression '{expression}'") from exc
885 collectionTypes = ensure_iterable(collectionTypes)
886 for record in query.iter(
887 self._managers.collections,
888 collectionTypes=frozenset(collectionTypes),
889 flattenChains=flattenChains,
890 includeChains=includeChains,
891 ):
892 yield record.name
894 def _makeQueryBuilder(
895 self, summary: queries.QuerySummary, doomed_by: Iterable[str] = ()
896 ) -> queries.QueryBuilder:
897 """Return a `QueryBuilder` instance capable of constructing and
898 managing more complex queries than those obtainable via `Registry`
899 interfaces.
901 This is an advanced interface; downstream code should prefer
902 `Registry.queryDataIds` and `Registry.queryDatasets` whenever those
903 are sufficient.
905 Parameters
906 ----------
907 summary : `queries.QuerySummary`
908 Object describing and categorizing the full set of dimensions that
909 will be included in the query.
910 doomed_by : `Iterable` of `str`, optional
911 A list of diagnostic messages that indicate why the query is going
912 to yield no results and should not even be executed. If an empty
913 container (default) the query will be executed unless other code
914 determines that it is doomed.
916 Returns
917 -------
918 builder : `queries.QueryBuilder`
919 Object that can be used to construct and perform advanced queries.
920 """
921 return queries.QueryBuilder(
922 summary,
923 queries.RegistryManagers(
924 collections=self._managers.collections,
925 dimensions=self._managers.dimensions,
926 datasets=self._managers.datasets,
927 TimespanReprClass=self._db.getTimespanRepresentation(),
928 ),
929 doomed_by=doomed_by,
930 )
932 def queryDatasets(
933 self,
934 datasetType: Any,
935 *,
936 collections: Any = None,
937 dimensions: Optional[Iterable[Union[Dimension, str]]] = None,
938 dataId: Optional[DataId] = None,
939 where: Optional[str] = None,
940 findFirst: bool = False,
941 components: Optional[bool] = None,
942 bind: Optional[Mapping[str, Any]] = None,
943 check: bool = True,
944 **kwargs: Any,
945 ) -> queries.DatasetQueryResults:
946 # Docstring inherited from lsst.daf.butler.registry.Registry
948 # Standardize the collections expression.
949 if collections is None:
950 if not self.defaults.collections:
951 raise NoDefaultCollectionError(
952 "No collections provided to findDataset, and no defaults from registry construction."
953 )
954 collections = self.defaults.collections
955 elif findFirst:
956 collections = CollectionSearch.fromExpression(collections)
957 else:
958 collections = CollectionQuery.fromExpression(collections)
959 # Standardize and expand the data ID provided as a constraint.
960 standardizedDataId = self.expandDataId(dataId, **kwargs)
962 # We can only query directly if given a non-component DatasetType
963 # instance. If we were given an expression or str or a component
964 # DatasetType instance, we'll populate this dict, recurse, and return.
965 # If we already have a non-component DatasetType, it will remain None
966 # and we'll run the query directly.
967 composition: Optional[
968 Dict[
969 DatasetType, List[Optional[str]] # parent dataset type # component name, or None for parent
970 ]
971 ] = None
972 if not isinstance(datasetType, DatasetType):
973 # We were given a dataset type expression (which may be as simple
974 # as a str). Loop over all matching datasets, delegating handling
975 # of the `components` argument to queryDatasetTypes, as we populate
976 # the composition dict.
977 composition = defaultdict(list)
978 for trueDatasetType in self.queryDatasetTypes(datasetType, components=components):
979 parentName, componentName = trueDatasetType.nameAndComponent()
980 if componentName is not None:
981 parentDatasetType = self.getDatasetType(parentName)
982 composition.setdefault(parentDatasetType, []).append(componentName)
983 else:
984 composition.setdefault(trueDatasetType, []).append(None)
985 if not composition:
986 return queries.ChainedDatasetQueryResults(
987 [],
988 doomed_by=[
989 f"No registered dataset type matching {t!r} found, so no matching datasets can "
990 "exist in any collection."
991 for t in ensure_iterable(datasetType)
992 ],
993 )
994 elif datasetType.isComponent():
995 # We were given a true DatasetType instance, but it's a component.
996 # the composition dict will have exactly one item.
997 parentName, componentName = datasetType.nameAndComponent()
998 parentDatasetType = self.getDatasetType(parentName)
999 composition = {parentDatasetType: [componentName]}
1000 if composition is not None:
1001 # We need to recurse. Do that once for each parent dataset type.
1002 chain = []
1003 for parentDatasetType, componentNames in composition.items():
1004 parentResults = self.queryDatasets(
1005 parentDatasetType,
1006 collections=collections,
1007 dimensions=dimensions,
1008 dataId=standardizedDataId,
1009 where=where,
1010 bind=bind,
1011 findFirst=findFirst,
1012 check=check,
1013 )
1014 assert isinstance(
1015 parentResults, queries.ParentDatasetQueryResults
1016 ), "Should always be true if passing in a DatasetType instance, and we are."
1017 chain.append(parentResults.withComponents(componentNames))
1018 return queries.ChainedDatasetQueryResults(chain)
1019 # If we get here, there's no need to recurse (or we are already
1020 # recursing; there can only ever be one level of recursion).
1022 # The full set of dimensions in the query is the combination of those
1023 # needed for the DatasetType and those explicitly requested, if any.
1024 requestedDimensionNames = set(datasetType.dimensions.names)
1025 if dimensions is not None:
1026 requestedDimensionNames.update(self.dimensions.extract(dimensions).names)
1027 # Construct the summary structure needed to construct a QueryBuilder.
1028 summary = queries.QuerySummary(
1029 requested=DimensionGraph(self.dimensions, names=requestedDimensionNames),
1030 dataId=standardizedDataId,
1031 expression=where,
1032 bind=bind,
1033 defaults=self.defaults.dataId,
1034 check=check,
1035 datasets=[datasetType],
1036 )
1037 builder = self._makeQueryBuilder(summary)
1038 # Add the dataset subquery to the query, telling the QueryBuilder to
1039 # include the rank of the selected collection in the results only if we
1040 # need to findFirst. Note that if any of the collections are
1041 # actually wildcard expressions, and we've asked for deduplication,
1042 # this will raise TypeError for us.
1043 builder.joinDataset(datasetType, collections, isResult=True, findFirst=findFirst)
1044 query = builder.finish()
1045 return queries.ParentDatasetQueryResults(self._db, query, components=[None], datasetType=datasetType)
1047 def queryDataIds(
1048 self,
1049 dimensions: Union[Iterable[Union[Dimension, str]], Dimension, str],
1050 *,
1051 dataId: Optional[DataId] = None,
1052 datasets: Any = None,
1053 collections: Any = None,
1054 where: Optional[str] = None,
1055 components: Optional[bool] = None,
1056 bind: Optional[Mapping[str, Any]] = None,
1057 check: bool = True,
1058 **kwargs: Any,
1059 ) -> queries.DataCoordinateQueryResults:
1060 # Docstring inherited from lsst.daf.butler.registry.Registry
1061 dimensions = ensure_iterable(dimensions)
1062 standardizedDataId = self.expandDataId(dataId, **kwargs)
1063 standardizedDatasetTypes = set()
1064 requestedDimensions = self.dimensions.extract(dimensions)
1065 missing: List[str] = []
1066 if datasets is not None:
1067 if not collections:
1068 if not self.defaults.collections:
1069 raise NoDefaultCollectionError(
1070 f"Cannot pass 'datasets' (='{datasets}') without 'collections'."
1071 )
1072 collections = self.defaults.collections
1073 else:
1074 # Preprocess collections expression in case the original
1075 # included single-pass iterators (we'll want to use it multiple
1076 # times below).
1077 collections = CollectionQuery.fromExpression(collections)
1078 for datasetType in self.queryDatasetTypes(datasets, components=components, missing=missing):
1079 # If any matched dataset type is a component, just operate on
1080 # its parent instead, because Registry doesn't know anything
1081 # about what components exist, and here (unlike queryDatasets)
1082 # we don't care about returning them.
1083 parentDatasetTypeName, componentName = datasetType.nameAndComponent()
1084 if componentName is not None:
1085 datasetType = self.getDatasetType(parentDatasetTypeName)
1086 standardizedDatasetTypes.add(datasetType)
1087 elif collections:
1088 raise ArgumentError(f"Cannot pass 'collections' (='{collections}') without 'datasets'.")
1090 def query_factory(
1091 order_by: Optional[Iterable[str]] = None, limit: Optional[Tuple[int, Optional[int]]] = None
1092 ) -> Query:
1093 """Construct the Query object that generates query results."""
1094 summary = queries.QuerySummary(
1095 requested=requestedDimensions,
1096 dataId=standardizedDataId,
1097 expression=where,
1098 bind=bind,
1099 defaults=self.defaults.dataId,
1100 check=check,
1101 datasets=standardizedDatasetTypes,
1102 order_by=order_by,
1103 limit=limit,
1104 )
1105 builder = self._makeQueryBuilder(
1106 summary, doomed_by=[f"Dataset type {name} is not registered." for name in missing]
1107 )
1108 for datasetType in standardizedDatasetTypes:
1109 builder.joinDataset(
1110 datasetType,
1111 collections,
1112 isResult=False,
1113 )
1114 return builder.finish()
1116 return queries.DataCoordinateQueryResults(self._db, query_factory, requestedDimensions)
1118 def queryDimensionRecords(
1119 self,
1120 element: Union[DimensionElement, str],
1121 *,
1122 dataId: Optional[DataId] = None,
1123 datasets: Any = None,
1124 collections: Any = None,
1125 where: Optional[str] = None,
1126 components: Optional[bool] = None,
1127 bind: Optional[Mapping[str, Any]] = None,
1128 check: bool = True,
1129 **kwargs: Any,
1130 ) -> queries.DimensionRecordQueryResults:
1131 # Docstring inherited from lsst.daf.butler.registry.Registry
1132 if not isinstance(element, DimensionElement):
1133 try:
1134 element = self.dimensions[element]
1135 except KeyError as e:
1136 raise DimensionNameError(
1137 f"No such dimension '{element}', available dimensions: "
1138 + str(self.dimensions.getStaticElements())
1139 ) from e
1140 dataIds = self.queryDataIds(
1141 element.graph,
1142 dataId=dataId,
1143 datasets=datasets,
1144 collections=collections,
1145 where=where,
1146 components=components,
1147 bind=bind,
1148 check=check,
1149 **kwargs,
1150 )
1151 return queries.DatabaseDimensionRecordQueryResults(dataIds, self._managers.dimensions[element])
1153 def queryDatasetAssociations(
1154 self,
1155 datasetType: Union[str, DatasetType],
1156 collections: Any = ...,
1157 *,
1158 collectionTypes: Iterable[CollectionType] = CollectionType.all(),
1159 flattenChains: bool = False,
1160 ) -> Iterator[DatasetAssociation]:
1161 # Docstring inherited from lsst.daf.butler.registry.Registry
1162 if collections is None:
1163 if not self.defaults.collections:
1164 raise NoDefaultCollectionError(
1165 "No collections provided to findDataset, and no defaults from registry construction."
1166 )
1167 collections = self.defaults.collections
1168 else:
1169 collections = CollectionQuery.fromExpression(collections)
1170 TimespanReprClass = self._db.getTimespanRepresentation()
1171 if isinstance(datasetType, str):
1172 storage = self._managers.datasets[datasetType]
1173 else:
1174 storage = self._managers.datasets[datasetType.name]
1175 for collectionRecord in collections.iter(
1176 self._managers.collections,
1177 collectionTypes=frozenset(collectionTypes),
1178 flattenChains=flattenChains,
1179 ):
1180 query = storage.select(collectionRecord)
1181 for row in self._db.query(query.combine()).mappings():
1182 dataId = DataCoordinate.fromRequiredValues(
1183 storage.datasetType.dimensions,
1184 tuple(row[name] for name in storage.datasetType.dimensions.required.names),
1185 )
1186 runRecord = self._managers.collections[row[self._managers.collections.getRunForeignKeyName()]]
1187 ref = DatasetRef(storage.datasetType, dataId, id=row["id"], run=runRecord.name, conform=False)
1188 if collectionRecord.type is CollectionType.CALIBRATION:
1189 timespan = TimespanReprClass.extract(row)
1190 else:
1191 timespan = None
1192 yield DatasetAssociation(ref=ref, collection=collectionRecord.name, timespan=timespan)
1194 storageClasses: StorageClassFactory
1195 """All storage classes known to the registry (`StorageClassFactory`).
1196 """