Coverage for python/lsst/daf/butler/registries/sql.py: 13%
469 statements
« prev ^ index » next coverage.py v6.4.4, created at 2022-09-27 08:58 +0000
« prev ^ index » next coverage.py v6.4.4, created at 2022-09-27 08:58 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22from __future__ import annotations
24__all__ = ("SqlRegistry",)
26import contextlib
27import logging
28from collections import defaultdict
29from typing import (
30 TYPE_CHECKING,
31 Any,
32 Dict,
33 Iterable,
34 Iterator,
35 List,
36 Literal,
37 Mapping,
38 Optional,
39 Set,
40 Tuple,
41 Union,
42)
44import sqlalchemy
45from lsst.resources import ResourcePathExpression
46from lsst.utils.iteration import ensure_iterable
48from ..core import (
49 Config,
50 DataCoordinate,
51 DataCoordinateIterable,
52 DataId,
53 DatasetAssociation,
54 DatasetId,
55 DatasetRef,
56 DatasetType,
57 Dimension,
58 DimensionConfig,
59 DimensionElement,
60 DimensionGraph,
61 DimensionRecord,
62 DimensionUniverse,
63 NamedKeyMapping,
64 NameLookupMapping,
65 Progress,
66 StorageClassFactory,
67 Timespan,
68 ddl,
69)
70from ..core.utils import transactional
71from ..registry import (
72 ArgumentError,
73 CollectionExpressionError,
74 CollectionSearch,
75 CollectionSummary,
76 CollectionType,
77 CollectionTypeError,
78 ConflictingDefinitionError,
79 DataIdValueError,
80 DatasetTypeError,
81 DatasetTypeExpressionError,
82 DimensionNameError,
83 InconsistentDataIdError,
84 NoDefaultCollectionError,
85 OrphanedRecordError,
86 Registry,
87 RegistryConfig,
88 RegistryDefaults,
89 queries,
90)
91from ..registry.interfaces import ChainedCollectionRecord, DatasetIdFactory, DatasetIdGenEnum, RunRecord
92from ..registry.managers import RegistryManagerInstances, RegistryManagerTypes
93from ..registry.wildcards import CategorizedWildcard, CollectionQuery, Ellipsis
95if TYPE_CHECKING: 95 ↛ 96line 95 didn't jump to line 96, because the condition on line 95 was never true
96 from .._butlerConfig import ButlerConfig
97 from ..registry.interfaces import CollectionRecord, Database, DatastoreRegistryBridgeManager
100_LOG = logging.getLogger(__name__)
103class SqlRegistry(Registry):
104 """Registry implementation based on SQLAlchemy.
106 Parameters
107 ----------
108 database : `Database`
109 Database instance to store Registry.
110 defaults : `RegistryDefaults`
111 Default collection search path and/or output `~CollectionType.RUN`
112 collection.
113 managers : `RegistryManagerInstances`
114 All the managers required for this registry.
115 """
117 defaultConfigFile: Optional[str] = None
118 """Path to configuration defaults. Accessed within the ``configs`` resource
119 or relative to a search path. Can be None if no defaults specified.
120 """
122 @classmethod
123 def createFromConfig(
124 cls,
125 config: Optional[Union[RegistryConfig, str]] = None,
126 dimensionConfig: Optional[Union[DimensionConfig, str]] = None,
127 butlerRoot: Optional[ResourcePathExpression] = None,
128 ) -> Registry:
129 """Create registry database and return `SqlRegistry` instance.
131 This method initializes database contents, database must be empty
132 prior to calling this method.
134 Parameters
135 ----------
136 config : `RegistryConfig` or `str`, optional
137 Registry configuration, if missing then default configuration will
138 be loaded from registry.yaml.
139 dimensionConfig : `DimensionConfig` or `str`, optional
140 Dimensions configuration, if missing then default configuration
141 will be loaded from dimensions.yaml.
142 butlerRoot : convertible to `lsst.resources.ResourcePath`, optional
143 Path to the repository root this `SqlRegistry` will manage.
145 Returns
146 -------
147 registry : `SqlRegistry`
148 A new `SqlRegistry` instance.
149 """
150 config = cls.forceRegistryConfig(config)
151 config.replaceRoot(butlerRoot)
153 if isinstance(dimensionConfig, str):
154 dimensionConfig = DimensionConfig(config)
155 elif dimensionConfig is None:
156 dimensionConfig = DimensionConfig()
157 elif not isinstance(dimensionConfig, DimensionConfig):
158 raise TypeError(f"Incompatible Dimension configuration type: {type(dimensionConfig)}")
160 DatabaseClass = config.getDatabaseClass()
161 database = DatabaseClass.fromUri(
162 str(config.connectionString), origin=config.get("origin", 0), namespace=config.get("namespace")
163 )
164 managerTypes = RegistryManagerTypes.fromConfig(config)
165 managers = managerTypes.makeRepo(database, dimensionConfig)
166 return cls(database, RegistryDefaults(), managers)
168 @classmethod
169 def fromConfig(
170 cls,
171 config: Union[ButlerConfig, RegistryConfig, Config, str],
172 butlerRoot: Optional[ResourcePathExpression] = None,
173 writeable: bool = True,
174 defaults: Optional[RegistryDefaults] = None,
175 ) -> Registry:
176 """Create `Registry` subclass instance from `config`.
178 Registry database must be initialized prior to calling this method.
180 Parameters
181 ----------
182 config : `ButlerConfig`, `RegistryConfig`, `Config` or `str`
183 Registry configuration
184 butlerRoot : `lsst.resources.ResourcePathExpression`, optional
185 Path to the repository root this `Registry` will manage.
186 writeable : `bool`, optional
187 If `True` (default) create a read-write connection to the database.
188 defaults : `RegistryDefaults`, optional
189 Default collection search path and/or output `~CollectionType.RUN`
190 collection.
192 Returns
193 -------
194 registry : `SqlRegistry` (subclass)
195 A new `SqlRegistry` subclass instance.
196 """
197 config = cls.forceRegistryConfig(config)
198 config.replaceRoot(butlerRoot)
199 DatabaseClass = config.getDatabaseClass()
200 database = DatabaseClass.fromUri(
201 str(config.connectionString),
202 origin=config.get("origin", 0),
203 namespace=config.get("namespace"),
204 writeable=writeable,
205 )
206 managerTypes = RegistryManagerTypes.fromConfig(config)
207 managers = managerTypes.loadRepo(database)
208 if defaults is None:
209 defaults = RegistryDefaults()
210 return cls(database, defaults, managers)
212 def __init__(self, database: Database, defaults: RegistryDefaults, managers: RegistryManagerInstances):
213 self._db = database
214 self._managers = managers
215 self.storageClasses = StorageClassFactory()
216 # Intentionally invoke property setter to initialize defaults. This
217 # can only be done after most of the rest of Registry has already been
218 # initialized, and must be done before the property getter is used.
219 self.defaults = defaults
220 # In the future DatasetIdFactory may become configurable and this
221 # instance will need to be shared with datasets manager.
222 self.datasetIdFactory = DatasetIdFactory()
224 def __str__(self) -> str:
225 return str(self._db)
227 def __repr__(self) -> str:
228 return f"SqlRegistry({self._db!r}, {self.dimensions!r})"
230 def isWriteable(self) -> bool:
231 # Docstring inherited from lsst.daf.butler.registry.Registry
232 return self._db.isWriteable()
234 def copy(self, defaults: Optional[RegistryDefaults] = None) -> Registry:
235 # Docstring inherited from lsst.daf.butler.registry.Registry
236 if defaults is None:
237 # No need to copy, because `RegistryDefaults` is immutable; we
238 # effectively copy on write.
239 defaults = self.defaults
240 return type(self)(self._db, defaults, self._managers)
242 @property
243 def dimensions(self) -> DimensionUniverse:
244 # Docstring inherited from lsst.daf.butler.registry.Registry
245 return self._managers.dimensions.universe
247 def refresh(self) -> None:
248 # Docstring inherited from lsst.daf.butler.registry.Registry
249 self._managers.refresh()
251 @contextlib.contextmanager
252 def transaction(self, *, savepoint: bool = False) -> Iterator[None]:
253 # Docstring inherited from lsst.daf.butler.registry.Registry
254 try:
255 with self._db.transaction(savepoint=savepoint):
256 yield
257 except BaseException:
258 # TODO: this clears the caches sometimes when we wouldn't actually
259 # need to. Can we avoid that?
260 self._managers.dimensions.clearCaches()
261 raise
263 def resetConnectionPool(self) -> None:
264 """Reset SQLAlchemy connection pool for `SqlRegistry` database.
266 This operation is useful when using registry with fork-based
267 multiprocessing. To use registry across fork boundary one has to make
268 sure that there are no currently active connections (no session or
269 transaction is in progress) and connection pool is reset using this
270 method. This method should be called by the child process immediately
271 after the fork.
272 """
273 self._db._engine.dispose()
275 def registerOpaqueTable(self, tableName: str, spec: ddl.TableSpec) -> None:
276 """Add an opaque (to the `Registry`) table for use by a `Datastore` or
277 other data repository client.
279 Opaque table records can be added via `insertOpaqueData`, retrieved via
280 `fetchOpaqueData`, and removed via `deleteOpaqueData`.
282 Parameters
283 ----------
284 tableName : `str`
285 Logical name of the opaque table. This may differ from the
286 actual name used in the database by a prefix and/or suffix.
287 spec : `ddl.TableSpec`
288 Specification for the table to be added.
289 """
290 self._managers.opaque.register(tableName, spec)
292 @transactional
293 def insertOpaqueData(self, tableName: str, *data: dict) -> None:
294 """Insert records into an opaque table.
296 Parameters
297 ----------
298 tableName : `str`
299 Logical name of the opaque table. Must match the name used in a
300 previous call to `registerOpaqueTable`.
301 data
302 Each additional positional argument is a dictionary that represents
303 a single row to be added.
304 """
305 self._managers.opaque[tableName].insert(*data)
307 def fetchOpaqueData(self, tableName: str, **where: Any) -> Iterator[dict]:
308 """Retrieve records from an opaque table.
310 Parameters
311 ----------
312 tableName : `str`
313 Logical name of the opaque table. Must match the name used in a
314 previous call to `registerOpaqueTable`.
315 where
316 Additional keyword arguments are interpreted as equality
317 constraints that restrict the returned rows (combined with AND);
318 keyword arguments are column names and values are the values they
319 must have.
321 Yields
322 ------
323 row : `dict`
324 A dictionary representing a single result row.
325 """
326 yield from self._managers.opaque[tableName].fetch(**where)
328 @transactional
329 def deleteOpaqueData(self, tableName: str, **where: Any) -> None:
330 """Remove records from an opaque table.
332 Parameters
333 ----------
334 tableName : `str`
335 Logical name of the opaque table. Must match the name used in a
336 previous call to `registerOpaqueTable`.
337 where
338 Additional keyword arguments are interpreted as equality
339 constraints that restrict the deleted rows (combined with AND);
340 keyword arguments are column names and values are the values they
341 must have.
342 """
343 self._managers.opaque[tableName].delete(where.keys(), where)
345 def registerCollection(
346 self, name: str, type: CollectionType = CollectionType.TAGGED, doc: Optional[str] = None
347 ) -> bool:
348 # Docstring inherited from lsst.daf.butler.registry.Registry
349 _, registered = self._managers.collections.register(name, type, doc=doc)
350 return registered
352 def getCollectionType(self, name: str) -> CollectionType:
353 # Docstring inherited from lsst.daf.butler.registry.Registry
354 return self._managers.collections.find(name).type
356 def _get_collection_record(self, name: str) -> CollectionRecord:
357 # Docstring inherited from lsst.daf.butler.registry.Registry
358 return self._managers.collections.find(name)
360 def registerRun(self, name: str, doc: Optional[str] = None) -> bool:
361 # Docstring inherited from lsst.daf.butler.registry.Registry
362 _, registered = self._managers.collections.register(name, CollectionType.RUN, doc=doc)
363 return registered
365 @transactional
366 def removeCollection(self, name: str) -> None:
367 # Docstring inherited from lsst.daf.butler.registry.Registry
368 self._managers.collections.remove(name)
370 def getCollectionChain(self, parent: str) -> CollectionSearch:
371 # Docstring inherited from lsst.daf.butler.registry.Registry
372 record = self._managers.collections.find(parent)
373 if record.type is not CollectionType.CHAINED:
374 raise CollectionTypeError(f"Collection '{parent}' has type {record.type.name}, not CHAINED.")
375 assert isinstance(record, ChainedCollectionRecord)
376 return record.children
378 @transactional
379 def setCollectionChain(self, parent: str, children: Any, *, flatten: bool = False) -> None:
380 # Docstring inherited from lsst.daf.butler.registry.Registry
381 record = self._managers.collections.find(parent)
382 if record.type is not CollectionType.CHAINED:
383 raise CollectionTypeError(f"Collection '{parent}' has type {record.type.name}, not CHAINED.")
384 assert isinstance(record, ChainedCollectionRecord)
385 children = CollectionSearch.fromExpression(children)
386 if children != record.children or flatten:
387 record.update(self._managers.collections, children, flatten=flatten)
389 def getCollectionParentChains(self, collection: str) -> Set[str]:
390 # Docstring inherited from lsst.daf.butler.registry.Registry
391 return {
392 record.name
393 for record in self._managers.collections.getParentChains(
394 self._managers.collections.find(collection).key
395 )
396 }
398 def getCollectionDocumentation(self, collection: str) -> Optional[str]:
399 # Docstring inherited from lsst.daf.butler.registry.Registry
400 return self._managers.collections.getDocumentation(self._managers.collections.find(collection).key)
402 def setCollectionDocumentation(self, collection: str, doc: Optional[str]) -> None:
403 # Docstring inherited from lsst.daf.butler.registry.Registry
404 self._managers.collections.setDocumentation(self._managers.collections.find(collection).key, doc)
406 def getCollectionSummary(self, collection: str) -> CollectionSummary:
407 # Docstring inherited from lsst.daf.butler.registry.Registry
408 record = self._managers.collections.find(collection)
409 return self._managers.datasets.getCollectionSummary(record)
411 def registerDatasetType(self, datasetType: DatasetType) -> bool:
412 # Docstring inherited from lsst.daf.butler.registry.Registry
413 _, inserted = self._managers.datasets.register(datasetType)
414 return inserted
416 def removeDatasetType(self, name: str) -> None:
417 # Docstring inherited from lsst.daf.butler.registry.Registry
418 self._managers.datasets.remove(name)
420 def getDatasetType(self, name: str) -> DatasetType:
421 # Docstring inherited from lsst.daf.butler.registry.Registry
422 return self._managers.datasets[name].datasetType
424 def supportsIdGenerationMode(self, mode: DatasetIdGenEnum) -> bool:
425 # Docstring inherited from lsst.daf.butler.registry.Registry
426 return self._managers.datasets.supportsIdGenerationMode(mode)
428 def findDataset(
429 self,
430 datasetType: Union[DatasetType, str],
431 dataId: Optional[DataId] = None,
432 *,
433 collections: Any = None,
434 timespan: Optional[Timespan] = None,
435 **kwargs: Any,
436 ) -> Optional[DatasetRef]:
437 # Docstring inherited from lsst.daf.butler.registry.Registry
438 if isinstance(datasetType, DatasetType):
439 storage = self._managers.datasets[datasetType.name]
440 else:
441 storage = self._managers.datasets[datasetType]
442 dataId = DataCoordinate.standardize(
443 dataId,
444 graph=storage.datasetType.dimensions,
445 universe=self.dimensions,
446 defaults=self.defaults.dataId,
447 **kwargs,
448 )
449 if collections is None:
450 if not self.defaults.collections:
451 raise NoDefaultCollectionError(
452 "No collections provided to findDataset, and no defaults from registry construction."
453 )
454 collections = self.defaults.collections
455 else:
456 collections = CollectionSearch.fromExpression(collections)
457 for collectionRecord in collections.iter(self._managers.collections):
458 if collectionRecord.type is CollectionType.CALIBRATION and (
459 not storage.datasetType.isCalibration() or timespan is None
460 ):
461 continue
462 result = storage.find(collectionRecord, dataId, timespan=timespan)
463 if result is not None:
464 return result
466 return None
468 @transactional
469 def insertDatasets(
470 self,
471 datasetType: Union[DatasetType, str],
472 dataIds: Iterable[DataId],
473 run: Optional[str] = None,
474 expand: bool = True,
475 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE,
476 ) -> List[DatasetRef]:
477 # Docstring inherited from lsst.daf.butler.registry.Registry
478 if isinstance(datasetType, DatasetType):
479 storage = self._managers.datasets.find(datasetType.name)
480 if storage is None:
481 raise DatasetTypeError(f"DatasetType '{datasetType}' has not been registered.")
482 else:
483 storage = self._managers.datasets.find(datasetType)
484 if storage is None:
485 raise DatasetTypeError(f"DatasetType with name '{datasetType}' has not been registered.")
486 if run is None:
487 if self.defaults.run is None:
488 raise NoDefaultCollectionError(
489 "No run provided to insertDatasets, and no default from registry construction."
490 )
491 run = self.defaults.run
492 runRecord = self._managers.collections.find(run)
493 if runRecord.type is not CollectionType.RUN:
494 raise CollectionTypeError(
495 f"Given collection is of type {runRecord.type.name}; RUN collection required."
496 )
497 assert isinstance(runRecord, RunRecord)
498 progress = Progress("daf.butler.Registry.insertDatasets", level=logging.DEBUG)
499 if expand:
500 expandedDataIds = [
501 self.expandDataId(dataId, graph=storage.datasetType.dimensions)
502 for dataId in progress.wrap(dataIds, f"Expanding {storage.datasetType.name} data IDs")
503 ]
504 else:
505 expandedDataIds = [
506 DataCoordinate.standardize(dataId, graph=storage.datasetType.dimensions) for dataId in dataIds
507 ]
508 try:
509 refs = list(storage.insert(runRecord, expandedDataIds, idGenerationMode))
510 except sqlalchemy.exc.IntegrityError as err:
511 raise ConflictingDefinitionError(
512 f"A database constraint failure was triggered by inserting "
513 f"one or more datasets of type {storage.datasetType} into "
514 f"collection '{run}'. "
515 f"This probably means a dataset with the same data ID "
516 f"and dataset type already exists, but it may also mean a "
517 f"dimension row is missing."
518 ) from err
519 return refs
521 @transactional
522 def _importDatasets(
523 self,
524 datasets: Iterable[DatasetRef],
525 expand: bool = True,
526 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE,
527 reuseIds: bool = False,
528 ) -> List[DatasetRef]:
529 # Docstring inherited from lsst.daf.butler.registry.Registry
530 datasets = list(datasets)
531 if not datasets:
532 # nothing to do
533 return []
535 # find dataset type
536 datasetTypes = set(dataset.datasetType for dataset in datasets)
537 if len(datasetTypes) != 1:
538 raise DatasetTypeError(f"Multiple dataset types in input datasets: {datasetTypes}")
539 datasetType = datasetTypes.pop()
541 # get storage handler for this dataset type
542 storage = self._managers.datasets.find(datasetType.name)
543 if storage is None:
544 raise DatasetTypeError(f"DatasetType '{datasetType}' has not been registered.")
546 # find run name
547 runs = set(dataset.run for dataset in datasets)
548 if len(runs) != 1:
549 raise ValueError(f"Multiple run names in input datasets: {runs}")
550 run = runs.pop()
551 if run is None:
552 if self.defaults.run is None:
553 raise NoDefaultCollectionError(
554 "No run provided to ingestDatasets, and no default from registry construction."
555 )
556 run = self.defaults.run
558 runRecord = self._managers.collections.find(run)
559 if runRecord.type is not CollectionType.RUN:
560 raise CollectionTypeError(
561 f"Given collection '{runRecord.name}' is of type {runRecord.type.name};"
562 " RUN collection required."
563 )
564 assert isinstance(runRecord, RunRecord)
566 progress = Progress("daf.butler.Registry.insertDatasets", level=logging.DEBUG)
567 if expand:
568 expandedDatasets = [
569 dataset.expanded(self.expandDataId(dataset.dataId, graph=storage.datasetType.dimensions))
570 for dataset in progress.wrap(datasets, f"Expanding {storage.datasetType.name} data IDs")
571 ]
572 else:
573 expandedDatasets = [
574 DatasetRef(datasetType, dataset.dataId, id=dataset.id, run=dataset.run, conform=True)
575 for dataset in datasets
576 ]
578 try:
579 refs = list(storage.import_(runRecord, expandedDatasets, idGenerationMode, reuseIds))
580 except sqlalchemy.exc.IntegrityError as err:
581 raise ConflictingDefinitionError(
582 f"A database constraint failure was triggered by inserting "
583 f"one or more datasets of type {storage.datasetType} into "
584 f"collection '{run}'. "
585 f"This probably means a dataset with the same data ID "
586 f"and dataset type already exists, but it may also mean a "
587 f"dimension row is missing."
588 ) from err
589 return refs
591 def getDataset(self, id: DatasetId) -> Optional[DatasetRef]:
592 # Docstring inherited from lsst.daf.butler.registry.Registry
593 return self._managers.datasets.getDatasetRef(id)
595 @transactional
596 def removeDatasets(self, refs: Iterable[DatasetRef]) -> None:
597 # Docstring inherited from lsst.daf.butler.registry.Registry
598 progress = Progress("lsst.daf.butler.Registry.removeDatasets", level=logging.DEBUG)
599 for datasetType, refsForType in progress.iter_item_chunks(
600 DatasetRef.groupByType(refs).items(), desc="Removing datasets by type"
601 ):
602 storage = self._managers.datasets[datasetType.name]
603 try:
604 storage.delete(refsForType)
605 except sqlalchemy.exc.IntegrityError as err:
606 raise OrphanedRecordError(
607 "One or more datasets is still present in one or more Datastores."
608 ) from err
610 @transactional
611 def associate(self, collection: str, refs: Iterable[DatasetRef]) -> None:
612 # Docstring inherited from lsst.daf.butler.registry.Registry
613 progress = Progress("lsst.daf.butler.Registry.associate", level=logging.DEBUG)
614 collectionRecord = self._managers.collections.find(collection)
615 if collectionRecord.type is not CollectionType.TAGGED:
616 raise CollectionTypeError(
617 f"Collection '{collection}' has type {collectionRecord.type.name}, not TAGGED."
618 )
619 for datasetType, refsForType in progress.iter_item_chunks(
620 DatasetRef.groupByType(refs).items(), desc="Associating datasets by type"
621 ):
622 storage = self._managers.datasets[datasetType.name]
623 try:
624 storage.associate(collectionRecord, refsForType)
625 except sqlalchemy.exc.IntegrityError as err:
626 raise ConflictingDefinitionError(
627 f"Constraint violation while associating dataset of type {datasetType.name} with "
628 f"collection {collection}. This probably means that one or more datasets with the same "
629 f"dataset type and data ID already exist in the collection, but it may also indicate "
630 f"that the datasets do not exist."
631 ) from err
633 @transactional
634 def disassociate(self, collection: str, refs: Iterable[DatasetRef]) -> None:
635 # Docstring inherited from lsst.daf.butler.registry.Registry
636 progress = Progress("lsst.daf.butler.Registry.disassociate", level=logging.DEBUG)
637 collectionRecord = self._managers.collections.find(collection)
638 if collectionRecord.type is not CollectionType.TAGGED:
639 raise CollectionTypeError(
640 f"Collection '{collection}' has type {collectionRecord.type.name}; expected TAGGED."
641 )
642 for datasetType, refsForType in progress.iter_item_chunks(
643 DatasetRef.groupByType(refs).items(), desc="Disassociating datasets by type"
644 ):
645 storage = self._managers.datasets[datasetType.name]
646 storage.disassociate(collectionRecord, refsForType)
648 @transactional
649 def certify(self, collection: str, refs: Iterable[DatasetRef], timespan: Timespan) -> None:
650 # Docstring inherited from lsst.daf.butler.registry.Registry
651 progress = Progress("lsst.daf.butler.Registry.certify", level=logging.DEBUG)
652 collectionRecord = self._managers.collections.find(collection)
653 for datasetType, refsForType in progress.iter_item_chunks(
654 DatasetRef.groupByType(refs).items(), desc="Certifying datasets by type"
655 ):
656 storage = self._managers.datasets[datasetType.name]
657 storage.certify(collectionRecord, refsForType, timespan)
659 @transactional
660 def decertify(
661 self,
662 collection: str,
663 datasetType: Union[str, DatasetType],
664 timespan: Timespan,
665 *,
666 dataIds: Optional[Iterable[DataId]] = None,
667 ) -> None:
668 # Docstring inherited from lsst.daf.butler.registry.Registry
669 collectionRecord = self._managers.collections.find(collection)
670 if isinstance(datasetType, str):
671 storage = self._managers.datasets[datasetType]
672 else:
673 storage = self._managers.datasets[datasetType.name]
674 standardizedDataIds = None
675 if dataIds is not None:
676 standardizedDataIds = [
677 DataCoordinate.standardize(d, graph=storage.datasetType.dimensions) for d in dataIds
678 ]
679 storage.decertify(collectionRecord, timespan, dataIds=standardizedDataIds)
681 def getDatastoreBridgeManager(self) -> DatastoreRegistryBridgeManager:
682 """Return an object that allows a new `Datastore` instance to
683 communicate with this `Registry`.
685 Returns
686 -------
687 manager : `DatastoreRegistryBridgeManager`
688 Object that mediates communication between this `Registry` and its
689 associated datastores.
690 """
691 return self._managers.datastores
693 def getDatasetLocations(self, ref: DatasetRef) -> Iterable[str]:
694 # Docstring inherited from lsst.daf.butler.registry.Registry
695 return self._managers.datastores.findDatastores(ref)
697 def expandDataId(
698 self,
699 dataId: Optional[DataId] = None,
700 *,
701 graph: Optional[DimensionGraph] = None,
702 records: Optional[NameLookupMapping[DimensionElement, Optional[DimensionRecord]]] = None,
703 withDefaults: bool = True,
704 **kwargs: Any,
705 ) -> DataCoordinate:
706 # Docstring inherited from lsst.daf.butler.registry.Registry
707 if not withDefaults:
708 defaults = None
709 else:
710 defaults = self.defaults.dataId
711 try:
712 standardized = DataCoordinate.standardize(
713 dataId, graph=graph, universe=self.dimensions, defaults=defaults, **kwargs
714 )
715 except KeyError as exc:
716 # This means either kwargs have some odd name or required
717 # dimension is missing.
718 raise DimensionNameError(str(exc)) from exc
719 if standardized.hasRecords():
720 return standardized
721 if records is None:
722 records = {}
723 elif isinstance(records, NamedKeyMapping):
724 records = records.byName()
725 else:
726 records = dict(records)
727 if isinstance(dataId, DataCoordinate) and dataId.hasRecords():
728 records.update(dataId.records.byName())
729 keys = standardized.byName()
730 for element in standardized.graph.primaryKeyTraversalOrder:
731 record = records.get(element.name, ...) # Use ... to mean not found; None might mean NULL
732 if record is ...:
733 if isinstance(element, Dimension) and keys.get(element.name) is None:
734 if element in standardized.graph.required:
735 raise DimensionNameError(
736 f"No value or null value for required dimension {element.name}."
737 )
738 keys[element.name] = None
739 record = None
740 else:
741 storage = self._managers.dimensions[element]
742 dataIdSet = DataCoordinateIterable.fromScalar(
743 DataCoordinate.standardize(keys, graph=element.graph)
744 )
745 fetched = tuple(storage.fetch(dataIdSet))
746 try:
747 (record,) = fetched
748 except ValueError:
749 record = None
750 records[element.name] = record
751 if record is not None:
752 for d in element.implied:
753 value = getattr(record, d.name)
754 if keys.setdefault(d.name, value) != value:
755 raise InconsistentDataIdError(
756 f"Data ID {standardized} has {d.name}={keys[d.name]!r}, "
757 f"but {element.name} implies {d.name}={value!r}."
758 )
759 else:
760 if element in standardized.graph.required:
761 raise DataIdValueError(
762 f"Could not fetch record for required dimension {element.name} via keys {keys}."
763 )
764 if element.alwaysJoin:
765 raise InconsistentDataIdError(
766 f"Could not fetch record for element {element.name} via keys {keys}, ",
767 "but it is marked alwaysJoin=True; this means one or more dimensions are not "
768 "related.",
769 )
770 for d in element.implied:
771 keys.setdefault(d.name, None)
772 records.setdefault(d.name, None)
773 return DataCoordinate.standardize(keys, graph=standardized.graph).expanded(records=records)
775 def insertDimensionData(
776 self,
777 element: Union[DimensionElement, str],
778 *data: Union[Mapping[str, Any], DimensionRecord],
779 conform: bool = True,
780 replace: bool = False,
781 skip_existing: bool = False,
782 ) -> None:
783 # Docstring inherited from lsst.daf.butler.registry.Registry
784 if conform:
785 if isinstance(element, str):
786 element = self.dimensions[element]
787 records = [
788 row if isinstance(row, DimensionRecord) else element.RecordClass(**row) for row in data
789 ]
790 else:
791 # Ignore typing since caller said to trust them with conform=False.
792 records = data # type: ignore
793 storage = self._managers.dimensions[element] # type: ignore
794 storage.insert(*records, replace=replace, skip_existing=skip_existing)
796 def syncDimensionData(
797 self,
798 element: Union[DimensionElement, str],
799 row: Union[Mapping[str, Any], DimensionRecord],
800 conform: bool = True,
801 update: bool = False,
802 ) -> Union[bool, Dict[str, Any]]:
803 # Docstring inherited from lsst.daf.butler.registry.Registry
804 if conform:
805 if isinstance(element, str):
806 element = self.dimensions[element]
807 record = row if isinstance(row, DimensionRecord) else element.RecordClass(**row)
808 else:
809 # Ignore typing since caller said to trust them with conform=False.
810 record = row # type: ignore
811 storage = self._managers.dimensions[element] # type: ignore
812 return storage.sync(record, update=update)
814 def queryDatasetTypes(
815 self,
816 expression: Any = ...,
817 *,
818 components: Optional[bool] = None,
819 missing: Optional[List[str]] = None,
820 ) -> Iterator[DatasetType]:
821 # Docstring inherited from lsst.daf.butler.registry.Registry
822 try:
823 wildcard = CategorizedWildcard.fromExpression(expression, coerceUnrecognized=lambda d: d.name)
824 except TypeError as exc:
825 raise DatasetTypeExpressionError(f"Invalid dataset type expression '{expression}'") from exc
826 unknownComponentsMessage = (
827 "Could not find definition for storage class %s for dataset type %r;"
828 " if it has components they will not be included in dataset type query results."
829 )
830 if wildcard is Ellipsis:
831 for datasetType in self._managers.datasets:
832 # The dataset type can no longer be a component
833 yield datasetType
834 if components:
835 # Automatically create the component dataset types
836 try:
837 componentsForDatasetType = datasetType.makeAllComponentDatasetTypes()
838 except KeyError as err:
839 _LOG.warning(unknownComponentsMessage, err, datasetType.name)
840 else:
841 yield from componentsForDatasetType
842 return
843 done: Set[str] = set()
844 for name in wildcard.strings:
845 storage = self._managers.datasets.find(name)
846 done.add(name)
847 if storage is None:
848 if missing is not None:
849 missing.append(name)
850 else:
851 yield storage.datasetType
852 if wildcard.patterns:
853 # If components (the argument) is None, we'll save component
854 # dataset that we might want to match, but only if their parents
855 # didn't get included.
856 componentsForLater = []
857 for registeredDatasetType in self._managers.datasets:
858 # Components are not stored in registry so expand them here
859 allDatasetTypes = [registeredDatasetType]
860 if components is not False:
861 # Only check for the components if we are being asked
862 # for components or components is None.
863 try:
864 allDatasetTypes.extend(registeredDatasetType.makeAllComponentDatasetTypes())
865 except KeyError as err:
866 _LOG.warning(unknownComponentsMessage, err, registeredDatasetType.name)
867 for datasetType in allDatasetTypes:
868 if datasetType.name in done:
869 continue
870 parentName, componentName = datasetType.nameAndComponent()
871 if componentName is not None and not components:
872 if components is None and parentName not in done:
873 componentsForLater.append(datasetType)
874 continue
875 if any(p.fullmatch(datasetType.name) for p in wildcard.patterns):
876 done.add(datasetType.name)
877 yield datasetType
878 # Go back and try to match saved components.
879 for datasetType in componentsForLater:
880 parentName, _ = datasetType.nameAndComponent()
881 if parentName not in done and any(p.fullmatch(datasetType.name) for p in wildcard.patterns):
882 yield datasetType
884 def queryCollections(
885 self,
886 expression: Any = ...,
887 datasetType: Optional[DatasetType] = None,
888 collectionTypes: Union[Iterable[CollectionType], CollectionType] = CollectionType.all(),
889 flattenChains: bool = False,
890 includeChains: Optional[bool] = None,
891 ) -> Iterator[str]:
892 # Docstring inherited from lsst.daf.butler.registry.Registry
894 # Right now the datasetTypes argument is completely ignored, but that
895 # is consistent with its [lack of] guarantees. DM-24939 or a follow-up
896 # ticket will take care of that.
897 try:
898 query = CollectionQuery.fromExpression(expression)
899 except TypeError as exc:
900 raise CollectionExpressionError(f"Invalid collection expression '{expression}'") from exc
901 collectionTypes = ensure_iterable(collectionTypes)
902 for record in query.iter(
903 self._managers.collections,
904 collectionTypes=frozenset(collectionTypes),
905 flattenChains=flattenChains,
906 includeChains=includeChains,
907 ):
908 yield record.name
910 def _makeQueryBuilder(
911 self, summary: queries.QuerySummary, doomed_by: Iterable[str] = ()
912 ) -> queries.QueryBuilder:
913 """Return a `QueryBuilder` instance capable of constructing and
914 managing more complex queries than those obtainable via `Registry`
915 interfaces.
917 This is an advanced interface; downstream code should prefer
918 `Registry.queryDataIds` and `Registry.queryDatasets` whenever those
919 are sufficient.
921 Parameters
922 ----------
923 summary : `queries.QuerySummary`
924 Object describing and categorizing the full set of dimensions that
925 will be included in the query.
926 doomed_by : `Iterable` of `str`, optional
927 A list of diagnostic messages that indicate why the query is going
928 to yield no results and should not even be executed. If an empty
929 container (default) the query will be executed unless other code
930 determines that it is doomed.
932 Returns
933 -------
934 builder : `queries.QueryBuilder`
935 Object that can be used to construct and perform advanced queries.
936 """
937 return queries.QueryBuilder(
938 summary,
939 backend=queries.SqlQueryBackend(self._db, self._managers),
940 doomed_by=doomed_by,
941 )
943 def _standardize_query_dataset_args(
944 self,
945 datasets: Any,
946 collections: Any,
947 components: bool | None,
948 mode: Literal["find_first"] | Literal["find_all"] | Literal["constrain"] = "constrain",
949 *,
950 doomed_by: list[str],
951 ) -> tuple[defaultdict[DatasetType, list[str | None]], CollectionQuery | CollectionSearch | None]:
952 """Preprocess dataset arguments passed to query* methods.
954 Parameters
955 ----------
956 datasets : `DatasetType`, `str`, `re.Pattern`, or iterable of these
957 Expression identifying dataset types. See `queryDatasetTypes` for
958 details.
959 collections : `str`, `re.Pattern`, or iterable of these
960 Expression identifying collections to be searched. See
961 `queryCollections` for details.
962 components : `bool`, optional
963 If `True`, apply all expression patterns to component dataset type
964 names as well. If `False`, never apply patterns to components.
965 If `None` (default), apply patterns to components only if their
966 parent datasets were not matched by the expression.
967 Fully-specified component datasets (`str` or `DatasetType`
968 instances) are always included.
969 mode : `str`, optional
970 The way in which datasets are being used in this query; one of:
972 - "find_first": this is a query for the first dataset in an
973 ordered list of collections. Prohibits collection wildcards,
974 but permits dataset type wildcards.
976 - "find_all": this is a query for all datasets in all matched
977 collections. Permits collection and dataset type wildcards.
979 - "constrain": this is a query for something other than datasets,
980 with results constrained by dataset existence. Permits
981 collection wildcards and prohibits ``...`` as a dataset type
982 wildcard.
983 doomed_by : `list` [ `str` ]
984 List to append messages indicating why the query is doomed to
985 yield no results.
987 Returns
988 -------
989 composition : `defaultdict` [ `DatasetType`, `list` [ `str` ] ]
990 Dictionary mapping parent dataset type to `list` of components
991 matched for that dataset type (or `None` for the parent itself).
992 collections : `CollectionSearch` or `CollectionQuery`
993 Processed collection expression.
994 """
995 composition: defaultdict[DatasetType, list[str | None]] = defaultdict(list)
996 if datasets is not None:
997 if not collections:
998 if not self.defaults.collections:
999 raise NoDefaultCollectionError("No collections, and no registry default collections.")
1000 collections = self.defaults.collections
1001 elif mode == "find_first":
1002 collections = CollectionSearch.fromExpression(collections)
1003 else:
1004 collections = CollectionQuery.fromExpression(collections)
1005 missing: list[str] = []
1006 if mode == "constrain" and datasets is Ellipsis:
1007 raise TypeError("Cannot pass the universal wildcard '...' for dataset types in this context.")
1008 for dataset_type in self.queryDatasetTypes(datasets, components=components, missing=missing):
1009 if dataset_type.isComponent():
1010 composition[dataset_type.makeCompositeDatasetType()].append(dataset_type.component())
1011 else:
1012 composition[dataset_type].append(None)
1013 doomed_by.extend(f"Dataset type {name} is not registered." for name in missing)
1014 elif collections:
1015 raise ArgumentError(f"Cannot pass 'collections' (='{collections}') without 'datasets'.")
1016 return composition, collections
1018 def queryDatasets(
1019 self,
1020 datasetType: Any,
1021 *,
1022 collections: Any = None,
1023 dimensions: Optional[Iterable[Union[Dimension, str]]] = None,
1024 dataId: Optional[DataId] = None,
1025 where: Optional[str] = None,
1026 findFirst: bool = False,
1027 components: Optional[bool] = None,
1028 bind: Optional[Mapping[str, Any]] = None,
1029 check: bool = True,
1030 **kwargs: Any,
1031 ) -> queries.DatasetQueryResults:
1032 # Docstring inherited from lsst.daf.butler.registry.Registry
1033 doomed_by: list[str] = []
1034 data_id = self.expandDataId(dataId, **kwargs)
1035 dataset_composition, collections = self._standardize_query_dataset_args(
1036 datasetType,
1037 collections,
1038 components,
1039 mode="find_first" if findFirst else "find_all",
1040 doomed_by=doomed_by,
1041 )
1042 parent_results: list[queries.ParentDatasetQueryResults] = []
1043 for parent_dataset_type, components_for_parent in dataset_composition.items():
1044 # The full set of dimensions in the query is the combination of
1045 # those needed for the DatasetType and those explicitly requested,
1046 # if any.
1047 dimension_names = set(parent_dataset_type.dimensions.names)
1048 if dimensions is not None:
1049 dimension_names.update(self.dimensions.extract(dimensions).names)
1050 # Construct the summary structure needed to construct a
1051 # QueryBuilder.
1052 summary = queries.QuerySummary(
1053 requested=DimensionGraph(self.dimensions, names=dimension_names),
1054 dataId=data_id,
1055 expression=where,
1056 bind=bind,
1057 defaults=self.defaults.dataId,
1058 check=check,
1059 datasets=[parent_dataset_type],
1060 )
1061 builder = self._makeQueryBuilder(summary, doomed_by=doomed_by)
1062 # Add the dataset subquery to the query, telling the QueryBuilder
1063 # to include the rank of the selected collection in the results
1064 # only if we need to findFirst. Note that if any of the
1065 # collections are actually wildcard expressions, and
1066 # findFirst=True, this will raise TypeError for us.
1067 builder.joinDataset(parent_dataset_type, collections, isResult=True, findFirst=findFirst)
1068 query = builder.finish()
1069 parent_results.append(
1070 queries.ParentDatasetQueryResults(
1071 self._db, query, datasetType=parent_dataset_type, components=components_for_parent
1072 )
1073 )
1074 if not parent_results:
1075 doomed_by.extend(
1076 f"No registered dataset type matching {t!r} found, so no matching datasets can "
1077 "exist in any collection."
1078 for t in ensure_iterable(datasetType)
1079 )
1080 return queries.ChainedDatasetQueryResults([], doomed_by=doomed_by)
1081 elif len(parent_results) == 1:
1082 return parent_results[0]
1083 else:
1084 return queries.ChainedDatasetQueryResults(parent_results)
1086 def queryDataIds(
1087 self,
1088 dimensions: Union[Iterable[Union[Dimension, str]], Dimension, str],
1089 *,
1090 dataId: Optional[DataId] = None,
1091 datasets: Any = None,
1092 collections: Any = None,
1093 where: Optional[str] = None,
1094 components: Optional[bool] = None,
1095 bind: Optional[Mapping[str, Any]] = None,
1096 check: bool = True,
1097 **kwargs: Any,
1098 ) -> queries.DataCoordinateQueryResults:
1099 # Docstring inherited from lsst.daf.butler.registry.Registry
1100 dimensions = ensure_iterable(dimensions)
1101 requestedDimensions = self.dimensions.extract(dimensions)
1102 doomed_by: list[str] = []
1103 data_id = self.expandDataId(dataId, **kwargs)
1104 dataset_composition, collections = self._standardize_query_dataset_args(
1105 datasets, collections, components, doomed_by=doomed_by
1106 )
1108 def query_factory(
1109 order_by: Optional[Iterable[str]] = None, limit: Optional[Tuple[int, Optional[int]]] = None
1110 ) -> queries.Query:
1111 """Construct the Query object that generates query results."""
1112 summary = queries.QuerySummary(
1113 requested=requestedDimensions,
1114 dataId=data_id,
1115 expression=where,
1116 bind=bind,
1117 defaults=self.defaults.dataId,
1118 check=check,
1119 datasets=dataset_composition.keys(),
1120 order_by=order_by,
1121 limit=limit,
1122 )
1123 builder = self._makeQueryBuilder(summary, doomed_by=doomed_by)
1124 for datasetType in dataset_composition:
1125 builder.joinDataset(datasetType, collections, isResult=False)
1126 return builder.finish()
1128 return queries.DataCoordinateQueryResults(self._db, query_factory, requestedDimensions)
1130 def queryDimensionRecords(
1131 self,
1132 element: Union[DimensionElement, str],
1133 *,
1134 dataId: Optional[DataId] = None,
1135 datasets: Any = None,
1136 collections: Any = None,
1137 where: Optional[str] = None,
1138 components: Optional[bool] = None,
1139 bind: Optional[Mapping[str, Any]] = None,
1140 check: bool = True,
1141 **kwargs: Any,
1142 ) -> queries.DimensionRecordQueryResults:
1143 # Docstring inherited from lsst.daf.butler.registry.Registry
1144 if not isinstance(element, DimensionElement):
1145 try:
1146 element = self.dimensions[element]
1147 except KeyError as e:
1148 raise DimensionNameError(
1149 f"No such dimension '{element}', available dimensions: "
1150 + str(self.dimensions.getStaticElements())
1151 ) from e
1152 dataIds = self.queryDataIds(
1153 element.graph,
1154 dataId=dataId,
1155 datasets=datasets,
1156 collections=collections,
1157 where=where,
1158 components=components,
1159 bind=bind,
1160 check=check,
1161 **kwargs,
1162 )
1163 return queries.DatabaseDimensionRecordQueryResults(dataIds, self._managers.dimensions[element])
1165 def queryDatasetAssociations(
1166 self,
1167 datasetType: Union[str, DatasetType],
1168 collections: Any = ...,
1169 *,
1170 collectionTypes: Iterable[CollectionType] = CollectionType.all(),
1171 flattenChains: bool = False,
1172 ) -> Iterator[DatasetAssociation]:
1173 # Docstring inherited from lsst.daf.butler.registry.Registry
1174 if collections is None:
1175 if not self.defaults.collections:
1176 raise NoDefaultCollectionError(
1177 "No collections provided to findDataset, and no defaults from registry construction."
1178 )
1179 collections = self.defaults.collections
1180 else:
1181 collections = CollectionQuery.fromExpression(collections)
1182 TimespanReprClass = self._db.getTimespanRepresentation()
1183 if isinstance(datasetType, str):
1184 storage = self._managers.datasets[datasetType]
1185 else:
1186 storage = self._managers.datasets[datasetType.name]
1187 for collectionRecord in collections.iter(
1188 self._managers.collections,
1189 collectionTypes=frozenset(collectionTypes),
1190 flattenChains=flattenChains,
1191 ):
1192 query = storage.select(collectionRecord)
1193 for row in self._db.query(query).mappings():
1194 dataId = DataCoordinate.fromRequiredValues(
1195 storage.datasetType.dimensions,
1196 tuple(row[name] for name in storage.datasetType.dimensions.required.names),
1197 )
1198 runRecord = self._managers.collections[row[self._managers.collections.getRunForeignKeyName()]]
1199 ref = DatasetRef(storage.datasetType, dataId, id=row["id"], run=runRecord.name, conform=False)
1200 if collectionRecord.type is CollectionType.CALIBRATION:
1201 timespan = TimespanReprClass.extract(row)
1202 else:
1203 timespan = None
1204 yield DatasetAssociation(ref=ref, collection=collectionRecord.name, timespan=timespan)
1206 storageClasses: StorageClassFactory
1207 """All storage classes known to the registry (`StorageClassFactory`).
1208 """