Coverage for python/lsst/daf/butler/registries/sql.py: 13%
482 statements
« prev ^ index » next coverage.py v6.4.4, created at 2022-08-26 02:22 -0700
« prev ^ index » next coverage.py v6.4.4, created at 2022-08-26 02:22 -0700
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22from __future__ import annotations
24__all__ = ("SqlRegistry",)
26import contextlib
27import logging
28from collections import defaultdict
29from typing import TYPE_CHECKING, Any, Dict, Iterable, Iterator, List, Mapping, Optional, Set, Tuple, Union
31import sqlalchemy
32from lsst.resources import ResourcePathExpression
33from lsst.utils.iteration import ensure_iterable
35from ..core import (
36 Config,
37 DataCoordinate,
38 DataCoordinateIterable,
39 DataId,
40 DatasetAssociation,
41 DatasetId,
42 DatasetRef,
43 DatasetType,
44 Dimension,
45 DimensionConfig,
46 DimensionElement,
47 DimensionGraph,
48 DimensionRecord,
49 DimensionUniverse,
50 NamedKeyMapping,
51 NameLookupMapping,
52 Progress,
53 StorageClassFactory,
54 Timespan,
55 ddl,
56)
57from ..core.utils import transactional
58from ..registry import (
59 ArgumentError,
60 CollectionExpressionError,
61 CollectionSearch,
62 CollectionType,
63 CollectionTypeError,
64 ConflictingDefinitionError,
65 DataIdValueError,
66 DatasetTypeError,
67 DatasetTypeExpressionError,
68 DimensionNameError,
69 InconsistentDataIdError,
70 NoDefaultCollectionError,
71 OrphanedRecordError,
72 Registry,
73 RegistryConfig,
74 RegistryDefaults,
75 queries,
76)
77from ..registry.interfaces import ChainedCollectionRecord, DatasetIdFactory, DatasetIdGenEnum, RunRecord
78from ..registry.managers import RegistryManagerInstances, RegistryManagerTypes
79from ..registry.queries import Query
80from ..registry.summaries import CollectionSummary
81from ..registry.wildcards import CategorizedWildcard, CollectionQuery, Ellipsis
83if TYPE_CHECKING: 83 ↛ 84line 83 didn't jump to line 84, because the condition on line 83 was never true
84 from .._butlerConfig import ButlerConfig
85 from ..registry.interfaces import CollectionRecord, Database, DatastoreRegistryBridgeManager
88_LOG = logging.getLogger(__name__)
91class SqlRegistry(Registry):
92 """Registry implementation based on SQLAlchemy.
94 Parameters
95 ----------
96 database : `Database`
97 Database instance to store Registry.
98 defaults : `RegistryDefaults`
99 Default collection search path and/or output `~CollectionType.RUN`
100 collection.
101 managers : `RegistryManagerInstances`
102 All the managers required for this registry.
103 """
105 defaultConfigFile: Optional[str] = None
106 """Path to configuration defaults. Accessed within the ``configs`` resource
107 or relative to a search path. Can be None if no defaults specified.
108 """
110 @classmethod
111 def createFromConfig(
112 cls,
113 config: Optional[Union[RegistryConfig, str]] = None,
114 dimensionConfig: Optional[Union[DimensionConfig, str]] = None,
115 butlerRoot: Optional[ResourcePathExpression] = None,
116 ) -> Registry:
117 """Create registry database and return `SqlRegistry` instance.
119 This method initializes database contents, database must be empty
120 prior to calling this method.
122 Parameters
123 ----------
124 config : `RegistryConfig` or `str`, optional
125 Registry configuration, if missing then default configuration will
126 be loaded from registry.yaml.
127 dimensionConfig : `DimensionConfig` or `str`, optional
128 Dimensions configuration, if missing then default configuration
129 will be loaded from dimensions.yaml.
130 butlerRoot : convertible to `lsst.resources.ResourcePath`, optional
131 Path to the repository root this `SqlRegistry` will manage.
133 Returns
134 -------
135 registry : `SqlRegistry`
136 A new `SqlRegistry` instance.
137 """
138 config = cls.forceRegistryConfig(config)
139 config.replaceRoot(butlerRoot)
141 if isinstance(dimensionConfig, str):
142 dimensionConfig = DimensionConfig(config)
143 elif dimensionConfig is None:
144 dimensionConfig = DimensionConfig()
145 elif not isinstance(dimensionConfig, DimensionConfig):
146 raise TypeError(f"Incompatible Dimension configuration type: {type(dimensionConfig)}")
148 DatabaseClass = config.getDatabaseClass()
149 database = DatabaseClass.fromUri(
150 str(config.connectionString), origin=config.get("origin", 0), namespace=config.get("namespace")
151 )
152 managerTypes = RegistryManagerTypes.fromConfig(config)
153 managers = managerTypes.makeRepo(database, dimensionConfig)
154 return cls(database, RegistryDefaults(), managers)
156 @classmethod
157 def fromConfig(
158 cls,
159 config: Union[ButlerConfig, RegistryConfig, Config, str],
160 butlerRoot: Optional[ResourcePathExpression] = None,
161 writeable: bool = True,
162 defaults: Optional[RegistryDefaults] = None,
163 ) -> Registry:
164 """Create `Registry` subclass instance from `config`.
166 Registry database must be initialized prior to calling this method.
168 Parameters
169 ----------
170 config : `ButlerConfig`, `RegistryConfig`, `Config` or `str`
171 Registry configuration
172 butlerRoot : `lsst.resources.ResourcePathExpression`, optional
173 Path to the repository root this `Registry` will manage.
174 writeable : `bool`, optional
175 If `True` (default) create a read-write connection to the database.
176 defaults : `RegistryDefaults`, optional
177 Default collection search path and/or output `~CollectionType.RUN`
178 collection.
180 Returns
181 -------
182 registry : `SqlRegistry` (subclass)
183 A new `SqlRegistry` subclass instance.
184 """
185 config = cls.forceRegistryConfig(config)
186 config.replaceRoot(butlerRoot)
187 DatabaseClass = config.getDatabaseClass()
188 database = DatabaseClass.fromUri(
189 str(config.connectionString),
190 origin=config.get("origin", 0),
191 namespace=config.get("namespace"),
192 writeable=writeable,
193 )
194 managerTypes = RegistryManagerTypes.fromConfig(config)
195 managers = managerTypes.loadRepo(database)
196 if defaults is None:
197 defaults = RegistryDefaults()
198 return cls(database, defaults, managers)
200 def __init__(self, database: Database, defaults: RegistryDefaults, managers: RegistryManagerInstances):
201 self._db = database
202 self._managers = managers
203 self.storageClasses = StorageClassFactory()
204 # Intentionally invoke property setter to initialize defaults. This
205 # can only be done after most of the rest of Registry has already been
206 # initialized, and must be done before the property getter is used.
207 self.defaults = defaults
208 # In the future DatasetIdFactory may become configurable and this
209 # instance will need to be shared with datasets manager.
210 self.datasetIdFactory = DatasetIdFactory()
212 def __str__(self) -> str:
213 return str(self._db)
215 def __repr__(self) -> str:
216 return f"SqlRegistry({self._db!r}, {self.dimensions!r})"
218 def isWriteable(self) -> bool:
219 # Docstring inherited from lsst.daf.butler.registry.Registry
220 return self._db.isWriteable()
222 def copy(self, defaults: Optional[RegistryDefaults] = None) -> Registry:
223 # Docstring inherited from lsst.daf.butler.registry.Registry
224 if defaults is None:
225 # No need to copy, because `RegistryDefaults` is immutable; we
226 # effectively copy on write.
227 defaults = self.defaults
228 return type(self)(self._db, defaults, self._managers)
230 @property
231 def dimensions(self) -> DimensionUniverse:
232 # Docstring inherited from lsst.daf.butler.registry.Registry
233 return self._managers.dimensions.universe
235 def refresh(self) -> None:
236 # Docstring inherited from lsst.daf.butler.registry.Registry
237 self._managers.refresh()
239 @contextlib.contextmanager
240 def transaction(self, *, savepoint: bool = False) -> Iterator[None]:
241 # Docstring inherited from lsst.daf.butler.registry.Registry
242 try:
243 with self._db.transaction(savepoint=savepoint):
244 yield
245 except BaseException:
246 # TODO: this clears the caches sometimes when we wouldn't actually
247 # need to. Can we avoid that?
248 self._managers.dimensions.clearCaches()
249 raise
251 def resetConnectionPool(self) -> None:
252 """Reset SQLAlchemy connection pool for `SqlRegistry` database.
254 This operation is useful when using registry with fork-based
255 multiprocessing. To use registry across fork boundary one has to make
256 sure that there are no currently active connections (no session or
257 transaction is in progress) and connection pool is reset using this
258 method. This method should be called by the child process immediately
259 after the fork.
260 """
261 self._db._engine.dispose()
263 def registerOpaqueTable(self, tableName: str, spec: ddl.TableSpec) -> None:
264 """Add an opaque (to the `Registry`) table for use by a `Datastore` or
265 other data repository client.
267 Opaque table records can be added via `insertOpaqueData`, retrieved via
268 `fetchOpaqueData`, and removed via `deleteOpaqueData`.
270 Parameters
271 ----------
272 tableName : `str`
273 Logical name of the opaque table. This may differ from the
274 actual name used in the database by a prefix and/or suffix.
275 spec : `ddl.TableSpec`
276 Specification for the table to be added.
277 """
278 self._managers.opaque.register(tableName, spec)
280 @transactional
281 def insertOpaqueData(self, tableName: str, *data: dict) -> None:
282 """Insert records into an opaque table.
284 Parameters
285 ----------
286 tableName : `str`
287 Logical name of the opaque table. Must match the name used in a
288 previous call to `registerOpaqueTable`.
289 data
290 Each additional positional argument is a dictionary that represents
291 a single row to be added.
292 """
293 self._managers.opaque[tableName].insert(*data)
295 def fetchOpaqueData(self, tableName: str, **where: Any) -> Iterator[dict]:
296 """Retrieve records from an opaque table.
298 Parameters
299 ----------
300 tableName : `str`
301 Logical name of the opaque table. Must match the name used in a
302 previous call to `registerOpaqueTable`.
303 where
304 Additional keyword arguments are interpreted as equality
305 constraints that restrict the returned rows (combined with AND);
306 keyword arguments are column names and values are the values they
307 must have.
309 Yields
310 ------
311 row : `dict`
312 A dictionary representing a single result row.
313 """
314 yield from self._managers.opaque[tableName].fetch(**where)
316 @transactional
317 def deleteOpaqueData(self, tableName: str, **where: Any) -> None:
318 """Remove records from an opaque table.
320 Parameters
321 ----------
322 tableName : `str`
323 Logical name of the opaque table. Must match the name used in a
324 previous call to `registerOpaqueTable`.
325 where
326 Additional keyword arguments are interpreted as equality
327 constraints that restrict the deleted rows (combined with AND);
328 keyword arguments are column names and values are the values they
329 must have.
330 """
331 self._managers.opaque[tableName].delete(where.keys(), where)
333 def registerCollection(
334 self, name: str, type: CollectionType = CollectionType.TAGGED, doc: Optional[str] = None
335 ) -> bool:
336 # Docstring inherited from lsst.daf.butler.registry.Registry
337 _, registered = self._managers.collections.register(name, type, doc=doc)
338 return registered
340 def getCollectionType(self, name: str) -> CollectionType:
341 # Docstring inherited from lsst.daf.butler.registry.Registry
342 return self._managers.collections.find(name).type
344 def _get_collection_record(self, name: str) -> CollectionRecord:
345 # Docstring inherited from lsst.daf.butler.registry.Registry
346 return self._managers.collections.find(name)
348 def registerRun(self, name: str, doc: Optional[str] = None) -> bool:
349 # Docstring inherited from lsst.daf.butler.registry.Registry
350 _, registered = self._managers.collections.register(name, CollectionType.RUN, doc=doc)
351 return registered
353 @transactional
354 def removeCollection(self, name: str) -> None:
355 # Docstring inherited from lsst.daf.butler.registry.Registry
356 self._managers.collections.remove(name)
358 def getCollectionChain(self, parent: str) -> CollectionSearch:
359 # Docstring inherited from lsst.daf.butler.registry.Registry
360 record = self._managers.collections.find(parent)
361 if record.type is not CollectionType.CHAINED:
362 raise CollectionTypeError(f"Collection '{parent}' has type {record.type.name}, not CHAINED.")
363 assert isinstance(record, ChainedCollectionRecord)
364 return record.children
366 @transactional
367 def setCollectionChain(self, parent: str, children: Any, *, flatten: bool = False) -> None:
368 # Docstring inherited from lsst.daf.butler.registry.Registry
369 record = self._managers.collections.find(parent)
370 if record.type is not CollectionType.CHAINED:
371 raise CollectionTypeError(f"Collection '{parent}' has type {record.type.name}, not CHAINED.")
372 assert isinstance(record, ChainedCollectionRecord)
373 children = CollectionSearch.fromExpression(children)
374 if children != record.children or flatten:
375 record.update(self._managers.collections, children, flatten=flatten)
377 def getCollectionParentChains(self, collection: str) -> Set[str]:
378 # Docstring inherited from lsst.daf.butler.registry.Registry
379 return {
380 record.name
381 for record in self._managers.collections.getParentChains(
382 self._managers.collections.find(collection).key
383 )
384 }
386 def getCollectionDocumentation(self, collection: str) -> Optional[str]:
387 # Docstring inherited from lsst.daf.butler.registry.Registry
388 return self._managers.collections.getDocumentation(self._managers.collections.find(collection).key)
390 def setCollectionDocumentation(self, collection: str, doc: Optional[str]) -> None:
391 # Docstring inherited from lsst.daf.butler.registry.Registry
392 self._managers.collections.setDocumentation(self._managers.collections.find(collection).key, doc)
394 def getCollectionSummary(self, collection: str) -> CollectionSummary:
395 # Docstring inherited from lsst.daf.butler.registry.Registry
396 record = self._managers.collections.find(collection)
397 return self._managers.datasets.getCollectionSummary(record)
399 def registerDatasetType(self, datasetType: DatasetType) -> bool:
400 # Docstring inherited from lsst.daf.butler.registry.Registry
401 _, inserted = self._managers.datasets.register(datasetType)
402 return inserted
404 def removeDatasetType(self, name: str) -> None:
405 # Docstring inherited from lsst.daf.butler.registry.Registry
406 self._managers.datasets.remove(name)
408 def getDatasetType(self, name: str) -> DatasetType:
409 # Docstring inherited from lsst.daf.butler.registry.Registry
410 return self._managers.datasets[name].datasetType
412 def supportsIdGenerationMode(self, mode: DatasetIdGenEnum) -> bool:
413 # Docstring inherited from lsst.daf.butler.registry.Registry
414 return self._managers.datasets.supportsIdGenerationMode(mode)
416 def findDataset(
417 self,
418 datasetType: Union[DatasetType, str],
419 dataId: Optional[DataId] = None,
420 *,
421 collections: Any = None,
422 timespan: Optional[Timespan] = None,
423 **kwargs: Any,
424 ) -> Optional[DatasetRef]:
425 # Docstring inherited from lsst.daf.butler.registry.Registry
426 if isinstance(datasetType, DatasetType):
427 storage = self._managers.datasets[datasetType.name]
428 else:
429 storage = self._managers.datasets[datasetType]
430 dataId = DataCoordinate.standardize(
431 dataId,
432 graph=storage.datasetType.dimensions,
433 universe=self.dimensions,
434 defaults=self.defaults.dataId,
435 **kwargs,
436 )
437 if collections is None:
438 if not self.defaults.collections:
439 raise NoDefaultCollectionError(
440 "No collections provided to findDataset, and no defaults from registry construction."
441 )
442 collections = self.defaults.collections
443 else:
444 collections = CollectionSearch.fromExpression(collections)
445 for collectionRecord in collections.iter(self._managers.collections):
446 if collectionRecord.type is CollectionType.CALIBRATION and (
447 not storage.datasetType.isCalibration() or timespan is None
448 ):
449 continue
450 result = storage.find(collectionRecord, dataId, timespan=timespan)
451 if result is not None:
452 return result
454 return None
456 @transactional
457 def insertDatasets(
458 self,
459 datasetType: Union[DatasetType, str],
460 dataIds: Iterable[DataId],
461 run: Optional[str] = None,
462 expand: bool = True,
463 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE,
464 ) -> List[DatasetRef]:
465 # Docstring inherited from lsst.daf.butler.registry.Registry
466 if isinstance(datasetType, DatasetType):
467 storage = self._managers.datasets.find(datasetType.name)
468 if storage is None:
469 raise DatasetTypeError(f"DatasetType '{datasetType}' has not been registered.")
470 else:
471 storage = self._managers.datasets.find(datasetType)
472 if storage is None:
473 raise DatasetTypeError(f"DatasetType with name '{datasetType}' has not been registered.")
474 if run is None:
475 if self.defaults.run is None:
476 raise NoDefaultCollectionError(
477 "No run provided to insertDatasets, and no default from registry construction."
478 )
479 run = self.defaults.run
480 runRecord = self._managers.collections.find(run)
481 if runRecord.type is not CollectionType.RUN:
482 raise CollectionTypeError(
483 f"Given collection is of type {runRecord.type.name}; RUN collection required."
484 )
485 assert isinstance(runRecord, RunRecord)
486 progress = Progress("daf.butler.Registry.insertDatasets", level=logging.DEBUG)
487 if expand:
488 expandedDataIds = [
489 self.expandDataId(dataId, graph=storage.datasetType.dimensions)
490 for dataId in progress.wrap(dataIds, f"Expanding {storage.datasetType.name} data IDs")
491 ]
492 else:
493 expandedDataIds = [
494 DataCoordinate.standardize(dataId, graph=storage.datasetType.dimensions) for dataId in dataIds
495 ]
496 try:
497 refs = list(storage.insert(runRecord, expandedDataIds, idGenerationMode))
498 except sqlalchemy.exc.IntegrityError as err:
499 raise ConflictingDefinitionError(
500 f"A database constraint failure was triggered by inserting "
501 f"one or more datasets of type {storage.datasetType} into "
502 f"collection '{run}'. "
503 f"This probably means a dataset with the same data ID "
504 f"and dataset type already exists, but it may also mean a "
505 f"dimension row is missing."
506 ) from err
507 return refs
509 @transactional
510 def _importDatasets(
511 self,
512 datasets: Iterable[DatasetRef],
513 expand: bool = True,
514 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE,
515 reuseIds: bool = False,
516 ) -> List[DatasetRef]:
517 # Docstring inherited from lsst.daf.butler.registry.Registry
518 datasets = list(datasets)
519 if not datasets:
520 # nothing to do
521 return []
523 # find dataset type
524 datasetTypes = set(dataset.datasetType for dataset in datasets)
525 if len(datasetTypes) != 1:
526 raise DatasetTypeError(f"Multiple dataset types in input datasets: {datasetTypes}")
527 datasetType = datasetTypes.pop()
529 # get storage handler for this dataset type
530 storage = self._managers.datasets.find(datasetType.name)
531 if storage is None:
532 raise DatasetTypeError(f"DatasetType '{datasetType}' has not been registered.")
534 # find run name
535 runs = set(dataset.run for dataset in datasets)
536 if len(runs) != 1:
537 raise ValueError(f"Multiple run names in input datasets: {runs}")
538 run = runs.pop()
539 if run is None:
540 if self.defaults.run is None:
541 raise NoDefaultCollectionError(
542 "No run provided to ingestDatasets, and no default from registry construction."
543 )
544 run = self.defaults.run
546 runRecord = self._managers.collections.find(run)
547 if runRecord.type is not CollectionType.RUN:
548 raise CollectionTypeError(
549 f"Given collection '{runRecord.name}' is of type {runRecord.type.name};"
550 " RUN collection required."
551 )
552 assert isinstance(runRecord, RunRecord)
554 progress = Progress("daf.butler.Registry.insertDatasets", level=logging.DEBUG)
555 if expand:
556 expandedDatasets = [
557 dataset.expanded(self.expandDataId(dataset.dataId, graph=storage.datasetType.dimensions))
558 for dataset in progress.wrap(datasets, f"Expanding {storage.datasetType.name} data IDs")
559 ]
560 else:
561 expandedDatasets = [
562 DatasetRef(datasetType, dataset.dataId, id=dataset.id, run=dataset.run, conform=True)
563 for dataset in datasets
564 ]
566 try:
567 refs = list(storage.import_(runRecord, expandedDatasets, idGenerationMode, reuseIds))
568 except sqlalchemy.exc.IntegrityError as err:
569 raise ConflictingDefinitionError(
570 f"A database constraint failure was triggered by inserting "
571 f"one or more datasets of type {storage.datasetType} into "
572 f"collection '{run}'. "
573 f"This probably means a dataset with the same data ID "
574 f"and dataset type already exists, but it may also mean a "
575 f"dimension row is missing."
576 ) from err
577 return refs
579 def getDataset(self, id: DatasetId) -> Optional[DatasetRef]:
580 # Docstring inherited from lsst.daf.butler.registry.Registry
581 return self._managers.datasets.getDatasetRef(id)
583 @transactional
584 def removeDatasets(self, refs: Iterable[DatasetRef]) -> None:
585 # Docstring inherited from lsst.daf.butler.registry.Registry
586 progress = Progress("lsst.daf.butler.Registry.removeDatasets", level=logging.DEBUG)
587 for datasetType, refsForType in progress.iter_item_chunks(
588 DatasetRef.groupByType(refs).items(), desc="Removing datasets by type"
589 ):
590 storage = self._managers.datasets[datasetType.name]
591 try:
592 storage.delete(refsForType)
593 except sqlalchemy.exc.IntegrityError as err:
594 raise OrphanedRecordError(
595 "One or more datasets is still present in one or more Datastores."
596 ) from err
598 @transactional
599 def associate(self, collection: str, refs: Iterable[DatasetRef]) -> None:
600 # Docstring inherited from lsst.daf.butler.registry.Registry
601 progress = Progress("lsst.daf.butler.Registry.associate", level=logging.DEBUG)
602 collectionRecord = self._managers.collections.find(collection)
603 if collectionRecord.type is not CollectionType.TAGGED:
604 raise CollectionTypeError(
605 f"Collection '{collection}' has type {collectionRecord.type.name}, not TAGGED."
606 )
607 for datasetType, refsForType in progress.iter_item_chunks(
608 DatasetRef.groupByType(refs).items(), desc="Associating datasets by type"
609 ):
610 storage = self._managers.datasets[datasetType.name]
611 try:
612 storage.associate(collectionRecord, refsForType)
613 except sqlalchemy.exc.IntegrityError as err:
614 raise ConflictingDefinitionError(
615 f"Constraint violation while associating dataset of type {datasetType.name} with "
616 f"collection {collection}. This probably means that one or more datasets with the same "
617 f"dataset type and data ID already exist in the collection, but it may also indicate "
618 f"that the datasets do not exist."
619 ) from err
621 @transactional
622 def disassociate(self, collection: str, refs: Iterable[DatasetRef]) -> None:
623 # Docstring inherited from lsst.daf.butler.registry.Registry
624 progress = Progress("lsst.daf.butler.Registry.disassociate", level=logging.DEBUG)
625 collectionRecord = self._managers.collections.find(collection)
626 if collectionRecord.type is not CollectionType.TAGGED:
627 raise CollectionTypeError(
628 f"Collection '{collection}' has type {collectionRecord.type.name}; expected TAGGED."
629 )
630 for datasetType, refsForType in progress.iter_item_chunks(
631 DatasetRef.groupByType(refs).items(), desc="Disassociating datasets by type"
632 ):
633 storage = self._managers.datasets[datasetType.name]
634 storage.disassociate(collectionRecord, refsForType)
636 @transactional
637 def certify(self, collection: str, refs: Iterable[DatasetRef], timespan: Timespan) -> None:
638 # Docstring inherited from lsst.daf.butler.registry.Registry
639 progress = Progress("lsst.daf.butler.Registry.certify", level=logging.DEBUG)
640 collectionRecord = self._managers.collections.find(collection)
641 for datasetType, refsForType in progress.iter_item_chunks(
642 DatasetRef.groupByType(refs).items(), desc="Certifying datasets by type"
643 ):
644 storage = self._managers.datasets[datasetType.name]
645 storage.certify(collectionRecord, refsForType, timespan)
647 @transactional
648 def decertify(
649 self,
650 collection: str,
651 datasetType: Union[str, DatasetType],
652 timespan: Timespan,
653 *,
654 dataIds: Optional[Iterable[DataId]] = None,
655 ) -> None:
656 # Docstring inherited from lsst.daf.butler.registry.Registry
657 collectionRecord = self._managers.collections.find(collection)
658 if isinstance(datasetType, str):
659 storage = self._managers.datasets[datasetType]
660 else:
661 storage = self._managers.datasets[datasetType.name]
662 standardizedDataIds = None
663 if dataIds is not None:
664 standardizedDataIds = [
665 DataCoordinate.standardize(d, graph=storage.datasetType.dimensions) for d in dataIds
666 ]
667 storage.decertify(collectionRecord, timespan, dataIds=standardizedDataIds)
669 def getDatastoreBridgeManager(self) -> DatastoreRegistryBridgeManager:
670 """Return an object that allows a new `Datastore` instance to
671 communicate with this `Registry`.
673 Returns
674 -------
675 manager : `DatastoreRegistryBridgeManager`
676 Object that mediates communication between this `Registry` and its
677 associated datastores.
678 """
679 return self._managers.datastores
681 def getDatasetLocations(self, ref: DatasetRef) -> Iterable[str]:
682 # Docstring inherited from lsst.daf.butler.registry.Registry
683 return self._managers.datastores.findDatastores(ref)
685 def expandDataId(
686 self,
687 dataId: Optional[DataId] = None,
688 *,
689 graph: Optional[DimensionGraph] = None,
690 records: Optional[NameLookupMapping[DimensionElement, Optional[DimensionRecord]]] = None,
691 withDefaults: bool = True,
692 **kwargs: Any,
693 ) -> DataCoordinate:
694 # Docstring inherited from lsst.daf.butler.registry.Registry
695 if not withDefaults:
696 defaults = None
697 else:
698 defaults = self.defaults.dataId
699 try:
700 standardized = DataCoordinate.standardize(
701 dataId, graph=graph, universe=self.dimensions, defaults=defaults, **kwargs
702 )
703 except KeyError as exc:
704 # This means either kwargs have some odd name or required
705 # dimension is missing.
706 raise DimensionNameError(str(exc)) from exc
707 if standardized.hasRecords():
708 return standardized
709 if records is None:
710 records = {}
711 elif isinstance(records, NamedKeyMapping):
712 records = records.byName()
713 else:
714 records = dict(records)
715 if isinstance(dataId, DataCoordinate) and dataId.hasRecords():
716 records.update(dataId.records.byName())
717 keys = standardized.byName()
718 for element in standardized.graph.primaryKeyTraversalOrder:
719 record = records.get(element.name, ...) # Use ... to mean not found; None might mean NULL
720 if record is ...:
721 if isinstance(element, Dimension) and keys.get(element.name) is None:
722 if element in standardized.graph.required:
723 raise DimensionNameError(
724 f"No value or null value for required dimension {element.name}."
725 )
726 keys[element.name] = None
727 record = None
728 else:
729 storage = self._managers.dimensions[element]
730 dataIdSet = DataCoordinateIterable.fromScalar(
731 DataCoordinate.standardize(keys, graph=element.graph)
732 )
733 fetched = tuple(storage.fetch(dataIdSet))
734 try:
735 (record,) = fetched
736 except ValueError:
737 record = None
738 records[element.name] = record
739 if record is not None:
740 for d in element.implied:
741 value = getattr(record, d.name)
742 if keys.setdefault(d.name, value) != value:
743 raise InconsistentDataIdError(
744 f"Data ID {standardized} has {d.name}={keys[d.name]!r}, "
745 f"but {element.name} implies {d.name}={value!r}."
746 )
747 else:
748 if element in standardized.graph.required:
749 raise DataIdValueError(
750 f"Could not fetch record for required dimension {element.name} via keys {keys}."
751 )
752 if element.alwaysJoin:
753 raise InconsistentDataIdError(
754 f"Could not fetch record for element {element.name} via keys {keys}, ",
755 "but it is marked alwaysJoin=True; this means one or more dimensions are not "
756 "related.",
757 )
758 for d in element.implied:
759 keys.setdefault(d.name, None)
760 records.setdefault(d.name, None)
761 return DataCoordinate.standardize(keys, graph=standardized.graph).expanded(records=records)
763 def insertDimensionData(
764 self,
765 element: Union[DimensionElement, str],
766 *data: Union[Mapping[str, Any], DimensionRecord],
767 conform: bool = True,
768 replace: bool = False,
769 skip_existing: bool = False,
770 ) -> None:
771 # Docstring inherited from lsst.daf.butler.registry.Registry
772 if conform:
773 if isinstance(element, str):
774 element = self.dimensions[element]
775 records = [
776 row if isinstance(row, DimensionRecord) else element.RecordClass(**row) for row in data
777 ]
778 else:
779 # Ignore typing since caller said to trust them with conform=False.
780 records = data # type: ignore
781 storage = self._managers.dimensions[element] # type: ignore
782 storage.insert(*records, replace=replace, skip_existing=skip_existing)
784 def syncDimensionData(
785 self,
786 element: Union[DimensionElement, str],
787 row: Union[Mapping[str, Any], DimensionRecord],
788 conform: bool = True,
789 update: bool = False,
790 ) -> Union[bool, Dict[str, Any]]:
791 # Docstring inherited from lsst.daf.butler.registry.Registry
792 if conform:
793 if isinstance(element, str):
794 element = self.dimensions[element]
795 record = row if isinstance(row, DimensionRecord) else element.RecordClass(**row)
796 else:
797 # Ignore typing since caller said to trust them with conform=False.
798 record = row # type: ignore
799 storage = self._managers.dimensions[element] # type: ignore
800 return storage.sync(record, update=update)
802 def queryDatasetTypes(
803 self,
804 expression: Any = ...,
805 *,
806 components: Optional[bool] = None,
807 missing: Optional[List[str]] = None,
808 ) -> Iterator[DatasetType]:
809 # Docstring inherited from lsst.daf.butler.registry.Registry
810 try:
811 wildcard = CategorizedWildcard.fromExpression(expression, coerceUnrecognized=lambda d: d.name)
812 except TypeError as exc:
813 raise DatasetTypeExpressionError(f"Invalid dataset type expression '{expression}'") from exc
814 unknownComponentsMessage = (
815 "Could not find definition for storage class %s for dataset type %r;"
816 " if it has components they will not be included in dataset type query results."
817 )
818 if wildcard is Ellipsis:
819 for datasetType in self._managers.datasets:
820 # The dataset type can no longer be a component
821 yield datasetType
822 if components:
823 # Automatically create the component dataset types
824 try:
825 componentsForDatasetType = datasetType.makeAllComponentDatasetTypes()
826 except KeyError as err:
827 _LOG.warning(unknownComponentsMessage, err, datasetType.name)
828 else:
829 yield from componentsForDatasetType
830 return
831 done: Set[str] = set()
832 for name in wildcard.strings:
833 storage = self._managers.datasets.find(name)
834 done.add(name)
835 if storage is None:
836 if missing is not None:
837 missing.append(name)
838 else:
839 yield storage.datasetType
840 if wildcard.patterns:
841 # If components (the argument) is None, we'll save component
842 # dataset that we might want to match, but only if their parents
843 # didn't get included.
844 componentsForLater = []
845 for registeredDatasetType in self._managers.datasets:
846 # Components are not stored in registry so expand them here
847 allDatasetTypes = [registeredDatasetType]
848 if components is not False:
849 # Only check for the components if we are being asked
850 # for components or components is None.
851 try:
852 allDatasetTypes.extend(registeredDatasetType.makeAllComponentDatasetTypes())
853 except KeyError as err:
854 _LOG.warning(unknownComponentsMessage, err, registeredDatasetType.name)
855 for datasetType in allDatasetTypes:
856 if datasetType.name in done:
857 continue
858 parentName, componentName = datasetType.nameAndComponent()
859 if componentName is not None and not components:
860 if components is None and parentName not in done:
861 componentsForLater.append(datasetType)
862 continue
863 if any(p.fullmatch(datasetType.name) for p in wildcard.patterns):
864 done.add(datasetType.name)
865 yield datasetType
866 # Go back and try to match saved components.
867 for datasetType in componentsForLater:
868 parentName, _ = datasetType.nameAndComponent()
869 if parentName not in done and any(p.fullmatch(datasetType.name) for p in wildcard.patterns):
870 yield datasetType
872 def queryCollections(
873 self,
874 expression: Any = ...,
875 datasetType: Optional[DatasetType] = None,
876 collectionTypes: Union[Iterable[CollectionType], CollectionType] = CollectionType.all(),
877 flattenChains: bool = False,
878 includeChains: Optional[bool] = None,
879 ) -> Iterator[str]:
880 # Docstring inherited from lsst.daf.butler.registry.Registry
882 # Right now the datasetTypes argument is completely ignored, but that
883 # is consistent with its [lack of] guarantees. DM-24939 or a follow-up
884 # ticket will take care of that.
885 try:
886 query = CollectionQuery.fromExpression(expression)
887 except TypeError as exc:
888 raise CollectionExpressionError(f"Invalid collection expression '{expression}'") from exc
889 collectionTypes = ensure_iterable(collectionTypes)
890 for record in query.iter(
891 self._managers.collections,
892 collectionTypes=frozenset(collectionTypes),
893 flattenChains=flattenChains,
894 includeChains=includeChains,
895 ):
896 yield record.name
898 def _makeQueryBuilder(
899 self, summary: queries.QuerySummary, doomed_by: Iterable[str] = ()
900 ) -> queries.QueryBuilder:
901 """Return a `QueryBuilder` instance capable of constructing and
902 managing more complex queries than those obtainable via `Registry`
903 interfaces.
905 This is an advanced interface; downstream code should prefer
906 `Registry.queryDataIds` and `Registry.queryDatasets` whenever those
907 are sufficient.
909 Parameters
910 ----------
911 summary : `queries.QuerySummary`
912 Object describing and categorizing the full set of dimensions that
913 will be included in the query.
914 doomed_by : `Iterable` of `str`, optional
915 A list of diagnostic messages that indicate why the query is going
916 to yield no results and should not even be executed. If an empty
917 container (default) the query will be executed unless other code
918 determines that it is doomed.
920 Returns
921 -------
922 builder : `queries.QueryBuilder`
923 Object that can be used to construct and perform advanced queries.
924 """
925 return queries.QueryBuilder(
926 summary,
927 queries.RegistryManagers(
928 collections=self._managers.collections,
929 dimensions=self._managers.dimensions,
930 datasets=self._managers.datasets,
931 TimespanReprClass=self._db.getTimespanRepresentation(),
932 ),
933 doomed_by=doomed_by,
934 )
936 def queryDatasets(
937 self,
938 datasetType: Any,
939 *,
940 collections: Any = None,
941 dimensions: Optional[Iterable[Union[Dimension, str]]] = None,
942 dataId: Optional[DataId] = None,
943 where: Optional[str] = None,
944 findFirst: bool = False,
945 components: Optional[bool] = None,
946 bind: Optional[Mapping[str, Any]] = None,
947 check: bool = True,
948 **kwargs: Any,
949 ) -> queries.DatasetQueryResults:
950 # Docstring inherited from lsst.daf.butler.registry.Registry
952 # Standardize the collections expression.
953 if collections is None:
954 if not self.defaults.collections:
955 raise NoDefaultCollectionError(
956 "No collections provided to findDataset, and no defaults from registry construction."
957 )
958 collections = self.defaults.collections
959 elif findFirst:
960 collections = CollectionSearch.fromExpression(collections)
961 else:
962 collections = CollectionQuery.fromExpression(collections)
963 # Standardize and expand the data ID provided as a constraint.
964 standardizedDataId = self.expandDataId(dataId, **kwargs)
966 # We can only query directly if given a non-component DatasetType
967 # instance. If we were given an expression or str or a component
968 # DatasetType instance, we'll populate this dict, recurse, and return.
969 # If we already have a non-component DatasetType, it will remain None
970 # and we'll run the query directly.
971 composition: Optional[
972 Dict[
973 DatasetType, List[Optional[str]] # parent dataset type # component name, or None for parent
974 ]
975 ] = None
976 if not isinstance(datasetType, DatasetType):
977 # We were given a dataset type expression (which may be as simple
978 # as a str). Loop over all matching datasets, delegating handling
979 # of the `components` argument to queryDatasetTypes, as we populate
980 # the composition dict.
981 composition = defaultdict(list)
982 for trueDatasetType in self.queryDatasetTypes(datasetType, components=components):
983 parentName, componentName = trueDatasetType.nameAndComponent()
984 if componentName is not None:
985 parentDatasetType = self.getDatasetType(parentName)
986 composition.setdefault(parentDatasetType, []).append(componentName)
987 else:
988 composition.setdefault(trueDatasetType, []).append(None)
989 if not composition:
990 return queries.ChainedDatasetQueryResults(
991 [],
992 doomed_by=[
993 f"No registered dataset type matching {t!r} found, so no matching datasets can "
994 "exist in any collection."
995 for t in ensure_iterable(datasetType)
996 ],
997 )
998 elif datasetType.isComponent():
999 # We were given a true DatasetType instance, but it's a component.
1000 # the composition dict will have exactly one item.
1001 parentName, componentName = datasetType.nameAndComponent()
1002 parentDatasetType = self.getDatasetType(parentName)
1003 composition = {parentDatasetType: [componentName]}
1004 if composition is not None:
1005 # We need to recurse. Do that once for each parent dataset type.
1006 chain = []
1007 for parentDatasetType, componentNames in composition.items():
1008 parentResults = self.queryDatasets(
1009 parentDatasetType,
1010 collections=collections,
1011 dimensions=dimensions,
1012 dataId=standardizedDataId,
1013 where=where,
1014 bind=bind,
1015 findFirst=findFirst,
1016 check=check,
1017 )
1018 assert isinstance(
1019 parentResults, queries.ParentDatasetQueryResults
1020 ), "Should always be true if passing in a DatasetType instance, and we are."
1021 chain.append(parentResults.withComponents(componentNames))
1022 return queries.ChainedDatasetQueryResults(chain)
1023 # If we get here, there's no need to recurse (or we are already
1024 # recursing; there can only ever be one level of recursion).
1026 # The full set of dimensions in the query is the combination of those
1027 # needed for the DatasetType and those explicitly requested, if any.
1028 requestedDimensionNames = set(datasetType.dimensions.names)
1029 if dimensions is not None:
1030 requestedDimensionNames.update(self.dimensions.extract(dimensions).names)
1031 # Construct the summary structure needed to construct a QueryBuilder.
1032 summary = queries.QuerySummary(
1033 requested=DimensionGraph(self.dimensions, names=requestedDimensionNames),
1034 dataId=standardizedDataId,
1035 expression=where,
1036 bind=bind,
1037 defaults=self.defaults.dataId,
1038 check=check,
1039 datasets=[datasetType],
1040 )
1041 builder = self._makeQueryBuilder(summary)
1042 # Add the dataset subquery to the query, telling the QueryBuilder to
1043 # include the rank of the selected collection in the results only if we
1044 # need to findFirst. Note that if any of the collections are
1045 # actually wildcard expressions, and we've asked for deduplication,
1046 # this will raise TypeError for us.
1047 builder.joinDataset(datasetType, collections, isResult=True, findFirst=findFirst)
1048 query = builder.finish()
1049 return queries.ParentDatasetQueryResults(self._db, query, components=[None], datasetType=datasetType)
1051 def queryDataIds(
1052 self,
1053 dimensions: Union[Iterable[Union[Dimension, str]], Dimension, str],
1054 *,
1055 dataId: Optional[DataId] = None,
1056 datasets: Any = None,
1057 collections: Any = None,
1058 where: Optional[str] = None,
1059 components: Optional[bool] = None,
1060 bind: Optional[Mapping[str, Any]] = None,
1061 check: bool = True,
1062 **kwargs: Any,
1063 ) -> queries.DataCoordinateQueryResults:
1064 # Docstring inherited from lsst.daf.butler.registry.Registry
1065 dimensions = ensure_iterable(dimensions)
1066 standardizedDataId = self.expandDataId(dataId, **kwargs)
1067 standardizedDatasetTypes = set()
1068 requestedDimensions = self.dimensions.extract(dimensions)
1069 missing: List[str] = []
1070 if datasets is not None:
1071 if not collections:
1072 if not self.defaults.collections:
1073 raise NoDefaultCollectionError(
1074 f"Cannot pass 'datasets' (='{datasets}') without 'collections'."
1075 )
1076 collections = self.defaults.collections
1077 else:
1078 # Preprocess collections expression in case the original
1079 # included single-pass iterators (we'll want to use it multiple
1080 # times below).
1081 collections = CollectionQuery.fromExpression(collections)
1082 for datasetType in self.queryDatasetTypes(datasets, components=components, missing=missing):
1083 # If any matched dataset type is a component, just operate on
1084 # its parent instead, because Registry doesn't know anything
1085 # about what components exist, and here (unlike queryDatasets)
1086 # we don't care about returning them.
1087 parentDatasetTypeName, componentName = datasetType.nameAndComponent()
1088 if componentName is not None:
1089 datasetType = self.getDatasetType(parentDatasetTypeName)
1090 standardizedDatasetTypes.add(datasetType)
1091 elif collections:
1092 raise ArgumentError(f"Cannot pass 'collections' (='{collections}') without 'datasets'.")
1094 def query_factory(
1095 order_by: Optional[Iterable[str]] = None, limit: Optional[Tuple[int, Optional[int]]] = None
1096 ) -> Query:
1097 """Construct the Query object that generates query results."""
1098 summary = queries.QuerySummary(
1099 requested=requestedDimensions,
1100 dataId=standardizedDataId,
1101 expression=where,
1102 bind=bind,
1103 defaults=self.defaults.dataId,
1104 check=check,
1105 datasets=standardizedDatasetTypes,
1106 order_by=order_by,
1107 limit=limit,
1108 )
1109 builder = self._makeQueryBuilder(
1110 summary, doomed_by=[f"Dataset type {name} is not registered." for name in missing]
1111 )
1112 for datasetType in standardizedDatasetTypes:
1113 builder.joinDataset(
1114 datasetType,
1115 collections,
1116 isResult=False,
1117 )
1118 return builder.finish()
1120 return queries.DataCoordinateQueryResults(self._db, query_factory, requestedDimensions)
1122 def queryDimensionRecords(
1123 self,
1124 element: Union[DimensionElement, str],
1125 *,
1126 dataId: Optional[DataId] = None,
1127 datasets: Any = None,
1128 collections: Any = None,
1129 where: Optional[str] = None,
1130 components: Optional[bool] = None,
1131 bind: Optional[Mapping[str, Any]] = None,
1132 check: bool = True,
1133 **kwargs: Any,
1134 ) -> queries.DimensionRecordQueryResults:
1135 # Docstring inherited from lsst.daf.butler.registry.Registry
1136 if not isinstance(element, DimensionElement):
1137 try:
1138 element = self.dimensions[element]
1139 except KeyError as e:
1140 raise DimensionNameError(
1141 f"No such dimension '{element}', available dimensions: "
1142 + str(self.dimensions.getStaticElements())
1143 ) from e
1144 dataIds = self.queryDataIds(
1145 element.graph,
1146 dataId=dataId,
1147 datasets=datasets,
1148 collections=collections,
1149 where=where,
1150 components=components,
1151 bind=bind,
1152 check=check,
1153 **kwargs,
1154 )
1155 return queries.DatabaseDimensionRecordQueryResults(dataIds, self._managers.dimensions[element])
1157 def queryDatasetAssociations(
1158 self,
1159 datasetType: Union[str, DatasetType],
1160 collections: Any = ...,
1161 *,
1162 collectionTypes: Iterable[CollectionType] = CollectionType.all(),
1163 flattenChains: bool = False,
1164 ) -> Iterator[DatasetAssociation]:
1165 # Docstring inherited from lsst.daf.butler.registry.Registry
1166 if collections is None:
1167 if not self.defaults.collections:
1168 raise NoDefaultCollectionError(
1169 "No collections provided to findDataset, and no defaults from registry construction."
1170 )
1171 collections = self.defaults.collections
1172 else:
1173 collections = CollectionQuery.fromExpression(collections)
1174 TimespanReprClass = self._db.getTimespanRepresentation()
1175 if isinstance(datasetType, str):
1176 storage = self._managers.datasets[datasetType]
1177 else:
1178 storage = self._managers.datasets[datasetType.name]
1179 for collectionRecord in collections.iter(
1180 self._managers.collections,
1181 collectionTypes=frozenset(collectionTypes),
1182 flattenChains=flattenChains,
1183 ):
1184 query = storage.select(collectionRecord)
1185 for row in self._db.query(query).mappings():
1186 dataId = DataCoordinate.fromRequiredValues(
1187 storage.datasetType.dimensions,
1188 tuple(row[name] for name in storage.datasetType.dimensions.required.names),
1189 )
1190 runRecord = self._managers.collections[row[self._managers.collections.getRunForeignKeyName()]]
1191 ref = DatasetRef(storage.datasetType, dataId, id=row["id"], run=runRecord.name, conform=False)
1192 if collectionRecord.type is CollectionType.CALIBRATION:
1193 timespan = TimespanReprClass.extract(row)
1194 else:
1195 timespan = None
1196 yield DatasetAssociation(ref=ref, collection=collectionRecord.name, timespan=timespan)
1198 storageClasses: StorageClassFactory
1199 """All storage classes known to the registry (`StorageClassFactory`).
1200 """