Coverage for python/lsst/daf/butler/registries/sql.py: 13%
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22from __future__ import annotations
24__all__ = ("SqlRegistry",)
26import contextlib
27import logging
28from collections import defaultdict
29from typing import TYPE_CHECKING, Any, Dict, Iterable, Iterator, List, Mapping, Optional, Set, Tuple, Union
31import sqlalchemy
32from lsst.resources import ResourcePathExpression
33from lsst.utils.iteration import ensure_iterable
35from ..core import (
36 Config,
37 DataCoordinate,
38 DataCoordinateIterable,
39 DataId,
40 DatasetAssociation,
41 DatasetId,
42 DatasetRef,
43 DatasetType,
44 Dimension,
45 DimensionConfig,
46 DimensionElement,
47 DimensionGraph,
48 DimensionRecord,
49 DimensionUniverse,
50 NamedKeyMapping,
51 NameLookupMapping,
52 Progress,
53 StorageClassFactory,
54 Timespan,
55 ddl,
56)
57from ..core.utils import transactional
58from ..registry import (
59 CollectionSearch,
60 CollectionType,
61 ConflictingDefinitionError,
62 InconsistentDataIdError,
63 OrphanedRecordError,
64 Registry,
65 RegistryConfig,
66 RegistryDefaults,
67 queries,
68)
69from ..registry.interfaces import ChainedCollectionRecord, DatasetIdGenEnum, RunRecord
70from ..registry.managers import RegistryManagerInstances, RegistryManagerTypes
71from ..registry.queries import Query
72from ..registry.summaries import CollectionSummary
73from ..registry.wildcards import CategorizedWildcard, CollectionQuery, Ellipsis
75if TYPE_CHECKING: 75 ↛ 76line 75 didn't jump to line 76, because the condition on line 75 was never true
76 from .._butlerConfig import ButlerConfig
77 from ..registry.interfaces import CollectionRecord, Database, DatastoreRegistryBridgeManager
80_LOG = logging.getLogger(__name__)
83class SqlRegistry(Registry):
84 """Registry implementation based on SQLAlchemy.
86 Parameters
87 ----------
88 database : `Database`
89 Database instance to store Registry.
90 defaults : `RegistryDefaults`
91 Default collection search path and/or output `~CollectionType.RUN`
92 collection.
93 managers : `RegistryManagerInstances`
94 All the managers required for this registry.
95 """
97 defaultConfigFile: Optional[str] = None
98 """Path to configuration defaults. Accessed within the ``configs`` resource
99 or relative to a search path. Can be None if no defaults specified.
100 """
102 @classmethod
103 def createFromConfig(
104 cls,
105 config: Optional[Union[RegistryConfig, str]] = None,
106 dimensionConfig: Optional[Union[DimensionConfig, str]] = None,
107 butlerRoot: Optional[ResourcePathExpression] = None,
108 ) -> Registry:
109 """Create registry database and return `SqlRegistry` instance.
111 This method initializes database contents, database must be empty
112 prior to calling this method.
114 Parameters
115 ----------
116 config : `RegistryConfig` or `str`, optional
117 Registry configuration, if missing then default configuration will
118 be loaded from registry.yaml.
119 dimensionConfig : `DimensionConfig` or `str`, optional
120 Dimensions configuration, if missing then default configuration
121 will be loaded from dimensions.yaml.
122 butlerRoot : convertible to `lsst.resources.ResourcePath`, optional
123 Path to the repository root this `SqlRegistry` will manage.
125 Returns
126 -------
127 registry : `SqlRegistry`
128 A new `SqlRegistry` instance.
129 """
130 config = cls.forceRegistryConfig(config)
131 config.replaceRoot(butlerRoot)
133 if isinstance(dimensionConfig, str):
134 dimensionConfig = DimensionConfig(config)
135 elif dimensionConfig is None:
136 dimensionConfig = DimensionConfig()
137 elif not isinstance(dimensionConfig, DimensionConfig):
138 raise TypeError(f"Incompatible Dimension configuration type: {type(dimensionConfig)}")
140 DatabaseClass = config.getDatabaseClass()
141 database = DatabaseClass.fromUri(
142 str(config.connectionString), origin=config.get("origin", 0), namespace=config.get("namespace")
143 )
144 managerTypes = RegistryManagerTypes.fromConfig(config)
145 managers = managerTypes.makeRepo(database, dimensionConfig)
146 return cls(database, RegistryDefaults(), managers)
148 @classmethod
149 def fromConfig(
150 cls,
151 config: Union[ButlerConfig, RegistryConfig, Config, str],
152 butlerRoot: Optional[ResourcePathExpression] = None,
153 writeable: bool = True,
154 defaults: Optional[RegistryDefaults] = None,
155 ) -> Registry:
156 """Create `Registry` subclass instance from `config`.
158 Registry database must be initialized prior to calling this method.
160 Parameters
161 ----------
162 config : `ButlerConfig`, `RegistryConfig`, `Config` or `str`
163 Registry configuration
164 butlerRoot : `lsst.resources.ResourcePathExpression`, optional
165 Path to the repository root this `Registry` will manage.
166 writeable : `bool`, optional
167 If `True` (default) create a read-write connection to the database.
168 defaults : `RegistryDefaults`, optional
169 Default collection search path and/or output `~CollectionType.RUN`
170 collection.
172 Returns
173 -------
174 registry : `SqlRegistry` (subclass)
175 A new `SqlRegistry` subclass instance.
176 """
177 config = cls.forceRegistryConfig(config)
178 config.replaceRoot(butlerRoot)
179 DatabaseClass = config.getDatabaseClass()
180 database = DatabaseClass.fromUri(
181 str(config.connectionString),
182 origin=config.get("origin", 0),
183 namespace=config.get("namespace"),
184 writeable=writeable,
185 )
186 managerTypes = RegistryManagerTypes.fromConfig(config)
187 managers = managerTypes.loadRepo(database)
188 if defaults is None:
189 defaults = RegistryDefaults()
190 return cls(database, defaults, managers)
192 def __init__(self, database: Database, defaults: RegistryDefaults, managers: RegistryManagerInstances):
193 self._db = database
194 self._managers = managers
195 self.storageClasses = StorageClassFactory()
196 # Intentionally invoke property setter to initialize defaults. This
197 # can only be done after most of the rest of Registry has already been
198 # initialized, and must be done before the property getter is used.
199 self.defaults = defaults
201 def __str__(self) -> str:
202 return str(self._db)
204 def __repr__(self) -> str:
205 return f"SqlRegistry({self._db!r}, {self.dimensions!r})"
207 def isWriteable(self) -> bool:
208 # Docstring inherited from lsst.daf.butler.registry.Registry
209 return self._db.isWriteable()
211 def copy(self, defaults: Optional[RegistryDefaults] = None) -> Registry:
212 # Docstring inherited from lsst.daf.butler.registry.Registry
213 if defaults is None:
214 # No need to copy, because `RegistryDefaults` is immutable; we
215 # effectively copy on write.
216 defaults = self.defaults
217 return type(self)(self._db, defaults, self._managers)
219 @property
220 def dimensions(self) -> DimensionUniverse:
221 # Docstring inherited from lsst.daf.butler.registry.Registry
222 return self._managers.dimensions.universe
224 def refresh(self) -> None:
225 # Docstring inherited from lsst.daf.butler.registry.Registry
226 self._managers.refresh()
228 @contextlib.contextmanager
229 def transaction(self, *, savepoint: bool = False) -> Iterator[None]:
230 # Docstring inherited from lsst.daf.butler.registry.Registry
231 try:
232 with self._db.transaction(savepoint=savepoint):
233 yield
234 except BaseException:
235 # TODO: this clears the caches sometimes when we wouldn't actually
236 # need to. Can we avoid that?
237 self._managers.dimensions.clearCaches()
238 raise
240 def resetConnectionPool(self) -> None:
241 """Reset SQLAlchemy connection pool for `SqlRegistry` database.
243 This operation is useful when using registry with fork-based
244 multiprocessing. To use registry across fork boundary one has to make
245 sure that there are no currently active connections (no session or
246 transaction is in progress) and connection pool is reset using this
247 method. This method should be called by the child process immediately
248 after the fork.
249 """
250 self._db._engine.dispose()
252 def registerOpaqueTable(self, tableName: str, spec: ddl.TableSpec) -> None:
253 """Add an opaque (to the `Registry`) table for use by a `Datastore` or
254 other data repository client.
256 Opaque table records can be added via `insertOpaqueData`, retrieved via
257 `fetchOpaqueData`, and removed via `deleteOpaqueData`.
259 Parameters
260 ----------
261 tableName : `str`
262 Logical name of the opaque table. This may differ from the
263 actual name used in the database by a prefix and/or suffix.
264 spec : `ddl.TableSpec`
265 Specification for the table to be added.
266 """
267 self._managers.opaque.register(tableName, spec)
269 @transactional
270 def insertOpaqueData(self, tableName: str, *data: dict) -> None:
271 """Insert records into an opaque table.
273 Parameters
274 ----------
275 tableName : `str`
276 Logical name of the opaque table. Must match the name used in a
277 previous call to `registerOpaqueTable`.
278 data
279 Each additional positional argument is a dictionary that represents
280 a single row to be added.
281 """
282 self._managers.opaque[tableName].insert(*data)
284 def fetchOpaqueData(self, tableName: str, **where: Any) -> Iterator[dict]:
285 """Retrieve records from an opaque table.
287 Parameters
288 ----------
289 tableName : `str`
290 Logical name of the opaque table. Must match the name used in a
291 previous call to `registerOpaqueTable`.
292 where
293 Additional keyword arguments are interpreted as equality
294 constraints that restrict the returned rows (combined with AND);
295 keyword arguments are column names and values are the values they
296 must have.
298 Yields
299 ------
300 row : `dict`
301 A dictionary representing a single result row.
302 """
303 yield from self._managers.opaque[tableName].fetch(**where)
305 @transactional
306 def deleteOpaqueData(self, tableName: str, **where: Any) -> None:
307 """Remove records from an opaque table.
309 Parameters
310 ----------
311 tableName : `str`
312 Logical name of the opaque table. Must match the name used in a
313 previous call to `registerOpaqueTable`.
314 where
315 Additional keyword arguments are interpreted as equality
316 constraints that restrict the deleted rows (combined with AND);
317 keyword arguments are column names and values are the values they
318 must have.
319 """
320 self._managers.opaque[tableName].delete(where.keys(), where)
322 def registerCollection(
323 self, name: str, type: CollectionType = CollectionType.TAGGED, doc: Optional[str] = None
324 ) -> bool:
325 # Docstring inherited from lsst.daf.butler.registry.Registry
326 _, registered = self._managers.collections.register(name, type, doc=doc)
327 return registered
329 def getCollectionType(self, name: str) -> CollectionType:
330 # Docstring inherited from lsst.daf.butler.registry.Registry
331 return self._managers.collections.find(name).type
333 def _get_collection_record(self, name: str) -> CollectionRecord:
334 # Docstring inherited from lsst.daf.butler.registry.Registry
335 return self._managers.collections.find(name)
337 def registerRun(self, name: str, doc: Optional[str] = None) -> bool:
338 # Docstring inherited from lsst.daf.butler.registry.Registry
339 _, registered = self._managers.collections.register(name, CollectionType.RUN, doc=doc)
340 return registered
342 @transactional
343 def removeCollection(self, name: str) -> None:
344 # Docstring inherited from lsst.daf.butler.registry.Registry
345 self._managers.collections.remove(name)
347 def getCollectionChain(self, parent: str) -> CollectionSearch:
348 # Docstring inherited from lsst.daf.butler.registry.Registry
349 record = self._managers.collections.find(parent)
350 if record.type is not CollectionType.CHAINED:
351 raise TypeError(f"Collection '{parent}' has type {record.type.name}, not CHAINED.")
352 assert isinstance(record, ChainedCollectionRecord)
353 return record.children
355 @transactional
356 def setCollectionChain(self, parent: str, children: Any, *, flatten: bool = False) -> None:
357 # Docstring inherited from lsst.daf.butler.registry.Registry
358 record = self._managers.collections.find(parent)
359 if record.type is not CollectionType.CHAINED:
360 raise TypeError(f"Collection '{parent}' has type {record.type.name}, not CHAINED.")
361 assert isinstance(record, ChainedCollectionRecord)
362 children = CollectionSearch.fromExpression(children)
363 if children != record.children or flatten:
364 record.update(self._managers.collections, children, flatten=flatten)
366 def getCollectionDocumentation(self, collection: str) -> Optional[str]:
367 # Docstring inherited from lsst.daf.butler.registry.Registry
368 return self._managers.collections.getDocumentation(self._managers.collections.find(collection).key)
370 def setCollectionDocumentation(self, collection: str, doc: Optional[str]) -> None:
371 # Docstring inherited from lsst.daf.butler.registry.Registry
372 self._managers.collections.setDocumentation(self._managers.collections.find(collection).key, doc)
374 def getCollectionSummary(self, collection: str) -> CollectionSummary:
375 # Docstring inherited from lsst.daf.butler.registry.Registry
376 record = self._managers.collections.find(collection)
377 return self._managers.datasets.getCollectionSummary(record)
379 def registerDatasetType(self, datasetType: DatasetType) -> bool:
380 # Docstring inherited from lsst.daf.butler.registry.Registry
381 _, inserted = self._managers.datasets.register(datasetType)
382 return inserted
384 def removeDatasetType(self, name: str) -> None:
385 # Docstring inherited from lsst.daf.butler.registry.Registry
386 self._managers.datasets.remove(name)
388 def getDatasetType(self, name: str) -> DatasetType:
389 # Docstring inherited from lsst.daf.butler.registry.Registry
390 return self._managers.datasets[name].datasetType
392 def supportsIdGenerationMode(self, mode: DatasetIdGenEnum) -> bool:
393 # Docstring inherited from lsst.daf.butler.registry.Registry
394 return self._managers.datasets.supportsIdGenerationMode(mode)
396 def findDataset(
397 self,
398 datasetType: Union[DatasetType, str],
399 dataId: Optional[DataId] = None,
400 *,
401 collections: Any = None,
402 timespan: Optional[Timespan] = None,
403 **kwargs: Any,
404 ) -> Optional[DatasetRef]:
405 # Docstring inherited from lsst.daf.butler.registry.Registry
406 if isinstance(datasetType, DatasetType):
407 storage = self._managers.datasets[datasetType.name]
408 else:
409 storage = self._managers.datasets[datasetType]
410 dataId = DataCoordinate.standardize(
411 dataId,
412 graph=storage.datasetType.dimensions,
413 universe=self.dimensions,
414 defaults=self.defaults.dataId,
415 **kwargs,
416 )
417 if collections is None:
418 if not self.defaults.collections:
419 raise TypeError(
420 "No collections provided to findDataset, and no defaults from registry construction."
421 )
422 collections = self.defaults.collections
423 else:
424 collections = CollectionSearch.fromExpression(collections)
425 for collectionRecord in collections.iter(self._managers.collections):
426 if collectionRecord.type is CollectionType.CALIBRATION and (
427 not storage.datasetType.isCalibration() or timespan is None
428 ):
429 continue
430 result = storage.find(collectionRecord, dataId, timespan=timespan)
431 if result is not None:
432 return result
434 return None
436 @transactional
437 def insertDatasets(
438 self,
439 datasetType: Union[DatasetType, str],
440 dataIds: Iterable[DataId],
441 run: Optional[str] = None,
442 expand: bool = True,
443 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE,
444 ) -> List[DatasetRef]:
445 # Docstring inherited from lsst.daf.butler.registry.Registry
446 if isinstance(datasetType, DatasetType):
447 storage = self._managers.datasets.find(datasetType.name)
448 if storage is None:
449 raise LookupError(f"DatasetType '{datasetType}' has not been registered.")
450 else:
451 storage = self._managers.datasets.find(datasetType)
452 if storage is None:
453 raise LookupError(f"DatasetType with name '{datasetType}' has not been registered.")
454 if run is None:
455 if self.defaults.run is None:
456 raise TypeError(
457 "No run provided to insertDatasets, and no default from registry construction."
458 )
459 run = self.defaults.run
460 runRecord = self._managers.collections.find(run)
461 if runRecord.type is not CollectionType.RUN:
462 raise TypeError(f"Given collection is of type {runRecord.type.name}; RUN collection required.")
463 assert isinstance(runRecord, RunRecord)
464 progress = Progress("daf.butler.Registry.insertDatasets", level=logging.DEBUG)
465 if expand:
466 expandedDataIds = [
467 self.expandDataId(dataId, graph=storage.datasetType.dimensions)
468 for dataId in progress.wrap(dataIds, f"Expanding {storage.datasetType.name} data IDs")
469 ]
470 else:
471 expandedDataIds = [
472 DataCoordinate.standardize(dataId, graph=storage.datasetType.dimensions) for dataId in dataIds
473 ]
474 try:
475 refs = list(storage.insert(runRecord, expandedDataIds, idGenerationMode))
476 except sqlalchemy.exc.IntegrityError as err:
477 raise ConflictingDefinitionError(
478 f"A database constraint failure was triggered by inserting "
479 f"one or more datasets of type {storage.datasetType} into "
480 f"collection '{run}'. "
481 f"This probably means a dataset with the same data ID "
482 f"and dataset type already exists, but it may also mean a "
483 f"dimension row is missing."
484 ) from err
485 return refs
487 @transactional
488 def _importDatasets(
489 self,
490 datasets: Iterable[DatasetRef],
491 expand: bool = True,
492 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE,
493 reuseIds: bool = False,
494 ) -> List[DatasetRef]:
495 # Docstring inherited from lsst.daf.butler.registry.Registry
496 datasets = list(datasets)
497 if not datasets:
498 # nothing to do
499 return []
501 # find dataset type
502 datasetTypes = set(dataset.datasetType for dataset in datasets)
503 if len(datasetTypes) != 1:
504 raise ValueError(f"Multiple dataset types in input datasets: {datasetTypes}")
505 datasetType = datasetTypes.pop()
507 # get storage handler for this dataset type
508 storage = self._managers.datasets.find(datasetType.name)
509 if storage is None:
510 raise LookupError(f"DatasetType '{datasetType}' has not been registered.")
512 # find run name
513 runs = set(dataset.run for dataset in datasets)
514 if len(runs) != 1:
515 raise ValueError(f"Multiple run names in input datasets: {runs}")
516 run = runs.pop()
517 if run is None:
518 if self.defaults.run is None:
519 raise TypeError(
520 "No run provided to ingestDatasets, and no default from registry construction."
521 )
522 run = self.defaults.run
524 runRecord = self._managers.collections.find(run)
525 if runRecord.type is not CollectionType.RUN:
526 raise TypeError(
527 f"Given collection '{runRecord.name}' is of type {runRecord.type.name};"
528 " RUN collection required."
529 )
530 assert isinstance(runRecord, RunRecord)
532 progress = Progress("daf.butler.Registry.insertDatasets", level=logging.DEBUG)
533 if expand:
534 expandedDatasets = [
535 dataset.expanded(self.expandDataId(dataset.dataId, graph=storage.datasetType.dimensions))
536 for dataset in progress.wrap(datasets, f"Expanding {storage.datasetType.name} data IDs")
537 ]
538 else:
539 expandedDatasets = [
540 DatasetRef(datasetType, dataset.dataId, id=dataset.id, run=dataset.run, conform=True)
541 for dataset in datasets
542 ]
544 try:
545 refs = list(storage.import_(runRecord, expandedDatasets, idGenerationMode, reuseIds))
546 except sqlalchemy.exc.IntegrityError as err:
547 raise ConflictingDefinitionError(
548 f"A database constraint failure was triggered by inserting "
549 f"one or more datasets of type {storage.datasetType} into "
550 f"collection '{run}'. "
551 f"This probably means a dataset with the same data ID "
552 f"and dataset type already exists, but it may also mean a "
553 f"dimension row is missing."
554 ) from err
555 return refs
557 def getDataset(self, id: DatasetId) -> Optional[DatasetRef]:
558 # Docstring inherited from lsst.daf.butler.registry.Registry
559 return self._managers.datasets.getDatasetRef(id)
561 @transactional
562 def removeDatasets(self, refs: Iterable[DatasetRef]) -> None:
563 # Docstring inherited from lsst.daf.butler.registry.Registry
564 progress = Progress("lsst.daf.butler.Registry.removeDatasets", level=logging.DEBUG)
565 for datasetType, refsForType in progress.iter_item_chunks(
566 DatasetRef.groupByType(refs).items(), desc="Removing datasets by type"
567 ):
568 storage = self._managers.datasets[datasetType.name]
569 try:
570 storage.delete(refsForType)
571 except sqlalchemy.exc.IntegrityError as err:
572 raise OrphanedRecordError(
573 "One or more datasets is still present in one or more Datastores."
574 ) from err
576 @transactional
577 def associate(self, collection: str, refs: Iterable[DatasetRef]) -> None:
578 # Docstring inherited from lsst.daf.butler.registry.Registry
579 progress = Progress("lsst.daf.butler.Registry.associate", level=logging.DEBUG)
580 collectionRecord = self._managers.collections.find(collection)
581 if collectionRecord.type is not CollectionType.TAGGED:
582 raise TypeError(f"Collection '{collection}' has type {collectionRecord.type.name}, not TAGGED.")
583 for datasetType, refsForType in progress.iter_item_chunks(
584 DatasetRef.groupByType(refs).items(), desc="Associating datasets by type"
585 ):
586 storage = self._managers.datasets[datasetType.name]
587 try:
588 storage.associate(collectionRecord, refsForType)
589 except sqlalchemy.exc.IntegrityError as err:
590 raise ConflictingDefinitionError(
591 f"Constraint violation while associating dataset of type {datasetType.name} with "
592 f"collection {collection}. This probably means that one or more datasets with the same "
593 f"dataset type and data ID already exist in the collection, but it may also indicate "
594 f"that the datasets do not exist."
595 ) from err
597 @transactional
598 def disassociate(self, collection: str, refs: Iterable[DatasetRef]) -> None:
599 # Docstring inherited from lsst.daf.butler.registry.Registry
600 progress = Progress("lsst.daf.butler.Registry.disassociate", level=logging.DEBUG)
601 collectionRecord = self._managers.collections.find(collection)
602 if collectionRecord.type is not CollectionType.TAGGED:
603 raise TypeError(
604 f"Collection '{collection}' has type {collectionRecord.type.name}; expected TAGGED."
605 )
606 for datasetType, refsForType in progress.iter_item_chunks(
607 DatasetRef.groupByType(refs).items(), desc="Disassociating datasets by type"
608 ):
609 storage = self._managers.datasets[datasetType.name]
610 storage.disassociate(collectionRecord, refsForType)
612 @transactional
613 def certify(self, collection: str, refs: Iterable[DatasetRef], timespan: Timespan) -> None:
614 # Docstring inherited from lsst.daf.butler.registry.Registry
615 progress = Progress("lsst.daf.butler.Registry.certify", level=logging.DEBUG)
616 collectionRecord = self._managers.collections.find(collection)
617 for datasetType, refsForType in progress.iter_item_chunks(
618 DatasetRef.groupByType(refs).items(), desc="Certifying datasets by type"
619 ):
620 storage = self._managers.datasets[datasetType.name]
621 storage.certify(collectionRecord, refsForType, timespan)
623 @transactional
624 def decertify(
625 self,
626 collection: str,
627 datasetType: Union[str, DatasetType],
628 timespan: Timespan,
629 *,
630 dataIds: Optional[Iterable[DataId]] = None,
631 ) -> None:
632 # Docstring inherited from lsst.daf.butler.registry.Registry
633 collectionRecord = self._managers.collections.find(collection)
634 if isinstance(datasetType, str):
635 storage = self._managers.datasets[datasetType]
636 else:
637 storage = self._managers.datasets[datasetType.name]
638 standardizedDataIds = None
639 if dataIds is not None:
640 standardizedDataIds = [
641 DataCoordinate.standardize(d, graph=storage.datasetType.dimensions) for d in dataIds
642 ]
643 storage.decertify(collectionRecord, timespan, dataIds=standardizedDataIds)
645 def getDatastoreBridgeManager(self) -> DatastoreRegistryBridgeManager:
646 """Return an object that allows a new `Datastore` instance to
647 communicate with this `Registry`.
649 Returns
650 -------
651 manager : `DatastoreRegistryBridgeManager`
652 Object that mediates communication between this `Registry` and its
653 associated datastores.
654 """
655 return self._managers.datastores
657 def getDatasetLocations(self, ref: DatasetRef) -> Iterable[str]:
658 # Docstring inherited from lsst.daf.butler.registry.Registry
659 return self._managers.datastores.findDatastores(ref)
661 def expandDataId(
662 self,
663 dataId: Optional[DataId] = None,
664 *,
665 graph: Optional[DimensionGraph] = None,
666 records: Optional[NameLookupMapping[DimensionElement, Optional[DimensionRecord]]] = None,
667 withDefaults: bool = True,
668 **kwargs: Any,
669 ) -> DataCoordinate:
670 # Docstring inherited from lsst.daf.butler.registry.Registry
671 if not withDefaults:
672 defaults = None
673 else:
674 defaults = self.defaults.dataId
675 standardized = DataCoordinate.standardize(
676 dataId, graph=graph, universe=self.dimensions, defaults=defaults, **kwargs
677 )
678 if standardized.hasRecords():
679 return standardized
680 if records is None:
681 records = {}
682 elif isinstance(records, NamedKeyMapping):
683 records = records.byName()
684 else:
685 records = dict(records)
686 if isinstance(dataId, DataCoordinate) and dataId.hasRecords():
687 records.update(dataId.records.byName())
688 keys = standardized.byName()
689 for element in standardized.graph.primaryKeyTraversalOrder:
690 record = records.get(element.name, ...) # Use ... to mean not found; None might mean NULL
691 if record is ...:
692 if isinstance(element, Dimension) and keys.get(element.name) is None:
693 if element in standardized.graph.required:
694 raise LookupError(f"No value or null value for required dimension {element.name}.")
695 keys[element.name] = None
696 record = None
697 else:
698 storage = self._managers.dimensions[element]
699 dataIdSet = DataCoordinateIterable.fromScalar(
700 DataCoordinate.standardize(keys, graph=element.graph)
701 )
702 fetched = tuple(storage.fetch(dataIdSet))
703 try:
704 (record,) = fetched
705 except ValueError:
706 record = None
707 records[element.name] = record
708 if record is not None:
709 for d in element.implied:
710 value = getattr(record, d.name)
711 if keys.setdefault(d.name, value) != value:
712 raise InconsistentDataIdError(
713 f"Data ID {standardized} has {d.name}={keys[d.name]!r}, "
714 f"but {element.name} implies {d.name}={value!r}."
715 )
716 else:
717 if element in standardized.graph.required:
718 raise LookupError(
719 f"Could not fetch record for required dimension {element.name} via keys {keys}."
720 )
721 if element.alwaysJoin:
722 raise InconsistentDataIdError(
723 f"Could not fetch record for element {element.name} via keys {keys}, ",
724 "but it is marked alwaysJoin=True; this means one or more dimensions are not "
725 "related.",
726 )
727 for d in element.implied:
728 keys.setdefault(d.name, None)
729 records.setdefault(d.name, None)
730 return DataCoordinate.standardize(keys, graph=standardized.graph).expanded(records=records)
732 def insertDimensionData(
733 self,
734 element: Union[DimensionElement, str],
735 *data: Union[Mapping[str, Any], DimensionRecord],
736 conform: bool = True,
737 replace: bool = False,
738 ) -> None:
739 # Docstring inherited from lsst.daf.butler.registry.Registry
740 if conform:
741 if isinstance(element, str):
742 element = self.dimensions[element]
743 records = [
744 row if isinstance(row, DimensionRecord) else element.RecordClass(**row) for row in data
745 ]
746 else:
747 # Ignore typing since caller said to trust them with conform=False.
748 records = data # type: ignore
749 storage = self._managers.dimensions[element] # type: ignore
750 storage.insert(*records, replace=replace)
752 def syncDimensionData(
753 self,
754 element: Union[DimensionElement, str],
755 row: Union[Mapping[str, Any], DimensionRecord],
756 conform: bool = True,
757 update: bool = False,
758 ) -> Union[bool, Dict[str, Any]]:
759 # Docstring inherited from lsst.daf.butler.registry.Registry
760 if conform:
761 if isinstance(element, str):
762 element = self.dimensions[element]
763 record = row if isinstance(row, DimensionRecord) else element.RecordClass(**row)
764 else:
765 # Ignore typing since caller said to trust them with conform=False.
766 record = row # type: ignore
767 storage = self._managers.dimensions[element] # type: ignore
768 return storage.sync(record, update=update)
770 def queryDatasetTypes(
771 self,
772 expression: Any = ...,
773 *,
774 components: Optional[bool] = None,
775 missing: Optional[List[str]] = None,
776 ) -> Iterator[DatasetType]:
777 # Docstring inherited from lsst.daf.butler.registry.Registry
778 wildcard = CategorizedWildcard.fromExpression(expression, coerceUnrecognized=lambda d: d.name)
779 if wildcard is Ellipsis:
780 for datasetType in self._managers.datasets:
781 # The dataset type can no longer be a component
782 yield datasetType
783 if components:
784 # Automatically create the component dataset types
785 try:
786 componentsForDatasetType = datasetType.makeAllComponentDatasetTypes()
787 except KeyError as err:
788 _LOG.warning(
789 f"Could not load storage class {err} for {datasetType.name}; "
790 "if it has components they will not be included in query results."
791 )
792 else:
793 yield from componentsForDatasetType
794 return
795 done: Set[str] = set()
796 for name in wildcard.strings:
797 storage = self._managers.datasets.find(name)
798 done.add(name)
799 if storage is None:
800 if missing is not None:
801 missing.append(name)
802 else:
803 yield storage.datasetType
804 if wildcard.patterns:
805 # If components (the argument) is None, we'll save component
806 # dataset that we might want to match, but only if their parents
807 # didn't get included.
808 componentsForLater = []
809 for registeredDatasetType in self._managers.datasets:
810 # Components are not stored in registry so expand them here
811 allDatasetTypes = [registeredDatasetType]
812 try:
813 allDatasetTypes.extend(registeredDatasetType.makeAllComponentDatasetTypes())
814 except KeyError as err:
815 _LOG.warning(
816 f"Could not load storage class {err} for {registeredDatasetType.name}; "
817 "if it has components they will not be included in query results."
818 )
819 for datasetType in allDatasetTypes:
820 if datasetType.name in done:
821 continue
822 parentName, componentName = datasetType.nameAndComponent()
823 if componentName is not None and not components:
824 if components is None and parentName not in done:
825 componentsForLater.append(datasetType)
826 continue
827 if any(p.fullmatch(datasetType.name) for p in wildcard.patterns):
828 done.add(datasetType.name)
829 yield datasetType
830 # Go back and try to match saved components.
831 for datasetType in componentsForLater:
832 parentName, _ = datasetType.nameAndComponent()
833 if parentName not in done and any(p.fullmatch(datasetType.name) for p in wildcard.patterns):
834 yield datasetType
836 def queryCollections(
837 self,
838 expression: Any = ...,
839 datasetType: Optional[DatasetType] = None,
840 collectionTypes: Union[Iterable[CollectionType], CollectionType] = CollectionType.all(),
841 flattenChains: bool = False,
842 includeChains: Optional[bool] = None,
843 ) -> Iterator[str]:
844 # Docstring inherited from lsst.daf.butler.registry.Registry
846 # Right now the datasetTypes argument is completely ignored, but that
847 # is consistent with its [lack of] guarantees. DM-24939 or a follow-up
848 # ticket will take care of that.
849 query = CollectionQuery.fromExpression(expression)
850 collectionTypes = ensure_iterable(collectionTypes)
851 for record in query.iter(
852 self._managers.collections,
853 collectionTypes=frozenset(collectionTypes),
854 flattenChains=flattenChains,
855 includeChains=includeChains,
856 ):
857 yield record.name
859 def _makeQueryBuilder(
860 self, summary: queries.QuerySummary, doomed_by: Iterable[str] = ()
861 ) -> queries.QueryBuilder:
862 """Return a `QueryBuilder` instance capable of constructing and
863 managing more complex queries than those obtainable via `Registry`
864 interfaces.
866 This is an advanced interface; downstream code should prefer
867 `Registry.queryDataIds` and `Registry.queryDatasets` whenever those
868 are sufficient.
870 Parameters
871 ----------
872 summary : `queries.QuerySummary`
873 Object describing and categorizing the full set of dimensions that
874 will be included in the query.
875 doomed_by : `Iterable` of `str`, optional
876 A list of diagnostic messages that indicate why the query is going
877 to yield no results and should not even be executed. If an empty
878 container (default) the query will be executed unless other code
879 determines that it is doomed.
881 Returns
882 -------
883 builder : `queries.QueryBuilder`
884 Object that can be used to construct and perform advanced queries.
885 """
886 return queries.QueryBuilder(
887 summary,
888 queries.RegistryManagers(
889 collections=self._managers.collections,
890 dimensions=self._managers.dimensions,
891 datasets=self._managers.datasets,
892 TimespanReprClass=self._db.getTimespanRepresentation(),
893 ),
894 doomed_by=doomed_by,
895 )
897 def queryDatasets(
898 self,
899 datasetType: Any,
900 *,
901 collections: Any = None,
902 dimensions: Optional[Iterable[Union[Dimension, str]]] = None,
903 dataId: Optional[DataId] = None,
904 where: Optional[str] = None,
905 findFirst: bool = False,
906 components: Optional[bool] = None,
907 bind: Optional[Mapping[str, Any]] = None,
908 check: bool = True,
909 **kwargs: Any,
910 ) -> queries.DatasetQueryResults:
911 # Docstring inherited from lsst.daf.butler.registry.Registry
913 # Standardize the collections expression.
914 if collections is None:
915 if not self.defaults.collections:
916 raise TypeError(
917 "No collections provided to findDataset, and no defaults from registry construction."
918 )
919 collections = self.defaults.collections
920 elif findFirst:
921 collections = CollectionSearch.fromExpression(collections)
922 else:
923 collections = CollectionQuery.fromExpression(collections)
924 # Standardize and expand the data ID provided as a constraint.
925 standardizedDataId = self.expandDataId(dataId, **kwargs)
927 # We can only query directly if given a non-component DatasetType
928 # instance. If we were given an expression or str or a component
929 # DatasetType instance, we'll populate this dict, recurse, and return.
930 # If we already have a non-component DatasetType, it will remain None
931 # and we'll run the query directly.
932 composition: Optional[
933 Dict[
934 DatasetType, List[Optional[str]] # parent dataset type # component name, or None for parent
935 ]
936 ] = None
937 if not isinstance(datasetType, DatasetType):
938 # We were given a dataset type expression (which may be as simple
939 # as a str). Loop over all matching datasets, delegating handling
940 # of the `components` argument to queryDatasetTypes, as we populate
941 # the composition dict.
942 composition = defaultdict(list)
943 for trueDatasetType in self.queryDatasetTypes(datasetType, components=components):
944 parentName, componentName = trueDatasetType.nameAndComponent()
945 if componentName is not None:
946 parentDatasetType = self.getDatasetType(parentName)
947 composition.setdefault(parentDatasetType, []).append(componentName)
948 else:
949 composition.setdefault(trueDatasetType, []).append(None)
950 if not composition:
951 return queries.ChainedDatasetQueryResults(
952 [],
953 doomed_by=[
954 f"No registered dataset type matching {t!r} found."
955 for t in ensure_iterable(datasetType)
956 ],
957 )
958 elif datasetType.isComponent():
959 # We were given a true DatasetType instance, but it's a component.
960 # the composition dict will have exactly one item.
961 parentName, componentName = datasetType.nameAndComponent()
962 parentDatasetType = self.getDatasetType(parentName)
963 composition = {parentDatasetType: [componentName]}
964 if composition is not None:
965 # We need to recurse. Do that once for each parent dataset type.
966 chain = []
967 for parentDatasetType, componentNames in composition.items():
968 parentResults = self.queryDatasets(
969 parentDatasetType,
970 collections=collections,
971 dimensions=dimensions,
972 dataId=standardizedDataId,
973 where=where,
974 bind=bind,
975 findFirst=findFirst,
976 check=check,
977 )
978 assert isinstance(
979 parentResults, queries.ParentDatasetQueryResults
980 ), "Should always be true if passing in a DatasetType instance, and we are."
981 chain.append(parentResults.withComponents(componentNames))
982 return queries.ChainedDatasetQueryResults(chain)
983 # If we get here, there's no need to recurse (or we are already
984 # recursing; there can only ever be one level of recursion).
986 # The full set of dimensions in the query is the combination of those
987 # needed for the DatasetType and those explicitly requested, if any.
988 requestedDimensionNames = set(datasetType.dimensions.names)
989 if dimensions is not None:
990 requestedDimensionNames.update(self.dimensions.extract(dimensions).names)
991 # Construct the summary structure needed to construct a QueryBuilder.
992 summary = queries.QuerySummary(
993 requested=DimensionGraph(self.dimensions, names=requestedDimensionNames),
994 dataId=standardizedDataId,
995 expression=where,
996 bind=bind,
997 defaults=self.defaults.dataId,
998 check=check,
999 datasets=[datasetType],
1000 )
1001 builder = self._makeQueryBuilder(summary)
1002 # Add the dataset subquery to the query, telling the QueryBuilder to
1003 # include the rank of the selected collection in the results only if we
1004 # need to findFirst. Note that if any of the collections are
1005 # actually wildcard expressions, and we've asked for deduplication,
1006 # this will raise TypeError for us.
1007 builder.joinDataset(datasetType, collections, isResult=True, findFirst=findFirst)
1008 query = builder.finish()
1009 return queries.ParentDatasetQueryResults(self._db, query, components=[None], datasetType=datasetType)
1011 def queryDataIds(
1012 self,
1013 dimensions: Union[Iterable[Union[Dimension, str]], Dimension, str],
1014 *,
1015 dataId: Optional[DataId] = None,
1016 datasets: Any = None,
1017 collections: Any = None,
1018 where: Optional[str] = None,
1019 components: Optional[bool] = None,
1020 bind: Optional[Mapping[str, Any]] = None,
1021 check: bool = True,
1022 **kwargs: Any,
1023 ) -> queries.DataCoordinateQueryResults:
1024 # Docstring inherited from lsst.daf.butler.registry.Registry
1025 dimensions = ensure_iterable(dimensions)
1026 standardizedDataId = self.expandDataId(dataId, **kwargs)
1027 standardizedDatasetTypes = set()
1028 requestedDimensions = self.dimensions.extract(dimensions)
1029 missing: List[str] = []
1030 if datasets is not None:
1031 if not collections:
1032 if not self.defaults.collections:
1033 raise TypeError(f"Cannot pass 'datasets' (='{datasets}') without 'collections'.")
1034 collections = self.defaults.collections
1035 else:
1036 # Preprocess collections expression in case the original
1037 # included single-pass iterators (we'll want to use it multiple
1038 # times below).
1039 collections = CollectionQuery.fromExpression(collections)
1040 for datasetType in self.queryDatasetTypes(datasets, components=components, missing=missing):
1041 # If any matched dataset type is a component, just operate on
1042 # its parent instead, because Registry doesn't know anything
1043 # about what components exist, and here (unlike queryDatasets)
1044 # we don't care about returning them.
1045 parentDatasetTypeName, componentName = datasetType.nameAndComponent()
1046 if componentName is not None:
1047 datasetType = self.getDatasetType(parentDatasetTypeName)
1048 standardizedDatasetTypes.add(datasetType)
1049 elif collections:
1050 raise TypeError(f"Cannot pass 'collections' (='{collections}') without 'datasets'.")
1052 def query_factory(
1053 order_by: Optional[Iterable[str]] = None, limit: Optional[Tuple[int, Optional[int]]] = None
1054 ) -> Query:
1055 """Construct the Query object that generates query results."""
1056 summary = queries.QuerySummary(
1057 requested=requestedDimensions,
1058 dataId=standardizedDataId,
1059 expression=where,
1060 bind=bind,
1061 defaults=self.defaults.dataId,
1062 check=check,
1063 datasets=standardizedDatasetTypes,
1064 order_by=order_by,
1065 limit=limit,
1066 )
1067 builder = self._makeQueryBuilder(
1068 summary, doomed_by=[f"Dataset type {name} is not registered." for name in missing]
1069 )
1070 for datasetType in standardizedDatasetTypes:
1071 builder.joinDataset(
1072 datasetType,
1073 collections,
1074 isResult=False,
1075 )
1076 return builder.finish()
1078 return queries.DataCoordinateQueryResults(self._db, query_factory, requestedDimensions)
1080 def queryDimensionRecords(
1081 self,
1082 element: Union[DimensionElement, str],
1083 *,
1084 dataId: Optional[DataId] = None,
1085 datasets: Any = None,
1086 collections: Any = None,
1087 where: Optional[str] = None,
1088 components: Optional[bool] = None,
1089 bind: Optional[Mapping[str, Any]] = None,
1090 check: bool = True,
1091 **kwargs: Any,
1092 ) -> queries.DimensionRecordQueryResults:
1093 # Docstring inherited from lsst.daf.butler.registry.Registry
1094 if not isinstance(element, DimensionElement):
1095 try:
1096 element = self.dimensions[element]
1097 except KeyError as e:
1098 raise KeyError(
1099 f"No such dimension '{element}', available dimensions: "
1100 + str(self.dimensions.getStaticElements())
1101 ) from e
1102 dataIds = self.queryDataIds(
1103 element.graph,
1104 dataId=dataId,
1105 datasets=datasets,
1106 collections=collections,
1107 where=where,
1108 components=components,
1109 bind=bind,
1110 check=check,
1111 **kwargs,
1112 )
1113 return queries.DatabaseDimensionRecordQueryResults(dataIds, self._managers.dimensions[element])
1115 def queryDatasetAssociations(
1116 self,
1117 datasetType: Union[str, DatasetType],
1118 collections: Any = ...,
1119 *,
1120 collectionTypes: Iterable[CollectionType] = CollectionType.all(),
1121 flattenChains: bool = False,
1122 ) -> Iterator[DatasetAssociation]:
1123 # Docstring inherited from lsst.daf.butler.registry.Registry
1124 if collections is None:
1125 if not self.defaults.collections:
1126 raise TypeError(
1127 "No collections provided to findDataset, and no defaults from registry construction."
1128 )
1129 collections = self.defaults.collections
1130 else:
1131 collections = CollectionQuery.fromExpression(collections)
1132 TimespanReprClass = self._db.getTimespanRepresentation()
1133 if isinstance(datasetType, str):
1134 storage = self._managers.datasets[datasetType]
1135 else:
1136 storage = self._managers.datasets[datasetType.name]
1137 for collectionRecord in collections.iter(
1138 self._managers.collections,
1139 collectionTypes=frozenset(collectionTypes),
1140 flattenChains=flattenChains,
1141 ):
1142 query = storage.select(collectionRecord)
1143 for row in self._db.query(query.combine()).mappings():
1144 dataId = DataCoordinate.fromRequiredValues(
1145 storage.datasetType.dimensions,
1146 tuple(row[name] for name in storage.datasetType.dimensions.required.names),
1147 )
1148 runRecord = self._managers.collections[row[self._managers.collections.getRunForeignKeyName()]]
1149 ref = DatasetRef(storage.datasetType, dataId, id=row["id"], run=runRecord.name, conform=False)
1150 if collectionRecord.type is CollectionType.CALIBRATION:
1151 timespan = TimespanReprClass.extract(row)
1152 else:
1153 timespan = None
1154 yield DatasetAssociation(ref=ref, collection=collectionRecord.name, timespan=timespan)
1156 storageClasses: StorageClassFactory
1157 """All storage classes known to the registry (`StorageClassFactory`).
1158 """