Coverage for python/lsst/daf/butler/registries/sql.py : 13%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22from __future__ import annotations
24__all__ = (
25 "SqlRegistry",
26)
28from collections import defaultdict
29import contextlib
30import logging
31from typing import (
32 Any,
33 Dict,
34 Iterable,
35 Iterator,
36 List,
37 Mapping,
38 Optional,
39 Set,
40 TYPE_CHECKING,
41 Union,
42)
44import sqlalchemy
45from lsst.utils.iteration import ensure_iterable
47from ..core import (
48 ButlerURI,
49 Config,
50 DataCoordinate,
51 DataCoordinateIterable,
52 DataId,
53 DatasetAssociation,
54 DatasetId,
55 DatasetRef,
56 DatasetType,
57 ddl,
58 Dimension,
59 DimensionConfig,
60 DimensionElement,
61 DimensionGraph,
62 DimensionRecord,
63 DimensionUniverse,
64 NamedKeyMapping,
65 NameLookupMapping,
66 Progress,
67 StorageClassFactory,
68 Timespan,
69)
70from ..core.utils import transactional
72from ..registry import (
73 Registry,
74 RegistryConfig,
75 CollectionType,
76 RegistryDefaults,
77 ConflictingDefinitionError,
78 InconsistentDataIdError,
79 OrphanedRecordError,
80 CollectionSearch,
81)
82from ..registry import queries
83from ..registry.wildcards import CategorizedWildcard, CollectionQuery, Ellipsis
84from ..registry.summaries import CollectionSummary
85from ..registry.managers import RegistryManagerTypes, RegistryManagerInstances
86from ..registry.interfaces import ChainedCollectionRecord, DatasetIdGenEnum, RunRecord
88if TYPE_CHECKING: 88 ↛ 89line 88 didn't jump to line 89, because the condition on line 88 was never true
89 from .._butlerConfig import ButlerConfig
90 from ..registry.interfaces import (
91 CollectionRecord,
92 Database,
93 DatastoreRegistryBridgeManager,
94 )
97_LOG = logging.getLogger(__name__)
100class SqlRegistry(Registry):
101 """Registry implementation based on SQLAlchemy.
103 Parameters
104 ----------
105 database : `Database`
106 Database instance to store Registry.
107 defaults : `RegistryDefaults`
108 Default collection search path and/or output `~CollectionType.RUN`
109 collection.
110 managers : `RegistryManagerInstances`
111 All the managers required for this registry.
112 """
114 defaultConfigFile: Optional[str] = None
115 """Path to configuration defaults. Accessed within the ``configs`` resource
116 or relative to a search path. Can be None if no defaults specified.
117 """
119 @classmethod
120 def createFromConfig(cls, config: Optional[Union[RegistryConfig, str]] = None,
121 dimensionConfig: Optional[Union[DimensionConfig, str]] = None,
122 butlerRoot: Optional[str] = None) -> Registry:
123 """Create registry database and return `SqlRegistry` instance.
125 This method initializes database contents, database must be empty
126 prior to calling this method.
128 Parameters
129 ----------
130 config : `RegistryConfig` or `str`, optional
131 Registry configuration, if missing then default configuration will
132 be loaded from registry.yaml.
133 dimensionConfig : `DimensionConfig` or `str`, optional
134 Dimensions configuration, if missing then default configuration
135 will be loaded from dimensions.yaml.
136 butlerRoot : `str`, optional
137 Path to the repository root this `SqlRegistry` will manage.
139 Returns
140 -------
141 registry : `SqlRegistry`
142 A new `SqlRegistry` instance.
143 """
144 config = cls.forceRegistryConfig(config)
145 config.replaceRoot(butlerRoot)
147 if isinstance(dimensionConfig, str):
148 dimensionConfig = DimensionConfig(config)
149 elif dimensionConfig is None:
150 dimensionConfig = DimensionConfig()
151 elif not isinstance(dimensionConfig, DimensionConfig):
152 raise TypeError(f"Incompatible Dimension configuration type: {type(dimensionConfig)}")
154 DatabaseClass = config.getDatabaseClass()
155 database = DatabaseClass.fromUri(str(config.connectionString), origin=config.get("origin", 0),
156 namespace=config.get("namespace"))
157 managerTypes = RegistryManagerTypes.fromConfig(config)
158 managers = managerTypes.makeRepo(database, dimensionConfig)
159 return cls(database, RegistryDefaults(), managers)
161 @classmethod
162 def fromConfig(cls, config: Union[ButlerConfig, RegistryConfig, Config, str],
163 butlerRoot: Optional[Union[str, ButlerURI]] = None, writeable: bool = True,
164 defaults: Optional[RegistryDefaults] = None) -> Registry:
165 """Create `Registry` subclass instance from `config`.
167 Registry database must be inbitialized prior to calling this method.
169 Parameters
170 ----------
171 config : `ButlerConfig`, `RegistryConfig`, `Config` or `str`
172 Registry configuration
173 butlerRoot : `str` or `ButlerURI`, optional
174 Path to the repository root this `Registry` will manage.
175 writeable : `bool`, optional
176 If `True` (default) create a read-write connection to the database.
177 defaults : `RegistryDefaults`, optional
178 Default collection search path and/or output `~CollectionType.RUN`
179 collection.
181 Returns
182 -------
183 registry : `SqlRegistry` (subclass)
184 A new `SqlRegistry` subclass instance.
185 """
186 config = cls.forceRegistryConfig(config)
187 config.replaceRoot(butlerRoot)
188 DatabaseClass = config.getDatabaseClass()
189 database = DatabaseClass.fromUri(str(config.connectionString), origin=config.get("origin", 0),
190 namespace=config.get("namespace"), writeable=writeable)
191 managerTypes = RegistryManagerTypes.fromConfig(config)
192 managers = managerTypes.loadRepo(database)
193 if defaults is None:
194 defaults = RegistryDefaults()
195 return cls(database, defaults, managers)
197 def __init__(self, database: Database, defaults: RegistryDefaults, managers: RegistryManagerInstances):
198 self._db = database
199 self._managers = managers
200 self.storageClasses = StorageClassFactory()
201 # Intentionally invoke property setter to initialize defaults. This
202 # can only be done after most of the rest of Registry has already been
203 # initialized, and must be done before the property getter is used.
204 self.defaults = defaults
206 def __str__(self) -> str:
207 return str(self._db)
209 def __repr__(self) -> str:
210 return f"SqlRegistry({self._db!r}, {self.dimensions!r})"
212 def isWriteable(self) -> bool:
213 # Docstring inherited from lsst.daf.butler.registry.Registry
214 return self._db.isWriteable()
216 def copy(self, defaults: Optional[RegistryDefaults] = None) -> Registry:
217 # Docstring inherited from lsst.daf.butler.registry.Registry
218 if defaults is None:
219 # No need to copy, because `RegistryDefaults` is immutable; we
220 # effectively copy on write.
221 defaults = self.defaults
222 return type(self)(self._db, defaults, self._managers)
224 @property
225 def dimensions(self) -> DimensionUniverse:
226 # Docstring inherited from lsst.daf.butler.registry.Registry
227 return self._managers.dimensions.universe
229 def refresh(self) -> None:
230 # Docstring inherited from lsst.daf.butler.registry.Registry
231 self._managers.refresh()
233 @contextlib.contextmanager
234 def transaction(self, *, savepoint: bool = False) -> Iterator[None]:
235 # Docstring inherited from lsst.daf.butler.registry.Registry
236 try:
237 with self._db.transaction(savepoint=savepoint):
238 yield
239 except BaseException:
240 # TODO: this clears the caches sometimes when we wouldn't actually
241 # need to. Can we avoid that?
242 self._managers.dimensions.clearCaches()
243 raise
245 def resetConnectionPool(self) -> None:
246 """Reset SQLAlchemy connection pool for `SqlRegistry` database.
248 This operation is useful when using registry with fork-based
249 multiprocessing. To use registry across fork boundary one has to make
250 sure that there are no currently active connections (no session or
251 transaction is in progress) and connection pool is reset using this
252 method. This method should be called by the child process immediately
253 after the fork.
254 """
255 self._db._engine.dispose()
257 def registerOpaqueTable(self, tableName: str, spec: ddl.TableSpec) -> None:
258 """Add an opaque (to the `Registry`) table for use by a `Datastore` or
259 other data repository client.
261 Opaque table records can be added via `insertOpaqueData`, retrieved via
262 `fetchOpaqueData`, and removed via `deleteOpaqueData`.
264 Parameters
265 ----------
266 tableName : `str`
267 Logical name of the opaque table. This may differ from the
268 actual name used in the database by a prefix and/or suffix.
269 spec : `ddl.TableSpec`
270 Specification for the table to be added.
271 """
272 self._managers.opaque.register(tableName, spec)
274 @transactional
275 def insertOpaqueData(self, tableName: str, *data: dict) -> None:
276 """Insert records into an opaque table.
278 Parameters
279 ----------
280 tableName : `str`
281 Logical name of the opaque table. Must match the name used in a
282 previous call to `registerOpaqueTable`.
283 data
284 Each additional positional argument is a dictionary that represents
285 a single row to be added.
286 """
287 self._managers.opaque[tableName].insert(*data)
289 def fetchOpaqueData(self, tableName: str, **where: Any) -> Iterator[dict]:
290 """Retrieve records from an opaque table.
292 Parameters
293 ----------
294 tableName : `str`
295 Logical name of the opaque table. Must match the name used in a
296 previous call to `registerOpaqueTable`.
297 where
298 Additional keyword arguments are interpreted as equality
299 constraints that restrict the returned rows (combined with AND);
300 keyword arguments are column names and values are the values they
301 must have.
303 Yields
304 ------
305 row : `dict`
306 A dictionary representing a single result row.
307 """
308 yield from self._managers.opaque[tableName].fetch(**where)
310 @transactional
311 def deleteOpaqueData(self, tableName: str, **where: Any) -> None:
312 """Remove records from an opaque table.
314 Parameters
315 ----------
316 tableName : `str`
317 Logical name of the opaque table. Must match the name used in a
318 previous call to `registerOpaqueTable`.
319 where
320 Additional keyword arguments are interpreted as equality
321 constraints that restrict the deleted rows (combined with AND);
322 keyword arguments are column names and values are the values they
323 must have.
324 """
325 self._managers.opaque[tableName].delete(where.keys(), where)
327 def registerCollection(self, name: str, type: CollectionType = CollectionType.TAGGED,
328 doc: Optional[str] = None) -> bool:
329 # Docstring inherited from lsst.daf.butler.registry.Registry
330 _, registered = self._managers.collections.register(name, type, doc=doc)
331 return registered
333 def getCollectionType(self, name: str) -> CollectionType:
334 # Docstring inherited from lsst.daf.butler.registry.Registry
335 return self._managers.collections.find(name).type
337 def _get_collection_record(self, name: str) -> CollectionRecord:
338 # Docstring inherited from lsst.daf.butler.registry.Registry
339 return self._managers.collections.find(name)
341 def registerRun(self, name: str, doc: Optional[str] = None) -> bool:
342 # Docstring inherited from lsst.daf.butler.registry.Registry
343 _, registered = self._managers.collections.register(name, CollectionType.RUN, doc=doc)
344 return registered
346 @transactional
347 def removeCollection(self, name: str) -> None:
348 # Docstring inherited from lsst.daf.butler.registry.Registry
349 self._managers.collections.remove(name)
351 def getCollectionChain(self, parent: str) -> CollectionSearch:
352 # Docstring inherited from lsst.daf.butler.registry.Registry
353 record = self._managers.collections.find(parent)
354 if record.type is not CollectionType.CHAINED:
355 raise TypeError(f"Collection '{parent}' has type {record.type.name}, not CHAINED.")
356 assert isinstance(record, ChainedCollectionRecord)
357 return record.children
359 @transactional
360 def setCollectionChain(self, parent: str, children: Any, *, flatten: bool = False) -> None:
361 # Docstring inherited from lsst.daf.butler.registry.Registry
362 record = self._managers.collections.find(parent)
363 if record.type is not CollectionType.CHAINED:
364 raise TypeError(f"Collection '{parent}' has type {record.type.name}, not CHAINED.")
365 assert isinstance(record, ChainedCollectionRecord)
366 children = CollectionSearch.fromExpression(children)
367 if children != record.children or flatten:
368 record.update(self._managers.collections, children, flatten=flatten)
370 def getCollectionDocumentation(self, collection: str) -> Optional[str]:
371 # Docstring inherited from lsst.daf.butler.registry.Registry
372 return self._managers.collections.getDocumentation(self._managers.collections.find(collection).key)
374 def setCollectionDocumentation(self, collection: str, doc: Optional[str]) -> None:
375 # Docstring inherited from lsst.daf.butler.registry.Registry
376 self._managers.collections.setDocumentation(self._managers.collections.find(collection).key, doc)
378 def getCollectionSummary(self, collection: str) -> CollectionSummary:
379 # Docstring inherited from lsst.daf.butler.registry.Registry
380 record = self._managers.collections.find(collection)
381 return self._managers.datasets.getCollectionSummary(record)
383 def registerDatasetType(self, datasetType: DatasetType) -> bool:
384 # Docstring inherited from lsst.daf.butler.registry.Registry
385 _, inserted = self._managers.datasets.register(datasetType)
386 return inserted
388 def removeDatasetType(self, name: str) -> None:
389 # Docstring inherited from lsst.daf.butler.registry.Registry
390 self._managers.datasets.remove(name)
392 def getDatasetType(self, name: str) -> DatasetType:
393 # Docstring inherited from lsst.daf.butler.registry.Registry
394 return self._managers.datasets[name].datasetType
396 def supportsIdGenerationMode(self, mode: DatasetIdGenEnum) -> bool:
397 # Docstring inherited from lsst.daf.butler.registry.Registry
398 return self._managers.datasets.supportsIdGenerationMode(mode)
400 def findDataset(self, datasetType: Union[DatasetType, str], dataId: Optional[DataId] = None, *,
401 collections: Any = None, timespan: Optional[Timespan] = None,
402 **kwargs: Any) -> Optional[DatasetRef]:
403 # Docstring inherited from lsst.daf.butler.registry.Registry
404 if isinstance(datasetType, DatasetType):
405 storage = self._managers.datasets[datasetType.name]
406 else:
407 storage = self._managers.datasets[datasetType]
408 dataId = DataCoordinate.standardize(dataId, graph=storage.datasetType.dimensions,
409 universe=self.dimensions, defaults=self.defaults.dataId,
410 **kwargs)
411 if collections is None:
412 if not self.defaults.collections:
413 raise TypeError("No collections provided to findDataset, "
414 "and no defaults from registry construction.")
415 collections = self.defaults.collections
416 else:
417 collections = CollectionSearch.fromExpression(collections)
418 for collectionRecord in collections.iter(self._managers.collections):
419 if (collectionRecord.type is CollectionType.CALIBRATION
420 and (not storage.datasetType.isCalibration() or timespan is None)):
421 continue
422 result = storage.find(collectionRecord, dataId, timespan=timespan)
423 if result is not None:
424 return result
426 return None
428 @transactional
429 def insertDatasets(self, datasetType: Union[DatasetType, str], dataIds: Iterable[DataId],
430 run: Optional[str] = None, expand: bool = True,
431 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE) -> List[DatasetRef]:
432 # Docstring inherited from lsst.daf.butler.registry.Registry
433 if isinstance(datasetType, DatasetType):
434 storage = self._managers.datasets.find(datasetType.name)
435 if storage is None:
436 raise LookupError(f"DatasetType '{datasetType}' has not been registered.")
437 else:
438 storage = self._managers.datasets.find(datasetType)
439 if storage is None:
440 raise LookupError(f"DatasetType with name '{datasetType}' has not been registered.")
441 if run is None:
442 if self.defaults.run is None:
443 raise TypeError("No run provided to insertDatasets, "
444 "and no default from registry construction.")
445 run = self.defaults.run
446 runRecord = self._managers.collections.find(run)
447 if runRecord.type is not CollectionType.RUN:
448 raise TypeError(f"Given collection is of type {runRecord.type.name}; RUN collection required.")
449 assert isinstance(runRecord, RunRecord)
450 progress = Progress("daf.butler.Registry.insertDatasets", level=logging.DEBUG)
451 if expand:
452 expandedDataIds = [self.expandDataId(dataId, graph=storage.datasetType.dimensions)
453 for dataId in progress.wrap(dataIds,
454 f"Expanding {storage.datasetType.name} data IDs")]
455 else:
456 expandedDataIds = [DataCoordinate.standardize(dataId, graph=storage.datasetType.dimensions)
457 for dataId in dataIds]
458 try:
459 refs = list(storage.insert(runRecord, expandedDataIds, idGenerationMode))
460 except sqlalchemy.exc.IntegrityError as err:
461 raise ConflictingDefinitionError(f"A database constraint failure was triggered by inserting "
462 f"one or more datasets of type {storage.datasetType} into "
463 f"collection '{run}'. "
464 f"This probably means a dataset with the same data ID "
465 f"and dataset type already exists, but it may also mean a "
466 f"dimension row is missing.") from err
467 return refs
469 @transactional
470 def _importDatasets(self, datasets: Iterable[DatasetRef], expand: bool = True,
471 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE,
472 reuseIds: bool = False) -> List[DatasetRef]:
473 # Docstring inherited from lsst.daf.butler.registry.Registry
474 datasets = list(datasets)
475 if not datasets:
476 # nothing to do
477 return []
479 # find dataset type
480 datasetTypes = set(dataset.datasetType for dataset in datasets)
481 if len(datasetTypes) != 1:
482 raise ValueError(f"Multiple dataset types in input datasets: {datasetTypes}")
483 datasetType = datasetTypes.pop()
485 # get storage handler for this dataset type
486 storage = self._managers.datasets.find(datasetType.name)
487 if storage is None:
488 raise LookupError(f"DatasetType '{datasetType}' has not been registered.")
490 # find run name
491 runs = set(dataset.run for dataset in datasets)
492 if len(runs) != 1:
493 raise ValueError(f"Multiple run names in input datasets: {runs}")
494 run = runs.pop()
495 if run is None:
496 if self.defaults.run is None:
497 raise TypeError("No run provided to ingestDatasets, "
498 "and no default from registry construction.")
499 run = self.defaults.run
501 runRecord = self._managers.collections.find(run)
502 if runRecord.type is not CollectionType.RUN:
503 raise TypeError(f"Given collection '{runRecord.name}' is of type {runRecord.type.name};"
504 " RUN collection required.")
505 assert isinstance(runRecord, RunRecord)
507 progress = Progress("daf.butler.Registry.insertDatasets", level=logging.DEBUG)
508 if expand:
509 expandedDatasets = [
510 dataset.expanded(self.expandDataId(dataset.dataId, graph=storage.datasetType.dimensions))
511 for dataset in progress.wrap(datasets, f"Expanding {storage.datasetType.name} data IDs")]
512 else:
513 expandedDatasets = [
514 DatasetRef(datasetType, dataset.dataId, id=dataset.id, run=dataset.run, conform=True)
515 for dataset in datasets
516 ]
518 try:
519 refs = list(storage.import_(runRecord, expandedDatasets, idGenerationMode, reuseIds))
520 except sqlalchemy.exc.IntegrityError as err:
521 raise ConflictingDefinitionError(f"A database constraint failure was triggered by inserting "
522 f"one or more datasets of type {storage.datasetType} into "
523 f"collection '{run}'. "
524 f"This probably means a dataset with the same data ID "
525 f"and dataset type already exists, but it may also mean a "
526 f"dimension row is missing.") from err
527 return refs
529 def getDataset(self, id: DatasetId) -> Optional[DatasetRef]:
530 # Docstring inherited from lsst.daf.butler.registry.Registry
531 return self._managers.datasets.getDatasetRef(id)
533 @transactional
534 def removeDatasets(self, refs: Iterable[DatasetRef]) -> None:
535 # Docstring inherited from lsst.daf.butler.registry.Registry
536 progress = Progress("lsst.daf.butler.Registry.removeDatasets", level=logging.DEBUG)
537 for datasetType, refsForType in progress.iter_item_chunks(DatasetRef.groupByType(refs).items(),
538 desc="Removing datasets by type"):
539 storage = self._managers.datasets[datasetType.name]
540 try:
541 storage.delete(refsForType)
542 except sqlalchemy.exc.IntegrityError as err:
543 raise OrphanedRecordError("One or more datasets is still "
544 "present in one or more Datastores.") from err
546 @transactional
547 def associate(self, collection: str, refs: Iterable[DatasetRef]) -> None:
548 # Docstring inherited from lsst.daf.butler.registry.Registry
549 progress = Progress("lsst.daf.butler.Registry.associate", level=logging.DEBUG)
550 collectionRecord = self._managers.collections.find(collection)
551 if collectionRecord.type is not CollectionType.TAGGED:
552 raise TypeError(f"Collection '{collection}' has type {collectionRecord.type.name}, not TAGGED.")
553 for datasetType, refsForType in progress.iter_item_chunks(DatasetRef.groupByType(refs).items(),
554 desc="Associating datasets by type"):
555 storage = self._managers.datasets[datasetType.name]
556 try:
557 storage.associate(collectionRecord, refsForType)
558 except sqlalchemy.exc.IntegrityError as err:
559 raise ConflictingDefinitionError(
560 f"Constraint violation while associating dataset of type {datasetType.name} with "
561 f"collection {collection}. This probably means that one or more datasets with the same "
562 f"dataset type and data ID already exist in the collection, but it may also indicate "
563 f"that the datasets do not exist."
564 ) from err
566 @transactional
567 def disassociate(self, collection: str, refs: Iterable[DatasetRef]) -> None:
568 # Docstring inherited from lsst.daf.butler.registry.Registry
569 progress = Progress("lsst.daf.butler.Registry.disassociate", level=logging.DEBUG)
570 collectionRecord = self._managers.collections.find(collection)
571 if collectionRecord.type is not CollectionType.TAGGED:
572 raise TypeError(f"Collection '{collection}' has type {collectionRecord.type.name}; "
573 "expected TAGGED.")
574 for datasetType, refsForType in progress.iter_item_chunks(DatasetRef.groupByType(refs).items(),
575 desc="Disassociating datasets by type"):
576 storage = self._managers.datasets[datasetType.name]
577 storage.disassociate(collectionRecord, refsForType)
579 @transactional
580 def certify(self, collection: str, refs: Iterable[DatasetRef], timespan: Timespan) -> None:
581 # Docstring inherited from lsst.daf.butler.registry.Registry
582 progress = Progress("lsst.daf.butler.Registry.certify", level=logging.DEBUG)
583 collectionRecord = self._managers.collections.find(collection)
584 for datasetType, refsForType in progress.iter_item_chunks(DatasetRef.groupByType(refs).items(),
585 desc="Certifying datasets by type"):
586 storage = self._managers.datasets[datasetType.name]
587 storage.certify(collectionRecord, refsForType, timespan)
589 @transactional
590 def decertify(self, collection: str, datasetType: Union[str, DatasetType], timespan: Timespan, *,
591 dataIds: Optional[Iterable[DataId]] = None) -> None:
592 # Docstring inherited from lsst.daf.butler.registry.Registry
593 collectionRecord = self._managers.collections.find(collection)
594 if isinstance(datasetType, str):
595 storage = self._managers.datasets[datasetType]
596 else:
597 storage = self._managers.datasets[datasetType.name]
598 standardizedDataIds = None
599 if dataIds is not None:
600 standardizedDataIds = [DataCoordinate.standardize(d, graph=storage.datasetType.dimensions)
601 for d in dataIds]
602 storage.decertify(collectionRecord, timespan, dataIds=standardizedDataIds)
604 def getDatastoreBridgeManager(self) -> DatastoreRegistryBridgeManager:
605 """Return an object that allows a new `Datastore` instance to
606 communicate with this `Registry`.
608 Returns
609 -------
610 manager : `DatastoreRegistryBridgeManager`
611 Object that mediates communication between this `Registry` and its
612 associated datastores.
613 """
614 return self._managers.datastores
616 def getDatasetLocations(self, ref: DatasetRef) -> Iterable[str]:
617 # Docstring inherited from lsst.daf.butler.registry.Registry
618 return self._managers.datastores.findDatastores(ref)
620 def expandDataId(self, dataId: Optional[DataId] = None, *, graph: Optional[DimensionGraph] = None,
621 records: Optional[NameLookupMapping[DimensionElement, Optional[DimensionRecord]]] = None,
622 withDefaults: bool = True,
623 **kwargs: Any) -> DataCoordinate:
624 # Docstring inherited from lsst.daf.butler.registry.Registry
625 if not withDefaults:
626 defaults = None
627 else:
628 defaults = self.defaults.dataId
629 standardized = DataCoordinate.standardize(dataId, graph=graph, universe=self.dimensions,
630 defaults=defaults, **kwargs)
631 if standardized.hasRecords():
632 return standardized
633 if records is None:
634 records = {}
635 elif isinstance(records, NamedKeyMapping):
636 records = records.byName()
637 else:
638 records = dict(records)
639 if isinstance(dataId, DataCoordinate) and dataId.hasRecords():
640 records.update(dataId.records.byName())
641 keys = standardized.byName()
642 for element in standardized.graph.primaryKeyTraversalOrder:
643 record = records.get(element.name, ...) # Use ... to mean not found; None might mean NULL
644 if record is ...:
645 if isinstance(element, Dimension) and keys.get(element.name) is None:
646 if element in standardized.graph.required:
647 raise LookupError(
648 f"No value or null value for required dimension {element.name}."
649 )
650 keys[element.name] = None
651 record = None
652 else:
653 storage = self._managers.dimensions[element]
654 dataIdSet = DataCoordinateIterable.fromScalar(
655 DataCoordinate.standardize(keys, graph=element.graph)
656 )
657 fetched = tuple(storage.fetch(dataIdSet))
658 try:
659 (record,) = fetched
660 except ValueError:
661 record = None
662 records[element.name] = record
663 if record is not None:
664 for d in element.implied:
665 value = getattr(record, d.name)
666 if keys.setdefault(d.name, value) != value:
667 raise InconsistentDataIdError(
668 f"Data ID {standardized} has {d.name}={keys[d.name]!r}, "
669 f"but {element.name} implies {d.name}={value!r}."
670 )
671 else:
672 if element in standardized.graph.required:
673 raise LookupError(
674 f"Could not fetch record for required dimension {element.name} via keys {keys}."
675 )
676 if element.alwaysJoin:
677 raise InconsistentDataIdError(
678 f"Could not fetch record for element {element.name} via keys {keys}, ",
679 "but it is marked alwaysJoin=True; this means one or more dimensions are not "
680 "related."
681 )
682 for d in element.implied:
683 keys.setdefault(d.name, None)
684 records.setdefault(d.name, None)
685 return DataCoordinate.standardize(keys, graph=standardized.graph).expanded(records=records)
687 def insertDimensionData(self, element: Union[DimensionElement, str],
688 *data: Union[Mapping[str, Any], DimensionRecord],
689 conform: bool = True,
690 replace: bool = False) -> None:
691 # Docstring inherited from lsst.daf.butler.registry.Registry
692 if conform:
693 if isinstance(element, str):
694 element = self.dimensions[element]
695 records = [row if isinstance(row, DimensionRecord) else element.RecordClass(**row)
696 for row in data]
697 else:
698 # Ignore typing since caller said to trust them with conform=False.
699 records = data # type: ignore
700 storage = self._managers.dimensions[element] # type: ignore
701 storage.insert(*records, replace=replace)
703 def syncDimensionData(self, element: Union[DimensionElement, str],
704 row: Union[Mapping[str, Any], DimensionRecord],
705 conform: bool = True,
706 update: bool = False) -> Union[bool, Dict[str, Any]]:
707 # Docstring inherited from lsst.daf.butler.registry.Registry
708 if conform:
709 if isinstance(element, str):
710 element = self.dimensions[element]
711 record = row if isinstance(row, DimensionRecord) else element.RecordClass(**row)
712 else:
713 # Ignore typing since caller said to trust them with conform=False.
714 record = row # type: ignore
715 storage = self._managers.dimensions[element] # type: ignore
716 return storage.sync(record, update=update)
718 def queryDatasetTypes(self, expression: Any = ..., *, components: Optional[bool] = None
719 ) -> Iterator[DatasetType]:
720 # Docstring inherited from lsst.daf.butler.registry.Registry
721 wildcard = CategorizedWildcard.fromExpression(expression, coerceUnrecognized=lambda d: d.name)
722 if wildcard is Ellipsis:
723 for datasetType in self._managers.datasets:
724 # The dataset type can no longer be a component
725 yield datasetType
726 if components:
727 # Automatically create the component dataset types
728 try:
729 componentsForDatasetType = datasetType.makeAllComponentDatasetTypes()
730 except KeyError as err:
731 _LOG.warning(f"Could not load storage class {err} for {datasetType.name}; "
732 "if it has components they will not be included in query results.")
733 else:
734 yield from componentsForDatasetType
735 return
736 done: Set[str] = set()
737 for name in wildcard.strings:
738 storage = self._managers.datasets.find(name)
739 if storage is not None:
740 done.add(storage.datasetType.name)
741 yield storage.datasetType
742 if wildcard.patterns:
743 # If components (the argument) is None, we'll save component
744 # dataset that we might want to match, but only if their parents
745 # didn't get included.
746 componentsForLater = []
747 for registeredDatasetType in self._managers.datasets:
748 # Components are not stored in registry so expand them here
749 allDatasetTypes = [registeredDatasetType]
750 try:
751 allDatasetTypes.extend(registeredDatasetType.makeAllComponentDatasetTypes())
752 except KeyError as err:
753 _LOG.warning(f"Could not load storage class {err} for {registeredDatasetType.name}; "
754 "if it has components they will not be included in query results.")
755 for datasetType in allDatasetTypes:
756 if datasetType.name in done:
757 continue
758 parentName, componentName = datasetType.nameAndComponent()
759 if componentName is not None and not components:
760 if components is None and parentName not in done:
761 componentsForLater.append(datasetType)
762 continue
763 if any(p.fullmatch(datasetType.name) for p in wildcard.patterns):
764 done.add(datasetType.name)
765 yield datasetType
766 # Go back and try to match saved components.
767 for datasetType in componentsForLater:
768 parentName, _ = datasetType.nameAndComponent()
769 if parentName not in done and any(p.fullmatch(datasetType.name) for p in wildcard.patterns):
770 yield datasetType
772 def queryCollections(self, expression: Any = ...,
773 datasetType: Optional[DatasetType] = None,
774 collectionTypes: Iterable[CollectionType] = CollectionType.all(),
775 flattenChains: bool = False,
776 includeChains: Optional[bool] = None) -> Iterator[str]:
777 # Docstring inherited from lsst.daf.butler.registry.Registry
779 # Right now the datasetTypes argument is completely ignored, but that
780 # is consistent with its [lack of] guarantees. DM-24939 or a follow-up
781 # ticket will take care of that.
782 query = CollectionQuery.fromExpression(expression)
783 for record in query.iter(self._managers.collections, collectionTypes=frozenset(collectionTypes),
784 flattenChains=flattenChains, includeChains=includeChains):
785 yield record.name
787 def _makeQueryBuilder(self, summary: queries.QuerySummary) -> queries.QueryBuilder:
788 """Return a `QueryBuilder` instance capable of constructing and
789 managing more complex queries than those obtainable via `Registry`
790 interfaces.
792 This is an advanced interface; downstream code should prefer
793 `Registry.queryDataIds` and `Registry.queryDatasets` whenever those
794 are sufficient.
796 Parameters
797 ----------
798 summary : `queries.QuerySummary`
799 Object describing and categorizing the full set of dimensions that
800 will be included in the query.
802 Returns
803 -------
804 builder : `queries.QueryBuilder`
805 Object that can be used to construct and perform advanced queries.
806 """
807 return queries.QueryBuilder(
808 summary,
809 queries.RegistryManagers(
810 collections=self._managers.collections,
811 dimensions=self._managers.dimensions,
812 datasets=self._managers.datasets,
813 TimespanReprClass=self._db.getTimespanRepresentation(),
814 ),
815 )
817 def queryDatasets(self, datasetType: Any, *,
818 collections: Any = None,
819 dimensions: Optional[Iterable[Union[Dimension, str]]] = None,
820 dataId: Optional[DataId] = None,
821 where: Optional[str] = None,
822 findFirst: bool = False,
823 components: Optional[bool] = None,
824 bind: Optional[Mapping[str, Any]] = None,
825 check: bool = True,
826 **kwargs: Any) -> queries.DatasetQueryResults:
827 # Docstring inherited from lsst.daf.butler.registry.Registry
829 # Standardize the collections expression.
830 if collections is None:
831 if not self.defaults.collections:
832 raise TypeError("No collections provided to findDataset, "
833 "and no defaults from registry construction.")
834 collections = self.defaults.collections
835 elif findFirst:
836 collections = CollectionSearch.fromExpression(collections)
837 else:
838 collections = CollectionQuery.fromExpression(collections)
839 # Standardize and expand the data ID provided as a constraint.
840 standardizedDataId = self.expandDataId(dataId, **kwargs)
842 # We can only query directly if given a non-component DatasetType
843 # instance. If we were given an expression or str or a component
844 # DatasetType instance, we'll populate this dict, recurse, and return.
845 # If we already have a non-component DatasetType, it will remain None
846 # and we'll run the query directly.
847 composition: Optional[
848 Dict[
849 DatasetType, # parent dataset type
850 List[Optional[str]] # component name, or None for parent
851 ]
852 ] = None
853 if not isinstance(datasetType, DatasetType):
854 # We were given a dataset type expression (which may be as simple
855 # as a str). Loop over all matching datasets, delegating handling
856 # of the `components` argument to queryDatasetTypes, as we populate
857 # the composition dict.
858 composition = defaultdict(list)
859 for trueDatasetType in self.queryDatasetTypes(datasetType, components=components):
860 parentName, componentName = trueDatasetType.nameAndComponent()
861 if componentName is not None:
862 parentDatasetType = self.getDatasetType(parentName)
863 composition.setdefault(parentDatasetType, []).append(componentName)
864 else:
865 composition.setdefault(trueDatasetType, []).append(None)
866 if not composition:
867 return queries.ChainedDatasetQueryResults(
868 [],
869 doomed_by=[f"No registered dataset type matching {t!r} found."
870 for t in ensure_iterable(datasetType)],
871 )
872 elif datasetType.isComponent():
873 # We were given a true DatasetType instance, but it's a component.
874 # the composition dict will have exactly one item.
875 parentName, componentName = datasetType.nameAndComponent()
876 parentDatasetType = self.getDatasetType(parentName)
877 composition = {parentDatasetType: [componentName]}
878 if composition is not None:
879 # We need to recurse. Do that once for each parent dataset type.
880 chain = []
881 for parentDatasetType, componentNames in composition.items():
882 parentResults = self.queryDatasets(
883 parentDatasetType,
884 collections=collections,
885 dimensions=dimensions,
886 dataId=standardizedDataId,
887 where=where,
888 bind=bind,
889 findFirst=findFirst,
890 check=check,
891 )
892 assert isinstance(parentResults, queries.ParentDatasetQueryResults), \
893 "Should always be true if passing in a DatasetType instance, and we are."
894 chain.append(
895 parentResults.withComponents(componentNames)
896 )
897 return queries.ChainedDatasetQueryResults(chain)
898 # If we get here, there's no need to recurse (or we are already
899 # recursing; there can only ever be one level of recursion).
901 # The full set of dimensions in the query is the combination of those
902 # needed for the DatasetType and those explicitly requested, if any.
903 requestedDimensionNames = set(datasetType.dimensions.names)
904 if dimensions is not None:
905 requestedDimensionNames.update(self.dimensions.extract(dimensions).names)
906 # Construct the summary structure needed to construct a QueryBuilder.
907 summary = queries.QuerySummary(
908 requested=DimensionGraph(self.dimensions, names=requestedDimensionNames),
909 dataId=standardizedDataId,
910 expression=where,
911 bind=bind,
912 defaults=self.defaults.dataId,
913 check=check,
914 )
915 builder = self._makeQueryBuilder(summary)
916 # Add the dataset subquery to the query, telling the QueryBuilder to
917 # include the rank of the selected collection in the results only if we
918 # need to findFirst. Note that if any of the collections are
919 # actually wildcard expressions, and we've asked for deduplication,
920 # this will raise TypeError for us.
921 builder.joinDataset(datasetType, collections, isResult=True, findFirst=findFirst)
922 query = builder.finish()
923 return queries.ParentDatasetQueryResults(self._db, query, components=[None], datasetType=datasetType)
925 def queryDataIds(self, dimensions: Union[Iterable[Union[Dimension, str]], Dimension, str], *,
926 dataId: Optional[DataId] = None,
927 datasets: Any = None,
928 collections: Any = None,
929 where: Optional[str] = None,
930 components: Optional[bool] = None,
931 bind: Optional[Mapping[str, Any]] = None,
932 check: bool = True,
933 **kwargs: Any) -> queries.DataCoordinateQueryResults:
934 # Docstring inherited from lsst.daf.butler.registry.Registry
935 dimensions = ensure_iterable(dimensions)
936 standardizedDataId = self.expandDataId(dataId, **kwargs)
937 standardizedDatasetTypes = set()
938 requestedDimensions = self.dimensions.extract(dimensions)
939 queryDimensionNames = set(requestedDimensions.names)
940 if datasets is not None:
941 if not collections:
942 if not self.defaults.collections:
943 raise TypeError(f"Cannot pass 'datasets' (='{datasets}') without 'collections'.")
944 collections = self.defaults.collections
945 else:
946 # Preprocess collections expression in case the original
947 # included single-pass iterators (we'll want to use it multiple
948 # times below).
949 collections = CollectionQuery.fromExpression(collections)
950 for datasetType in self.queryDatasetTypes(datasets, components=components):
951 queryDimensionNames.update(datasetType.dimensions.names)
952 # If any matched dataset type is a component, just operate on
953 # its parent instead, because Registry doesn't know anything
954 # about what components exist, and here (unlike queryDatasets)
955 # we don't care about returning them.
956 parentDatasetTypeName, componentName = datasetType.nameAndComponent()
957 if componentName is not None:
958 datasetType = self.getDatasetType(parentDatasetTypeName)
959 standardizedDatasetTypes.add(datasetType)
960 elif collections:
961 raise TypeError(f"Cannot pass 'collections' (='{collections}') without 'datasets'.")
963 summary = queries.QuerySummary(
964 requested=DimensionGraph(self.dimensions, names=queryDimensionNames),
965 dataId=standardizedDataId,
966 expression=where,
967 bind=bind,
968 defaults=self.defaults.dataId,
969 check=check,
970 )
971 builder = self._makeQueryBuilder(summary)
972 for datasetType in standardizedDatasetTypes:
973 builder.joinDataset(datasetType, collections, isResult=False)
974 query = builder.finish()
975 return queries.DataCoordinateQueryResults(self._db, query)
977 def queryDimensionRecords(self, element: Union[DimensionElement, str], *,
978 dataId: Optional[DataId] = None,
979 datasets: Any = None,
980 collections: Any = None,
981 where: Optional[str] = None,
982 components: Optional[bool] = None,
983 bind: Optional[Mapping[str, Any]] = None,
984 check: bool = True,
985 **kwargs: Any) -> Iterator[DimensionRecord]:
986 # Docstring inherited from lsst.daf.butler.registry.Registry
987 if not isinstance(element, DimensionElement):
988 try:
989 element = self.dimensions[element]
990 except KeyError as e:
991 raise KeyError(f"No such dimension '{element}', available dimensions: "
992 + str(self.dimensions.getStaticElements())) from e
993 dataIds = self.queryDataIds(element.graph, dataId=dataId, datasets=datasets, collections=collections,
994 where=where, components=components, bind=bind, check=check, **kwargs)
995 return iter(self._managers.dimensions[element].fetch(dataIds))
997 def queryDatasetAssociations(
998 self,
999 datasetType: Union[str, DatasetType],
1000 collections: Any = ...,
1001 *,
1002 collectionTypes: Iterable[CollectionType] = CollectionType.all(),
1003 flattenChains: bool = False,
1004 ) -> Iterator[DatasetAssociation]:
1005 # Docstring inherited from lsst.daf.butler.registry.Registry
1006 if collections is None:
1007 if not self.defaults.collections:
1008 raise TypeError("No collections provided to findDataset, "
1009 "and no defaults from registry construction.")
1010 collections = self.defaults.collections
1011 else:
1012 collections = CollectionQuery.fromExpression(collections)
1013 TimespanReprClass = self._db.getTimespanRepresentation()
1014 if isinstance(datasetType, str):
1015 storage = self._managers.datasets[datasetType]
1016 else:
1017 storage = self._managers.datasets[datasetType.name]
1018 for collectionRecord in collections.iter(self._managers.collections,
1019 collectionTypes=frozenset(collectionTypes),
1020 flattenChains=flattenChains):
1021 query = storage.select(collectionRecord)
1022 for row in self._db.query(query.combine()).mappings():
1023 dataId = DataCoordinate.fromRequiredValues(
1024 storage.datasetType.dimensions,
1025 tuple(row[name] for name in storage.datasetType.dimensions.required.names)
1026 )
1027 runRecord = self._managers.collections[row[self._managers.collections.getRunForeignKeyName()]]
1028 ref = DatasetRef(storage.datasetType, dataId, id=row["id"], run=runRecord.name,
1029 conform=False)
1030 if collectionRecord.type is CollectionType.CALIBRATION:
1031 timespan = TimespanReprClass.extract(row)
1032 else:
1033 timespan = None
1034 yield DatasetAssociation(ref=ref, collection=collectionRecord.name, timespan=timespan)
1036 storageClasses: StorageClassFactory
1037 """All storage classes known to the registry (`StorageClassFactory`).
1038 """