Coverage for python/lsst/daf/butler/registries/sql.py: 13%
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22from __future__ import annotations
24__all__ = (
25 "SqlRegistry",
26)
28from collections import defaultdict
29import contextlib
30import logging
31from typing import (
32 Any,
33 Dict,
34 Iterable,
35 Iterator,
36 List,
37 Mapping,
38 Optional,
39 Set,
40 TYPE_CHECKING,
41 Union,
42)
44import sqlalchemy
45from lsst.utils.iteration import ensure_iterable
47from ..core import (
48 ButlerURI,
49 Config,
50 DataCoordinate,
51 DataCoordinateIterable,
52 DataId,
53 DatasetAssociation,
54 DatasetId,
55 DatasetRef,
56 DatasetType,
57 ddl,
58 Dimension,
59 DimensionConfig,
60 DimensionElement,
61 DimensionGraph,
62 DimensionRecord,
63 DimensionUniverse,
64 NamedKeyMapping,
65 NameLookupMapping,
66 Progress,
67 StorageClassFactory,
68 Timespan,
69)
70from ..core.utils import transactional
72from ..registry import (
73 Registry,
74 RegistryConfig,
75 CollectionType,
76 RegistryDefaults,
77 ConflictingDefinitionError,
78 InconsistentDataIdError,
79 OrphanedRecordError,
80 CollectionSearch,
81)
82from ..registry import queries
84from ..registry.wildcards import CategorizedWildcard, CollectionQuery, Ellipsis
85from ..registry.summaries import CollectionSummary
86from ..registry.managers import RegistryManagerTypes, RegistryManagerInstances
87from ..registry.interfaces import ChainedCollectionRecord, DatasetIdGenEnum, RunRecord
89if TYPE_CHECKING: 89 ↛ 90line 89 didn't jump to line 90, because the condition on line 89 was never true
90 from .._butlerConfig import ButlerConfig
91 from ..registry.interfaces import (
92 CollectionRecord,
93 Database,
94 DatastoreRegistryBridgeManager,
95 )
98_LOG = logging.getLogger(__name__)
101class SqlRegistry(Registry):
102 """Registry implementation based on SQLAlchemy.
104 Parameters
105 ----------
106 database : `Database`
107 Database instance to store Registry.
108 defaults : `RegistryDefaults`
109 Default collection search path and/or output `~CollectionType.RUN`
110 collection.
111 managers : `RegistryManagerInstances`
112 All the managers required for this registry.
113 """
115 defaultConfigFile: Optional[str] = None
116 """Path to configuration defaults. Accessed within the ``configs`` resource
117 or relative to a search path. Can be None if no defaults specified.
118 """
120 @classmethod
121 def createFromConfig(cls, config: Optional[Union[RegistryConfig, str]] = None,
122 dimensionConfig: Optional[Union[DimensionConfig, str]] = None,
123 butlerRoot: Optional[str] = None) -> Registry:
124 """Create registry database and return `SqlRegistry` instance.
126 This method initializes database contents, database must be empty
127 prior to calling this method.
129 Parameters
130 ----------
131 config : `RegistryConfig` or `str`, optional
132 Registry configuration, if missing then default configuration will
133 be loaded from registry.yaml.
134 dimensionConfig : `DimensionConfig` or `str`, optional
135 Dimensions configuration, if missing then default configuration
136 will be loaded from dimensions.yaml.
137 butlerRoot : `str`, optional
138 Path to the repository root this `SqlRegistry` will manage.
140 Returns
141 -------
142 registry : `SqlRegistry`
143 A new `SqlRegistry` instance.
144 """
145 config = cls.forceRegistryConfig(config)
146 config.replaceRoot(butlerRoot)
148 if isinstance(dimensionConfig, str):
149 dimensionConfig = DimensionConfig(config)
150 elif dimensionConfig is None:
151 dimensionConfig = DimensionConfig()
152 elif not isinstance(dimensionConfig, DimensionConfig):
153 raise TypeError(f"Incompatible Dimension configuration type: {type(dimensionConfig)}")
155 DatabaseClass = config.getDatabaseClass()
156 database = DatabaseClass.fromUri(str(config.connectionString), origin=config.get("origin", 0),
157 namespace=config.get("namespace"))
158 managerTypes = RegistryManagerTypes.fromConfig(config)
159 managers = managerTypes.makeRepo(database, dimensionConfig)
160 return cls(database, RegistryDefaults(), managers)
162 @classmethod
163 def fromConfig(cls, config: Union[ButlerConfig, RegistryConfig, Config, str],
164 butlerRoot: Optional[Union[str, ButlerURI]] = None, writeable: bool = True,
165 defaults: Optional[RegistryDefaults] = None) -> Registry:
166 """Create `Registry` subclass instance from `config`.
168 Registry database must be inbitialized prior to calling this method.
170 Parameters
171 ----------
172 config : `ButlerConfig`, `RegistryConfig`, `Config` or `str`
173 Registry configuration
174 butlerRoot : `str` or `ButlerURI`, optional
175 Path to the repository root this `Registry` will manage.
176 writeable : `bool`, optional
177 If `True` (default) create a read-write connection to the database.
178 defaults : `RegistryDefaults`, optional
179 Default collection search path and/or output `~CollectionType.RUN`
180 collection.
182 Returns
183 -------
184 registry : `SqlRegistry` (subclass)
185 A new `SqlRegistry` subclass instance.
186 """
187 config = cls.forceRegistryConfig(config)
188 config.replaceRoot(butlerRoot)
189 DatabaseClass = config.getDatabaseClass()
190 database = DatabaseClass.fromUri(str(config.connectionString), origin=config.get("origin", 0),
191 namespace=config.get("namespace"), writeable=writeable)
192 managerTypes = RegistryManagerTypes.fromConfig(config)
193 managers = managerTypes.loadRepo(database)
194 if defaults is None:
195 defaults = RegistryDefaults()
196 return cls(database, defaults, managers)
198 def __init__(self, database: Database, defaults: RegistryDefaults, managers: RegistryManagerInstances):
199 self._db = database
200 self._managers = managers
201 self.storageClasses = StorageClassFactory()
202 # Intentionally invoke property setter to initialize defaults. This
203 # can only be done after most of the rest of Registry has already been
204 # initialized, and must be done before the property getter is used.
205 self.defaults = defaults
207 def __str__(self) -> str:
208 return str(self._db)
210 def __repr__(self) -> str:
211 return f"SqlRegistry({self._db!r}, {self.dimensions!r})"
213 def isWriteable(self) -> bool:
214 # Docstring inherited from lsst.daf.butler.registry.Registry
215 return self._db.isWriteable()
217 def copy(self, defaults: Optional[RegistryDefaults] = None) -> Registry:
218 # Docstring inherited from lsst.daf.butler.registry.Registry
219 if defaults is None:
220 # No need to copy, because `RegistryDefaults` is immutable; we
221 # effectively copy on write.
222 defaults = self.defaults
223 return type(self)(self._db, defaults, self._managers)
225 @property
226 def dimensions(self) -> DimensionUniverse:
227 # Docstring inherited from lsst.daf.butler.registry.Registry
228 return self._managers.dimensions.universe
230 def refresh(self) -> None:
231 # Docstring inherited from lsst.daf.butler.registry.Registry
232 self._managers.refresh()
234 @contextlib.contextmanager
235 def transaction(self, *, savepoint: bool = False) -> Iterator[None]:
236 # Docstring inherited from lsst.daf.butler.registry.Registry
237 try:
238 with self._db.transaction(savepoint=savepoint):
239 yield
240 except BaseException:
241 # TODO: this clears the caches sometimes when we wouldn't actually
242 # need to. Can we avoid that?
243 self._managers.dimensions.clearCaches()
244 raise
246 def resetConnectionPool(self) -> None:
247 """Reset SQLAlchemy connection pool for `SqlRegistry` database.
249 This operation is useful when using registry with fork-based
250 multiprocessing. To use registry across fork boundary one has to make
251 sure that there are no currently active connections (no session or
252 transaction is in progress) and connection pool is reset using this
253 method. This method should be called by the child process immediately
254 after the fork.
255 """
256 self._db._engine.dispose()
258 def registerOpaqueTable(self, tableName: str, spec: ddl.TableSpec) -> None:
259 """Add an opaque (to the `Registry`) table for use by a `Datastore` or
260 other data repository client.
262 Opaque table records can be added via `insertOpaqueData`, retrieved via
263 `fetchOpaqueData`, and removed via `deleteOpaqueData`.
265 Parameters
266 ----------
267 tableName : `str`
268 Logical name of the opaque table. This may differ from the
269 actual name used in the database by a prefix and/or suffix.
270 spec : `ddl.TableSpec`
271 Specification for the table to be added.
272 """
273 self._managers.opaque.register(tableName, spec)
275 @transactional
276 def insertOpaqueData(self, tableName: str, *data: dict) -> None:
277 """Insert records into an opaque table.
279 Parameters
280 ----------
281 tableName : `str`
282 Logical name of the opaque table. Must match the name used in a
283 previous call to `registerOpaqueTable`.
284 data
285 Each additional positional argument is a dictionary that represents
286 a single row to be added.
287 """
288 self._managers.opaque[tableName].insert(*data)
290 def fetchOpaqueData(self, tableName: str, **where: Any) -> Iterator[dict]:
291 """Retrieve records from an opaque table.
293 Parameters
294 ----------
295 tableName : `str`
296 Logical name of the opaque table. Must match the name used in a
297 previous call to `registerOpaqueTable`.
298 where
299 Additional keyword arguments are interpreted as equality
300 constraints that restrict the returned rows (combined with AND);
301 keyword arguments are column names and values are the values they
302 must have.
304 Yields
305 ------
306 row : `dict`
307 A dictionary representing a single result row.
308 """
309 yield from self._managers.opaque[tableName].fetch(**where)
311 @transactional
312 def deleteOpaqueData(self, tableName: str, **where: Any) -> None:
313 """Remove records from an opaque table.
315 Parameters
316 ----------
317 tableName : `str`
318 Logical name of the opaque table. Must match the name used in a
319 previous call to `registerOpaqueTable`.
320 where
321 Additional keyword arguments are interpreted as equality
322 constraints that restrict the deleted rows (combined with AND);
323 keyword arguments are column names and values are the values they
324 must have.
325 """
326 self._managers.opaque[tableName].delete(where.keys(), where)
328 def registerCollection(self, name: str, type: CollectionType = CollectionType.TAGGED,
329 doc: Optional[str] = None) -> bool:
330 # Docstring inherited from lsst.daf.butler.registry.Registry
331 _, registered = self._managers.collections.register(name, type, doc=doc)
332 return registered
334 def getCollectionType(self, name: str) -> CollectionType:
335 # Docstring inherited from lsst.daf.butler.registry.Registry
336 return self._managers.collections.find(name).type
338 def _get_collection_record(self, name: str) -> CollectionRecord:
339 # Docstring inherited from lsst.daf.butler.registry.Registry
340 return self._managers.collections.find(name)
342 def registerRun(self, name: str, doc: Optional[str] = None) -> bool:
343 # Docstring inherited from lsst.daf.butler.registry.Registry
344 _, registered = self._managers.collections.register(name, CollectionType.RUN, doc=doc)
345 return registered
347 @transactional
348 def removeCollection(self, name: str) -> None:
349 # Docstring inherited from lsst.daf.butler.registry.Registry
350 self._managers.collections.remove(name)
352 def getCollectionChain(self, parent: str) -> CollectionSearch:
353 # Docstring inherited from lsst.daf.butler.registry.Registry
354 record = self._managers.collections.find(parent)
355 if record.type is not CollectionType.CHAINED:
356 raise TypeError(f"Collection '{parent}' has type {record.type.name}, not CHAINED.")
357 assert isinstance(record, ChainedCollectionRecord)
358 return record.children
360 @transactional
361 def setCollectionChain(self, parent: str, children: Any, *, flatten: bool = False) -> None:
362 # Docstring inherited from lsst.daf.butler.registry.Registry
363 record = self._managers.collections.find(parent)
364 if record.type is not CollectionType.CHAINED:
365 raise TypeError(f"Collection '{parent}' has type {record.type.name}, not CHAINED.")
366 assert isinstance(record, ChainedCollectionRecord)
367 children = CollectionSearch.fromExpression(children)
368 if children != record.children or flatten:
369 record.update(self._managers.collections, children, flatten=flatten)
371 def getCollectionDocumentation(self, collection: str) -> Optional[str]:
372 # Docstring inherited from lsst.daf.butler.registry.Registry
373 return self._managers.collections.getDocumentation(self._managers.collections.find(collection).key)
375 def setCollectionDocumentation(self, collection: str, doc: Optional[str]) -> None:
376 # Docstring inherited from lsst.daf.butler.registry.Registry
377 self._managers.collections.setDocumentation(self._managers.collections.find(collection).key, doc)
379 def getCollectionSummary(self, collection: str) -> CollectionSummary:
380 # Docstring inherited from lsst.daf.butler.registry.Registry
381 record = self._managers.collections.find(collection)
382 return self._managers.datasets.getCollectionSummary(record)
384 def registerDatasetType(self, datasetType: DatasetType) -> bool:
385 # Docstring inherited from lsst.daf.butler.registry.Registry
386 _, inserted = self._managers.datasets.register(datasetType)
387 return inserted
389 def removeDatasetType(self, name: str) -> None:
390 # Docstring inherited from lsst.daf.butler.registry.Registry
391 self._managers.datasets.remove(name)
393 def getDatasetType(self, name: str) -> DatasetType:
394 # Docstring inherited from lsst.daf.butler.registry.Registry
395 return self._managers.datasets[name].datasetType
397 def supportsIdGenerationMode(self, mode: DatasetIdGenEnum) -> bool:
398 # Docstring inherited from lsst.daf.butler.registry.Registry
399 return self._managers.datasets.supportsIdGenerationMode(mode)
401 def findDataset(self, datasetType: Union[DatasetType, str], dataId: Optional[DataId] = None, *,
402 collections: Any = None, timespan: Optional[Timespan] = None,
403 **kwargs: Any) -> Optional[DatasetRef]:
404 # Docstring inherited from lsst.daf.butler.registry.Registry
405 if isinstance(datasetType, DatasetType):
406 storage = self._managers.datasets[datasetType.name]
407 else:
408 storage = self._managers.datasets[datasetType]
409 dataId = DataCoordinate.standardize(dataId, graph=storage.datasetType.dimensions,
410 universe=self.dimensions, defaults=self.defaults.dataId,
411 **kwargs)
412 if collections is None:
413 if not self.defaults.collections:
414 raise TypeError("No collections provided to findDataset, "
415 "and no defaults from registry construction.")
416 collections = self.defaults.collections
417 else:
418 collections = CollectionSearch.fromExpression(collections)
419 for collectionRecord in collections.iter(self._managers.collections):
420 if (collectionRecord.type is CollectionType.CALIBRATION
421 and (not storage.datasetType.isCalibration() or timespan is None)):
422 continue
423 result = storage.find(collectionRecord, dataId, timespan=timespan)
424 if result is not None:
425 return result
427 return None
429 @transactional
430 def insertDatasets(self, datasetType: Union[DatasetType, str], dataIds: Iterable[DataId],
431 run: Optional[str] = None, expand: bool = True,
432 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE) -> List[DatasetRef]:
433 # Docstring inherited from lsst.daf.butler.registry.Registry
434 if isinstance(datasetType, DatasetType):
435 storage = self._managers.datasets.find(datasetType.name)
436 if storage is None:
437 raise LookupError(f"DatasetType '{datasetType}' has not been registered.")
438 else:
439 storage = self._managers.datasets.find(datasetType)
440 if storage is None:
441 raise LookupError(f"DatasetType with name '{datasetType}' has not been registered.")
442 if run is None:
443 if self.defaults.run is None:
444 raise TypeError("No run provided to insertDatasets, "
445 "and no default from registry construction.")
446 run = self.defaults.run
447 runRecord = self._managers.collections.find(run)
448 if runRecord.type is not CollectionType.RUN:
449 raise TypeError(f"Given collection is of type {runRecord.type.name}; RUN collection required.")
450 assert isinstance(runRecord, RunRecord)
451 progress = Progress("daf.butler.Registry.insertDatasets", level=logging.DEBUG)
452 if expand:
453 expandedDataIds = [self.expandDataId(dataId, graph=storage.datasetType.dimensions)
454 for dataId in progress.wrap(dataIds,
455 f"Expanding {storage.datasetType.name} data IDs")]
456 else:
457 expandedDataIds = [DataCoordinate.standardize(dataId, graph=storage.datasetType.dimensions)
458 for dataId in dataIds]
459 try:
460 refs = list(storage.insert(runRecord, expandedDataIds, idGenerationMode))
461 except sqlalchemy.exc.IntegrityError as err:
462 raise ConflictingDefinitionError(f"A database constraint failure was triggered by inserting "
463 f"one or more datasets of type {storage.datasetType} into "
464 f"collection '{run}'. "
465 f"This probably means a dataset with the same data ID "
466 f"and dataset type already exists, but it may also mean a "
467 f"dimension row is missing.") from err
468 return refs
470 @transactional
471 def _importDatasets(self, datasets: Iterable[DatasetRef], expand: bool = True,
472 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE,
473 reuseIds: bool = False) -> List[DatasetRef]:
474 # Docstring inherited from lsst.daf.butler.registry.Registry
475 datasets = list(datasets)
476 if not datasets:
477 # nothing to do
478 return []
480 # find dataset type
481 datasetTypes = set(dataset.datasetType for dataset in datasets)
482 if len(datasetTypes) != 1:
483 raise ValueError(f"Multiple dataset types in input datasets: {datasetTypes}")
484 datasetType = datasetTypes.pop()
486 # get storage handler for this dataset type
487 storage = self._managers.datasets.find(datasetType.name)
488 if storage is None:
489 raise LookupError(f"DatasetType '{datasetType}' has not been registered.")
491 # find run name
492 runs = set(dataset.run for dataset in datasets)
493 if len(runs) != 1:
494 raise ValueError(f"Multiple run names in input datasets: {runs}")
495 run = runs.pop()
496 if run is None:
497 if self.defaults.run is None:
498 raise TypeError("No run provided to ingestDatasets, "
499 "and no default from registry construction.")
500 run = self.defaults.run
502 runRecord = self._managers.collections.find(run)
503 if runRecord.type is not CollectionType.RUN:
504 raise TypeError(f"Given collection '{runRecord.name}' is of type {runRecord.type.name};"
505 " RUN collection required.")
506 assert isinstance(runRecord, RunRecord)
508 progress = Progress("daf.butler.Registry.insertDatasets", level=logging.DEBUG)
509 if expand:
510 expandedDatasets = [
511 dataset.expanded(self.expandDataId(dataset.dataId, graph=storage.datasetType.dimensions))
512 for dataset in progress.wrap(datasets, f"Expanding {storage.datasetType.name} data IDs")]
513 else:
514 expandedDatasets = [
515 DatasetRef(datasetType, dataset.dataId, id=dataset.id, run=dataset.run, conform=True)
516 for dataset in datasets
517 ]
519 try:
520 refs = list(storage.import_(runRecord, expandedDatasets, idGenerationMode, reuseIds))
521 except sqlalchemy.exc.IntegrityError as err:
522 raise ConflictingDefinitionError(f"A database constraint failure was triggered by inserting "
523 f"one or more datasets of type {storage.datasetType} into "
524 f"collection '{run}'. "
525 f"This probably means a dataset with the same data ID "
526 f"and dataset type already exists, but it may also mean a "
527 f"dimension row is missing.") from err
528 return refs
530 def getDataset(self, id: DatasetId) -> Optional[DatasetRef]:
531 # Docstring inherited from lsst.daf.butler.registry.Registry
532 return self._managers.datasets.getDatasetRef(id)
534 @transactional
535 def removeDatasets(self, refs: Iterable[DatasetRef]) -> None:
536 # Docstring inherited from lsst.daf.butler.registry.Registry
537 progress = Progress("lsst.daf.butler.Registry.removeDatasets", level=logging.DEBUG)
538 for datasetType, refsForType in progress.iter_item_chunks(DatasetRef.groupByType(refs).items(),
539 desc="Removing datasets by type"):
540 storage = self._managers.datasets[datasetType.name]
541 try:
542 storage.delete(refsForType)
543 except sqlalchemy.exc.IntegrityError as err:
544 raise OrphanedRecordError("One or more datasets is still "
545 "present in one or more Datastores.") from err
547 @transactional
548 def associate(self, collection: str, refs: Iterable[DatasetRef]) -> None:
549 # Docstring inherited from lsst.daf.butler.registry.Registry
550 progress = Progress("lsst.daf.butler.Registry.associate", level=logging.DEBUG)
551 collectionRecord = self._managers.collections.find(collection)
552 if collectionRecord.type is not CollectionType.TAGGED:
553 raise TypeError(f"Collection '{collection}' has type {collectionRecord.type.name}, not TAGGED.")
554 for datasetType, refsForType in progress.iter_item_chunks(DatasetRef.groupByType(refs).items(),
555 desc="Associating datasets by type"):
556 storage = self._managers.datasets[datasetType.name]
557 try:
558 storage.associate(collectionRecord, refsForType)
559 except sqlalchemy.exc.IntegrityError as err:
560 raise ConflictingDefinitionError(
561 f"Constraint violation while associating dataset of type {datasetType.name} with "
562 f"collection {collection}. This probably means that one or more datasets with the same "
563 f"dataset type and data ID already exist in the collection, but it may also indicate "
564 f"that the datasets do not exist."
565 ) from err
567 @transactional
568 def disassociate(self, collection: str, refs: Iterable[DatasetRef]) -> None:
569 # Docstring inherited from lsst.daf.butler.registry.Registry
570 progress = Progress("lsst.daf.butler.Registry.disassociate", level=logging.DEBUG)
571 collectionRecord = self._managers.collections.find(collection)
572 if collectionRecord.type is not CollectionType.TAGGED:
573 raise TypeError(f"Collection '{collection}' has type {collectionRecord.type.name}; "
574 "expected TAGGED.")
575 for datasetType, refsForType in progress.iter_item_chunks(DatasetRef.groupByType(refs).items(),
576 desc="Disassociating datasets by type"):
577 storage = self._managers.datasets[datasetType.name]
578 storage.disassociate(collectionRecord, refsForType)
580 @transactional
581 def certify(self, collection: str, refs: Iterable[DatasetRef], timespan: Timespan) -> None:
582 # Docstring inherited from lsst.daf.butler.registry.Registry
583 progress = Progress("lsst.daf.butler.Registry.certify", level=logging.DEBUG)
584 collectionRecord = self._managers.collections.find(collection)
585 for datasetType, refsForType in progress.iter_item_chunks(DatasetRef.groupByType(refs).items(),
586 desc="Certifying datasets by type"):
587 storage = self._managers.datasets[datasetType.name]
588 storage.certify(collectionRecord, refsForType, timespan)
590 @transactional
591 def decertify(self, collection: str, datasetType: Union[str, DatasetType], timespan: Timespan, *,
592 dataIds: Optional[Iterable[DataId]] = None) -> None:
593 # Docstring inherited from lsst.daf.butler.registry.Registry
594 collectionRecord = self._managers.collections.find(collection)
595 if isinstance(datasetType, str):
596 storage = self._managers.datasets[datasetType]
597 else:
598 storage = self._managers.datasets[datasetType.name]
599 standardizedDataIds = None
600 if dataIds is not None:
601 standardizedDataIds = [DataCoordinate.standardize(d, graph=storage.datasetType.dimensions)
602 for d in dataIds]
603 storage.decertify(collectionRecord, timespan, dataIds=standardizedDataIds)
605 def getDatastoreBridgeManager(self) -> DatastoreRegistryBridgeManager:
606 """Return an object that allows a new `Datastore` instance to
607 communicate with this `Registry`.
609 Returns
610 -------
611 manager : `DatastoreRegistryBridgeManager`
612 Object that mediates communication between this `Registry` and its
613 associated datastores.
614 """
615 return self._managers.datastores
617 def getDatasetLocations(self, ref: DatasetRef) -> Iterable[str]:
618 # Docstring inherited from lsst.daf.butler.registry.Registry
619 return self._managers.datastores.findDatastores(ref)
621 def expandDataId(self, dataId: Optional[DataId] = None, *, graph: Optional[DimensionGraph] = None,
622 records: Optional[NameLookupMapping[DimensionElement, Optional[DimensionRecord]]] = None,
623 withDefaults: bool = True,
624 **kwargs: Any) -> DataCoordinate:
625 # Docstring inherited from lsst.daf.butler.registry.Registry
626 if not withDefaults:
627 defaults = None
628 else:
629 defaults = self.defaults.dataId
630 standardized = DataCoordinate.standardize(dataId, graph=graph, universe=self.dimensions,
631 defaults=defaults, **kwargs)
632 if standardized.hasRecords():
633 return standardized
634 if records is None:
635 records = {}
636 elif isinstance(records, NamedKeyMapping):
637 records = records.byName()
638 else:
639 records = dict(records)
640 if isinstance(dataId, DataCoordinate) and dataId.hasRecords():
641 records.update(dataId.records.byName())
642 keys = standardized.byName()
643 for element in standardized.graph.primaryKeyTraversalOrder:
644 record = records.get(element.name, ...) # Use ... to mean not found; None might mean NULL
645 if record is ...:
646 if isinstance(element, Dimension) and keys.get(element.name) is None:
647 if element in standardized.graph.required:
648 raise LookupError(
649 f"No value or null value for required dimension {element.name}."
650 )
651 keys[element.name] = None
652 record = None
653 else:
654 storage = self._managers.dimensions[element]
655 dataIdSet = DataCoordinateIterable.fromScalar(
656 DataCoordinate.standardize(keys, graph=element.graph)
657 )
658 fetched = tuple(storage.fetch(dataIdSet))
659 try:
660 (record,) = fetched
661 except ValueError:
662 record = None
663 records[element.name] = record
664 if record is not None:
665 for d in element.implied:
666 value = getattr(record, d.name)
667 if keys.setdefault(d.name, value) != value:
668 raise InconsistentDataIdError(
669 f"Data ID {standardized} has {d.name}={keys[d.name]!r}, "
670 f"but {element.name} implies {d.name}={value!r}."
671 )
672 else:
673 if element in standardized.graph.required:
674 raise LookupError(
675 f"Could not fetch record for required dimension {element.name} via keys {keys}."
676 )
677 if element.alwaysJoin:
678 raise InconsistentDataIdError(
679 f"Could not fetch record for element {element.name} via keys {keys}, ",
680 "but it is marked alwaysJoin=True; this means one or more dimensions are not "
681 "related."
682 )
683 for d in element.implied:
684 keys.setdefault(d.name, None)
685 records.setdefault(d.name, None)
686 return DataCoordinate.standardize(keys, graph=standardized.graph).expanded(records=records)
688 def insertDimensionData(self, element: Union[DimensionElement, str],
689 *data: Union[Mapping[str, Any], DimensionRecord],
690 conform: bool = True,
691 replace: bool = False) -> None:
692 # Docstring inherited from lsst.daf.butler.registry.Registry
693 if conform:
694 if isinstance(element, str):
695 element = self.dimensions[element]
696 records = [row if isinstance(row, DimensionRecord) else element.RecordClass(**row)
697 for row in data]
698 else:
699 # Ignore typing since caller said to trust them with conform=False.
700 records = data # type: ignore
701 storage = self._managers.dimensions[element] # type: ignore
702 storage.insert(*records, replace=replace)
704 def syncDimensionData(self, element: Union[DimensionElement, str],
705 row: Union[Mapping[str, Any], DimensionRecord],
706 conform: bool = True,
707 update: bool = False) -> Union[bool, Dict[str, Any]]:
708 # Docstring inherited from lsst.daf.butler.registry.Registry
709 if conform:
710 if isinstance(element, str):
711 element = self.dimensions[element]
712 record = row if isinstance(row, DimensionRecord) else element.RecordClass(**row)
713 else:
714 # Ignore typing since caller said to trust them with conform=False.
715 record = row # type: ignore
716 storage = self._managers.dimensions[element] # type: ignore
717 return storage.sync(record, update=update)
719 def queryDatasetTypes(self, expression: Any = ..., *, components: Optional[bool] = None,
720 missing: Optional[List[str]] = None,
721 ) -> Iterator[DatasetType]:
722 # Docstring inherited from lsst.daf.butler.registry.Registry
723 wildcard = CategorizedWildcard.fromExpression(expression, coerceUnrecognized=lambda d: d.name)
724 if wildcard is Ellipsis:
725 for datasetType in self._managers.datasets:
726 # The dataset type can no longer be a component
727 yield datasetType
728 if components:
729 # Automatically create the component dataset types
730 try:
731 componentsForDatasetType = datasetType.makeAllComponentDatasetTypes()
732 except KeyError as err:
733 _LOG.warning(f"Could not load storage class {err} for {datasetType.name}; "
734 "if it has components they will not be included in query results.")
735 else:
736 yield from componentsForDatasetType
737 return
738 done: Set[str] = set()
739 for name in wildcard.strings:
740 storage = self._managers.datasets.find(name)
741 done.add(name)
742 if storage is None:
743 if missing is not None:
744 missing.append(name)
745 else:
746 yield storage.datasetType
747 if wildcard.patterns:
748 # If components (the argument) is None, we'll save component
749 # dataset that we might want to match, but only if their parents
750 # didn't get included.
751 componentsForLater = []
752 for registeredDatasetType in self._managers.datasets:
753 # Components are not stored in registry so expand them here
754 allDatasetTypes = [registeredDatasetType]
755 try:
756 allDatasetTypes.extend(registeredDatasetType.makeAllComponentDatasetTypes())
757 except KeyError as err:
758 _LOG.warning(f"Could not load storage class {err} for {registeredDatasetType.name}; "
759 "if it has components they will not be included in query results.")
760 for datasetType in allDatasetTypes:
761 if datasetType.name in done:
762 continue
763 parentName, componentName = datasetType.nameAndComponent()
764 if componentName is not None and not components:
765 if components is None and parentName not in done:
766 componentsForLater.append(datasetType)
767 continue
768 if any(p.fullmatch(datasetType.name) for p in wildcard.patterns):
769 done.add(datasetType.name)
770 yield datasetType
771 # Go back and try to match saved components.
772 for datasetType in componentsForLater:
773 parentName, _ = datasetType.nameAndComponent()
774 if parentName not in done and any(p.fullmatch(datasetType.name) for p in wildcard.patterns):
775 yield datasetType
777 def queryCollections(self, expression: Any = ...,
778 datasetType: Optional[DatasetType] = None,
779 collectionTypes: Iterable[CollectionType] = CollectionType.all(),
780 flattenChains: bool = False,
781 includeChains: Optional[bool] = None) -> Iterator[str]:
782 # Docstring inherited from lsst.daf.butler.registry.Registry
784 # Right now the datasetTypes argument is completely ignored, but that
785 # is consistent with its [lack of] guarantees. DM-24939 or a follow-up
786 # ticket will take care of that.
787 query = CollectionQuery.fromExpression(expression)
788 for record in query.iter(self._managers.collections, collectionTypes=frozenset(collectionTypes),
789 flattenChains=flattenChains, includeChains=includeChains):
790 yield record.name
792 def _makeQueryBuilder(self, summary: queries.QuerySummary,
793 doomed_by: Iterable[str] = ()) -> queries.QueryBuilder:
794 """Return a `QueryBuilder` instance capable of constructing and
795 managing more complex queries than those obtainable via `Registry`
796 interfaces.
798 This is an advanced interface; downstream code should prefer
799 `Registry.queryDataIds` and `Registry.queryDatasets` whenever those
800 are sufficient.
802 Parameters
803 ----------
804 summary : `queries.QuerySummary`
805 Object describing and categorizing the full set of dimensions that
806 will be included in the query.
807 doomed_by : `Iterable` of `str`, optional
808 A list of diagnostic messages that indicate why the query is going
809 to yield no results and should not even be executed. If an empty
810 container (default) the query will be executed unless other code
811 determines that it is doomed.
813 Returns
814 -------
815 builder : `queries.QueryBuilder`
816 Object that can be used to construct and perform advanced queries.
817 """
818 return queries.QueryBuilder(
819 summary,
820 queries.RegistryManagers(
821 collections=self._managers.collections,
822 dimensions=self._managers.dimensions,
823 datasets=self._managers.datasets,
824 TimespanReprClass=self._db.getTimespanRepresentation(),
825 ),
826 doomed_by=doomed_by,
827 )
829 def queryDatasets(self, datasetType: Any, *,
830 collections: Any = None,
831 dimensions: Optional[Iterable[Union[Dimension, str]]] = None,
832 dataId: Optional[DataId] = None,
833 where: Optional[str] = None,
834 findFirst: bool = False,
835 components: Optional[bool] = None,
836 bind: Optional[Mapping[str, Any]] = None,
837 check: bool = True,
838 **kwargs: Any) -> queries.DatasetQueryResults:
839 # Docstring inherited from lsst.daf.butler.registry.Registry
841 # Standardize the collections expression.
842 if collections is None:
843 if not self.defaults.collections:
844 raise TypeError("No collections provided to findDataset, "
845 "and no defaults from registry construction.")
846 collections = self.defaults.collections
847 elif findFirst:
848 collections = CollectionSearch.fromExpression(collections)
849 else:
850 collections = CollectionQuery.fromExpression(collections)
851 # Standardize and expand the data ID provided as a constraint.
852 standardizedDataId = self.expandDataId(dataId, **kwargs)
854 # We can only query directly if given a non-component DatasetType
855 # instance. If we were given an expression or str or a component
856 # DatasetType instance, we'll populate this dict, recurse, and return.
857 # If we already have a non-component DatasetType, it will remain None
858 # and we'll run the query directly.
859 composition: Optional[
860 Dict[
861 DatasetType, # parent dataset type
862 List[Optional[str]] # component name, or None for parent
863 ]
864 ] = None
865 if not isinstance(datasetType, DatasetType):
866 # We were given a dataset type expression (which may be as simple
867 # as a str). Loop over all matching datasets, delegating handling
868 # of the `components` argument to queryDatasetTypes, as we populate
869 # the composition dict.
870 composition = defaultdict(list)
871 for trueDatasetType in self.queryDatasetTypes(datasetType, components=components):
872 parentName, componentName = trueDatasetType.nameAndComponent()
873 if componentName is not None:
874 parentDatasetType = self.getDatasetType(parentName)
875 composition.setdefault(parentDatasetType, []).append(componentName)
876 else:
877 composition.setdefault(trueDatasetType, []).append(None)
878 if not composition:
879 return queries.ChainedDatasetQueryResults(
880 [],
881 doomed_by=[f"No registered dataset type matching {t!r} found."
882 for t in ensure_iterable(datasetType)],
883 )
884 elif datasetType.isComponent():
885 # We were given a true DatasetType instance, but it's a component.
886 # the composition dict will have exactly one item.
887 parentName, componentName = datasetType.nameAndComponent()
888 parentDatasetType = self.getDatasetType(parentName)
889 composition = {parentDatasetType: [componentName]}
890 if composition is not None:
891 # We need to recurse. Do that once for each parent dataset type.
892 chain = []
893 for parentDatasetType, componentNames in composition.items():
894 parentResults = self.queryDatasets(
895 parentDatasetType,
896 collections=collections,
897 dimensions=dimensions,
898 dataId=standardizedDataId,
899 where=where,
900 bind=bind,
901 findFirst=findFirst,
902 check=check,
903 )
904 assert isinstance(parentResults, queries.ParentDatasetQueryResults), \
905 "Should always be true if passing in a DatasetType instance, and we are."
906 chain.append(
907 parentResults.withComponents(componentNames)
908 )
909 return queries.ChainedDatasetQueryResults(chain)
910 # If we get here, there's no need to recurse (or we are already
911 # recursing; there can only ever be one level of recursion).
913 # The full set of dimensions in the query is the combination of those
914 # needed for the DatasetType and those explicitly requested, if any.
915 requestedDimensionNames = set(datasetType.dimensions.names)
916 if dimensions is not None:
917 requestedDimensionNames.update(self.dimensions.extract(dimensions).names)
918 # Construct the summary structure needed to construct a QueryBuilder.
919 summary = queries.QuerySummary(
920 requested=DimensionGraph(self.dimensions, names=requestedDimensionNames),
921 dataId=standardizedDataId,
922 expression=where,
923 bind=bind,
924 defaults=self.defaults.dataId,
925 check=check,
926 )
927 builder = self._makeQueryBuilder(summary)
928 # Add the dataset subquery to the query, telling the QueryBuilder to
929 # include the rank of the selected collection in the results only if we
930 # need to findFirst. Note that if any of the collections are
931 # actually wildcard expressions, and we've asked for deduplication,
932 # this will raise TypeError for us.
933 builder.joinDataset(datasetType, collections, isResult=True, findFirst=findFirst)
934 query = builder.finish()
935 return queries.ParentDatasetQueryResults(self._db, query, components=[None], datasetType=datasetType)
937 def queryDataIds(self, dimensions: Union[Iterable[Union[Dimension, str]], Dimension, str], *,
938 dataId: Optional[DataId] = None,
939 datasets: Any = None,
940 collections: Any = None,
941 where: Optional[str] = None,
942 components: Optional[bool] = None,
943 bind: Optional[Mapping[str, Any]] = None,
944 check: bool = True,
945 **kwargs: Any) -> queries.DataCoordinateQueryResults:
946 # Docstring inherited from lsst.daf.butler.registry.Registry
947 dimensions = ensure_iterable(dimensions)
948 standardizedDataId = self.expandDataId(dataId, **kwargs)
949 standardizedDatasetTypes = set()
950 requestedDimensions = self.dimensions.extract(dimensions)
951 queryDimensionNames = set(requestedDimensions.names)
952 missing: List[str] = []
953 if datasets is not None:
954 if not collections:
955 if not self.defaults.collections:
956 raise TypeError(f"Cannot pass 'datasets' (='{datasets}') without 'collections'.")
957 collections = self.defaults.collections
958 else:
959 # Preprocess collections expression in case the original
960 # included single-pass iterators (we'll want to use it multiple
961 # times below).
962 collections = CollectionQuery.fromExpression(collections)
963 for datasetType in self.queryDatasetTypes(datasets, components=components, missing=missing):
964 queryDimensionNames.update(datasetType.dimensions.names)
965 # If any matched dataset type is a component, just operate on
966 # its parent instead, because Registry doesn't know anything
967 # about what components exist, and here (unlike queryDatasets)
968 # we don't care about returning them.
969 parentDatasetTypeName, componentName = datasetType.nameAndComponent()
970 if componentName is not None:
971 datasetType = self.getDatasetType(parentDatasetTypeName)
972 standardizedDatasetTypes.add(datasetType)
973 elif collections:
974 raise TypeError(f"Cannot pass 'collections' (='{collections}') without 'datasets'.")
976 summary = queries.QuerySummary(
977 requested=DimensionGraph(self.dimensions, names=queryDimensionNames),
978 dataId=standardizedDataId,
979 expression=where,
980 bind=bind,
981 defaults=self.defaults.dataId,
982 check=check,
983 )
984 builder = self._makeQueryBuilder(
985 summary,
986 doomed_by=[f"Dataset type {name} is not registered." for name in missing]
987 )
988 for datasetType in standardizedDatasetTypes:
989 builder.joinDataset(datasetType, collections, isResult=False)
990 query = builder.finish()
991 return queries.DataCoordinateQueryResults(self._db, query)
993 def queryDimensionRecords(self, element: Union[DimensionElement, str], *,
994 dataId: Optional[DataId] = None,
995 datasets: Any = None,
996 collections: Any = None,
997 where: Optional[str] = None,
998 components: Optional[bool] = None,
999 bind: Optional[Mapping[str, Any]] = None,
1000 check: bool = True,
1001 **kwargs: Any) -> Iterator[DimensionRecord]:
1002 # Docstring inherited from lsst.daf.butler.registry.Registry
1003 if not isinstance(element, DimensionElement):
1004 try:
1005 element = self.dimensions[element]
1006 except KeyError as e:
1007 raise KeyError(f"No such dimension '{element}', available dimensions: "
1008 + str(self.dimensions.getStaticElements())) from e
1009 dataIds = self.queryDataIds(element.graph, dataId=dataId, datasets=datasets, collections=collections,
1010 where=where, components=components, bind=bind, check=check, **kwargs)
1011 return iter(self._managers.dimensions[element].fetch(dataIds))
1013 def queryDatasetAssociations(
1014 self,
1015 datasetType: Union[str, DatasetType],
1016 collections: Any = ...,
1017 *,
1018 collectionTypes: Iterable[CollectionType] = CollectionType.all(),
1019 flattenChains: bool = False,
1020 ) -> Iterator[DatasetAssociation]:
1021 # Docstring inherited from lsst.daf.butler.registry.Registry
1022 if collections is None:
1023 if not self.defaults.collections:
1024 raise TypeError("No collections provided to findDataset, "
1025 "and no defaults from registry construction.")
1026 collections = self.defaults.collections
1027 else:
1028 collections = CollectionQuery.fromExpression(collections)
1029 TimespanReprClass = self._db.getTimespanRepresentation()
1030 if isinstance(datasetType, str):
1031 storage = self._managers.datasets[datasetType]
1032 else:
1033 storage = self._managers.datasets[datasetType.name]
1034 for collectionRecord in collections.iter(self._managers.collections,
1035 collectionTypes=frozenset(collectionTypes),
1036 flattenChains=flattenChains):
1037 query = storage.select(collectionRecord)
1038 for row in self._db.query(query.combine()).mappings():
1039 dataId = DataCoordinate.fromRequiredValues(
1040 storage.datasetType.dimensions,
1041 tuple(row[name] for name in storage.datasetType.dimensions.required.names)
1042 )
1043 runRecord = self._managers.collections[row[self._managers.collections.getRunForeignKeyName()]]
1044 ref = DatasetRef(storage.datasetType, dataId, id=row["id"], run=runRecord.name,
1045 conform=False)
1046 if collectionRecord.type is CollectionType.CALIBRATION:
1047 timespan = TimespanReprClass.extract(row)
1048 else:
1049 timespan = None
1050 yield DatasetAssociation(ref=ref, collection=collectionRecord.name, timespan=timespan)
1052 storageClasses: StorageClassFactory
1053 """All storage classes known to the registry (`StorageClassFactory`).
1054 """