Coverage for python/lsst/daf/butler/registries/sql.py: 13%
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22from __future__ import annotations
24__all__ = (
25 "SqlRegistry",
26)
28from collections import defaultdict
29import contextlib
30import logging
31from typing import (
32 Any,
33 Dict,
34 Iterable,
35 Iterator,
36 List,
37 Mapping,
38 Optional,
39 Set,
40 Tuple,
41 TYPE_CHECKING,
42 Union,
43)
45import sqlalchemy
46from lsst.utils.iteration import ensure_iterable
48from ..core import (
49 ButlerURI,
50 Config,
51 DataCoordinate,
52 DataCoordinateIterable,
53 DataId,
54 DatasetAssociation,
55 DatasetId,
56 DatasetRef,
57 DatasetType,
58 ddl,
59 Dimension,
60 DimensionConfig,
61 DimensionElement,
62 DimensionGraph,
63 DimensionRecord,
64 DimensionUniverse,
65 NamedKeyMapping,
66 NameLookupMapping,
67 Progress,
68 StorageClassFactory,
69 Timespan,
70)
71from ..core.utils import transactional
73from ..registry import (
74 Registry,
75 RegistryConfig,
76 CollectionType,
77 RegistryDefaults,
78 ConflictingDefinitionError,
79 InconsistentDataIdError,
80 OrphanedRecordError,
81 CollectionSearch,
82)
83from ..registry import queries
85from ..registry.wildcards import CategorizedWildcard, CollectionQuery, Ellipsis
86from ..registry.summaries import CollectionSummary
87from ..registry.managers import RegistryManagerTypes, RegistryManagerInstances
88from ..registry.queries import Query
89from ..registry.interfaces import ChainedCollectionRecord, DatasetIdGenEnum, RunRecord
91if TYPE_CHECKING: 91 ↛ 92line 91 didn't jump to line 92, because the condition on line 91 was never true
92 from .._butlerConfig import ButlerConfig
93 from ..registry.interfaces import (
94 CollectionRecord,
95 Database,
96 DatastoreRegistryBridgeManager,
97 )
100_LOG = logging.getLogger(__name__)
103class SqlRegistry(Registry):
104 """Registry implementation based on SQLAlchemy.
106 Parameters
107 ----------
108 database : `Database`
109 Database instance to store Registry.
110 defaults : `RegistryDefaults`
111 Default collection search path and/or output `~CollectionType.RUN`
112 collection.
113 managers : `RegistryManagerInstances`
114 All the managers required for this registry.
115 """
117 defaultConfigFile: Optional[str] = None
118 """Path to configuration defaults. Accessed within the ``configs`` resource
119 or relative to a search path. Can be None if no defaults specified.
120 """
122 @classmethod
123 def createFromConfig(cls, config: Optional[Union[RegistryConfig, str]] = None,
124 dimensionConfig: Optional[Union[DimensionConfig, str]] = None,
125 butlerRoot: Optional[str] = None) -> Registry:
126 """Create registry database and return `SqlRegistry` instance.
128 This method initializes database contents, database must be empty
129 prior to calling this method.
131 Parameters
132 ----------
133 config : `RegistryConfig` or `str`, optional
134 Registry configuration, if missing then default configuration will
135 be loaded from registry.yaml.
136 dimensionConfig : `DimensionConfig` or `str`, optional
137 Dimensions configuration, if missing then default configuration
138 will be loaded from dimensions.yaml.
139 butlerRoot : `str`, optional
140 Path to the repository root this `SqlRegistry` will manage.
142 Returns
143 -------
144 registry : `SqlRegistry`
145 A new `SqlRegistry` instance.
146 """
147 config = cls.forceRegistryConfig(config)
148 config.replaceRoot(butlerRoot)
150 if isinstance(dimensionConfig, str):
151 dimensionConfig = DimensionConfig(config)
152 elif dimensionConfig is None:
153 dimensionConfig = DimensionConfig()
154 elif not isinstance(dimensionConfig, DimensionConfig):
155 raise TypeError(f"Incompatible Dimension configuration type: {type(dimensionConfig)}")
157 DatabaseClass = config.getDatabaseClass()
158 database = DatabaseClass.fromUri(str(config.connectionString), origin=config.get("origin", 0),
159 namespace=config.get("namespace"))
160 managerTypes = RegistryManagerTypes.fromConfig(config)
161 managers = managerTypes.makeRepo(database, dimensionConfig)
162 return cls(database, RegistryDefaults(), managers)
164 @classmethod
165 def fromConfig(cls, config: Union[ButlerConfig, RegistryConfig, Config, str],
166 butlerRoot: Optional[Union[str, ButlerURI]] = None, writeable: bool = True,
167 defaults: Optional[RegistryDefaults] = None) -> Registry:
168 """Create `Registry` subclass instance from `config`.
170 Registry database must be initialized prior to calling this method.
172 Parameters
173 ----------
174 config : `ButlerConfig`, `RegistryConfig`, `Config` or `str`
175 Registry configuration
176 butlerRoot : `str` or `ButlerURI`, optional
177 Path to the repository root this `Registry` will manage.
178 writeable : `bool`, optional
179 If `True` (default) create a read-write connection to the database.
180 defaults : `RegistryDefaults`, optional
181 Default collection search path and/or output `~CollectionType.RUN`
182 collection.
184 Returns
185 -------
186 registry : `SqlRegistry` (subclass)
187 A new `SqlRegistry` subclass instance.
188 """
189 config = cls.forceRegistryConfig(config)
190 config.replaceRoot(butlerRoot)
191 DatabaseClass = config.getDatabaseClass()
192 database = DatabaseClass.fromUri(str(config.connectionString), origin=config.get("origin", 0),
193 namespace=config.get("namespace"), writeable=writeable)
194 managerTypes = RegistryManagerTypes.fromConfig(config)
195 managers = managerTypes.loadRepo(database)
196 if defaults is None:
197 defaults = RegistryDefaults()
198 return cls(database, defaults, managers)
200 def __init__(self, database: Database, defaults: RegistryDefaults, managers: RegistryManagerInstances):
201 self._db = database
202 self._managers = managers
203 self.storageClasses = StorageClassFactory()
204 # Intentionally invoke property setter to initialize defaults. This
205 # can only be done after most of the rest of Registry has already been
206 # initialized, and must be done before the property getter is used.
207 self.defaults = defaults
209 def __str__(self) -> str:
210 return str(self._db)
212 def __repr__(self) -> str:
213 return f"SqlRegistry({self._db!r}, {self.dimensions!r})"
215 def isWriteable(self) -> bool:
216 # Docstring inherited from lsst.daf.butler.registry.Registry
217 return self._db.isWriteable()
219 def copy(self, defaults: Optional[RegistryDefaults] = None) -> Registry:
220 # Docstring inherited from lsst.daf.butler.registry.Registry
221 if defaults is None:
222 # No need to copy, because `RegistryDefaults` is immutable; we
223 # effectively copy on write.
224 defaults = self.defaults
225 return type(self)(self._db, defaults, self._managers)
227 @property
228 def dimensions(self) -> DimensionUniverse:
229 # Docstring inherited from lsst.daf.butler.registry.Registry
230 return self._managers.dimensions.universe
232 def refresh(self) -> None:
233 # Docstring inherited from lsst.daf.butler.registry.Registry
234 self._managers.refresh()
236 @contextlib.contextmanager
237 def transaction(self, *, savepoint: bool = False) -> Iterator[None]:
238 # Docstring inherited from lsst.daf.butler.registry.Registry
239 try:
240 with self._db.transaction(savepoint=savepoint):
241 yield
242 except BaseException:
243 # TODO: this clears the caches sometimes when we wouldn't actually
244 # need to. Can we avoid that?
245 self._managers.dimensions.clearCaches()
246 raise
248 def resetConnectionPool(self) -> None:
249 """Reset SQLAlchemy connection pool for `SqlRegistry` database.
251 This operation is useful when using registry with fork-based
252 multiprocessing. To use registry across fork boundary one has to make
253 sure that there are no currently active connections (no session or
254 transaction is in progress) and connection pool is reset using this
255 method. This method should be called by the child process immediately
256 after the fork.
257 """
258 self._db._engine.dispose()
260 def registerOpaqueTable(self, tableName: str, spec: ddl.TableSpec) -> None:
261 """Add an opaque (to the `Registry`) table for use by a `Datastore` or
262 other data repository client.
264 Opaque table records can be added via `insertOpaqueData`, retrieved via
265 `fetchOpaqueData`, and removed via `deleteOpaqueData`.
267 Parameters
268 ----------
269 tableName : `str`
270 Logical name of the opaque table. This may differ from the
271 actual name used in the database by a prefix and/or suffix.
272 spec : `ddl.TableSpec`
273 Specification for the table to be added.
274 """
275 self._managers.opaque.register(tableName, spec)
277 @transactional
278 def insertOpaqueData(self, tableName: str, *data: dict) -> None:
279 """Insert records into an opaque table.
281 Parameters
282 ----------
283 tableName : `str`
284 Logical name of the opaque table. Must match the name used in a
285 previous call to `registerOpaqueTable`.
286 data
287 Each additional positional argument is a dictionary that represents
288 a single row to be added.
289 """
290 self._managers.opaque[tableName].insert(*data)
292 def fetchOpaqueData(self, tableName: str, **where: Any) -> Iterator[dict]:
293 """Retrieve records from an opaque table.
295 Parameters
296 ----------
297 tableName : `str`
298 Logical name of the opaque table. Must match the name used in a
299 previous call to `registerOpaqueTable`.
300 where
301 Additional keyword arguments are interpreted as equality
302 constraints that restrict the returned rows (combined with AND);
303 keyword arguments are column names and values are the values they
304 must have.
306 Yields
307 ------
308 row : `dict`
309 A dictionary representing a single result row.
310 """
311 yield from self._managers.opaque[tableName].fetch(**where)
313 @transactional
314 def deleteOpaqueData(self, tableName: str, **where: Any) -> None:
315 """Remove records from an opaque table.
317 Parameters
318 ----------
319 tableName : `str`
320 Logical name of the opaque table. Must match the name used in a
321 previous call to `registerOpaqueTable`.
322 where
323 Additional keyword arguments are interpreted as equality
324 constraints that restrict the deleted rows (combined with AND);
325 keyword arguments are column names and values are the values they
326 must have.
327 """
328 self._managers.opaque[tableName].delete(where.keys(), where)
330 def registerCollection(self, name: str, type: CollectionType = CollectionType.TAGGED,
331 doc: Optional[str] = None) -> bool:
332 # Docstring inherited from lsst.daf.butler.registry.Registry
333 _, registered = self._managers.collections.register(name, type, doc=doc)
334 return registered
336 def getCollectionType(self, name: str) -> CollectionType:
337 # Docstring inherited from lsst.daf.butler.registry.Registry
338 return self._managers.collections.find(name).type
340 def _get_collection_record(self, name: str) -> CollectionRecord:
341 # Docstring inherited from lsst.daf.butler.registry.Registry
342 return self._managers.collections.find(name)
344 def registerRun(self, name: str, doc: Optional[str] = None) -> bool:
345 # Docstring inherited from lsst.daf.butler.registry.Registry
346 _, registered = self._managers.collections.register(name, CollectionType.RUN, doc=doc)
347 return registered
349 @transactional
350 def removeCollection(self, name: str) -> None:
351 # Docstring inherited from lsst.daf.butler.registry.Registry
352 self._managers.collections.remove(name)
354 def getCollectionChain(self, parent: str) -> CollectionSearch:
355 # Docstring inherited from lsst.daf.butler.registry.Registry
356 record = self._managers.collections.find(parent)
357 if record.type is not CollectionType.CHAINED:
358 raise TypeError(f"Collection '{parent}' has type {record.type.name}, not CHAINED.")
359 assert isinstance(record, ChainedCollectionRecord)
360 return record.children
362 @transactional
363 def setCollectionChain(self, parent: str, children: Any, *, flatten: bool = False) -> None:
364 # Docstring inherited from lsst.daf.butler.registry.Registry
365 record = self._managers.collections.find(parent)
366 if record.type is not CollectionType.CHAINED:
367 raise TypeError(f"Collection '{parent}' has type {record.type.name}, not CHAINED.")
368 assert isinstance(record, ChainedCollectionRecord)
369 children = CollectionSearch.fromExpression(children)
370 if children != record.children or flatten:
371 record.update(self._managers.collections, children, flatten=flatten)
373 def getCollectionDocumentation(self, collection: str) -> Optional[str]:
374 # Docstring inherited from lsst.daf.butler.registry.Registry
375 return self._managers.collections.getDocumentation(self._managers.collections.find(collection).key)
377 def setCollectionDocumentation(self, collection: str, doc: Optional[str]) -> None:
378 # Docstring inherited from lsst.daf.butler.registry.Registry
379 self._managers.collections.setDocumentation(self._managers.collections.find(collection).key, doc)
381 def getCollectionSummary(self, collection: str) -> CollectionSummary:
382 # Docstring inherited from lsst.daf.butler.registry.Registry
383 record = self._managers.collections.find(collection)
384 return self._managers.datasets.getCollectionSummary(record)
386 def registerDatasetType(self, datasetType: DatasetType) -> bool:
387 # Docstring inherited from lsst.daf.butler.registry.Registry
388 _, inserted = self._managers.datasets.register(datasetType)
389 return inserted
391 def removeDatasetType(self, name: str) -> None:
392 # Docstring inherited from lsst.daf.butler.registry.Registry
393 self._managers.datasets.remove(name)
395 def getDatasetType(self, name: str) -> DatasetType:
396 # Docstring inherited from lsst.daf.butler.registry.Registry
397 return self._managers.datasets[name].datasetType
399 def supportsIdGenerationMode(self, mode: DatasetIdGenEnum) -> bool:
400 # Docstring inherited from lsst.daf.butler.registry.Registry
401 return self._managers.datasets.supportsIdGenerationMode(mode)
403 def findDataset(self, datasetType: Union[DatasetType, str], dataId: Optional[DataId] = None, *,
404 collections: Any = None, timespan: Optional[Timespan] = None,
405 **kwargs: Any) -> Optional[DatasetRef]:
406 # Docstring inherited from lsst.daf.butler.registry.Registry
407 if isinstance(datasetType, DatasetType):
408 storage = self._managers.datasets[datasetType.name]
409 else:
410 storage = self._managers.datasets[datasetType]
411 dataId = DataCoordinate.standardize(dataId, graph=storage.datasetType.dimensions,
412 universe=self.dimensions, defaults=self.defaults.dataId,
413 **kwargs)
414 if collections is None:
415 if not self.defaults.collections:
416 raise TypeError("No collections provided to findDataset, "
417 "and no defaults from registry construction.")
418 collections = self.defaults.collections
419 else:
420 collections = CollectionSearch.fromExpression(collections)
421 for collectionRecord in collections.iter(self._managers.collections):
422 if (collectionRecord.type is CollectionType.CALIBRATION
423 and (not storage.datasetType.isCalibration() or timespan is None)):
424 continue
425 result = storage.find(collectionRecord, dataId, timespan=timespan)
426 if result is not None:
427 return result
429 return None
431 @transactional
432 def insertDatasets(self, datasetType: Union[DatasetType, str], dataIds: Iterable[DataId],
433 run: Optional[str] = None, expand: bool = True,
434 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE) -> List[DatasetRef]:
435 # Docstring inherited from lsst.daf.butler.registry.Registry
436 if isinstance(datasetType, DatasetType):
437 storage = self._managers.datasets.find(datasetType.name)
438 if storage is None:
439 raise LookupError(f"DatasetType '{datasetType}' has not been registered.")
440 else:
441 storage = self._managers.datasets.find(datasetType)
442 if storage is None:
443 raise LookupError(f"DatasetType with name '{datasetType}' has not been registered.")
444 if run is None:
445 if self.defaults.run is None:
446 raise TypeError("No run provided to insertDatasets, "
447 "and no default from registry construction.")
448 run = self.defaults.run
449 runRecord = self._managers.collections.find(run)
450 if runRecord.type is not CollectionType.RUN:
451 raise TypeError(f"Given collection is of type {runRecord.type.name}; RUN collection required.")
452 assert isinstance(runRecord, RunRecord)
453 progress = Progress("daf.butler.Registry.insertDatasets", level=logging.DEBUG)
454 if expand:
455 expandedDataIds = [self.expandDataId(dataId, graph=storage.datasetType.dimensions)
456 for dataId in progress.wrap(dataIds,
457 f"Expanding {storage.datasetType.name} data IDs")]
458 else:
459 expandedDataIds = [DataCoordinate.standardize(dataId, graph=storage.datasetType.dimensions)
460 for dataId in dataIds]
461 try:
462 refs = list(storage.insert(runRecord, expandedDataIds, idGenerationMode))
463 except sqlalchemy.exc.IntegrityError as err:
464 raise ConflictingDefinitionError(f"A database constraint failure was triggered by inserting "
465 f"one or more datasets of type {storage.datasetType} into "
466 f"collection '{run}'. "
467 f"This probably means a dataset with the same data ID "
468 f"and dataset type already exists, but it may also mean a "
469 f"dimension row is missing.") from err
470 return refs
472 @transactional
473 def _importDatasets(self, datasets: Iterable[DatasetRef], expand: bool = True,
474 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE,
475 reuseIds: bool = False) -> List[DatasetRef]:
476 # Docstring inherited from lsst.daf.butler.registry.Registry
477 datasets = list(datasets)
478 if not datasets:
479 # nothing to do
480 return []
482 # find dataset type
483 datasetTypes = set(dataset.datasetType for dataset in datasets)
484 if len(datasetTypes) != 1:
485 raise ValueError(f"Multiple dataset types in input datasets: {datasetTypes}")
486 datasetType = datasetTypes.pop()
488 # get storage handler for this dataset type
489 storage = self._managers.datasets.find(datasetType.name)
490 if storage is None:
491 raise LookupError(f"DatasetType '{datasetType}' has not been registered.")
493 # find run name
494 runs = set(dataset.run for dataset in datasets)
495 if len(runs) != 1:
496 raise ValueError(f"Multiple run names in input datasets: {runs}")
497 run = runs.pop()
498 if run is None:
499 if self.defaults.run is None:
500 raise TypeError("No run provided to ingestDatasets, "
501 "and no default from registry construction.")
502 run = self.defaults.run
504 runRecord = self._managers.collections.find(run)
505 if runRecord.type is not CollectionType.RUN:
506 raise TypeError(f"Given collection '{runRecord.name}' is of type {runRecord.type.name};"
507 " RUN collection required.")
508 assert isinstance(runRecord, RunRecord)
510 progress = Progress("daf.butler.Registry.insertDatasets", level=logging.DEBUG)
511 if expand:
512 expandedDatasets = [
513 dataset.expanded(self.expandDataId(dataset.dataId, graph=storage.datasetType.dimensions))
514 for dataset in progress.wrap(datasets, f"Expanding {storage.datasetType.name} data IDs")]
515 else:
516 expandedDatasets = [
517 DatasetRef(datasetType, dataset.dataId, id=dataset.id, run=dataset.run, conform=True)
518 for dataset in datasets
519 ]
521 try:
522 refs = list(storage.import_(runRecord, expandedDatasets, idGenerationMode, reuseIds))
523 except sqlalchemy.exc.IntegrityError as err:
524 raise ConflictingDefinitionError(f"A database constraint failure was triggered by inserting "
525 f"one or more datasets of type {storage.datasetType} into "
526 f"collection '{run}'. "
527 f"This probably means a dataset with the same data ID "
528 f"and dataset type already exists, but it may also mean a "
529 f"dimension row is missing.") from err
530 return refs
532 def getDataset(self, id: DatasetId) -> Optional[DatasetRef]:
533 # Docstring inherited from lsst.daf.butler.registry.Registry
534 return self._managers.datasets.getDatasetRef(id)
536 @transactional
537 def removeDatasets(self, refs: Iterable[DatasetRef]) -> None:
538 # Docstring inherited from lsst.daf.butler.registry.Registry
539 progress = Progress("lsst.daf.butler.Registry.removeDatasets", level=logging.DEBUG)
540 for datasetType, refsForType in progress.iter_item_chunks(DatasetRef.groupByType(refs).items(),
541 desc="Removing datasets by type"):
542 storage = self._managers.datasets[datasetType.name]
543 try:
544 storage.delete(refsForType)
545 except sqlalchemy.exc.IntegrityError as err:
546 raise OrphanedRecordError("One or more datasets is still "
547 "present in one or more Datastores.") from err
549 @transactional
550 def associate(self, collection: str, refs: Iterable[DatasetRef]) -> None:
551 # Docstring inherited from lsst.daf.butler.registry.Registry
552 progress = Progress("lsst.daf.butler.Registry.associate", level=logging.DEBUG)
553 collectionRecord = self._managers.collections.find(collection)
554 if collectionRecord.type is not CollectionType.TAGGED:
555 raise TypeError(f"Collection '{collection}' has type {collectionRecord.type.name}, not TAGGED.")
556 for datasetType, refsForType in progress.iter_item_chunks(DatasetRef.groupByType(refs).items(),
557 desc="Associating datasets by type"):
558 storage = self._managers.datasets[datasetType.name]
559 try:
560 storage.associate(collectionRecord, refsForType)
561 except sqlalchemy.exc.IntegrityError as err:
562 raise ConflictingDefinitionError(
563 f"Constraint violation while associating dataset of type {datasetType.name} with "
564 f"collection {collection}. This probably means that one or more datasets with the same "
565 f"dataset type and data ID already exist in the collection, but it may also indicate "
566 f"that the datasets do not exist."
567 ) from err
569 @transactional
570 def disassociate(self, collection: str, refs: Iterable[DatasetRef]) -> None:
571 # Docstring inherited from lsst.daf.butler.registry.Registry
572 progress = Progress("lsst.daf.butler.Registry.disassociate", level=logging.DEBUG)
573 collectionRecord = self._managers.collections.find(collection)
574 if collectionRecord.type is not CollectionType.TAGGED:
575 raise TypeError(f"Collection '{collection}' has type {collectionRecord.type.name}; "
576 "expected TAGGED.")
577 for datasetType, refsForType in progress.iter_item_chunks(DatasetRef.groupByType(refs).items(),
578 desc="Disassociating datasets by type"):
579 storage = self._managers.datasets[datasetType.name]
580 storage.disassociate(collectionRecord, refsForType)
582 @transactional
583 def certify(self, collection: str, refs: Iterable[DatasetRef], timespan: Timespan) -> None:
584 # Docstring inherited from lsst.daf.butler.registry.Registry
585 progress = Progress("lsst.daf.butler.Registry.certify", level=logging.DEBUG)
586 collectionRecord = self._managers.collections.find(collection)
587 for datasetType, refsForType in progress.iter_item_chunks(DatasetRef.groupByType(refs).items(),
588 desc="Certifying datasets by type"):
589 storage = self._managers.datasets[datasetType.name]
590 storage.certify(collectionRecord, refsForType, timespan)
592 @transactional
593 def decertify(self, collection: str, datasetType: Union[str, DatasetType], timespan: Timespan, *,
594 dataIds: Optional[Iterable[DataId]] = None) -> None:
595 # Docstring inherited from lsst.daf.butler.registry.Registry
596 collectionRecord = self._managers.collections.find(collection)
597 if isinstance(datasetType, str):
598 storage = self._managers.datasets[datasetType]
599 else:
600 storage = self._managers.datasets[datasetType.name]
601 standardizedDataIds = None
602 if dataIds is not None:
603 standardizedDataIds = [DataCoordinate.standardize(d, graph=storage.datasetType.dimensions)
604 for d in dataIds]
605 storage.decertify(collectionRecord, timespan, dataIds=standardizedDataIds)
607 def getDatastoreBridgeManager(self) -> DatastoreRegistryBridgeManager:
608 """Return an object that allows a new `Datastore` instance to
609 communicate with this `Registry`.
611 Returns
612 -------
613 manager : `DatastoreRegistryBridgeManager`
614 Object that mediates communication between this `Registry` and its
615 associated datastores.
616 """
617 return self._managers.datastores
619 def getDatasetLocations(self, ref: DatasetRef) -> Iterable[str]:
620 # Docstring inherited from lsst.daf.butler.registry.Registry
621 return self._managers.datastores.findDatastores(ref)
623 def expandDataId(self, dataId: Optional[DataId] = None, *, graph: Optional[DimensionGraph] = None,
624 records: Optional[NameLookupMapping[DimensionElement, Optional[DimensionRecord]]] = None,
625 withDefaults: bool = True,
626 **kwargs: Any) -> DataCoordinate:
627 # Docstring inherited from lsst.daf.butler.registry.Registry
628 if not withDefaults:
629 defaults = None
630 else:
631 defaults = self.defaults.dataId
632 standardized = DataCoordinate.standardize(dataId, graph=graph, universe=self.dimensions,
633 defaults=defaults, **kwargs)
634 if standardized.hasRecords():
635 return standardized
636 if records is None:
637 records = {}
638 elif isinstance(records, NamedKeyMapping):
639 records = records.byName()
640 else:
641 records = dict(records)
642 if isinstance(dataId, DataCoordinate) and dataId.hasRecords():
643 records.update(dataId.records.byName())
644 keys = standardized.byName()
645 for element in standardized.graph.primaryKeyTraversalOrder:
646 record = records.get(element.name, ...) # Use ... to mean not found; None might mean NULL
647 if record is ...:
648 if isinstance(element, Dimension) and keys.get(element.name) is None:
649 if element in standardized.graph.required:
650 raise LookupError(
651 f"No value or null value for required dimension {element.name}."
652 )
653 keys[element.name] = None
654 record = None
655 else:
656 storage = self._managers.dimensions[element]
657 dataIdSet = DataCoordinateIterable.fromScalar(
658 DataCoordinate.standardize(keys, graph=element.graph)
659 )
660 fetched = tuple(storage.fetch(dataIdSet))
661 try:
662 (record,) = fetched
663 except ValueError:
664 record = None
665 records[element.name] = record
666 if record is not None:
667 for d in element.implied:
668 value = getattr(record, d.name)
669 if keys.setdefault(d.name, value) != value:
670 raise InconsistentDataIdError(
671 f"Data ID {standardized} has {d.name}={keys[d.name]!r}, "
672 f"but {element.name} implies {d.name}={value!r}."
673 )
674 else:
675 if element in standardized.graph.required:
676 raise LookupError(
677 f"Could not fetch record for required dimension {element.name} via keys {keys}."
678 )
679 if element.alwaysJoin:
680 raise InconsistentDataIdError(
681 f"Could not fetch record for element {element.name} via keys {keys}, ",
682 "but it is marked alwaysJoin=True; this means one or more dimensions are not "
683 "related."
684 )
685 for d in element.implied:
686 keys.setdefault(d.name, None)
687 records.setdefault(d.name, None)
688 return DataCoordinate.standardize(keys, graph=standardized.graph).expanded(records=records)
690 def insertDimensionData(self, element: Union[DimensionElement, str],
691 *data: Union[Mapping[str, Any], DimensionRecord],
692 conform: bool = True,
693 replace: bool = False) -> None:
694 # Docstring inherited from lsst.daf.butler.registry.Registry
695 if conform:
696 if isinstance(element, str):
697 element = self.dimensions[element]
698 records = [row if isinstance(row, DimensionRecord) else element.RecordClass(**row)
699 for row in data]
700 else:
701 # Ignore typing since caller said to trust them with conform=False.
702 records = data # type: ignore
703 storage = self._managers.dimensions[element] # type: ignore
704 storage.insert(*records, replace=replace)
706 def syncDimensionData(self, element: Union[DimensionElement, str],
707 row: Union[Mapping[str, Any], DimensionRecord],
708 conform: bool = True,
709 update: bool = False) -> Union[bool, Dict[str, Any]]:
710 # Docstring inherited from lsst.daf.butler.registry.Registry
711 if conform:
712 if isinstance(element, str):
713 element = self.dimensions[element]
714 record = row if isinstance(row, DimensionRecord) else element.RecordClass(**row)
715 else:
716 # Ignore typing since caller said to trust them with conform=False.
717 record = row # type: ignore
718 storage = self._managers.dimensions[element] # type: ignore
719 return storage.sync(record, update=update)
721 def queryDatasetTypes(self, expression: Any = ..., *, components: Optional[bool] = None,
722 missing: Optional[List[str]] = None,
723 ) -> Iterator[DatasetType]:
724 # Docstring inherited from lsst.daf.butler.registry.Registry
725 wildcard = CategorizedWildcard.fromExpression(expression, coerceUnrecognized=lambda d: d.name)
726 if wildcard is Ellipsis:
727 for datasetType in self._managers.datasets:
728 # The dataset type can no longer be a component
729 yield datasetType
730 if components:
731 # Automatically create the component dataset types
732 try:
733 componentsForDatasetType = datasetType.makeAllComponentDatasetTypes()
734 except KeyError as err:
735 _LOG.warning(f"Could not load storage class {err} for {datasetType.name}; "
736 "if it has components they will not be included in query results.")
737 else:
738 yield from componentsForDatasetType
739 return
740 done: Set[str] = set()
741 for name in wildcard.strings:
742 storage = self._managers.datasets.find(name)
743 done.add(name)
744 if storage is None:
745 if missing is not None:
746 missing.append(name)
747 else:
748 yield storage.datasetType
749 if wildcard.patterns:
750 # If components (the argument) is None, we'll save component
751 # dataset that we might want to match, but only if their parents
752 # didn't get included.
753 componentsForLater = []
754 for registeredDatasetType in self._managers.datasets:
755 # Components are not stored in registry so expand them here
756 allDatasetTypes = [registeredDatasetType]
757 try:
758 allDatasetTypes.extend(registeredDatasetType.makeAllComponentDatasetTypes())
759 except KeyError as err:
760 _LOG.warning(f"Could not load storage class {err} for {registeredDatasetType.name}; "
761 "if it has components they will not be included in query results.")
762 for datasetType in allDatasetTypes:
763 if datasetType.name in done:
764 continue
765 parentName, componentName = datasetType.nameAndComponent()
766 if componentName is not None and not components:
767 if components is None and parentName not in done:
768 componentsForLater.append(datasetType)
769 continue
770 if any(p.fullmatch(datasetType.name) for p in wildcard.patterns):
771 done.add(datasetType.name)
772 yield datasetType
773 # Go back and try to match saved components.
774 for datasetType in componentsForLater:
775 parentName, _ = datasetType.nameAndComponent()
776 if parentName not in done and any(p.fullmatch(datasetType.name) for p in wildcard.patterns):
777 yield datasetType
779 def queryCollections(self, expression: Any = ...,
780 datasetType: Optional[DatasetType] = None,
781 collectionTypes: Iterable[CollectionType] = CollectionType.all(),
782 flattenChains: bool = False,
783 includeChains: Optional[bool] = None) -> Iterator[str]:
784 # Docstring inherited from lsst.daf.butler.registry.Registry
786 # Right now the datasetTypes argument is completely ignored, but that
787 # is consistent with its [lack of] guarantees. DM-24939 or a follow-up
788 # ticket will take care of that.
789 query = CollectionQuery.fromExpression(expression)
790 for record in query.iter(self._managers.collections, collectionTypes=frozenset(collectionTypes),
791 flattenChains=flattenChains, includeChains=includeChains):
792 yield record.name
794 def _makeQueryBuilder(self, summary: queries.QuerySummary,
795 doomed_by: Iterable[str] = ()) -> queries.QueryBuilder:
796 """Return a `QueryBuilder` instance capable of constructing and
797 managing more complex queries than those obtainable via `Registry`
798 interfaces.
800 This is an advanced interface; downstream code should prefer
801 `Registry.queryDataIds` and `Registry.queryDatasets` whenever those
802 are sufficient.
804 Parameters
805 ----------
806 summary : `queries.QuerySummary`
807 Object describing and categorizing the full set of dimensions that
808 will be included in the query.
809 doomed_by : `Iterable` of `str`, optional
810 A list of diagnostic messages that indicate why the query is going
811 to yield no results and should not even be executed. If an empty
812 container (default) the query will be executed unless other code
813 determines that it is doomed.
815 Returns
816 -------
817 builder : `queries.QueryBuilder`
818 Object that can be used to construct and perform advanced queries.
819 """
820 return queries.QueryBuilder(
821 summary,
822 queries.RegistryManagers(
823 collections=self._managers.collections,
824 dimensions=self._managers.dimensions,
825 datasets=self._managers.datasets,
826 TimespanReprClass=self._db.getTimespanRepresentation(),
827 ),
828 doomed_by=doomed_by,
829 )
831 def queryDatasets(self, datasetType: Any, *,
832 collections: Any = None,
833 dimensions: Optional[Iterable[Union[Dimension, str]]] = None,
834 dataId: Optional[DataId] = None,
835 where: Optional[str] = None,
836 findFirst: bool = False,
837 components: Optional[bool] = None,
838 bind: Optional[Mapping[str, Any]] = None,
839 check: bool = True,
840 **kwargs: Any) -> queries.DatasetQueryResults:
841 # Docstring inherited from lsst.daf.butler.registry.Registry
843 # Standardize the collections expression.
844 if collections is None:
845 if not self.defaults.collections:
846 raise TypeError("No collections provided to findDataset, "
847 "and no defaults from registry construction.")
848 collections = self.defaults.collections
849 elif findFirst:
850 collections = CollectionSearch.fromExpression(collections)
851 else:
852 collections = CollectionQuery.fromExpression(collections)
853 # Standardize and expand the data ID provided as a constraint.
854 standardizedDataId = self.expandDataId(dataId, **kwargs)
856 # We can only query directly if given a non-component DatasetType
857 # instance. If we were given an expression or str or a component
858 # DatasetType instance, we'll populate this dict, recurse, and return.
859 # If we already have a non-component DatasetType, it will remain None
860 # and we'll run the query directly.
861 composition: Optional[
862 Dict[
863 DatasetType, # parent dataset type
864 List[Optional[str]] # component name, or None for parent
865 ]
866 ] = None
867 if not isinstance(datasetType, DatasetType):
868 # We were given a dataset type expression (which may be as simple
869 # as a str). Loop over all matching datasets, delegating handling
870 # of the `components` argument to queryDatasetTypes, as we populate
871 # the composition dict.
872 composition = defaultdict(list)
873 for trueDatasetType in self.queryDatasetTypes(datasetType, components=components):
874 parentName, componentName = trueDatasetType.nameAndComponent()
875 if componentName is not None:
876 parentDatasetType = self.getDatasetType(parentName)
877 composition.setdefault(parentDatasetType, []).append(componentName)
878 else:
879 composition.setdefault(trueDatasetType, []).append(None)
880 if not composition:
881 return queries.ChainedDatasetQueryResults(
882 [],
883 doomed_by=[f"No registered dataset type matching {t!r} found."
884 for t in ensure_iterable(datasetType)],
885 )
886 elif datasetType.isComponent():
887 # We were given a true DatasetType instance, but it's a component.
888 # the composition dict will have exactly one item.
889 parentName, componentName = datasetType.nameAndComponent()
890 parentDatasetType = self.getDatasetType(parentName)
891 composition = {parentDatasetType: [componentName]}
892 if composition is not None:
893 # We need to recurse. Do that once for each parent dataset type.
894 chain = []
895 for parentDatasetType, componentNames in composition.items():
896 parentResults = self.queryDatasets(
897 parentDatasetType,
898 collections=collections,
899 dimensions=dimensions,
900 dataId=standardizedDataId,
901 where=where,
902 bind=bind,
903 findFirst=findFirst,
904 check=check,
905 )
906 assert isinstance(parentResults, queries.ParentDatasetQueryResults), \
907 "Should always be true if passing in a DatasetType instance, and we are."
908 chain.append(
909 parentResults.withComponents(componentNames)
910 )
911 return queries.ChainedDatasetQueryResults(chain)
912 # If we get here, there's no need to recurse (or we are already
913 # recursing; there can only ever be one level of recursion).
915 # The full set of dimensions in the query is the combination of those
916 # needed for the DatasetType and those explicitly requested, if any.
917 requestedDimensionNames = set(datasetType.dimensions.names)
918 if dimensions is not None:
919 requestedDimensionNames.update(self.dimensions.extract(dimensions).names)
920 # Construct the summary structure needed to construct a QueryBuilder.
921 summary = queries.QuerySummary(
922 requested=DimensionGraph(self.dimensions, names=requestedDimensionNames),
923 dataId=standardizedDataId,
924 expression=where,
925 bind=bind,
926 defaults=self.defaults.dataId,
927 check=check,
928 datasets=[datasetType],
929 )
930 builder = self._makeQueryBuilder(summary)
931 # Add the dataset subquery to the query, telling the QueryBuilder to
932 # include the rank of the selected collection in the results only if we
933 # need to findFirst. Note that if any of the collections are
934 # actually wildcard expressions, and we've asked for deduplication,
935 # this will raise TypeError for us.
936 builder.joinDataset(datasetType, collections, isResult=True, findFirst=findFirst)
937 query = builder.finish()
938 return queries.ParentDatasetQueryResults(self._db, query, components=[None], datasetType=datasetType)
940 def queryDataIds(self, dimensions: Union[Iterable[Union[Dimension, str]], Dimension, str], *,
941 dataId: Optional[DataId] = None,
942 datasets: Any = None,
943 collections: Any = None,
944 where: Optional[str] = None,
945 components: Optional[bool] = None,
946 bind: Optional[Mapping[str, Any]] = None,
947 check: bool = True,
948 **kwargs: Any) -> queries.DataCoordinateQueryResults:
949 # Docstring inherited from lsst.daf.butler.registry.Registry
950 dimensions = ensure_iterable(dimensions)
951 standardizedDataId = self.expandDataId(dataId, **kwargs)
952 standardizedDatasetTypes = set()
953 requestedDimensions = self.dimensions.extract(dimensions)
954 missing: List[str] = []
955 if datasets is not None:
956 if not collections:
957 if not self.defaults.collections:
958 raise TypeError(f"Cannot pass 'datasets' (='{datasets}') without 'collections'.")
959 collections = self.defaults.collections
960 else:
961 # Preprocess collections expression in case the original
962 # included single-pass iterators (we'll want to use it multiple
963 # times below).
964 collections = CollectionQuery.fromExpression(collections)
965 for datasetType in self.queryDatasetTypes(datasets, components=components, missing=missing):
966 # If any matched dataset type is a component, just operate on
967 # its parent instead, because Registry doesn't know anything
968 # about what components exist, and here (unlike queryDatasets)
969 # we don't care about returning them.
970 parentDatasetTypeName, componentName = datasetType.nameAndComponent()
971 if componentName is not None:
972 datasetType = self.getDatasetType(parentDatasetTypeName)
973 standardizedDatasetTypes.add(datasetType)
974 elif collections:
975 raise TypeError(f"Cannot pass 'collections' (='{collections}') without 'datasets'.")
977 def query_factory(order_by: Optional[Iterable[str]] = None,
978 limit: Optional[Tuple[int, Optional[int]]] = None) -> Query:
979 """Construct the Query object that generates query results.
980 """
981 summary = queries.QuerySummary(
982 requested=requestedDimensions,
983 dataId=standardizedDataId,
984 expression=where,
985 bind=bind,
986 defaults=self.defaults.dataId,
987 check=check,
988 datasets=standardizedDatasetTypes,
989 order_by=order_by,
990 limit=limit
991 )
992 builder = self._makeQueryBuilder(
993 summary,
994 doomed_by=[f"Dataset type {name} is not registered." for name in missing]
995 )
996 for datasetType in standardizedDatasetTypes:
997 builder.joinDataset(datasetType, collections, isResult=False,)
998 return builder.finish()
1000 return queries.DataCoordinateQueryResults(self._db, query_factory, requestedDimensions)
1002 def queryDimensionRecords(self, element: Union[DimensionElement, str], *,
1003 dataId: Optional[DataId] = None,
1004 datasets: Any = None,
1005 collections: Any = None,
1006 where: Optional[str] = None,
1007 components: Optional[bool] = None,
1008 bind: Optional[Mapping[str, Any]] = None,
1009 check: bool = True,
1010 **kwargs: Any) -> queries.DimensionRecordQueryResults:
1011 # Docstring inherited from lsst.daf.butler.registry.Registry
1012 if not isinstance(element, DimensionElement):
1013 try:
1014 element = self.dimensions[element]
1015 except KeyError as e:
1016 raise KeyError(f"No such dimension '{element}', available dimensions: "
1017 + str(self.dimensions.getStaticElements())) from e
1018 dataIds = self.queryDataIds(element.graph, dataId=dataId, datasets=datasets, collections=collections,
1019 where=where, components=components, bind=bind, check=check, **kwargs)
1020 return queries.DatabaseDimensionRecordQueryResults(dataIds, self._managers.dimensions[element])
1022 def queryDatasetAssociations(
1023 self,
1024 datasetType: Union[str, DatasetType],
1025 collections: Any = ...,
1026 *,
1027 collectionTypes: Iterable[CollectionType] = CollectionType.all(),
1028 flattenChains: bool = False,
1029 ) -> Iterator[DatasetAssociation]:
1030 # Docstring inherited from lsst.daf.butler.registry.Registry
1031 if collections is None:
1032 if not self.defaults.collections:
1033 raise TypeError("No collections provided to findDataset, "
1034 "and no defaults from registry construction.")
1035 collections = self.defaults.collections
1036 else:
1037 collections = CollectionQuery.fromExpression(collections)
1038 TimespanReprClass = self._db.getTimespanRepresentation()
1039 if isinstance(datasetType, str):
1040 storage = self._managers.datasets[datasetType]
1041 else:
1042 storage = self._managers.datasets[datasetType.name]
1043 for collectionRecord in collections.iter(self._managers.collections,
1044 collectionTypes=frozenset(collectionTypes),
1045 flattenChains=flattenChains):
1046 query = storage.select(collectionRecord)
1047 for row in self._db.query(query.combine()).mappings():
1048 dataId = DataCoordinate.fromRequiredValues(
1049 storage.datasetType.dimensions,
1050 tuple(row[name] for name in storage.datasetType.dimensions.required.names)
1051 )
1052 runRecord = self._managers.collections[row[self._managers.collections.getRunForeignKeyName()]]
1053 ref = DatasetRef(storage.datasetType, dataId, id=row["id"], run=runRecord.name,
1054 conform=False)
1055 if collectionRecord.type is CollectionType.CALIBRATION:
1056 timespan = TimespanReprClass.extract(row)
1057 else:
1058 timespan = None
1059 yield DatasetAssociation(ref=ref, collection=collectionRecord.name, timespan=timespan)
1061 storageClasses: StorageClassFactory
1062 """All storage classes known to the registry (`StorageClassFactory`).
1063 """