Coverage for python/lsst/daf/butler/registries/sql.py: 13%
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22from __future__ import annotations
24__all__ = (
25 "SqlRegistry",
26)
28from collections import defaultdict
29import contextlib
30import logging
31from typing import (
32 Any,
33 Dict,
34 Iterable,
35 Iterator,
36 List,
37 Mapping,
38 Optional,
39 Set,
40 Tuple,
41 TYPE_CHECKING,
42 Union,
43)
45import sqlalchemy
46from lsst.utils.iteration import ensure_iterable
48from ..core import (
49 ButlerURI,
50 Config,
51 DataCoordinate,
52 DataCoordinateIterable,
53 DataId,
54 DatasetAssociation,
55 DatasetId,
56 DatasetRef,
57 DatasetType,
58 ddl,
59 Dimension,
60 DimensionConfig,
61 DimensionElement,
62 DimensionGraph,
63 DimensionRecord,
64 DimensionUniverse,
65 NamedKeyMapping,
66 NameLookupMapping,
67 Progress,
68 StorageClassFactory,
69 Timespan,
70)
71from ..core.utils import transactional
73from ..registry import (
74 Registry,
75 RegistryConfig,
76 CollectionType,
77 RegistryDefaults,
78 ConflictingDefinitionError,
79 InconsistentDataIdError,
80 OrphanedRecordError,
81 CollectionSearch,
82)
83from ..registry import queries
85from ..registry.wildcards import CategorizedWildcard, CollectionQuery, Ellipsis
86from ..registry.summaries import CollectionSummary
87from ..registry.managers import RegistryManagerTypes, RegistryManagerInstances
88from ..registry.queries import Query
89from ..registry.interfaces import ChainedCollectionRecord, DatasetIdGenEnum, RunRecord
91if TYPE_CHECKING: 91 ↛ 92line 91 didn't jump to line 92, because the condition on line 91 was never true
92 from .._butlerConfig import ButlerConfig
93 from ..registry.interfaces import (
94 CollectionRecord,
95 Database,
96 DatastoreRegistryBridgeManager,
97 )
100_LOG = logging.getLogger(__name__)
103class SqlRegistry(Registry):
104 """Registry implementation based on SQLAlchemy.
106 Parameters
107 ----------
108 database : `Database`
109 Database instance to store Registry.
110 defaults : `RegistryDefaults`
111 Default collection search path and/or output `~CollectionType.RUN`
112 collection.
113 managers : `RegistryManagerInstances`
114 All the managers required for this registry.
115 """
117 defaultConfigFile: Optional[str] = None
118 """Path to configuration defaults. Accessed within the ``configs`` resource
119 or relative to a search path. Can be None if no defaults specified.
120 """
122 @classmethod
123 def createFromConfig(cls, config: Optional[Union[RegistryConfig, str]] = None,
124 dimensionConfig: Optional[Union[DimensionConfig, str]] = None,
125 butlerRoot: Optional[str] = None) -> Registry:
126 """Create registry database and return `SqlRegistry` instance.
128 This method initializes database contents, database must be empty
129 prior to calling this method.
131 Parameters
132 ----------
133 config : `RegistryConfig` or `str`, optional
134 Registry configuration, if missing then default configuration will
135 be loaded from registry.yaml.
136 dimensionConfig : `DimensionConfig` or `str`, optional
137 Dimensions configuration, if missing then default configuration
138 will be loaded from dimensions.yaml.
139 butlerRoot : `str`, optional
140 Path to the repository root this `SqlRegistry` will manage.
142 Returns
143 -------
144 registry : `SqlRegistry`
145 A new `SqlRegistry` instance.
146 """
147 config = cls.forceRegistryConfig(config)
148 config.replaceRoot(butlerRoot)
150 if isinstance(dimensionConfig, str):
151 dimensionConfig = DimensionConfig(config)
152 elif dimensionConfig is None:
153 dimensionConfig = DimensionConfig()
154 elif not isinstance(dimensionConfig, DimensionConfig):
155 raise TypeError(f"Incompatible Dimension configuration type: {type(dimensionConfig)}")
157 DatabaseClass = config.getDatabaseClass()
158 database = DatabaseClass.fromUri(str(config.connectionString), origin=config.get("origin", 0),
159 namespace=config.get("namespace"))
160 managerTypes = RegistryManagerTypes.fromConfig(config)
161 managers = managerTypes.makeRepo(database, dimensionConfig)
162 return cls(database, RegistryDefaults(), managers)
164 @classmethod
165 def fromConfig(cls, config: Union[ButlerConfig, RegistryConfig, Config, str],
166 butlerRoot: Optional[Union[str, ButlerURI]] = None, writeable: bool = True,
167 defaults: Optional[RegistryDefaults] = None) -> Registry:
168 """Create `Registry` subclass instance from `config`.
170 Registry database must be initialized prior to calling this method.
172 Parameters
173 ----------
174 config : `ButlerConfig`, `RegistryConfig`, `Config` or `str`
175 Registry configuration
176 butlerRoot : `str` or `ButlerURI`, optional
177 Path to the repository root this `Registry` will manage.
178 writeable : `bool`, optional
179 If `True` (default) create a read-write connection to the database.
180 defaults : `RegistryDefaults`, optional
181 Default collection search path and/or output `~CollectionType.RUN`
182 collection.
184 Returns
185 -------
186 registry : `SqlRegistry` (subclass)
187 A new `SqlRegistry` subclass instance.
188 """
189 config = cls.forceRegistryConfig(config)
190 config.replaceRoot(butlerRoot)
191 DatabaseClass = config.getDatabaseClass()
192 database = DatabaseClass.fromUri(str(config.connectionString), origin=config.get("origin", 0),
193 namespace=config.get("namespace"), writeable=writeable)
194 managerTypes = RegistryManagerTypes.fromConfig(config)
195 managers = managerTypes.loadRepo(database)
196 if defaults is None:
197 defaults = RegistryDefaults()
198 return cls(database, defaults, managers)
200 def __init__(self, database: Database, defaults: RegistryDefaults, managers: RegistryManagerInstances):
201 self._db = database
202 self._managers = managers
203 self.storageClasses = StorageClassFactory()
204 # Intentionally invoke property setter to initialize defaults. This
205 # can only be done after most of the rest of Registry has already been
206 # initialized, and must be done before the property getter is used.
207 self.defaults = defaults
209 def __str__(self) -> str:
210 return str(self._db)
212 def __repr__(self) -> str:
213 return f"SqlRegistry({self._db!r}, {self.dimensions!r})"
215 def isWriteable(self) -> bool:
216 # Docstring inherited from lsst.daf.butler.registry.Registry
217 return self._db.isWriteable()
219 def copy(self, defaults: Optional[RegistryDefaults] = None) -> Registry:
220 # Docstring inherited from lsst.daf.butler.registry.Registry
221 if defaults is None:
222 # No need to copy, because `RegistryDefaults` is immutable; we
223 # effectively copy on write.
224 defaults = self.defaults
225 return type(self)(self._db, defaults, self._managers)
227 @property
228 def dimensions(self) -> DimensionUniverse:
229 # Docstring inherited from lsst.daf.butler.registry.Registry
230 return self._managers.dimensions.universe
232 def refresh(self) -> None:
233 # Docstring inherited from lsst.daf.butler.registry.Registry
234 self._managers.refresh()
236 @contextlib.contextmanager
237 def transaction(self, *, savepoint: bool = False) -> Iterator[None]:
238 # Docstring inherited from lsst.daf.butler.registry.Registry
239 try:
240 with self._db.transaction(savepoint=savepoint):
241 yield
242 except BaseException:
243 # TODO: this clears the caches sometimes when we wouldn't actually
244 # need to. Can we avoid that?
245 self._managers.dimensions.clearCaches()
246 raise
248 def resetConnectionPool(self) -> None:
249 """Reset SQLAlchemy connection pool for `SqlRegistry` database.
251 This operation is useful when using registry with fork-based
252 multiprocessing. To use registry across fork boundary one has to make
253 sure that there are no currently active connections (no session or
254 transaction is in progress) and connection pool is reset using this
255 method. This method should be called by the child process immediately
256 after the fork.
257 """
258 self._db._engine.dispose()
260 def registerOpaqueTable(self, tableName: str, spec: ddl.TableSpec) -> None:
261 """Add an opaque (to the `Registry`) table for use by a `Datastore` or
262 other data repository client.
264 Opaque table records can be added via `insertOpaqueData`, retrieved via
265 `fetchOpaqueData`, and removed via `deleteOpaqueData`.
267 Parameters
268 ----------
269 tableName : `str`
270 Logical name of the opaque table. This may differ from the
271 actual name used in the database by a prefix and/or suffix.
272 spec : `ddl.TableSpec`
273 Specification for the table to be added.
274 """
275 self._managers.opaque.register(tableName, spec)
277 @transactional
278 def insertOpaqueData(self, tableName: str, *data: dict) -> None:
279 """Insert records into an opaque table.
281 Parameters
282 ----------
283 tableName : `str`
284 Logical name of the opaque table. Must match the name used in a
285 previous call to `registerOpaqueTable`.
286 data
287 Each additional positional argument is a dictionary that represents
288 a single row to be added.
289 """
290 self._managers.opaque[tableName].insert(*data)
292 def fetchOpaqueData(self, tableName: str, **where: Any) -> Iterator[dict]:
293 """Retrieve records from an opaque table.
295 Parameters
296 ----------
297 tableName : `str`
298 Logical name of the opaque table. Must match the name used in a
299 previous call to `registerOpaqueTable`.
300 where
301 Additional keyword arguments are interpreted as equality
302 constraints that restrict the returned rows (combined with AND);
303 keyword arguments are column names and values are the values they
304 must have.
306 Yields
307 ------
308 row : `dict`
309 A dictionary representing a single result row.
310 """
311 yield from self._managers.opaque[tableName].fetch(**where)
313 @transactional
314 def deleteOpaqueData(self, tableName: str, **where: Any) -> None:
315 """Remove records from an opaque table.
317 Parameters
318 ----------
319 tableName : `str`
320 Logical name of the opaque table. Must match the name used in a
321 previous call to `registerOpaqueTable`.
322 where
323 Additional keyword arguments are interpreted as equality
324 constraints that restrict the deleted rows (combined with AND);
325 keyword arguments are column names and values are the values they
326 must have.
327 """
328 self._managers.opaque[tableName].delete(where.keys(), where)
330 def registerCollection(self, name: str, type: CollectionType = CollectionType.TAGGED,
331 doc: Optional[str] = None) -> bool:
332 # Docstring inherited from lsst.daf.butler.registry.Registry
333 _, registered = self._managers.collections.register(name, type, doc=doc)
334 return registered
336 def getCollectionType(self, name: str) -> CollectionType:
337 # Docstring inherited from lsst.daf.butler.registry.Registry
338 return self._managers.collections.find(name).type
340 def _get_collection_record(self, name: str) -> CollectionRecord:
341 # Docstring inherited from lsst.daf.butler.registry.Registry
342 return self._managers.collections.find(name)
344 def registerRun(self, name: str, doc: Optional[str] = None) -> bool:
345 # Docstring inherited from lsst.daf.butler.registry.Registry
346 _, registered = self._managers.collections.register(name, CollectionType.RUN, doc=doc)
347 return registered
349 @transactional
350 def removeCollection(self, name: str) -> None:
351 # Docstring inherited from lsst.daf.butler.registry.Registry
352 self._managers.collections.remove(name)
354 def getCollectionChain(self, parent: str) -> CollectionSearch:
355 # Docstring inherited from lsst.daf.butler.registry.Registry
356 record = self._managers.collections.find(parent)
357 if record.type is not CollectionType.CHAINED:
358 raise TypeError(f"Collection '{parent}' has type {record.type.name}, not CHAINED.")
359 assert isinstance(record, ChainedCollectionRecord)
360 return record.children
362 @transactional
363 def setCollectionChain(self, parent: str, children: Any, *, flatten: bool = False) -> None:
364 # Docstring inherited from lsst.daf.butler.registry.Registry
365 record = self._managers.collections.find(parent)
366 if record.type is not CollectionType.CHAINED:
367 raise TypeError(f"Collection '{parent}' has type {record.type.name}, not CHAINED.")
368 assert isinstance(record, ChainedCollectionRecord)
369 children = CollectionSearch.fromExpression(children)
370 if children != record.children or flatten:
371 record.update(self._managers.collections, children, flatten=flatten)
373 def getCollectionDocumentation(self, collection: str) -> Optional[str]:
374 # Docstring inherited from lsst.daf.butler.registry.Registry
375 return self._managers.collections.getDocumentation(self._managers.collections.find(collection).key)
377 def setCollectionDocumentation(self, collection: str, doc: Optional[str]) -> None:
378 # Docstring inherited from lsst.daf.butler.registry.Registry
379 self._managers.collections.setDocumentation(self._managers.collections.find(collection).key, doc)
381 def getCollectionSummary(self, collection: str) -> CollectionSummary:
382 # Docstring inherited from lsst.daf.butler.registry.Registry
383 record = self._managers.collections.find(collection)
384 return self._managers.datasets.getCollectionSummary(record)
386 def registerDatasetType(self, datasetType: DatasetType) -> bool:
387 # Docstring inherited from lsst.daf.butler.registry.Registry
388 _, inserted = self._managers.datasets.register(datasetType)
389 return inserted
391 def removeDatasetType(self, name: str) -> None:
392 # Docstring inherited from lsst.daf.butler.registry.Registry
393 self._managers.datasets.remove(name)
395 def getDatasetType(self, name: str) -> DatasetType:
396 # Docstring inherited from lsst.daf.butler.registry.Registry
397 return self._managers.datasets[name].datasetType
399 def supportsIdGenerationMode(self, mode: DatasetIdGenEnum) -> bool:
400 # Docstring inherited from lsst.daf.butler.registry.Registry
401 return self._managers.datasets.supportsIdGenerationMode(mode)
403 def findDataset(self, datasetType: Union[DatasetType, str], dataId: Optional[DataId] = None, *,
404 collections: Any = None, timespan: Optional[Timespan] = None,
405 **kwargs: Any) -> Optional[DatasetRef]:
406 # Docstring inherited from lsst.daf.butler.registry.Registry
407 if isinstance(datasetType, DatasetType):
408 storage = self._managers.datasets[datasetType.name]
409 else:
410 storage = self._managers.datasets[datasetType]
411 dataId = DataCoordinate.standardize(dataId, graph=storage.datasetType.dimensions,
412 universe=self.dimensions, defaults=self.defaults.dataId,
413 **kwargs)
414 if collections is None:
415 if not self.defaults.collections:
416 raise TypeError("No collections provided to findDataset, "
417 "and no defaults from registry construction.")
418 collections = self.defaults.collections
419 else:
420 collections = CollectionSearch.fromExpression(collections)
421 for collectionRecord in collections.iter(self._managers.collections):
422 if (collectionRecord.type is CollectionType.CALIBRATION
423 and (not storage.datasetType.isCalibration() or timespan is None)):
424 continue
425 result = storage.find(collectionRecord, dataId, timespan=timespan)
426 if result is not None:
427 return result
429 return None
431 @transactional
432 def insertDatasets(self, datasetType: Union[DatasetType, str], dataIds: Iterable[DataId],
433 run: Optional[str] = None, expand: bool = True,
434 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE) -> List[DatasetRef]:
435 # Docstring inherited from lsst.daf.butler.registry.Registry
436 if isinstance(datasetType, DatasetType):
437 storage = self._managers.datasets.find(datasetType.name)
438 if storage is None:
439 raise LookupError(f"DatasetType '{datasetType}' has not been registered.")
440 else:
441 storage = self._managers.datasets.find(datasetType)
442 if storage is None:
443 raise LookupError(f"DatasetType with name '{datasetType}' has not been registered.")
444 if run is None:
445 if self.defaults.run is None:
446 raise TypeError("No run provided to insertDatasets, "
447 "and no default from registry construction.")
448 run = self.defaults.run
449 runRecord = self._managers.collections.find(run)
450 if runRecord.type is not CollectionType.RUN:
451 raise TypeError(f"Given collection is of type {runRecord.type.name}; RUN collection required.")
452 assert isinstance(runRecord, RunRecord)
453 progress = Progress("daf.butler.Registry.insertDatasets", level=logging.DEBUG)
454 if expand:
455 expandedDataIds = [self.expandDataId(dataId, graph=storage.datasetType.dimensions)
456 for dataId in progress.wrap(dataIds,
457 f"Expanding {storage.datasetType.name} data IDs")]
458 else:
459 expandedDataIds = [DataCoordinate.standardize(dataId, graph=storage.datasetType.dimensions)
460 for dataId in dataIds]
461 try:
462 refs = list(storage.insert(runRecord, expandedDataIds, idGenerationMode))
463 except sqlalchemy.exc.IntegrityError as err:
464 raise ConflictingDefinitionError(f"A database constraint failure was triggered by inserting "
465 f"one or more datasets of type {storage.datasetType} into "
466 f"collection '{run}'. "
467 f"This probably means a dataset with the same data ID "
468 f"and dataset type already exists, but it may also mean a "
469 f"dimension row is missing.") from err
470 return refs
472 @transactional
473 def _importDatasets(self, datasets: Iterable[DatasetRef], expand: bool = True,
474 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE,
475 reuseIds: bool = False) -> List[DatasetRef]:
476 # Docstring inherited from lsst.daf.butler.registry.Registry
477 datasets = list(datasets)
478 if not datasets:
479 # nothing to do
480 return []
482 # find dataset type
483 datasetTypes = set(dataset.datasetType for dataset in datasets)
484 if len(datasetTypes) != 1:
485 raise ValueError(f"Multiple dataset types in input datasets: {datasetTypes}")
486 datasetType = datasetTypes.pop()
488 # get storage handler for this dataset type
489 storage = self._managers.datasets.find(datasetType.name)
490 if storage is None:
491 raise LookupError(f"DatasetType '{datasetType}' has not been registered.")
493 # find run name
494 runs = set(dataset.run for dataset in datasets)
495 if len(runs) != 1:
496 raise ValueError(f"Multiple run names in input datasets: {runs}")
497 run = runs.pop()
498 if run is None:
499 if self.defaults.run is None:
500 raise TypeError("No run provided to ingestDatasets, "
501 "and no default from registry construction.")
502 run = self.defaults.run
504 runRecord = self._managers.collections.find(run)
505 if runRecord.type is not CollectionType.RUN:
506 raise TypeError(f"Given collection '{runRecord.name}' is of type {runRecord.type.name};"
507 " RUN collection required.")
508 assert isinstance(runRecord, RunRecord)
510 progress = Progress("daf.butler.Registry.insertDatasets", level=logging.DEBUG)
511 if expand:
512 expandedDatasets = [
513 dataset.expanded(self.expandDataId(dataset.dataId, graph=storage.datasetType.dimensions))
514 for dataset in progress.wrap(datasets, f"Expanding {storage.datasetType.name} data IDs")]
515 else:
516 expandedDatasets = [
517 DatasetRef(datasetType, dataset.dataId, id=dataset.id, run=dataset.run, conform=True)
518 for dataset in datasets
519 ]
521 try:
522 refs = list(storage.import_(runRecord, expandedDatasets, idGenerationMode, reuseIds))
523 except sqlalchemy.exc.IntegrityError as err:
524 raise ConflictingDefinitionError(f"A database constraint failure was triggered by inserting "
525 f"one or more datasets of type {storage.datasetType} into "
526 f"collection '{run}'. "
527 f"This probably means a dataset with the same data ID "
528 f"and dataset type already exists, but it may also mean a "
529 f"dimension row is missing.") from err
530 return refs
532 def getDataset(self, id: DatasetId) -> Optional[DatasetRef]:
533 # Docstring inherited from lsst.daf.butler.registry.Registry
534 return self._managers.datasets.getDatasetRef(id)
536 @transactional
537 def removeDatasets(self, refs: Iterable[DatasetRef]) -> None:
538 # Docstring inherited from lsst.daf.butler.registry.Registry
539 progress = Progress("lsst.daf.butler.Registry.removeDatasets", level=logging.DEBUG)
540 for datasetType, refsForType in progress.iter_item_chunks(DatasetRef.groupByType(refs).items(),
541 desc="Removing datasets by type"):
542 storage = self._managers.datasets[datasetType.name]
543 try:
544 storage.delete(refsForType)
545 except sqlalchemy.exc.IntegrityError as err:
546 raise OrphanedRecordError("One or more datasets is still "
547 "present in one or more Datastores.") from err
549 @transactional
550 def associate(self, collection: str, refs: Iterable[DatasetRef]) -> None:
551 # Docstring inherited from lsst.daf.butler.registry.Registry
552 progress = Progress("lsst.daf.butler.Registry.associate", level=logging.DEBUG)
553 collectionRecord = self._managers.collections.find(collection)
554 if collectionRecord.type is not CollectionType.TAGGED:
555 raise TypeError(f"Collection '{collection}' has type {collectionRecord.type.name}, not TAGGED.")
556 for datasetType, refsForType in progress.iter_item_chunks(DatasetRef.groupByType(refs).items(),
557 desc="Associating datasets by type"):
558 storage = self._managers.datasets[datasetType.name]
559 try:
560 storage.associate(collectionRecord, refsForType)
561 except sqlalchemy.exc.IntegrityError as err:
562 raise ConflictingDefinitionError(
563 f"Constraint violation while associating dataset of type {datasetType.name} with "
564 f"collection {collection}. This probably means that one or more datasets with the same "
565 f"dataset type and data ID already exist in the collection, but it may also indicate "
566 f"that the datasets do not exist."
567 ) from err
569 @transactional
570 def disassociate(self, collection: str, refs: Iterable[DatasetRef]) -> None:
571 # Docstring inherited from lsst.daf.butler.registry.Registry
572 progress = Progress("lsst.daf.butler.Registry.disassociate", level=logging.DEBUG)
573 collectionRecord = self._managers.collections.find(collection)
574 if collectionRecord.type is not CollectionType.TAGGED:
575 raise TypeError(f"Collection '{collection}' has type {collectionRecord.type.name}; "
576 "expected TAGGED.")
577 for datasetType, refsForType in progress.iter_item_chunks(DatasetRef.groupByType(refs).items(),
578 desc="Disassociating datasets by type"):
579 storage = self._managers.datasets[datasetType.name]
580 storage.disassociate(collectionRecord, refsForType)
582 @transactional
583 def certify(self, collection: str, refs: Iterable[DatasetRef], timespan: Timespan) -> None:
584 # Docstring inherited from lsst.daf.butler.registry.Registry
585 progress = Progress("lsst.daf.butler.Registry.certify", level=logging.DEBUG)
586 collectionRecord = self._managers.collections.find(collection)
587 for datasetType, refsForType in progress.iter_item_chunks(DatasetRef.groupByType(refs).items(),
588 desc="Certifying datasets by type"):
589 storage = self._managers.datasets[datasetType.name]
590 storage.certify(collectionRecord, refsForType, timespan)
592 @transactional
593 def decertify(self, collection: str, datasetType: Union[str, DatasetType], timespan: Timespan, *,
594 dataIds: Optional[Iterable[DataId]] = None) -> None:
595 # Docstring inherited from lsst.daf.butler.registry.Registry
596 collectionRecord = self._managers.collections.find(collection)
597 if isinstance(datasetType, str):
598 storage = self._managers.datasets[datasetType]
599 else:
600 storage = self._managers.datasets[datasetType.name]
601 standardizedDataIds = None
602 if dataIds is not None:
603 standardizedDataIds = [DataCoordinate.standardize(d, graph=storage.datasetType.dimensions)
604 for d in dataIds]
605 storage.decertify(collectionRecord, timespan, dataIds=standardizedDataIds)
607 def getDatastoreBridgeManager(self) -> DatastoreRegistryBridgeManager:
608 """Return an object that allows a new `Datastore` instance to
609 communicate with this `Registry`.
611 Returns
612 -------
613 manager : `DatastoreRegistryBridgeManager`
614 Object that mediates communication between this `Registry` and its
615 associated datastores.
616 """
617 return self._managers.datastores
619 def getDatasetLocations(self, ref: DatasetRef) -> Iterable[str]:
620 # Docstring inherited from lsst.daf.butler.registry.Registry
621 return self._managers.datastores.findDatastores(ref)
623 def expandDataId(self, dataId: Optional[DataId] = None, *, graph: Optional[DimensionGraph] = None,
624 records: Optional[NameLookupMapping[DimensionElement, Optional[DimensionRecord]]] = None,
625 withDefaults: bool = True,
626 **kwargs: Any) -> DataCoordinate:
627 # Docstring inherited from lsst.daf.butler.registry.Registry
628 if not withDefaults:
629 defaults = None
630 else:
631 defaults = self.defaults.dataId
632 standardized = DataCoordinate.standardize(dataId, graph=graph, universe=self.dimensions,
633 defaults=defaults, **kwargs)
634 if standardized.hasRecords():
635 return standardized
636 if records is None:
637 records = {}
638 elif isinstance(records, NamedKeyMapping):
639 records = records.byName()
640 else:
641 records = dict(records)
642 if isinstance(dataId, DataCoordinate) and dataId.hasRecords():
643 records.update(dataId.records.byName())
644 keys = standardized.byName()
645 for element in standardized.graph.primaryKeyTraversalOrder:
646 record = records.get(element.name, ...) # Use ... to mean not found; None might mean NULL
647 if record is ...:
648 if isinstance(element, Dimension) and keys.get(element.name) is None:
649 if element in standardized.graph.required:
650 raise LookupError(
651 f"No value or null value for required dimension {element.name}."
652 )
653 keys[element.name] = None
654 record = None
655 else:
656 storage = self._managers.dimensions[element]
657 dataIdSet = DataCoordinateIterable.fromScalar(
658 DataCoordinate.standardize(keys, graph=element.graph)
659 )
660 fetched = tuple(storage.fetch(dataIdSet))
661 try:
662 (record,) = fetched
663 except ValueError:
664 record = None
665 records[element.name] = record
666 if record is not None:
667 for d in element.implied:
668 value = getattr(record, d.name)
669 if keys.setdefault(d.name, value) != value:
670 raise InconsistentDataIdError(
671 f"Data ID {standardized} has {d.name}={keys[d.name]!r}, "
672 f"but {element.name} implies {d.name}={value!r}."
673 )
674 else:
675 if element in standardized.graph.required:
676 raise LookupError(
677 f"Could not fetch record for required dimension {element.name} via keys {keys}."
678 )
679 if element.alwaysJoin:
680 raise InconsistentDataIdError(
681 f"Could not fetch record for element {element.name} via keys {keys}, ",
682 "but it is marked alwaysJoin=True; this means one or more dimensions are not "
683 "related."
684 )
685 for d in element.implied:
686 keys.setdefault(d.name, None)
687 records.setdefault(d.name, None)
688 return DataCoordinate.standardize(keys, graph=standardized.graph).expanded(records=records)
690 def insertDimensionData(self, element: Union[DimensionElement, str],
691 *data: Union[Mapping[str, Any], DimensionRecord],
692 conform: bool = True,
693 replace: bool = False) -> None:
694 # Docstring inherited from lsst.daf.butler.registry.Registry
695 if conform:
696 if isinstance(element, str):
697 element = self.dimensions[element]
698 records = [row if isinstance(row, DimensionRecord) else element.RecordClass(**row)
699 for row in data]
700 else:
701 # Ignore typing since caller said to trust them with conform=False.
702 records = data # type: ignore
703 storage = self._managers.dimensions[element] # type: ignore
704 storage.insert(*records, replace=replace)
706 def syncDimensionData(self, element: Union[DimensionElement, str],
707 row: Union[Mapping[str, Any], DimensionRecord],
708 conform: bool = True,
709 update: bool = False) -> Union[bool, Dict[str, Any]]:
710 # Docstring inherited from lsst.daf.butler.registry.Registry
711 if conform:
712 if isinstance(element, str):
713 element = self.dimensions[element]
714 record = row if isinstance(row, DimensionRecord) else element.RecordClass(**row)
715 else:
716 # Ignore typing since caller said to trust them with conform=False.
717 record = row # type: ignore
718 storage = self._managers.dimensions[element] # type: ignore
719 return storage.sync(record, update=update)
721 def queryDatasetTypes(self, expression: Any = ..., *, components: Optional[bool] = None,
722 missing: Optional[List[str]] = None,
723 ) -> Iterator[DatasetType]:
724 # Docstring inherited from lsst.daf.butler.registry.Registry
725 wildcard = CategorizedWildcard.fromExpression(expression, coerceUnrecognized=lambda d: d.name)
726 if wildcard is Ellipsis:
727 for datasetType in self._managers.datasets:
728 # The dataset type can no longer be a component
729 yield datasetType
730 if components:
731 # Automatically create the component dataset types
732 try:
733 componentsForDatasetType = datasetType.makeAllComponentDatasetTypes()
734 except KeyError as err:
735 _LOG.warning(f"Could not load storage class {err} for {datasetType.name}; "
736 "if it has components they will not be included in query results.")
737 else:
738 yield from componentsForDatasetType
739 return
740 done: Set[str] = set()
741 for name in wildcard.strings:
742 storage = self._managers.datasets.find(name)
743 done.add(name)
744 if storage is None:
745 if missing is not None:
746 missing.append(name)
747 else:
748 yield storage.datasetType
749 if wildcard.patterns:
750 # If components (the argument) is None, we'll save component
751 # dataset that we might want to match, but only if their parents
752 # didn't get included.
753 componentsForLater = []
754 for registeredDatasetType in self._managers.datasets:
755 # Components are not stored in registry so expand them here
756 allDatasetTypes = [registeredDatasetType]
757 try:
758 allDatasetTypes.extend(registeredDatasetType.makeAllComponentDatasetTypes())
759 except KeyError as err:
760 _LOG.warning(f"Could not load storage class {err} for {registeredDatasetType.name}; "
761 "if it has components they will not be included in query results.")
762 for datasetType in allDatasetTypes:
763 if datasetType.name in done:
764 continue
765 parentName, componentName = datasetType.nameAndComponent()
766 if componentName is not None and not components:
767 if components is None and parentName not in done:
768 componentsForLater.append(datasetType)
769 continue
770 if any(p.fullmatch(datasetType.name) for p in wildcard.patterns):
771 done.add(datasetType.name)
772 yield datasetType
773 # Go back and try to match saved components.
774 for datasetType in componentsForLater:
775 parentName, _ = datasetType.nameAndComponent()
776 if parentName not in done and any(p.fullmatch(datasetType.name) for p in wildcard.patterns):
777 yield datasetType
779 def queryCollections(
780 self,
781 expression: Any = ...,
782 datasetType: Optional[DatasetType] = None,
783 collectionTypes: Union[Iterable[CollectionType], CollectionType] = CollectionType.all(),
784 flattenChains: bool = False,
785 includeChains: Optional[bool] = None,
786 ) -> Iterator[str]:
787 # Docstring inherited from lsst.daf.butler.registry.Registry
789 # Right now the datasetTypes argument is completely ignored, but that
790 # is consistent with its [lack of] guarantees. DM-24939 or a follow-up
791 # ticket will take care of that.
792 query = CollectionQuery.fromExpression(expression)
793 collectionTypes = ensure_iterable(collectionTypes)
794 for record in query.iter(self._managers.collections, collectionTypes=frozenset(collectionTypes),
795 flattenChains=flattenChains, includeChains=includeChains):
796 yield record.name
798 def _makeQueryBuilder(self, summary: queries.QuerySummary,
799 doomed_by: Iterable[str] = ()) -> queries.QueryBuilder:
800 """Return a `QueryBuilder` instance capable of constructing and
801 managing more complex queries than those obtainable via `Registry`
802 interfaces.
804 This is an advanced interface; downstream code should prefer
805 `Registry.queryDataIds` and `Registry.queryDatasets` whenever those
806 are sufficient.
808 Parameters
809 ----------
810 summary : `queries.QuerySummary`
811 Object describing and categorizing the full set of dimensions that
812 will be included in the query.
813 doomed_by : `Iterable` of `str`, optional
814 A list of diagnostic messages that indicate why the query is going
815 to yield no results and should not even be executed. If an empty
816 container (default) the query will be executed unless other code
817 determines that it is doomed.
819 Returns
820 -------
821 builder : `queries.QueryBuilder`
822 Object that can be used to construct and perform advanced queries.
823 """
824 return queries.QueryBuilder(
825 summary,
826 queries.RegistryManagers(
827 collections=self._managers.collections,
828 dimensions=self._managers.dimensions,
829 datasets=self._managers.datasets,
830 TimespanReprClass=self._db.getTimespanRepresentation(),
831 ),
832 doomed_by=doomed_by,
833 )
835 def queryDatasets(self, datasetType: Any, *,
836 collections: Any = None,
837 dimensions: Optional[Iterable[Union[Dimension, str]]] = None,
838 dataId: Optional[DataId] = None,
839 where: Optional[str] = None,
840 findFirst: bool = False,
841 components: Optional[bool] = None,
842 bind: Optional[Mapping[str, Any]] = None,
843 check: bool = True,
844 **kwargs: Any) -> queries.DatasetQueryResults:
845 # Docstring inherited from lsst.daf.butler.registry.Registry
847 # Standardize the collections expression.
848 if collections is None:
849 if not self.defaults.collections:
850 raise TypeError("No collections provided to findDataset, "
851 "and no defaults from registry construction.")
852 collections = self.defaults.collections
853 elif findFirst:
854 collections = CollectionSearch.fromExpression(collections)
855 else:
856 collections = CollectionQuery.fromExpression(collections)
857 # Standardize and expand the data ID provided as a constraint.
858 standardizedDataId = self.expandDataId(dataId, **kwargs)
860 # We can only query directly if given a non-component DatasetType
861 # instance. If we were given an expression or str or a component
862 # DatasetType instance, we'll populate this dict, recurse, and return.
863 # If we already have a non-component DatasetType, it will remain None
864 # and we'll run the query directly.
865 composition: Optional[
866 Dict[
867 DatasetType, # parent dataset type
868 List[Optional[str]] # component name, or None for parent
869 ]
870 ] = None
871 if not isinstance(datasetType, DatasetType):
872 # We were given a dataset type expression (which may be as simple
873 # as a str). Loop over all matching datasets, delegating handling
874 # of the `components` argument to queryDatasetTypes, as we populate
875 # the composition dict.
876 composition = defaultdict(list)
877 for trueDatasetType in self.queryDatasetTypes(datasetType, components=components):
878 parentName, componentName = trueDatasetType.nameAndComponent()
879 if componentName is not None:
880 parentDatasetType = self.getDatasetType(parentName)
881 composition.setdefault(parentDatasetType, []).append(componentName)
882 else:
883 composition.setdefault(trueDatasetType, []).append(None)
884 if not composition:
885 return queries.ChainedDatasetQueryResults(
886 [],
887 doomed_by=[f"No registered dataset type matching {t!r} found."
888 for t in ensure_iterable(datasetType)],
889 )
890 elif datasetType.isComponent():
891 # We were given a true DatasetType instance, but it's a component.
892 # the composition dict will have exactly one item.
893 parentName, componentName = datasetType.nameAndComponent()
894 parentDatasetType = self.getDatasetType(parentName)
895 composition = {parentDatasetType: [componentName]}
896 if composition is not None:
897 # We need to recurse. Do that once for each parent dataset type.
898 chain = []
899 for parentDatasetType, componentNames in composition.items():
900 parentResults = self.queryDatasets(
901 parentDatasetType,
902 collections=collections,
903 dimensions=dimensions,
904 dataId=standardizedDataId,
905 where=where,
906 bind=bind,
907 findFirst=findFirst,
908 check=check,
909 )
910 assert isinstance(parentResults, queries.ParentDatasetQueryResults), \
911 "Should always be true if passing in a DatasetType instance, and we are."
912 chain.append(
913 parentResults.withComponents(componentNames)
914 )
915 return queries.ChainedDatasetQueryResults(chain)
916 # If we get here, there's no need to recurse (or we are already
917 # recursing; there can only ever be one level of recursion).
919 # The full set of dimensions in the query is the combination of those
920 # needed for the DatasetType and those explicitly requested, if any.
921 requestedDimensionNames = set(datasetType.dimensions.names)
922 if dimensions is not None:
923 requestedDimensionNames.update(self.dimensions.extract(dimensions).names)
924 # Construct the summary structure needed to construct a QueryBuilder.
925 summary = queries.QuerySummary(
926 requested=DimensionGraph(self.dimensions, names=requestedDimensionNames),
927 dataId=standardizedDataId,
928 expression=where,
929 bind=bind,
930 defaults=self.defaults.dataId,
931 check=check,
932 datasets=[datasetType],
933 )
934 builder = self._makeQueryBuilder(summary)
935 # Add the dataset subquery to the query, telling the QueryBuilder to
936 # include the rank of the selected collection in the results only if we
937 # need to findFirst. Note that if any of the collections are
938 # actually wildcard expressions, and we've asked for deduplication,
939 # this will raise TypeError for us.
940 builder.joinDataset(datasetType, collections, isResult=True, findFirst=findFirst)
941 query = builder.finish()
942 return queries.ParentDatasetQueryResults(self._db, query, components=[None], datasetType=datasetType)
944 def queryDataIds(self, dimensions: Union[Iterable[Union[Dimension, str]], Dimension, str], *,
945 dataId: Optional[DataId] = None,
946 datasets: Any = None,
947 collections: Any = None,
948 where: Optional[str] = None,
949 components: Optional[bool] = None,
950 bind: Optional[Mapping[str, Any]] = None,
951 check: bool = True,
952 **kwargs: Any) -> queries.DataCoordinateQueryResults:
953 # Docstring inherited from lsst.daf.butler.registry.Registry
954 dimensions = ensure_iterable(dimensions)
955 standardizedDataId = self.expandDataId(dataId, **kwargs)
956 standardizedDatasetTypes = set()
957 requestedDimensions = self.dimensions.extract(dimensions)
958 missing: List[str] = []
959 if datasets is not None:
960 if not collections:
961 if not self.defaults.collections:
962 raise TypeError(f"Cannot pass 'datasets' (='{datasets}') without 'collections'.")
963 collections = self.defaults.collections
964 else:
965 # Preprocess collections expression in case the original
966 # included single-pass iterators (we'll want to use it multiple
967 # times below).
968 collections = CollectionQuery.fromExpression(collections)
969 for datasetType in self.queryDatasetTypes(datasets, components=components, missing=missing):
970 # If any matched dataset type is a component, just operate on
971 # its parent instead, because Registry doesn't know anything
972 # about what components exist, and here (unlike queryDatasets)
973 # we don't care about returning them.
974 parentDatasetTypeName, componentName = datasetType.nameAndComponent()
975 if componentName is not None:
976 datasetType = self.getDatasetType(parentDatasetTypeName)
977 standardizedDatasetTypes.add(datasetType)
978 elif collections:
979 raise TypeError(f"Cannot pass 'collections' (='{collections}') without 'datasets'.")
981 def query_factory(order_by: Optional[Iterable[str]] = None,
982 limit: Optional[Tuple[int, Optional[int]]] = None) -> Query:
983 """Construct the Query object that generates query results.
984 """
985 summary = queries.QuerySummary(
986 requested=requestedDimensions,
987 dataId=standardizedDataId,
988 expression=where,
989 bind=bind,
990 defaults=self.defaults.dataId,
991 check=check,
992 datasets=standardizedDatasetTypes,
993 order_by=order_by,
994 limit=limit
995 )
996 builder = self._makeQueryBuilder(
997 summary,
998 doomed_by=[f"Dataset type {name} is not registered." for name in missing]
999 )
1000 for datasetType in standardizedDatasetTypes:
1001 builder.joinDataset(datasetType, collections, isResult=False,)
1002 return builder.finish()
1004 return queries.DataCoordinateQueryResults(self._db, query_factory, requestedDimensions)
1006 def queryDimensionRecords(self, element: Union[DimensionElement, str], *,
1007 dataId: Optional[DataId] = None,
1008 datasets: Any = None,
1009 collections: Any = None,
1010 where: Optional[str] = None,
1011 components: Optional[bool] = None,
1012 bind: Optional[Mapping[str, Any]] = None,
1013 check: bool = True,
1014 **kwargs: Any) -> queries.DimensionRecordQueryResults:
1015 # Docstring inherited from lsst.daf.butler.registry.Registry
1016 if not isinstance(element, DimensionElement):
1017 try:
1018 element = self.dimensions[element]
1019 except KeyError as e:
1020 raise KeyError(f"No such dimension '{element}', available dimensions: "
1021 + str(self.dimensions.getStaticElements())) from e
1022 dataIds = self.queryDataIds(element.graph, dataId=dataId, datasets=datasets, collections=collections,
1023 where=where, components=components, bind=bind, check=check, **kwargs)
1024 return queries.DatabaseDimensionRecordQueryResults(dataIds, self._managers.dimensions[element])
1026 def queryDatasetAssociations(
1027 self,
1028 datasetType: Union[str, DatasetType],
1029 collections: Any = ...,
1030 *,
1031 collectionTypes: Iterable[CollectionType] = CollectionType.all(),
1032 flattenChains: bool = False,
1033 ) -> Iterator[DatasetAssociation]:
1034 # Docstring inherited from lsst.daf.butler.registry.Registry
1035 if collections is None:
1036 if not self.defaults.collections:
1037 raise TypeError("No collections provided to findDataset, "
1038 "and no defaults from registry construction.")
1039 collections = self.defaults.collections
1040 else:
1041 collections = CollectionQuery.fromExpression(collections)
1042 TimespanReprClass = self._db.getTimespanRepresentation()
1043 if isinstance(datasetType, str):
1044 storage = self._managers.datasets[datasetType]
1045 else:
1046 storage = self._managers.datasets[datasetType.name]
1047 for collectionRecord in collections.iter(self._managers.collections,
1048 collectionTypes=frozenset(collectionTypes),
1049 flattenChains=flattenChains):
1050 query = storage.select(collectionRecord)
1051 for row in self._db.query(query.combine()).mappings():
1052 dataId = DataCoordinate.fromRequiredValues(
1053 storage.datasetType.dimensions,
1054 tuple(row[name] for name in storage.datasetType.dimensions.required.names)
1055 )
1056 runRecord = self._managers.collections[row[self._managers.collections.getRunForeignKeyName()]]
1057 ref = DatasetRef(storage.datasetType, dataId, id=row["id"], run=runRecord.name,
1058 conform=False)
1059 if collectionRecord.type is CollectionType.CALIBRATION:
1060 timespan = TimespanReprClass.extract(row)
1061 else:
1062 timespan = None
1063 yield DatasetAssociation(ref=ref, collection=collectionRecord.name, timespan=timespan)
1065 storageClasses: StorageClassFactory
1066 """All storage classes known to the registry (`StorageClassFactory`).
1067 """