Coverage for python/lsst/daf/butler/registries/sql.py : 13%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22from __future__ import annotations
24__all__ = (
25 "SqlRegistry",
26)
28from collections import defaultdict
29import contextlib
30import logging
31from typing import (
32 Any,
33 Dict,
34 Iterable,
35 Iterator,
36 List,
37 Mapping,
38 Optional,
39 Set,
40 TYPE_CHECKING,
41 Union,
42)
44import sqlalchemy
46from ..core import (
47 ButlerURI,
48 Config,
49 DataCoordinate,
50 DataCoordinateIterable,
51 DataId,
52 DatasetAssociation,
53 DatasetId,
54 DatasetRef,
55 DatasetType,
56 ddl,
57 Dimension,
58 DimensionConfig,
59 DimensionElement,
60 DimensionGraph,
61 DimensionRecord,
62 DimensionUniverse,
63 NamedKeyMapping,
64 NameLookupMapping,
65 Progress,
66 StorageClassFactory,
67 Timespan,
68)
69from ..core.utils import iterable, transactional
71from ..registry import (
72 Registry,
73 RegistryConfig,
74 CollectionType,
75 RegistryDefaults,
76 ConflictingDefinitionError,
77 InconsistentDataIdError,
78 OrphanedRecordError,
79 CollectionSearch,
80)
81from ..registry import queries
82from ..registry.wildcards import CategorizedWildcard, CollectionQuery, Ellipsis
83from ..registry.summaries import CollectionSummary
84from ..registry.managers import RegistryManagerTypes, RegistryManagerInstances
85from ..registry.interfaces import ChainedCollectionRecord, DatasetIdGenEnum, RunRecord
87if TYPE_CHECKING: 87 ↛ 88line 87 didn't jump to line 88, because the condition on line 87 was never true
88 from .._butlerConfig import ButlerConfig
89 from ..registry.interfaces import (
90 CollectionRecord,
91 Database,
92 DatastoreRegistryBridgeManager,
93 )
96_LOG = logging.getLogger(__name__)
99class SqlRegistry(Registry):
100 """Registry implementation based on SQLAlchemy.
102 Parameters
103 ----------
104 database : `Database`
105 Database instance to store Registry.
106 defaults : `RegistryDefaults`
107 Default collection search path and/or output `~CollectionType.RUN`
108 collection.
109 managers : `RegistryManagerInstances`
110 All the managers required for this registry.
111 """
113 defaultConfigFile: Optional[str] = None
114 """Path to configuration defaults. Accessed within the ``configs`` resource
115 or relative to a search path. Can be None if no defaults specified.
116 """
118 @classmethod
119 def createFromConfig(cls, config: Optional[Union[RegistryConfig, str]] = None,
120 dimensionConfig: Optional[Union[DimensionConfig, str]] = None,
121 butlerRoot: Optional[str] = None) -> Registry:
122 """Create registry database and return `SqlRegistry` instance.
124 This method initializes database contents, database must be empty
125 prior to calling this method.
127 Parameters
128 ----------
129 config : `RegistryConfig` or `str`, optional
130 Registry configuration, if missing then default configuration will
131 be loaded from registry.yaml.
132 dimensionConfig : `DimensionConfig` or `str`, optional
133 Dimensions configuration, if missing then default configuration
134 will be loaded from dimensions.yaml.
135 butlerRoot : `str`, optional
136 Path to the repository root this `SqlRegistry` will manage.
138 Returns
139 -------
140 registry : `SqlRegistry`
141 A new `SqlRegistry` instance.
142 """
143 config = cls.forceRegistryConfig(config)
144 config.replaceRoot(butlerRoot)
146 if isinstance(dimensionConfig, str):
147 dimensionConfig = DimensionConfig(config)
148 elif dimensionConfig is None:
149 dimensionConfig = DimensionConfig()
150 elif not isinstance(dimensionConfig, DimensionConfig):
151 raise TypeError(f"Incompatible Dimension configuration type: {type(dimensionConfig)}")
153 DatabaseClass = config.getDatabaseClass()
154 database = DatabaseClass.fromUri(str(config.connectionString), origin=config.get("origin", 0),
155 namespace=config.get("namespace"))
156 managerTypes = RegistryManagerTypes.fromConfig(config)
157 managers = managerTypes.makeRepo(database, dimensionConfig)
158 return cls(database, RegistryDefaults(), managers)
160 @classmethod
161 def fromConfig(cls, config: Union[ButlerConfig, RegistryConfig, Config, str],
162 butlerRoot: Optional[Union[str, ButlerURI]] = None, writeable: bool = True,
163 defaults: Optional[RegistryDefaults] = None) -> Registry:
164 """Create `Registry` subclass instance from `config`.
166 Registry database must be inbitialized prior to calling this method.
168 Parameters
169 ----------
170 config : `ButlerConfig`, `RegistryConfig`, `Config` or `str`
171 Registry configuration
172 butlerRoot : `str` or `ButlerURI`, optional
173 Path to the repository root this `Registry` will manage.
174 writeable : `bool`, optional
175 If `True` (default) create a read-write connection to the database.
176 defaults : `RegistryDefaults`, optional
177 Default collection search path and/or output `~CollectionType.RUN`
178 collection.
180 Returns
181 -------
182 registry : `SqlRegistry` (subclass)
183 A new `SqlRegistry` subclass instance.
184 """
185 config = cls.forceRegistryConfig(config)
186 config.replaceRoot(butlerRoot)
187 DatabaseClass = config.getDatabaseClass()
188 database = DatabaseClass.fromUri(str(config.connectionString), origin=config.get("origin", 0),
189 namespace=config.get("namespace"), writeable=writeable)
190 managerTypes = RegistryManagerTypes.fromConfig(config)
191 managers = managerTypes.loadRepo(database)
192 if defaults is None:
193 defaults = RegistryDefaults()
194 return cls(database, defaults, managers)
196 def __init__(self, database: Database, defaults: RegistryDefaults, managers: RegistryManagerInstances):
197 self._db = database
198 self._managers = managers
199 self.storageClasses = StorageClassFactory()
200 # Intentionally invoke property setter to initialize defaults. This
201 # can only be done after most of the rest of Registry has already been
202 # initialized, and must be done before the property getter is used.
203 self.defaults = defaults
205 def __str__(self) -> str:
206 return str(self._db)
208 def __repr__(self) -> str:
209 return f"SqlRegistry({self._db!r}, {self.dimensions!r})"
211 def isWriteable(self) -> bool:
212 # Docstring inherited from lsst.daf.butler.registry.Registry
213 return self._db.isWriteable()
215 def copy(self, defaults: Optional[RegistryDefaults] = None) -> Registry:
216 # Docstring inherited from lsst.daf.butler.registry.Registry
217 if defaults is None:
218 # No need to copy, because `RegistryDefaults` is immutable; we
219 # effectively copy on write.
220 defaults = self.defaults
221 return type(self)(self._db, defaults, self._managers)
223 @property
224 def dimensions(self) -> DimensionUniverse:
225 # Docstring inherited from lsst.daf.butler.registry.Registry
226 return self._managers.dimensions.universe
228 def refresh(self) -> None:
229 # Docstring inherited from lsst.daf.butler.registry.Registry
230 self._managers.refresh()
232 @contextlib.contextmanager
233 def transaction(self, *, savepoint: bool = False) -> Iterator[None]:
234 # Docstring inherited from lsst.daf.butler.registry.Registry
235 try:
236 with self._db.transaction(savepoint=savepoint):
237 yield
238 except BaseException:
239 # TODO: this clears the caches sometimes when we wouldn't actually
240 # need to. Can we avoid that?
241 self._managers.dimensions.clearCaches()
242 raise
244 def resetConnectionPool(self) -> None:
245 """Reset SQLAlchemy connection pool for `SqlRegistry` database.
247 This operation is useful when using registry with fork-based
248 multiprocessing. To use registry across fork boundary one has to make
249 sure that there are no currently active connections (no session or
250 transaction is in progress) and connection pool is reset using this
251 method. This method should be called by the child process immediately
252 after the fork.
253 """
254 self._db._engine.dispose()
256 def registerOpaqueTable(self, tableName: str, spec: ddl.TableSpec) -> None:
257 """Add an opaque (to the `Registry`) table for use by a `Datastore` or
258 other data repository client.
260 Opaque table records can be added via `insertOpaqueData`, retrieved via
261 `fetchOpaqueData`, and removed via `deleteOpaqueData`.
263 Parameters
264 ----------
265 tableName : `str`
266 Logical name of the opaque table. This may differ from the
267 actual name used in the database by a prefix and/or suffix.
268 spec : `ddl.TableSpec`
269 Specification for the table to be added.
270 """
271 self._managers.opaque.register(tableName, spec)
273 @transactional
274 def insertOpaqueData(self, tableName: str, *data: dict) -> None:
275 """Insert records into an opaque table.
277 Parameters
278 ----------
279 tableName : `str`
280 Logical name of the opaque table. Must match the name used in a
281 previous call to `registerOpaqueTable`.
282 data
283 Each additional positional argument is a dictionary that represents
284 a single row to be added.
285 """
286 self._managers.opaque[tableName].insert(*data)
288 def fetchOpaqueData(self, tableName: str, **where: Any) -> Iterator[dict]:
289 """Retrieve records from an opaque table.
291 Parameters
292 ----------
293 tableName : `str`
294 Logical name of the opaque table. Must match the name used in a
295 previous call to `registerOpaqueTable`.
296 where
297 Additional keyword arguments are interpreted as equality
298 constraints that restrict the returned rows (combined with AND);
299 keyword arguments are column names and values are the values they
300 must have.
302 Yields
303 ------
304 row : `dict`
305 A dictionary representing a single result row.
306 """
307 yield from self._managers.opaque[tableName].fetch(**where)
309 @transactional
310 def deleteOpaqueData(self, tableName: str, **where: Any) -> None:
311 """Remove records from an opaque table.
313 Parameters
314 ----------
315 tableName : `str`
316 Logical name of the opaque table. Must match the name used in a
317 previous call to `registerOpaqueTable`.
318 where
319 Additional keyword arguments are interpreted as equality
320 constraints that restrict the deleted rows (combined with AND);
321 keyword arguments are column names and values are the values they
322 must have.
323 """
324 self._managers.opaque[tableName].delete(where.keys(), where)
326 def registerCollection(self, name: str, type: CollectionType = CollectionType.TAGGED,
327 doc: Optional[str] = None) -> None:
328 # Docstring inherited from lsst.daf.butler.registry.Registry
329 self._managers.collections.register(name, type, doc=doc)
331 def getCollectionType(self, name: str) -> CollectionType:
332 # Docstring inherited from lsst.daf.butler.registry.Registry
333 return self._managers.collections.find(name).type
335 def _get_collection_record(self, name: str) -> CollectionRecord:
336 # Docstring inherited from lsst.daf.butler.registry.Registry
337 return self._managers.collections.find(name)
339 def registerRun(self, name: str, doc: Optional[str] = None) -> None:
340 # Docstring inherited from lsst.daf.butler.registry.Registry
341 self._managers.collections.register(name, CollectionType.RUN, doc=doc)
343 @transactional
344 def removeCollection(self, name: str) -> None:
345 # Docstring inherited from lsst.daf.butler.registry.Registry
346 self._managers.collections.remove(name)
348 def getCollectionChain(self, parent: str) -> CollectionSearch:
349 # Docstring inherited from lsst.daf.butler.registry.Registry
350 record = self._managers.collections.find(parent)
351 if record.type is not CollectionType.CHAINED:
352 raise TypeError(f"Collection '{parent}' has type {record.type.name}, not CHAINED.")
353 assert isinstance(record, ChainedCollectionRecord)
354 return record.children
356 @transactional
357 def setCollectionChain(self, parent: str, children: Any, *, flatten: bool = False) -> None:
358 # Docstring inherited from lsst.daf.butler.registry.Registry
359 record = self._managers.collections.find(parent)
360 if record.type is not CollectionType.CHAINED:
361 raise TypeError(f"Collection '{parent}' has type {record.type.name}, not CHAINED.")
362 assert isinstance(record, ChainedCollectionRecord)
363 children = CollectionSearch.fromExpression(children)
364 if children != record.children or flatten:
365 record.update(self._managers.collections, children, flatten=flatten)
367 def getCollectionDocumentation(self, collection: str) -> Optional[str]:
368 # Docstring inherited from lsst.daf.butler.registry.Registry
369 return self._managers.collections.getDocumentation(self._managers.collections.find(collection).key)
371 def setCollectionDocumentation(self, collection: str, doc: Optional[str]) -> None:
372 # Docstring inherited from lsst.daf.butler.registry.Registry
373 self._managers.collections.setDocumentation(self._managers.collections.find(collection).key, doc)
375 def getCollectionSummary(self, collection: str) -> CollectionSummary:
376 # Docstring inherited from lsst.daf.butler.registry.Registry
377 record = self._managers.collections.find(collection)
378 return self._managers.datasets.getCollectionSummary(record)
380 def registerDatasetType(self, datasetType: DatasetType) -> bool:
381 # Docstring inherited from lsst.daf.butler.registry.Registry
382 _, inserted = self._managers.datasets.register(datasetType)
383 return inserted
385 def removeDatasetType(self, name: str) -> None:
386 # Docstring inherited from lsst.daf.butler.registry.Registry
387 self._managers.datasets.remove(name)
389 def getDatasetType(self, name: str) -> DatasetType:
390 # Docstring inherited from lsst.daf.butler.registry.Registry
391 return self._managers.datasets[name].datasetType
393 def findDataset(self, datasetType: Union[DatasetType, str], dataId: Optional[DataId] = None, *,
394 collections: Any = None, timespan: Optional[Timespan] = None,
395 **kwargs: Any) -> Optional[DatasetRef]:
396 # Docstring inherited from lsst.daf.butler.registry.Registry
397 if isinstance(datasetType, DatasetType):
398 storage = self._managers.datasets[datasetType.name]
399 else:
400 storage = self._managers.datasets[datasetType]
401 dataId = DataCoordinate.standardize(dataId, graph=storage.datasetType.dimensions,
402 universe=self.dimensions, defaults=self.defaults.dataId,
403 **kwargs)
404 if collections is None:
405 if not self.defaults.collections:
406 raise TypeError("No collections provided to findDataset, "
407 "and no defaults from registry construction.")
408 collections = self.defaults.collections
409 else:
410 collections = CollectionSearch.fromExpression(collections)
411 for collectionRecord in collections.iter(self._managers.collections):
412 if (collectionRecord.type is CollectionType.CALIBRATION
413 and (not storage.datasetType.isCalibration() or timespan is None)):
414 continue
415 result = storage.find(collectionRecord, dataId, timespan=timespan)
416 if result is not None:
417 return result
419 return None
421 @transactional
422 def insertDatasets(self, datasetType: Union[DatasetType, str], dataIds: Iterable[DataId],
423 run: Optional[str] = None, expand: bool = True,
424 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE) -> List[DatasetRef]:
425 # Docstring inherited from lsst.daf.butler.registry.Registry
426 if isinstance(datasetType, DatasetType):
427 storage = self._managers.datasets.find(datasetType.name)
428 if storage is None:
429 raise LookupError(f"DatasetType '{datasetType}' has not been registered.")
430 else:
431 storage = self._managers.datasets.find(datasetType)
432 if storage is None:
433 raise LookupError(f"DatasetType with name '{datasetType}' has not been registered.")
434 if run is None:
435 if self.defaults.run is None:
436 raise TypeError("No run provided to insertDatasets, "
437 "and no default from registry construction.")
438 run = self.defaults.run
439 runRecord = self._managers.collections.find(run)
440 if runRecord.type is not CollectionType.RUN:
441 raise TypeError(f"Given collection is of type {runRecord.type.name}; RUN collection required.")
442 assert isinstance(runRecord, RunRecord)
443 progress = Progress("daf.butler.Registry.insertDatasets", level=logging.DEBUG)
444 if expand:
445 expandedDataIds = [self.expandDataId(dataId, graph=storage.datasetType.dimensions)
446 for dataId in progress.wrap(dataIds,
447 f"Expanding {storage.datasetType.name} data IDs")]
448 else:
449 expandedDataIds = [DataCoordinate.standardize(dataId, graph=storage.datasetType.dimensions)
450 for dataId in dataIds]
451 try:
452 refs = list(storage.insert(runRecord, expandedDataIds, idGenerationMode))
453 except sqlalchemy.exc.IntegrityError as err:
454 raise ConflictingDefinitionError(f"A database constraint failure was triggered by inserting "
455 f"one or more datasets of type {storage.datasetType} into "
456 f"collection '{run}'. "
457 f"This probably means a dataset with the same data ID "
458 f"and dataset type already exists, but it may also mean a "
459 f"dimension row is missing.") from err
460 return refs
462 @transactional
463 def _importDatasets(self, datasets: Iterable[DatasetRef], expand: bool = True,
464 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE,
465 reuseIds: bool = False) -> List[DatasetRef]:
466 # Docstring inherited from lsst.daf.butler.registry.Registry
467 datasets = list(datasets)
468 if not datasets:
469 # nothing to do
470 return []
472 # find dataset type
473 datasetTypes = set(dataset.datasetType for dataset in datasets)
474 if len(datasetTypes) != 1:
475 raise ValueError(f"Multiple dataset types in input datasets: {datasetTypes}")
476 datasetType = datasetTypes.pop()
478 # get storage handler for this dataset type
479 storage = self._managers.datasets.find(datasetType.name)
480 if storage is None:
481 raise LookupError(f"DatasetType '{datasetType}' has not been registered.")
483 # find run name
484 runs = set(dataset.run for dataset in datasets)
485 if len(runs) != 1:
486 raise ValueError(f"Multiple run names in input datasets: {runs}")
487 run = runs.pop()
488 if run is None:
489 if self.defaults.run is None:
490 raise TypeError("No run provided to ingestDatasets, "
491 "and no default from registry construction.")
492 run = self.defaults.run
494 runRecord = self._managers.collections.find(run)
495 if runRecord.type is not CollectionType.RUN:
496 raise TypeError(f"Given collection '{runRecord.name}' is of type {runRecord.type.name};"
497 " RUN collection required.")
498 assert isinstance(runRecord, RunRecord)
500 progress = Progress("daf.butler.Registry.insertDatasets", level=logging.DEBUG)
501 if expand:
502 expandedDatasets = [
503 dataset.expanded(self.expandDataId(dataset.dataId, graph=storage.datasetType.dimensions))
504 for dataset in progress.wrap(datasets, f"Expanding {storage.datasetType.name} data IDs")]
505 else:
506 expandedDatasets = [
507 DatasetRef(datasetType, dataset.dataId, id=dataset.id, run=dataset.run, conform=True)
508 for dataset in datasets
509 ]
511 try:
512 refs = list(storage.import_(runRecord, expandedDatasets, idGenerationMode, reuseIds))
513 except sqlalchemy.exc.IntegrityError as err:
514 raise ConflictingDefinitionError(f"A database constraint failure was triggered by inserting "
515 f"one or more datasets of type {storage.datasetType} into "
516 f"collection '{run}'. "
517 f"This probably means a dataset with the same data ID "
518 f"and dataset type already exists, but it may also mean a "
519 f"dimension row is missing.") from err
520 return refs
522 def getDataset(self, id: DatasetId) -> Optional[DatasetRef]:
523 # Docstring inherited from lsst.daf.butler.registry.Registry
524 return self._managers.datasets.getDatasetRef(id)
526 @transactional
527 def removeDatasets(self, refs: Iterable[DatasetRef]) -> None:
528 # Docstring inherited from lsst.daf.butler.registry.Registry
529 progress = Progress("lsst.daf.butler.Registry.removeDatasets", level=logging.DEBUG)
530 for datasetType, refsForType in progress.iter_item_chunks(DatasetRef.groupByType(refs).items(),
531 desc="Removing datasets by type"):
532 storage = self._managers.datasets[datasetType.name]
533 try:
534 storage.delete(refsForType)
535 except sqlalchemy.exc.IntegrityError as err:
536 raise OrphanedRecordError("One or more datasets is still "
537 "present in one or more Datastores.") from err
539 @transactional
540 def associate(self, collection: str, refs: Iterable[DatasetRef]) -> None:
541 # Docstring inherited from lsst.daf.butler.registry.Registry
542 progress = Progress("lsst.daf.butler.Registry.associate", level=logging.DEBUG)
543 collectionRecord = self._managers.collections.find(collection)
544 if collectionRecord.type is not CollectionType.TAGGED:
545 raise TypeError(f"Collection '{collection}' has type {collectionRecord.type.name}, not TAGGED.")
546 for datasetType, refsForType in progress.iter_item_chunks(DatasetRef.groupByType(refs).items(),
547 desc="Associating datasets by type"):
548 storage = self._managers.datasets[datasetType.name]
549 try:
550 storage.associate(collectionRecord, refsForType)
551 except sqlalchemy.exc.IntegrityError as err:
552 raise ConflictingDefinitionError(
553 f"Constraint violation while associating dataset of type {datasetType.name} with "
554 f"collection {collection}. This probably means that one or more datasets with the same "
555 f"dataset type and data ID already exist in the collection, but it may also indicate "
556 f"that the datasets do not exist."
557 ) from err
559 @transactional
560 def disassociate(self, collection: str, refs: Iterable[DatasetRef]) -> None:
561 # Docstring inherited from lsst.daf.butler.registry.Registry
562 progress = Progress("lsst.daf.butler.Registry.disassociate", level=logging.DEBUG)
563 collectionRecord = self._managers.collections.find(collection)
564 if collectionRecord.type is not CollectionType.TAGGED:
565 raise TypeError(f"Collection '{collection}' has type {collectionRecord.type.name}; "
566 "expected TAGGED.")
567 for datasetType, refsForType in progress.iter_item_chunks(DatasetRef.groupByType(refs).items(),
568 desc="Disassociating datasets by type"):
569 storage = self._managers.datasets[datasetType.name]
570 storage.disassociate(collectionRecord, refsForType)
572 @transactional
573 def certify(self, collection: str, refs: Iterable[DatasetRef], timespan: Timespan) -> None:
574 # Docstring inherited from lsst.daf.butler.registry.Registry
575 progress = Progress("lsst.daf.butler.Registry.certify", level=logging.DEBUG)
576 collectionRecord = self._managers.collections.find(collection)
577 for datasetType, refsForType in progress.iter_item_chunks(DatasetRef.groupByType(refs).items(),
578 desc="Certifying datasets by type"):
579 storage = self._managers.datasets[datasetType.name]
580 storage.certify(collectionRecord, refsForType, timespan)
582 @transactional
583 def decertify(self, collection: str, datasetType: Union[str, DatasetType], timespan: Timespan, *,
584 dataIds: Optional[Iterable[DataId]] = None) -> None:
585 # Docstring inherited from lsst.daf.butler.registry.Registry
586 collectionRecord = self._managers.collections.find(collection)
587 if isinstance(datasetType, str):
588 storage = self._managers.datasets[datasetType]
589 else:
590 storage = self._managers.datasets[datasetType.name]
591 standardizedDataIds = None
592 if dataIds is not None:
593 standardizedDataIds = [DataCoordinate.standardize(d, graph=storage.datasetType.dimensions)
594 for d in dataIds]
595 storage.decertify(collectionRecord, timespan, dataIds=standardizedDataIds)
597 def getDatastoreBridgeManager(self) -> DatastoreRegistryBridgeManager:
598 """Return an object that allows a new `Datastore` instance to
599 communicate with this `Registry`.
601 Returns
602 -------
603 manager : `DatastoreRegistryBridgeManager`
604 Object that mediates communication between this `Registry` and its
605 associated datastores.
606 """
607 return self._managers.datastores
609 def getDatasetLocations(self, ref: DatasetRef) -> Iterable[str]:
610 # Docstring inherited from lsst.daf.butler.registry.Registry
611 return self._managers.datastores.findDatastores(ref)
613 def expandDataId(self, dataId: Optional[DataId] = None, *, graph: Optional[DimensionGraph] = None,
614 records: Optional[NameLookupMapping[DimensionElement, Optional[DimensionRecord]]] = None,
615 withDefaults: bool = True,
616 **kwargs: Any) -> DataCoordinate:
617 # Docstring inherited from lsst.daf.butler.registry.Registry
618 if not withDefaults:
619 defaults = None
620 else:
621 defaults = self.defaults.dataId
622 standardized = DataCoordinate.standardize(dataId, graph=graph, universe=self.dimensions,
623 defaults=defaults, **kwargs)
624 if standardized.hasRecords():
625 return standardized
626 if records is None:
627 records = {}
628 elif isinstance(records, NamedKeyMapping):
629 records = records.byName()
630 else:
631 records = dict(records)
632 if isinstance(dataId, DataCoordinate) and dataId.hasRecords():
633 records.update(dataId.records.byName())
634 keys = standardized.byName()
635 for element in standardized.graph.primaryKeyTraversalOrder:
636 record = records.get(element.name, ...) # Use ... to mean not found; None might mean NULL
637 if record is ...:
638 if isinstance(element, Dimension) and keys.get(element.name) is None:
639 if element in standardized.graph.required:
640 raise LookupError(
641 f"No value or null value for required dimension {element.name}."
642 )
643 keys[element.name] = None
644 record = None
645 else:
646 storage = self._managers.dimensions[element]
647 dataIdSet = DataCoordinateIterable.fromScalar(
648 DataCoordinate.standardize(keys, graph=element.graph)
649 )
650 fetched = tuple(storage.fetch(dataIdSet))
651 try:
652 (record,) = fetched
653 except ValueError:
654 record = None
655 records[element.name] = record
656 if record is not None:
657 for d in element.implied:
658 value = getattr(record, d.name)
659 if keys.setdefault(d.name, value) != value:
660 raise InconsistentDataIdError(
661 f"Data ID {standardized} has {d.name}={keys[d.name]!r}, "
662 f"but {element.name} implies {d.name}={value!r}."
663 )
664 else:
665 if element in standardized.graph.required:
666 raise LookupError(
667 f"Could not fetch record for required dimension {element.name} via keys {keys}."
668 )
669 if element.alwaysJoin:
670 raise InconsistentDataIdError(
671 f"Could not fetch record for element {element.name} via keys {keys}, ",
672 "but it is marked alwaysJoin=True; this means one or more dimensions are not "
673 "related."
674 )
675 for d in element.implied:
676 keys.setdefault(d.name, None)
677 records.setdefault(d.name, None)
678 return DataCoordinate.standardize(keys, graph=standardized.graph).expanded(records=records)
680 def insertDimensionData(self, element: Union[DimensionElement, str],
681 *data: Union[Mapping[str, Any], DimensionRecord],
682 conform: bool = True) -> None:
683 # Docstring inherited from lsst.daf.butler.registry.Registry
684 if conform:
685 if isinstance(element, str):
686 element = self.dimensions[element]
687 records = [row if isinstance(row, DimensionRecord) else element.RecordClass(**row)
688 for row in data]
689 else:
690 # Ignore typing since caller said to trust them with conform=False.
691 records = data # type: ignore
692 storage = self._managers.dimensions[element] # type: ignore
693 storage.insert(*records)
695 def syncDimensionData(self, element: Union[DimensionElement, str],
696 row: Union[Mapping[str, Any], DimensionRecord],
697 conform: bool = True) -> bool:
698 # Docstring inherited from lsst.daf.butler.registry.Registry
699 if conform:
700 if isinstance(element, str):
701 element = self.dimensions[element]
702 record = row if isinstance(row, DimensionRecord) else element.RecordClass(**row)
703 else:
704 # Ignore typing since caller said to trust them with conform=False.
705 record = row # type: ignore
706 storage = self._managers.dimensions[element] # type: ignore
707 return storage.sync(record)
709 def queryDatasetTypes(self, expression: Any = ..., *, components: Optional[bool] = None
710 ) -> Iterator[DatasetType]:
711 # Docstring inherited from lsst.daf.butler.registry.Registry
712 wildcard = CategorizedWildcard.fromExpression(expression, coerceUnrecognized=lambda d: d.name)
713 if wildcard is Ellipsis:
714 for datasetType in self._managers.datasets:
715 # The dataset type can no longer be a component
716 yield datasetType
717 if components:
718 # Automatically create the component dataset types
719 try:
720 componentsForDatasetType = datasetType.makeAllComponentDatasetTypes()
721 except KeyError as err:
722 _LOG.warning(f"Could not load storage class {err} for {datasetType.name}; "
723 "if it has components they will not be included in query results.")
724 else:
725 yield from componentsForDatasetType
726 return
727 done: Set[str] = set()
728 for name in wildcard.strings:
729 storage = self._managers.datasets.find(name)
730 if storage is not None:
731 done.add(storage.datasetType.name)
732 yield storage.datasetType
733 if wildcard.patterns:
734 # If components (the argument) is None, we'll save component
735 # dataset that we might want to match, but only if their parents
736 # didn't get included.
737 componentsForLater = []
738 for registeredDatasetType in self._managers.datasets:
739 # Components are not stored in registry so expand them here
740 allDatasetTypes = [registeredDatasetType]
741 try:
742 allDatasetTypes.extend(registeredDatasetType.makeAllComponentDatasetTypes())
743 except KeyError as err:
744 _LOG.warning(f"Could not load storage class {err} for {registeredDatasetType.name}; "
745 "if it has components they will not be included in query results.")
746 for datasetType in allDatasetTypes:
747 if datasetType.name in done:
748 continue
749 parentName, componentName = datasetType.nameAndComponent()
750 if componentName is not None and not components:
751 if components is None and parentName not in done:
752 componentsForLater.append(datasetType)
753 continue
754 if any(p.fullmatch(datasetType.name) for p in wildcard.patterns):
755 done.add(datasetType.name)
756 yield datasetType
757 # Go back and try to match saved components.
758 for datasetType in componentsForLater:
759 parentName, _ = datasetType.nameAndComponent()
760 if parentName not in done and any(p.fullmatch(datasetType.name) for p in wildcard.patterns):
761 yield datasetType
763 def queryCollections(self, expression: Any = ...,
764 datasetType: Optional[DatasetType] = None,
765 collectionTypes: Iterable[CollectionType] = CollectionType.all(),
766 flattenChains: bool = False,
767 includeChains: Optional[bool] = None) -> Iterator[str]:
768 # Docstring inherited from lsst.daf.butler.registry.Registry
770 # Right now the datasetTypes argument is completely ignored, but that
771 # is consistent with its [lack of] guarantees. DM-24939 or a follow-up
772 # ticket will take care of that.
773 query = CollectionQuery.fromExpression(expression)
774 for record in query.iter(self._managers.collections, collectionTypes=frozenset(collectionTypes),
775 flattenChains=flattenChains, includeChains=includeChains):
776 yield record.name
778 def _makeQueryBuilder(self, summary: queries.QuerySummary) -> queries.QueryBuilder:
779 """Return a `QueryBuilder` instance capable of constructing and
780 managing more complex queries than those obtainable via `Registry`
781 interfaces.
783 This is an advanced interface; downstream code should prefer
784 `Registry.queryDataIds` and `Registry.queryDatasets` whenever those
785 are sufficient.
787 Parameters
788 ----------
789 summary : `queries.QuerySummary`
790 Object describing and categorizing the full set of dimensions that
791 will be included in the query.
793 Returns
794 -------
795 builder : `queries.QueryBuilder`
796 Object that can be used to construct and perform advanced queries.
797 """
798 return queries.QueryBuilder(
799 summary,
800 queries.RegistryManagers(
801 collections=self._managers.collections,
802 dimensions=self._managers.dimensions,
803 datasets=self._managers.datasets,
804 TimespanReprClass=self._db.getTimespanRepresentation(),
805 ),
806 )
808 def queryDatasets(self, datasetType: Any, *,
809 collections: Any = None,
810 dimensions: Optional[Iterable[Union[Dimension, str]]] = None,
811 dataId: Optional[DataId] = None,
812 where: Optional[str] = None,
813 findFirst: bool = False,
814 components: Optional[bool] = None,
815 bind: Optional[Mapping[str, Any]] = None,
816 check: bool = True,
817 **kwargs: Any) -> queries.DatasetQueryResults:
818 # Docstring inherited from lsst.daf.butler.registry.Registry
820 # Standardize the collections expression.
821 if collections is None:
822 if not self.defaults.collections:
823 raise TypeError("No collections provided to findDataset, "
824 "and no defaults from registry construction.")
825 collections = self.defaults.collections
826 elif findFirst:
827 collections = CollectionSearch.fromExpression(collections)
828 else:
829 collections = CollectionQuery.fromExpression(collections)
830 # Standardize and expand the data ID provided as a constraint.
831 standardizedDataId = self.expandDataId(dataId, **kwargs)
833 # We can only query directly if given a non-component DatasetType
834 # instance. If we were given an expression or str or a component
835 # DatasetType instance, we'll populate this dict, recurse, and return.
836 # If we already have a non-component DatasetType, it will remain None
837 # and we'll run the query directly.
838 composition: Optional[
839 Dict[
840 DatasetType, # parent dataset type
841 List[Optional[str]] # component name, or None for parent
842 ]
843 ] = None
844 if not isinstance(datasetType, DatasetType):
845 # We were given a dataset type expression (which may be as simple
846 # as a str). Loop over all matching datasets, delegating handling
847 # of the `components` argument to queryDatasetTypes, as we populate
848 # the composition dict.
849 composition = defaultdict(list)
850 for trueDatasetType in self.queryDatasetTypes(datasetType, components=components):
851 parentName, componentName = trueDatasetType.nameAndComponent()
852 if componentName is not None:
853 parentDatasetType = self.getDatasetType(parentName)
854 composition.setdefault(parentDatasetType, []).append(componentName)
855 else:
856 composition.setdefault(trueDatasetType, []).append(None)
857 elif datasetType.isComponent():
858 # We were given a true DatasetType instance, but it's a component.
859 # the composition dict will have exactly one item.
860 parentName, componentName = datasetType.nameAndComponent()
861 parentDatasetType = self.getDatasetType(parentName)
862 composition = {parentDatasetType: [componentName]}
863 if composition is not None:
864 # We need to recurse. Do that once for each parent dataset type.
865 chain = []
866 for parentDatasetType, componentNames in composition.items():
867 parentResults = self.queryDatasets(
868 parentDatasetType,
869 collections=collections,
870 dimensions=dimensions,
871 dataId=standardizedDataId,
872 where=where,
873 findFirst=findFirst,
874 check=check,
875 )
876 if isinstance(parentResults, queries.ParentDatasetQueryResults):
877 chain.append(
878 parentResults.withComponents(componentNames)
879 )
880 else:
881 # Should only happen if we know there would be no results.
882 assert isinstance(parentResults, queries.ChainedDatasetQueryResults) \
883 and not parentResults._chain
884 return queries.ChainedDatasetQueryResults(chain)
885 # If we get here, there's no need to recurse (or we are already
886 # recursing; there can only ever be one level of recursion).
888 # The full set of dimensions in the query is the combination of those
889 # needed for the DatasetType and those explicitly requested, if any.
890 requestedDimensionNames = set(datasetType.dimensions.names)
891 if dimensions is not None:
892 requestedDimensionNames.update(self.dimensions.extract(dimensions).names)
893 # Construct the summary structure needed to construct a QueryBuilder.
894 summary = queries.QuerySummary(
895 requested=DimensionGraph(self.dimensions, names=requestedDimensionNames),
896 dataId=standardizedDataId,
897 expression=where,
898 bind=bind,
899 defaults=self.defaults.dataId,
900 check=check,
901 )
902 builder = self._makeQueryBuilder(summary)
903 # Add the dataset subquery to the query, telling the QueryBuilder to
904 # include the rank of the selected collection in the results only if we
905 # need to findFirst. Note that if any of the collections are
906 # actually wildcard expressions, and we've asked for deduplication,
907 # this will raise TypeError for us.
908 if not builder.joinDataset(datasetType, collections, isResult=True, findFirst=findFirst):
909 return queries.ChainedDatasetQueryResults(())
910 query = builder.finish()
911 return queries.ParentDatasetQueryResults(self._db, query, components=[None])
913 def queryDataIds(self, dimensions: Union[Iterable[Union[Dimension, str]], Dimension, str], *,
914 dataId: Optional[DataId] = None,
915 datasets: Any = None,
916 collections: Any = None,
917 where: Optional[str] = None,
918 components: Optional[bool] = None,
919 bind: Optional[Mapping[str, Any]] = None,
920 check: bool = True,
921 **kwargs: Any) -> queries.DataCoordinateQueryResults:
922 # Docstring inherited from lsst.daf.butler.registry.Registry
923 dimensions = iterable(dimensions)
924 standardizedDataId = self.expandDataId(dataId, **kwargs)
925 standardizedDatasetTypes = set()
926 requestedDimensions = self.dimensions.extract(dimensions)
927 queryDimensionNames = set(requestedDimensions.names)
928 if datasets is not None:
929 if collections is None:
930 if not self.defaults.collections:
931 raise TypeError("Cannot pass 'datasets' without 'collections'.")
932 collections = self.defaults.collections
933 else:
934 # Preprocess collections expression in case the original
935 # included single-pass iterators (we'll want to use it multiple
936 # times below).
937 collections = CollectionQuery.fromExpression(collections)
938 for datasetType in self.queryDatasetTypes(datasets, components=components):
939 queryDimensionNames.update(datasetType.dimensions.names)
940 # If any matched dataset type is a component, just operate on
941 # its parent instead, because Registry doesn't know anything
942 # about what components exist, and here (unlike queryDatasets)
943 # we don't care about returning them.
944 parentDatasetTypeName, componentName = datasetType.nameAndComponent()
945 if componentName is not None:
946 datasetType = self.getDatasetType(parentDatasetTypeName)
947 standardizedDatasetTypes.add(datasetType)
949 summary = queries.QuerySummary(
950 requested=DimensionGraph(self.dimensions, names=queryDimensionNames),
951 dataId=standardizedDataId,
952 expression=where,
953 bind=bind,
954 defaults=self.defaults.dataId,
955 check=check,
956 )
957 builder = self._makeQueryBuilder(summary)
958 for datasetType in standardizedDatasetTypes:
959 builder.joinDataset(datasetType, collections, isResult=False)
960 query = builder.finish()
961 return queries.DataCoordinateQueryResults(self._db, query)
963 def queryDimensionRecords(self, element: Union[DimensionElement, str], *,
964 dataId: Optional[DataId] = None,
965 datasets: Any = None,
966 collections: Any = None,
967 where: Optional[str] = None,
968 components: Optional[bool] = None,
969 bind: Optional[Mapping[str, Any]] = None,
970 check: bool = True,
971 **kwargs: Any) -> Iterator[DimensionRecord]:
972 # Docstring inherited from lsst.daf.butler.registry.Registry
973 if not isinstance(element, DimensionElement):
974 try:
975 element = self.dimensions[element]
976 except KeyError as e:
977 raise KeyError(f"No such dimension '{element}', available dimensions: "
978 + str(self.dimensions.getStaticElements())) from e
979 dataIds = self.queryDataIds(element.graph, dataId=dataId, datasets=datasets, collections=collections,
980 where=where, components=components, bind=bind, check=check, **kwargs)
981 return iter(self._managers.dimensions[element].fetch(dataIds))
983 def queryDatasetAssociations(
984 self,
985 datasetType: Union[str, DatasetType],
986 collections: Any = ...,
987 *,
988 collectionTypes: Iterable[CollectionType] = CollectionType.all(),
989 flattenChains: bool = False,
990 ) -> Iterator[DatasetAssociation]:
991 # Docstring inherited from lsst.daf.butler.registry.Registry
992 if collections is None:
993 if not self.defaults.collections:
994 raise TypeError("No collections provided to findDataset, "
995 "and no defaults from registry construction.")
996 collections = self.defaults.collections
997 else:
998 collections = CollectionQuery.fromExpression(collections)
999 TimespanReprClass = self._db.getTimespanRepresentation()
1000 if isinstance(datasetType, str):
1001 storage = self._managers.datasets[datasetType]
1002 else:
1003 storage = self._managers.datasets[datasetType.name]
1004 for collectionRecord in collections.iter(self._managers.collections,
1005 collectionTypes=frozenset(collectionTypes),
1006 flattenChains=flattenChains):
1007 query = storage.select(collectionRecord)
1008 if query is None:
1009 continue
1010 for row in self._db.query(query.combine()):
1011 dataId = DataCoordinate.fromRequiredValues(
1012 storage.datasetType.dimensions,
1013 tuple(row[name] for name in storage.datasetType.dimensions.required.names)
1014 )
1015 runRecord = self._managers.collections[row[self._managers.collections.getRunForeignKeyName()]]
1016 ref = DatasetRef(storage.datasetType, dataId, id=row["id"], run=runRecord.name,
1017 conform=False)
1018 if collectionRecord.type is CollectionType.CALIBRATION:
1019 timespan = TimespanReprClass.extract(row)
1020 else:
1021 timespan = None
1022 yield DatasetAssociation(ref=ref, collection=collectionRecord.name, timespan=timespan)
1024 storageClasses: StorageClassFactory
1025 """All storage classes known to the registry (`StorageClassFactory`).
1026 """