Coverage for python/lsst/daf/butler/registry/_sqlRegistry.py : 13%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22from __future__ import annotations
24__all__ = (
25 "SqlRegistry",
26)
28from collections import defaultdict
29import contextlib
30import logging
31from typing import (
32 Any,
33 Dict,
34 Iterable,
35 Iterator,
36 List,
37 Mapping,
38 Optional,
39 Set,
40 TYPE_CHECKING,
41 Union,
42)
44import sqlalchemy
46from ..core import (
47 ButlerURI,
48 Config,
49 DataCoordinate,
50 DataCoordinateIterable,
51 DataId,
52 DatasetAssociation,
53 DatasetId,
54 DatasetRef,
55 DatasetType,
56 ddl,
57 Dimension,
58 DimensionConfig,
59 DimensionElement,
60 DimensionGraph,
61 DimensionRecord,
62 DimensionUniverse,
63 NamedKeyMapping,
64 NameLookupMapping,
65 Progress,
66 StorageClassFactory,
67 Timespan,
68)
69from . import queries
70from ..core.utils import iterable, transactional
71from ._config import RegistryConfig
72from ._collectionType import CollectionType
73from ._defaults import RegistryDefaults
74from ._exceptions import ConflictingDefinitionError, InconsistentDataIdError, OrphanedRecordError
75from .managers import RegistryManagerTypes, RegistryManagerInstances
76from .wildcards import CategorizedWildcard, CollectionQuery, CollectionSearch, Ellipsis
77from .summaries import CollectionSummary
78from .interfaces import ChainedCollectionRecord, DatasetIdGenEnum, RunRecord
79from ._registry import Registry
81if TYPE_CHECKING: 81 ↛ 82line 81 didn't jump to line 82, because the condition on line 81 was never true
82 from .._butlerConfig import ButlerConfig
83 from .interfaces import (
84 CollectionRecord,
85 Database,
86 DatastoreRegistryBridgeManager,
87 )
90_LOG = logging.getLogger(__name__)
93class SqlRegistry(Registry):
94 """Registry implementation based on SQLAlchemy.
96 Parameters
97 ----------
98 database : `Database`
99 Database instance to store Registry.
100 defaults : `RegistryDefaults`
101 Default collection search path and/or output `~CollectionType.RUN`
102 collection.
103 managers : `RegistryManagerInstances`
104 All the managers required for this registry.
105 """
107 defaultConfigFile: Optional[str] = None
108 """Path to configuration defaults. Accessed within the ``configs`` resource
109 or relative to a search path. Can be None if no defaults specified.
110 """
112 @classmethod
113 def createFromConfig(cls, config: Optional[Union[RegistryConfig, str]] = None,
114 dimensionConfig: Optional[Union[DimensionConfig, str]] = None,
115 butlerRoot: Optional[str] = None) -> Registry:
116 """Create registry database and return `SqlRegistry` instance.
118 This method initializes database contents, database must be empty
119 prior to calling this method.
121 Parameters
122 ----------
123 config : `RegistryConfig` or `str`, optional
124 Registry configuration, if missing then default configuration will
125 be loaded from registry.yaml.
126 dimensionConfig : `DimensionConfig` or `str`, optional
127 Dimensions configuration, if missing then default configuration
128 will be loaded from dimensions.yaml.
129 butlerRoot : `str`, optional
130 Path to the repository root this `SqlRegistry` will manage.
132 Returns
133 -------
134 registry : `SqlRegistry`
135 A new `SqlRegistry` instance.
136 """
137 config = cls.forceRegistryConfig(config)
138 config.replaceRoot(butlerRoot)
140 if isinstance(dimensionConfig, str):
141 dimensionConfig = DimensionConfig(config)
142 elif dimensionConfig is None:
143 dimensionConfig = DimensionConfig()
144 elif not isinstance(dimensionConfig, DimensionConfig):
145 raise TypeError(f"Incompatible Dimension configuration type: {type(dimensionConfig)}")
147 DatabaseClass = config.getDatabaseClass()
148 database = DatabaseClass.fromUri(str(config.connectionString), origin=config.get("origin", 0),
149 namespace=config.get("namespace"))
150 managerTypes = RegistryManagerTypes.fromConfig(config)
151 managers = managerTypes.makeRepo(database, dimensionConfig)
152 return cls(database, RegistryDefaults(), managers)
154 @classmethod
155 def fromConfig(cls, config: Union[ButlerConfig, RegistryConfig, Config, str],
156 butlerRoot: Optional[Union[str, ButlerURI]] = None, writeable: bool = True,
157 defaults: Optional[RegistryDefaults] = None) -> Registry:
158 """Create `Registry` subclass instance from `config`.
160 Registry database must be inbitialized prior to calling this method.
162 Parameters
163 ----------
164 config : `ButlerConfig`, `RegistryConfig`, `Config` or `str`
165 Registry configuration
166 butlerRoot : `str` or `ButlerURI`, optional
167 Path to the repository root this `Registry` will manage.
168 writeable : `bool`, optional
169 If `True` (default) create a read-write connection to the database.
170 defaults : `RegistryDefaults`, optional
171 Default collection search path and/or output `~CollectionType.RUN`
172 collection.
174 Returns
175 -------
176 registry : `SqlRegistry` (subclass)
177 A new `SqlRegistry` subclass instance.
178 """
179 config = cls.forceRegistryConfig(config)
180 config.replaceRoot(butlerRoot)
181 DatabaseClass = config.getDatabaseClass()
182 database = DatabaseClass.fromUri(str(config.connectionString), origin=config.get("origin", 0),
183 namespace=config.get("namespace"), writeable=writeable)
184 managerTypes = RegistryManagerTypes.fromConfig(config)
185 managers = managerTypes.loadRepo(database)
186 if defaults is None:
187 defaults = RegistryDefaults()
188 return cls(database, defaults, managers)
190 def __init__(self, database: Database, defaults: RegistryDefaults, managers: RegistryManagerInstances):
191 self._db = database
192 self._managers = managers
193 self.storageClasses = StorageClassFactory()
194 # Intentionally invoke property setter to initialize defaults. This
195 # can only be done after most of the rest of Registry has already been
196 # initialized, and must be done before the property getter is used.
197 self.defaults = defaults
199 def __str__(self) -> str:
200 return str(self._db)
202 def __repr__(self) -> str:
203 return f"SqlRegistry({self._db!r}, {self.dimensions!r})"
205 def isWriteable(self) -> bool:
206 # Docstring inherited from lsst.daf.butler.registry.Registry
207 return self._db.isWriteable()
209 def copy(self, defaults: Optional[RegistryDefaults] = None) -> Registry:
210 # Docstring inherited from lsst.daf.butler.registry.Registry
211 if defaults is None:
212 # No need to copy, because `RegistryDefaults` is immutable; we
213 # effectively copy on write.
214 defaults = self.defaults
215 return type(self)(self._db, defaults, self._managers)
217 @property
218 def dimensions(self) -> DimensionUniverse:
219 # Docstring inherited from lsst.daf.butler.registry.Registry
220 return self._managers.dimensions.universe
222 def refresh(self) -> None:
223 # Docstring inherited from lsst.daf.butler.registry.Registry
224 self._managers.refresh()
226 @contextlib.contextmanager
227 def transaction(self, *, savepoint: bool = False) -> Iterator[None]:
228 # Docstring inherited from lsst.daf.butler.registry.Registry
229 try:
230 with self._db.transaction(savepoint=savepoint):
231 yield
232 except BaseException:
233 # TODO: this clears the caches sometimes when we wouldn't actually
234 # need to. Can we avoid that?
235 self._managers.dimensions.clearCaches()
236 raise
238 def resetConnectionPool(self) -> None:
239 """Reset SQLAlchemy connection pool for `SqlRegistry` database.
241 This operation is useful when using registry with fork-based
242 multiprocessing. To use registry across fork boundary one has to make
243 sure that there are no currently active connections (no session or
244 transaction is in progress) and connection pool is reset using this
245 method. This method should be called by the child process immediately
246 after the fork.
247 """
248 self._db._engine.dispose()
250 def registerOpaqueTable(self, tableName: str, spec: ddl.TableSpec) -> None:
251 """Add an opaque (to the `Registry`) table for use by a `Datastore` or
252 other data repository client.
254 Opaque table records can be added via `insertOpaqueData`, retrieved via
255 `fetchOpaqueData`, and removed via `deleteOpaqueData`.
257 Parameters
258 ----------
259 tableName : `str`
260 Logical name of the opaque table. This may differ from the
261 actual name used in the database by a prefix and/or suffix.
262 spec : `ddl.TableSpec`
263 Specification for the table to be added.
264 """
265 self._managers.opaque.register(tableName, spec)
267 @transactional
268 def insertOpaqueData(self, tableName: str, *data: dict) -> None:
269 """Insert records into an opaque table.
271 Parameters
272 ----------
273 tableName : `str`
274 Logical name of the opaque table. Must match the name used in a
275 previous call to `registerOpaqueTable`.
276 data
277 Each additional positional argument is a dictionary that represents
278 a single row to be added.
279 """
280 self._managers.opaque[tableName].insert(*data)
282 def fetchOpaqueData(self, tableName: str, **where: Any) -> Iterator[dict]:
283 """Retrieve records from an opaque table.
285 Parameters
286 ----------
287 tableName : `str`
288 Logical name of the opaque table. Must match the name used in a
289 previous call to `registerOpaqueTable`.
290 where
291 Additional keyword arguments are interpreted as equality
292 constraints that restrict the returned rows (combined with AND);
293 keyword arguments are column names and values are the values they
294 must have.
296 Yields
297 ------
298 row : `dict`
299 A dictionary representing a single result row.
300 """
301 yield from self._managers.opaque[tableName].fetch(**where)
303 @transactional
304 def deleteOpaqueData(self, tableName: str, **where: Any) -> None:
305 """Remove records from an opaque table.
307 Parameters
308 ----------
309 tableName : `str`
310 Logical name of the opaque table. Must match the name used in a
311 previous call to `registerOpaqueTable`.
312 where
313 Additional keyword arguments are interpreted as equality
314 constraints that restrict the deleted rows (combined with AND);
315 keyword arguments are column names and values are the values they
316 must have.
317 """
318 self._managers.opaque[tableName].delete(where.keys(), where)
320 def registerCollection(self, name: str, type: CollectionType = CollectionType.TAGGED,
321 doc: Optional[str] = None) -> None:
322 # Docstring inherited from lsst.daf.butler.registry.Registry
323 self._managers.collections.register(name, type, doc=doc)
325 def getCollectionType(self, name: str) -> CollectionType:
326 # Docstring inherited from lsst.daf.butler.registry.Registry
327 return self._managers.collections.find(name).type
329 def _get_collection_record(self, name: str) -> CollectionRecord:
330 # Docstring inherited from lsst.daf.butler.registry.Registry
331 return self._managers.collections.find(name)
333 def registerRun(self, name: str, doc: Optional[str] = None) -> None:
334 # Docstring inherited from lsst.daf.butler.registry.Registry
335 self._managers.collections.register(name, CollectionType.RUN, doc=doc)
337 @transactional
338 def removeCollection(self, name: str) -> None:
339 # Docstring inherited from lsst.daf.butler.registry.Registry
340 self._managers.collections.remove(name)
342 def getCollectionChain(self, parent: str) -> CollectionSearch:
343 # Docstring inherited from lsst.daf.butler.registry.Registry
344 record = self._managers.collections.find(parent)
345 if record.type is not CollectionType.CHAINED:
346 raise TypeError(f"Collection '{parent}' has type {record.type.name}, not CHAINED.")
347 assert isinstance(record, ChainedCollectionRecord)
348 return record.children
350 @transactional
351 def setCollectionChain(self, parent: str, children: Any, *, flatten: bool = False) -> None:
352 # Docstring inherited from lsst.daf.butler.registry.Registry
353 record = self._managers.collections.find(parent)
354 if record.type is not CollectionType.CHAINED:
355 raise TypeError(f"Collection '{parent}' has type {record.type.name}, not CHAINED.")
356 assert isinstance(record, ChainedCollectionRecord)
357 children = CollectionSearch.fromExpression(children)
358 if children != record.children or flatten:
359 record.update(self._managers.collections, children, flatten=flatten)
361 def getCollectionDocumentation(self, collection: str) -> Optional[str]:
362 # Docstring inherited from lsst.daf.butler.registry.Registry
363 return self._managers.collections.getDocumentation(self._managers.collections.find(collection).key)
365 def setCollectionDocumentation(self, collection: str, doc: Optional[str]) -> None:
366 # Docstring inherited from lsst.daf.butler.registry.Registry
367 self._managers.collections.setDocumentation(self._managers.collections.find(collection).key, doc)
369 def getCollectionSummary(self, collection: str) -> CollectionSummary:
370 # Docstring inherited from lsst.daf.butler.registry.Registry
371 record = self._managers.collections.find(collection)
372 return self._managers.datasets.getCollectionSummary(record)
374 def registerDatasetType(self, datasetType: DatasetType) -> bool:
375 # Docstring inherited from lsst.daf.butler.registry.Registry
376 _, inserted = self._managers.datasets.register(datasetType)
377 return inserted
379 def removeDatasetType(self, name: str) -> None:
380 # Docstring inherited from lsst.daf.butler.registry.Registry
381 self._managers.datasets.remove(name)
383 def getDatasetType(self, name: str) -> DatasetType:
384 # Docstring inherited from lsst.daf.butler.registry.Registry
385 return self._managers.datasets[name].datasetType
387 def findDataset(self, datasetType: Union[DatasetType, str], dataId: Optional[DataId] = None, *,
388 collections: Any = None, timespan: Optional[Timespan] = None,
389 **kwargs: Any) -> Optional[DatasetRef]:
390 # Docstring inherited from lsst.daf.butler.registry.Registry
391 if isinstance(datasetType, DatasetType):
392 storage = self._managers.datasets[datasetType.name]
393 else:
394 storage = self._managers.datasets[datasetType]
395 dataId = DataCoordinate.standardize(dataId, graph=storage.datasetType.dimensions,
396 universe=self.dimensions, defaults=self.defaults.dataId,
397 **kwargs)
398 if collections is None:
399 if not self.defaults.collections:
400 raise TypeError("No collections provided to findDataset, "
401 "and no defaults from registry construction.")
402 collections = self.defaults.collections
403 else:
404 collections = CollectionSearch.fromExpression(collections)
405 for collectionRecord in collections.iter(self._managers.collections):
406 if (collectionRecord.type is CollectionType.CALIBRATION
407 and (not storage.datasetType.isCalibration() or timespan is None)):
408 continue
409 result = storage.find(collectionRecord, dataId, timespan=timespan)
410 if result is not None:
411 return result
413 return None
415 @transactional
416 def insertDatasets(self, datasetType: Union[DatasetType, str], dataIds: Iterable[DataId],
417 run: Optional[str] = None, expand: bool = True,
418 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE) -> List[DatasetRef]:
419 # Docstring inherited from lsst.daf.butler.registry.Registry
420 if isinstance(datasetType, DatasetType):
421 storage = self._managers.datasets.find(datasetType.name)
422 if storage is None:
423 raise LookupError(f"DatasetType '{datasetType}' has not been registered.")
424 else:
425 storage = self._managers.datasets.find(datasetType)
426 if storage is None:
427 raise LookupError(f"DatasetType with name '{datasetType}' has not been registered.")
428 if run is None:
429 if self.defaults.run is None:
430 raise TypeError("No run provided to insertDatasets, "
431 "and no default from registry construction.")
432 run = self.defaults.run
433 runRecord = self._managers.collections.find(run)
434 if runRecord.type is not CollectionType.RUN:
435 raise TypeError(f"Given collection is of type {runRecord.type.name}; RUN collection required.")
436 assert isinstance(runRecord, RunRecord)
437 progress = Progress("daf.butler.Registry.insertDatasets", level=logging.DEBUG)
438 if expand:
439 expandedDataIds = [self.expandDataId(dataId, graph=storage.datasetType.dimensions)
440 for dataId in progress.wrap(dataIds,
441 f"Expanding {storage.datasetType.name} data IDs")]
442 else:
443 expandedDataIds = [DataCoordinate.standardize(dataId, graph=storage.datasetType.dimensions)
444 for dataId in dataIds]
445 try:
446 refs = list(storage.insert(runRecord, expandedDataIds, idGenerationMode))
447 except sqlalchemy.exc.IntegrityError as err:
448 raise ConflictingDefinitionError(f"A database constraint failure was triggered by inserting "
449 f"one or more datasets of type {storage.datasetType} into "
450 f"collection '{run}'. "
451 f"This probably means a dataset with the same data ID "
452 f"and dataset type already exists, but it may also mean a "
453 f"dimension row is missing.") from err
454 return refs
456 @transactional
457 def _importDatasets(self, datasets: Iterable[DatasetRef], expand: bool = True,
458 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE,
459 reuseIds: bool = False) -> List[DatasetRef]:
460 # Docstring inherited from lsst.daf.butler.registry.Registry
461 datasets = list(datasets)
462 if not datasets:
463 # nothing to do
464 return []
466 # find dataset type
467 datasetTypes = set(dataset.datasetType for dataset in datasets)
468 if len(datasetTypes) != 1:
469 raise ValueError(f"Multiple dataset types in input datasets: {datasetTypes}")
470 datasetType = datasetTypes.pop()
472 # get storage handler for this dataset type
473 storage = self._managers.datasets.find(datasetType.name)
474 if storage is None:
475 raise LookupError(f"DatasetType '{datasetType}' has not been registered.")
477 # find run name
478 runs = set(dataset.run for dataset in datasets)
479 if len(runs) != 1:
480 raise ValueError(f"Multiple run names in input datasets: {runs}")
481 run = runs.pop()
482 if run is None:
483 if self.defaults.run is None:
484 raise TypeError("No run provided to ingestDatasets, "
485 "and no default from registry construction.")
486 run = self.defaults.run
488 runRecord = self._managers.collections.find(run)
489 if runRecord.type is not CollectionType.RUN:
490 raise TypeError(f"Given collection is of type {runRecord.type.name}; RUN collection required.")
491 assert isinstance(runRecord, RunRecord)
493 progress = Progress("daf.butler.Registry.insertDatasets", level=logging.DEBUG)
494 if expand:
495 expandedDatasets = [
496 dataset.expanded(self.expandDataId(dataset.dataId, graph=storage.datasetType.dimensions))
497 for dataset in progress.wrap(datasets, f"Expanding {storage.datasetType.name} data IDs")]
498 else:
499 expandedDatasets = [
500 DatasetRef(datasetType, dataset.dataId, id=dataset.id, run=dataset.run, conform=True)
501 for dataset in datasets
502 ]
504 try:
505 refs = list(storage.import_(runRecord, expandedDatasets, idGenerationMode, reuseIds))
506 except sqlalchemy.exc.IntegrityError as err:
507 raise ConflictingDefinitionError(f"A database constraint failure was triggered by inserting "
508 f"one or more datasets of type {storage.datasetType} into "
509 f"collection '{run}'. "
510 f"This probably means a dataset with the same data ID "
511 f"and dataset type already exists, but it may also mean a "
512 f"dimension row is missing.") from err
513 return refs
515 def getDataset(self, id: DatasetId) -> Optional[DatasetRef]:
516 # Docstring inherited from lsst.daf.butler.registry.Registry
517 return self._managers.datasets.getDatasetRef(id)
519 @transactional
520 def removeDatasets(self, refs: Iterable[DatasetRef]) -> None:
521 # Docstring inherited from lsst.daf.butler.registry.Registry
522 progress = Progress("lsst.daf.butler.Registry.removeDatasets", level=logging.DEBUG)
523 for datasetType, refsForType in progress.iter_item_chunks(DatasetRef.groupByType(refs).items(),
524 desc="Removing datasets by type"):
525 storage = self._managers.datasets[datasetType.name]
526 try:
527 storage.delete(refsForType)
528 except sqlalchemy.exc.IntegrityError as err:
529 raise OrphanedRecordError("One or more datasets is still "
530 "present in one or more Datastores.") from err
532 @transactional
533 def associate(self, collection: str, refs: Iterable[DatasetRef]) -> None:
534 # Docstring inherited from lsst.daf.butler.registry.Registry
535 progress = Progress("lsst.daf.butler.Registry.associate", level=logging.DEBUG)
536 collectionRecord = self._managers.collections.find(collection)
537 if collectionRecord.type is not CollectionType.TAGGED:
538 raise TypeError(f"Collection '{collection}' has type {collectionRecord.type.name}, not TAGGED.")
539 for datasetType, refsForType in progress.iter_item_chunks(DatasetRef.groupByType(refs).items(),
540 desc="Associating datasets by type"):
541 storage = self._managers.datasets[datasetType.name]
542 try:
543 storage.associate(collectionRecord, refsForType)
544 except sqlalchemy.exc.IntegrityError as err:
545 raise ConflictingDefinitionError(
546 f"Constraint violation while associating dataset of type {datasetType.name} with "
547 f"collection {collection}. This probably means that one or more datasets with the same "
548 f"dataset type and data ID already exist in the collection, but it may also indicate "
549 f"that the datasets do not exist."
550 ) from err
552 @transactional
553 def disassociate(self, collection: str, refs: Iterable[DatasetRef]) -> None:
554 # Docstring inherited from lsst.daf.butler.registry.Registry
555 progress = Progress("lsst.daf.butler.Registry.disassociate", level=logging.DEBUG)
556 collectionRecord = self._managers.collections.find(collection)
557 if collectionRecord.type is not CollectionType.TAGGED:
558 raise TypeError(f"Collection '{collection}' has type {collectionRecord.type.name}; "
559 "expected TAGGED.")
560 for datasetType, refsForType in progress.iter_item_chunks(DatasetRef.groupByType(refs).items(),
561 desc="Disassociating datasets by type"):
562 storage = self._managers.datasets[datasetType.name]
563 storage.disassociate(collectionRecord, refsForType)
565 @transactional
566 def certify(self, collection: str, refs: Iterable[DatasetRef], timespan: Timespan) -> None:
567 # Docstring inherited from lsst.daf.butler.registry.Registry
568 progress = Progress("lsst.daf.butler.Registry.certify", level=logging.DEBUG)
569 collectionRecord = self._managers.collections.find(collection)
570 for datasetType, refsForType in progress.iter_item_chunks(DatasetRef.groupByType(refs).items(),
571 desc="Certifying datasets by type"):
572 storage = self._managers.datasets[datasetType.name]
573 storage.certify(collectionRecord, refsForType, timespan)
575 @transactional
576 def decertify(self, collection: str, datasetType: Union[str, DatasetType], timespan: Timespan, *,
577 dataIds: Optional[Iterable[DataId]] = None) -> None:
578 # Docstring inherited from lsst.daf.butler.registry.Registry
579 collectionRecord = self._managers.collections.find(collection)
580 if isinstance(datasetType, str):
581 storage = self._managers.datasets[datasetType]
582 else:
583 storage = self._managers.datasets[datasetType.name]
584 standardizedDataIds = None
585 if dataIds is not None:
586 standardizedDataIds = [DataCoordinate.standardize(d, graph=storage.datasetType.dimensions)
587 for d in dataIds]
588 storage.decertify(collectionRecord, timespan, dataIds=standardizedDataIds)
590 def getDatastoreBridgeManager(self) -> DatastoreRegistryBridgeManager:
591 """Return an object that allows a new `Datastore` instance to
592 communicate with this `Registry`.
594 Returns
595 -------
596 manager : `DatastoreRegistryBridgeManager`
597 Object that mediates communication between this `Registry` and its
598 associated datastores.
599 """
600 return self._managers.datastores
602 def getDatasetLocations(self, ref: DatasetRef) -> Iterable[str]:
603 # Docstring inherited from lsst.daf.butler.registry.Registry
604 return self._managers.datastores.findDatastores(ref)
606 def expandDataId(self, dataId: Optional[DataId] = None, *, graph: Optional[DimensionGraph] = None,
607 records: Optional[NameLookupMapping[DimensionElement, Optional[DimensionRecord]]] = None,
608 withDefaults: bool = True,
609 **kwargs: Any) -> DataCoordinate:
610 # Docstring inherited from lsst.daf.butler.registry.Registry
611 if not withDefaults:
612 defaults = None
613 else:
614 defaults = self.defaults.dataId
615 standardized = DataCoordinate.standardize(dataId, graph=graph, universe=self.dimensions,
616 defaults=defaults, **kwargs)
617 if standardized.hasRecords():
618 return standardized
619 if records is None:
620 records = {}
621 elif isinstance(records, NamedKeyMapping):
622 records = records.byName()
623 else:
624 records = dict(records)
625 if isinstance(dataId, DataCoordinate) and dataId.hasRecords():
626 records.update(dataId.records.byName())
627 keys = standardized.byName()
628 for element in standardized.graph.primaryKeyTraversalOrder:
629 record = records.get(element.name, ...) # Use ... to mean not found; None might mean NULL
630 if record is ...:
631 if isinstance(element, Dimension) and keys.get(element.name) is None:
632 if element in standardized.graph.required:
633 raise LookupError(
634 f"No value or null value for required dimension {element.name}."
635 )
636 keys[element.name] = None
637 record = None
638 else:
639 storage = self._managers.dimensions[element]
640 dataIdSet = DataCoordinateIterable.fromScalar(
641 DataCoordinate.standardize(keys, graph=element.graph)
642 )
643 fetched = tuple(storage.fetch(dataIdSet))
644 try:
645 (record,) = fetched
646 except ValueError:
647 record = None
648 records[element.name] = record
649 if record is not None:
650 for d in element.implied:
651 value = getattr(record, d.name)
652 if keys.setdefault(d.name, value) != value:
653 raise InconsistentDataIdError(
654 f"Data ID {standardized} has {d.name}={keys[d.name]!r}, "
655 f"but {element.name} implies {d.name}={value!r}."
656 )
657 else:
658 if element in standardized.graph.required:
659 raise LookupError(
660 f"Could not fetch record for required dimension {element.name} via keys {keys}."
661 )
662 if element.alwaysJoin:
663 raise InconsistentDataIdError(
664 f"Could not fetch record for element {element.name} via keys {keys}, ",
665 "but it is marked alwaysJoin=True; this means one or more dimensions are not "
666 "related."
667 )
668 for d in element.implied:
669 keys.setdefault(d.name, None)
670 records.setdefault(d.name, None)
671 return DataCoordinate.standardize(keys, graph=standardized.graph).expanded(records=records)
673 def insertDimensionData(self, element: Union[DimensionElement, str],
674 *data: Union[Mapping[str, Any], DimensionRecord],
675 conform: bool = True) -> None:
676 # Docstring inherited from lsst.daf.butler.registry.Registry
677 if conform:
678 if isinstance(element, str):
679 element = self.dimensions[element]
680 records = [row if isinstance(row, DimensionRecord) else element.RecordClass(**row)
681 for row in data]
682 else:
683 # Ignore typing since caller said to trust them with conform=False.
684 records = data # type: ignore
685 storage = self._managers.dimensions[element] # type: ignore
686 storage.insert(*records)
688 def syncDimensionData(self, element: Union[DimensionElement, str],
689 row: Union[Mapping[str, Any], DimensionRecord],
690 conform: bool = True) -> bool:
691 # Docstring inherited from lsst.daf.butler.registry.Registry
692 if conform:
693 if isinstance(element, str):
694 element = self.dimensions[element]
695 record = row if isinstance(row, DimensionRecord) else element.RecordClass(**row)
696 else:
697 # Ignore typing since caller said to trust them with conform=False.
698 record = row # type: ignore
699 storage = self._managers.dimensions[element] # type: ignore
700 return storage.sync(record)
702 def queryDatasetTypes(self, expression: Any = ..., *, components: Optional[bool] = None
703 ) -> Iterator[DatasetType]:
704 # Docstring inherited from lsst.daf.butler.registry.Registry
705 wildcard = CategorizedWildcard.fromExpression(expression, coerceUnrecognized=lambda d: d.name)
706 if wildcard is Ellipsis:
707 for datasetType in self._managers.datasets:
708 # The dataset type can no longer be a component
709 yield datasetType
710 if components:
711 # Automatically create the component dataset types
712 try:
713 componentsForDatasetType = datasetType.makeAllComponentDatasetTypes()
714 except KeyError as err:
715 _LOG.warning(f"Could not load storage class {err} for {datasetType.name}; "
716 "if it has components they will not be included in query results.")
717 else:
718 yield from componentsForDatasetType
719 return
720 done: Set[str] = set()
721 for name in wildcard.strings:
722 storage = self._managers.datasets.find(name)
723 if storage is not None:
724 done.add(storage.datasetType.name)
725 yield storage.datasetType
726 if wildcard.patterns:
727 # If components (the argument) is None, we'll save component
728 # dataset that we might want to match, but only if their parents
729 # didn't get included.
730 componentsForLater = []
731 for registeredDatasetType in self._managers.datasets:
732 # Components are not stored in registry so expand them here
733 allDatasetTypes = [registeredDatasetType]
734 try:
735 allDatasetTypes.extend(registeredDatasetType.makeAllComponentDatasetTypes())
736 except KeyError as err:
737 _LOG.warning(f"Could not load storage class {err} for {registeredDatasetType.name}; "
738 "if it has components they will not be included in query results.")
739 for datasetType in allDatasetTypes:
740 if datasetType.name in done:
741 continue
742 parentName, componentName = datasetType.nameAndComponent()
743 if componentName is not None and not components:
744 if components is None and parentName not in done:
745 componentsForLater.append(datasetType)
746 continue
747 if any(p.fullmatch(datasetType.name) for p in wildcard.patterns):
748 done.add(datasetType.name)
749 yield datasetType
750 # Go back and try to match saved components.
751 for datasetType in componentsForLater:
752 parentName, _ = datasetType.nameAndComponent()
753 if parentName not in done and any(p.fullmatch(datasetType.name) for p in wildcard.patterns):
754 yield datasetType
756 def queryCollections(self, expression: Any = ...,
757 datasetType: Optional[DatasetType] = None,
758 collectionTypes: Iterable[CollectionType] = CollectionType.all(),
759 flattenChains: bool = False,
760 includeChains: Optional[bool] = None) -> Iterator[str]:
761 # Docstring inherited from lsst.daf.butler.registry.Registry
763 # Right now the datasetTypes argument is completely ignored, but that
764 # is consistent with its [lack of] guarantees. DM-24939 or a follow-up
765 # ticket will take care of that.
766 query = CollectionQuery.fromExpression(expression)
767 for record in query.iter(self._managers.collections, collectionTypes=frozenset(collectionTypes),
768 flattenChains=flattenChains, includeChains=includeChains):
769 yield record.name
771 def makeQueryBuilder(self, summary: queries.QuerySummary) -> queries.QueryBuilder:
772 """Return a `QueryBuilder` instance capable of constructing and
773 managing more complex queries than those obtainable via `Registry`
774 interfaces.
776 This is an advanced interface; downstream code should prefer
777 `Registry.queryDataIds` and `Registry.queryDatasets` whenever those
778 are sufficient.
780 Parameters
781 ----------
782 summary : `queries.QuerySummary`
783 Object describing and categorizing the full set of dimensions that
784 will be included in the query.
786 Returns
787 -------
788 builder : `queries.QueryBuilder`
789 Object that can be used to construct and perform advanced queries.
790 """
791 return queries.QueryBuilder(
792 summary,
793 queries.RegistryManagers(
794 collections=self._managers.collections,
795 dimensions=self._managers.dimensions,
796 datasets=self._managers.datasets,
797 TimespanReprClass=self._db.getTimespanRepresentation(),
798 ),
799 )
801 def queryDatasets(self, datasetType: Any, *,
802 collections: Any = None,
803 dimensions: Optional[Iterable[Union[Dimension, str]]] = None,
804 dataId: Optional[DataId] = None,
805 where: Optional[str] = None,
806 findFirst: bool = False,
807 components: Optional[bool] = None,
808 bind: Optional[Mapping[str, Any]] = None,
809 check: bool = True,
810 **kwargs: Any) -> queries.DatasetQueryResults:
811 # Docstring inherited from lsst.daf.butler.registry.Registry
813 # Standardize the collections expression.
814 if collections is None:
815 if not self.defaults.collections:
816 raise TypeError("No collections provided to findDataset, "
817 "and no defaults from registry construction.")
818 collections = self.defaults.collections
819 elif findFirst:
820 collections = CollectionSearch.fromExpression(collections)
821 else:
822 collections = CollectionQuery.fromExpression(collections)
823 # Standardize and expand the data ID provided as a constraint.
824 standardizedDataId = self.expandDataId(dataId, **kwargs)
826 # We can only query directly if given a non-component DatasetType
827 # instance. If we were given an expression or str or a component
828 # DatasetType instance, we'll populate this dict, recurse, and return.
829 # If we already have a non-component DatasetType, it will remain None
830 # and we'll run the query directly.
831 composition: Optional[
832 Dict[
833 DatasetType, # parent dataset type
834 List[Optional[str]] # component name, or None for parent
835 ]
836 ] = None
837 if not isinstance(datasetType, DatasetType):
838 # We were given a dataset type expression (which may be as simple
839 # as a str). Loop over all matching datasets, delegating handling
840 # of the `components` argument to queryDatasetTypes, as we populate
841 # the composition dict.
842 composition = defaultdict(list)
843 for trueDatasetType in self.queryDatasetTypes(datasetType, components=components):
844 parentName, componentName = trueDatasetType.nameAndComponent()
845 if componentName is not None:
846 parentDatasetType = self.getDatasetType(parentName)
847 composition.setdefault(parentDatasetType, []).append(componentName)
848 else:
849 composition.setdefault(trueDatasetType, []).append(None)
850 elif datasetType.isComponent():
851 # We were given a true DatasetType instance, but it's a component.
852 # the composition dict will have exactly one item.
853 parentName, componentName = datasetType.nameAndComponent()
854 parentDatasetType = self.getDatasetType(parentName)
855 composition = {parentDatasetType: [componentName]}
856 if composition is not None:
857 # We need to recurse. Do that once for each parent dataset type.
858 chain = []
859 for parentDatasetType, componentNames in composition.items():
860 parentResults = self.queryDatasets(
861 parentDatasetType,
862 collections=collections,
863 dimensions=dimensions,
864 dataId=standardizedDataId,
865 where=where,
866 findFirst=findFirst,
867 check=check,
868 )
869 if isinstance(parentResults, queries.ParentDatasetQueryResults):
870 chain.append(
871 parentResults.withComponents(componentNames)
872 )
873 else:
874 # Should only happen if we know there would be no results.
875 assert isinstance(parentResults, queries.ChainedDatasetQueryResults) \
876 and not parentResults._chain
877 return queries.ChainedDatasetQueryResults(chain)
878 # If we get here, there's no need to recurse (or we are already
879 # recursing; there can only ever be one level of recursion).
881 # The full set of dimensions in the query is the combination of those
882 # needed for the DatasetType and those explicitly requested, if any.
883 requestedDimensionNames = set(datasetType.dimensions.names)
884 if dimensions is not None:
885 requestedDimensionNames.update(self.dimensions.extract(dimensions).names)
886 # Construct the summary structure needed to construct a QueryBuilder.
887 summary = queries.QuerySummary(
888 requested=DimensionGraph(self.dimensions, names=requestedDimensionNames),
889 dataId=standardizedDataId,
890 expression=where,
891 bind=bind,
892 defaults=self.defaults.dataId,
893 check=check,
894 )
895 builder = self.makeQueryBuilder(summary)
896 # Add the dataset subquery to the query, telling the QueryBuilder to
897 # include the rank of the selected collection in the results only if we
898 # need to findFirst. Note that if any of the collections are
899 # actually wildcard expressions, and we've asked for deduplication,
900 # this will raise TypeError for us.
901 if not builder.joinDataset(datasetType, collections, isResult=True, findFirst=findFirst):
902 return queries.ChainedDatasetQueryResults(())
903 query = builder.finish()
904 return queries.ParentDatasetQueryResults(self._db, query, components=[None])
906 def queryDataIds(self, dimensions: Union[Iterable[Union[Dimension, str]], Dimension, str], *,
907 dataId: Optional[DataId] = None,
908 datasets: Any = None,
909 collections: Any = None,
910 where: Optional[str] = None,
911 components: Optional[bool] = None,
912 bind: Optional[Mapping[str, Any]] = None,
913 check: bool = True,
914 **kwargs: Any) -> queries.DataCoordinateQueryResults:
915 # Docstring inherited from lsst.daf.butler.registry.Registry
916 dimensions = iterable(dimensions)
917 standardizedDataId = self.expandDataId(dataId, **kwargs)
918 standardizedDatasetTypes = set()
919 requestedDimensions = self.dimensions.extract(dimensions)
920 queryDimensionNames = set(requestedDimensions.names)
921 if datasets is not None:
922 if collections is None:
923 if not self.defaults.collections:
924 raise TypeError("Cannot pass 'datasets' without 'collections'.")
925 collections = self.defaults.collections
926 else:
927 # Preprocess collections expression in case the original
928 # included single-pass iterators (we'll want to use it multiple
929 # times below).
930 collections = CollectionQuery.fromExpression(collections)
931 for datasetType in self.queryDatasetTypes(datasets, components=components):
932 queryDimensionNames.update(datasetType.dimensions.names)
933 # If any matched dataset type is a component, just operate on
934 # its parent instead, because Registry doesn't know anything
935 # about what components exist, and here (unlike queryDatasets)
936 # we don't care about returning them.
937 parentDatasetTypeName, componentName = datasetType.nameAndComponent()
938 if componentName is not None:
939 datasetType = self.getDatasetType(parentDatasetTypeName)
940 standardizedDatasetTypes.add(datasetType)
942 summary = queries.QuerySummary(
943 requested=DimensionGraph(self.dimensions, names=queryDimensionNames),
944 dataId=standardizedDataId,
945 expression=where,
946 bind=bind,
947 defaults=self.defaults.dataId,
948 check=check,
949 )
950 builder = self.makeQueryBuilder(summary)
951 for datasetType in standardizedDatasetTypes:
952 builder.joinDataset(datasetType, collections, isResult=False)
953 query = builder.finish()
954 return queries.DataCoordinateQueryResults(self._db, query)
956 def queryDimensionRecords(self, element: Union[DimensionElement, str], *,
957 dataId: Optional[DataId] = None,
958 datasets: Any = None,
959 collections: Any = None,
960 where: Optional[str] = None,
961 components: Optional[bool] = None,
962 bind: Optional[Mapping[str, Any]] = None,
963 check: bool = True,
964 **kwargs: Any) -> Iterator[DimensionRecord]:
965 # Docstring inherited from lsst.daf.butler.registry.Registry
966 if not isinstance(element, DimensionElement):
967 try:
968 element = self.dimensions[element]
969 except KeyError as e:
970 raise KeyError(f"No such dimension '{element}', available dimensions: "
971 + str(self.dimensions.getStaticElements())) from e
972 dataIds = self.queryDataIds(element.graph, dataId=dataId, datasets=datasets, collections=collections,
973 where=where, components=components, bind=bind, check=check, **kwargs)
974 return iter(self._managers.dimensions[element].fetch(dataIds))
976 def queryDatasetAssociations(
977 self,
978 datasetType: Union[str, DatasetType],
979 collections: Any = ...,
980 *,
981 collectionTypes: Iterable[CollectionType] = CollectionType.all(),
982 flattenChains: bool = False,
983 ) -> Iterator[DatasetAssociation]:
984 # Docstring inherited from lsst.daf.butler.registry.Registry
985 if collections is None:
986 if not self.defaults.collections:
987 raise TypeError("No collections provided to findDataset, "
988 "and no defaults from registry construction.")
989 collections = self.defaults.collections
990 else:
991 collections = CollectionQuery.fromExpression(collections)
992 TimespanReprClass = self._db.getTimespanRepresentation()
993 if isinstance(datasetType, str):
994 storage = self._managers.datasets[datasetType]
995 else:
996 storage = self._managers.datasets[datasetType.name]
997 for collectionRecord in collections.iter(self._managers.collections,
998 collectionTypes=frozenset(collectionTypes),
999 flattenChains=flattenChains):
1000 query = storage.select(collectionRecord)
1001 if query is None:
1002 continue
1003 for row in self._db.query(query.combine()):
1004 dataId = DataCoordinate.fromRequiredValues(
1005 storage.datasetType.dimensions,
1006 tuple(row[name] for name in storage.datasetType.dimensions.required.names)
1007 )
1008 runRecord = self._managers.collections[row[self._managers.collections.getRunForeignKeyName()]]
1009 ref = DatasetRef(storage.datasetType, dataId, id=row["id"], run=runRecord.name,
1010 conform=False)
1011 if collectionRecord.type is CollectionType.CALIBRATION:
1012 timespan = TimespanReprClass.extract(row)
1013 else:
1014 timespan = None
1015 yield DatasetAssociation(ref=ref, collection=collectionRecord.name, timespan=timespan)
1017 storageClasses: StorageClassFactory
1018 """All storage classes known to the registry (`StorageClassFactory`).
1019 """