Coverage for python/lsst/daf/butler/registry/_sqlRegistry.py : 14%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22from __future__ import annotations
24__all__ = (
25 "SqlRegistry",
26)
28from collections import defaultdict
29import contextlib
30import logging
31from typing import (
32 Any,
33 Dict,
34 Iterable,
35 Iterator,
36 List,
37 Mapping,
38 Optional,
39 Set,
40 TYPE_CHECKING,
41 Union,
42)
44import sqlalchemy
46from ..core import (
47 ButlerURI,
48 Config,
49 DataCoordinate,
50 DataCoordinateIterable,
51 DataId,
52 DatasetAssociation,
53 DatasetRef,
54 DatasetType,
55 ddl,
56 Dimension,
57 DimensionConfig,
58 DimensionElement,
59 DimensionGraph,
60 DimensionRecord,
61 DimensionUniverse,
62 NamedKeyMapping,
63 NameLookupMapping,
64 Progress,
65 StorageClassFactory,
66 Timespan,
67)
68from . import queries
69from ..core.utils import iterable, transactional
70from ._config import RegistryConfig
71from ._collectionType import CollectionType
72from ._defaults import RegistryDefaults
73from ._exceptions import ConflictingDefinitionError, InconsistentDataIdError, OrphanedRecordError
74from .managers import RegistryManagerTypes, RegistryManagerInstances
75from .wildcards import CategorizedWildcard, CollectionQuery, CollectionSearch, Ellipsis
76from .summaries import CollectionSummary
77from .interfaces import ChainedCollectionRecord, RunRecord
78from ._registry import Registry
80if TYPE_CHECKING: 80 ↛ 81line 80 didn't jump to line 81, because the condition on line 80 was never true
81 from .._butlerConfig import ButlerConfig
82 from .interfaces import (
83 CollectionRecord,
84 Database,
85 DatastoreRegistryBridgeManager,
86 )
89_LOG = logging.getLogger(__name__)
92class SqlRegistry(Registry):
93 """Registry implementation based on SQLAlchemy.
95 Parameters
96 ----------
97 database : `Database`
98 Database instance to store Registry.
99 defaults : `RegistryDefaults`
100 Default collection search path and/or output `~CollectionType.RUN`
101 collection.
102 managers : `RegistryManagerInstances`
103 All the managers required for this registry.
104 """
106 defaultConfigFile: Optional[str] = None
107 """Path to configuration defaults. Accessed within the ``configs`` resource
108 or relative to a search path. Can be None if no defaults specified.
109 """
111 @classmethod
112 def createFromConfig(cls, config: Optional[Union[RegistryConfig, str]] = None,
113 dimensionConfig: Optional[Union[DimensionConfig, str]] = None,
114 butlerRoot: Optional[str] = None) -> Registry:
115 """Create registry database and return `SqlRegistry` instance.
117 This method initializes database contents, database must be empty
118 prior to calling this method.
120 Parameters
121 ----------
122 config : `RegistryConfig` or `str`, optional
123 Registry configuration, if missing then default configuration will
124 be loaded from registry.yaml.
125 dimensionConfig : `DimensionConfig` or `str`, optional
126 Dimensions configuration, if missing then default configuration
127 will be loaded from dimensions.yaml.
128 butlerRoot : `str`, optional
129 Path to the repository root this `SqlRegistry` will manage.
131 Returns
132 -------
133 registry : `SqlRegistry`
134 A new `SqlRegistry` instance.
135 """
136 config = cls.forceRegistryConfig(config)
137 config.replaceRoot(butlerRoot)
139 if isinstance(dimensionConfig, str):
140 dimensionConfig = DimensionConfig(config)
141 elif dimensionConfig is None:
142 dimensionConfig = DimensionConfig()
143 elif not isinstance(dimensionConfig, DimensionConfig):
144 raise TypeError(f"Incompatible Dimension configuration type: {type(dimensionConfig)}")
146 DatabaseClass = config.getDatabaseClass()
147 database = DatabaseClass.fromUri(str(config.connectionString), origin=config.get("origin", 0),
148 namespace=config.get("namespace"))
149 managerTypes = RegistryManagerTypes.fromConfig(config)
150 managers = managerTypes.makeRepo(database, dimensionConfig)
151 return cls(database, RegistryDefaults(), managers)
153 @classmethod
154 def fromConfig(cls, config: Union[ButlerConfig, RegistryConfig, Config, str],
155 butlerRoot: Optional[Union[str, ButlerURI]] = None, writeable: bool = True,
156 defaults: Optional[RegistryDefaults] = None) -> Registry:
157 """Create `Registry` subclass instance from `config`.
159 Registry database must be inbitialized prior to calling this method.
161 Parameters
162 ----------
163 config : `ButlerConfig`, `RegistryConfig`, `Config` or `str`
164 Registry configuration
165 butlerRoot : `str` or `ButlerURI`, optional
166 Path to the repository root this `Registry` will manage.
167 writeable : `bool`, optional
168 If `True` (default) create a read-write connection to the database.
169 defaults : `RegistryDefaults`, optional
170 Default collection search path and/or output `~CollectionType.RUN`
171 collection.
173 Returns
174 -------
175 registry : `SqlRegistry` (subclass)
176 A new `SqlRegistry` subclass instance.
177 """
178 config = cls.forceRegistryConfig(config)
179 config.replaceRoot(butlerRoot)
180 DatabaseClass = config.getDatabaseClass()
181 database = DatabaseClass.fromUri(str(config.connectionString), origin=config.get("origin", 0),
182 namespace=config.get("namespace"), writeable=writeable)
183 managerTypes = RegistryManagerTypes.fromConfig(config)
184 managers = managerTypes.loadRepo(database)
185 if defaults is None:
186 defaults = RegistryDefaults()
187 return cls(database, defaults, managers)
189 def __init__(self, database: Database, defaults: RegistryDefaults, managers: RegistryManagerInstances):
190 self._db = database
191 self._managers = managers
192 self.storageClasses = StorageClassFactory()
193 # Intentionally invoke property setter to initialize defaults. This
194 # can only be done after most of the rest of Registry has already been
195 # initialized, and must be done before the property getter is used.
196 self.defaults = defaults
198 def __str__(self) -> str:
199 return str(self._db)
201 def __repr__(self) -> str:
202 return f"SqlRegistry({self._db!r}, {self.dimensions!r})"
204 def isWriteable(self) -> bool:
205 # Docstring inherited from lsst.daf.butler.registry.Registry
206 return self._db.isWriteable()
208 def copy(self, defaults: Optional[RegistryDefaults] = None) -> Registry:
209 # Docstring inherited from lsst.daf.butler.registry.Registry
210 if defaults is None:
211 # No need to copy, because `RegistryDefaults` is immutable; we
212 # effectively copy on write.
213 defaults = self.defaults
214 return type(self)(self._db, defaults, self._managers)
216 @property
217 def dimensions(self) -> DimensionUniverse:
218 # Docstring inherited from lsst.daf.butler.registry.Registry
219 return self._managers.dimensions.universe
221 def refresh(self) -> None:
222 # Docstring inherited from lsst.daf.butler.registry.Registry
223 self._managers.refresh()
225 @contextlib.contextmanager
226 def transaction(self, *, savepoint: bool = False) -> Iterator[None]:
227 # Docstring inherited from lsst.daf.butler.registry.Registry
228 try:
229 with self._db.transaction(savepoint=savepoint):
230 yield
231 except BaseException:
232 # TODO: this clears the caches sometimes when we wouldn't actually
233 # need to. Can we avoid that?
234 self._managers.dimensions.clearCaches()
235 raise
237 def resetConnectionPool(self) -> None:
238 """Reset SQLAlchemy connection pool for `SqlRegistry` database.
240 This operation is useful when using registry with fork-based
241 multiprocessing. To use registry across fork boundary one has to make
242 sure that there are no currently active connections (no session or
243 transaction is in progress) and connection pool is reset using this
244 method. This method should be called by the child process immediately
245 after the fork.
246 """
247 self._db._engine.dispose()
249 def registerOpaqueTable(self, tableName: str, spec: ddl.TableSpec) -> None:
250 """Add an opaque (to the `Registry`) table for use by a `Datastore` or
251 other data repository client.
253 Opaque table records can be added via `insertOpaqueData`, retrieved via
254 `fetchOpaqueData`, and removed via `deleteOpaqueData`.
256 Parameters
257 ----------
258 tableName : `str`
259 Logical name of the opaque table. This may differ from the
260 actual name used in the database by a prefix and/or suffix.
261 spec : `ddl.TableSpec`
262 Specification for the table to be added.
263 """
264 self._managers.opaque.register(tableName, spec)
266 @transactional
267 def insertOpaqueData(self, tableName: str, *data: dict) -> None:
268 """Insert records into an opaque table.
270 Parameters
271 ----------
272 tableName : `str`
273 Logical name of the opaque table. Must match the name used in a
274 previous call to `registerOpaqueTable`.
275 data
276 Each additional positional argument is a dictionary that represents
277 a single row to be added.
278 """
279 self._managers.opaque[tableName].insert(*data)
281 def fetchOpaqueData(self, tableName: str, **where: Any) -> Iterator[dict]:
282 """Retrieve records from an opaque table.
284 Parameters
285 ----------
286 tableName : `str`
287 Logical name of the opaque table. Must match the name used in a
288 previous call to `registerOpaqueTable`.
289 where
290 Additional keyword arguments are interpreted as equality
291 constraints that restrict the returned rows (combined with AND);
292 keyword arguments are column names and values are the values they
293 must have.
295 Yields
296 ------
297 row : `dict`
298 A dictionary representing a single result row.
299 """
300 yield from self._managers.opaque[tableName].fetch(**where)
302 @transactional
303 def deleteOpaqueData(self, tableName: str, **where: Any) -> None:
304 """Remove records from an opaque table.
306 Parameters
307 ----------
308 tableName : `str`
309 Logical name of the opaque table. Must match the name used in a
310 previous call to `registerOpaqueTable`.
311 where
312 Additional keyword arguments are interpreted as equality
313 constraints that restrict the deleted rows (combined with AND);
314 keyword arguments are column names and values are the values they
315 must have.
316 """
317 self._managers.opaque[tableName].delete(where.keys(), where)
319 def registerCollection(self, name: str, type: CollectionType = CollectionType.TAGGED,
320 doc: Optional[str] = None) -> None:
321 # Docstring inherited from lsst.daf.butler.registry.Registry
322 self._managers.collections.register(name, type, doc=doc)
324 def getCollectionType(self, name: str) -> CollectionType:
325 # Docstring inherited from lsst.daf.butler.registry.Registry
326 return self._managers.collections.find(name).type
328 def _get_collection_record(self, name: str) -> CollectionRecord:
329 # Docstring inherited from lsst.daf.butler.registry.Registry
330 return self._managers.collections.find(name)
332 def registerRun(self, name: str, doc: Optional[str] = None) -> None:
333 # Docstring inherited from lsst.daf.butler.registry.Registry
334 self._managers.collections.register(name, CollectionType.RUN, doc=doc)
336 @transactional
337 def removeCollection(self, name: str) -> None:
338 # Docstring inherited from lsst.daf.butler.registry.Registry
339 self._managers.collections.remove(name)
341 def getCollectionChain(self, parent: str) -> CollectionSearch:
342 # Docstring inherited from lsst.daf.butler.registry.Registry
343 record = self._managers.collections.find(parent)
344 if record.type is not CollectionType.CHAINED:
345 raise TypeError(f"Collection '{parent}' has type {record.type.name}, not CHAINED.")
346 assert isinstance(record, ChainedCollectionRecord)
347 return record.children
349 @transactional
350 def setCollectionChain(self, parent: str, children: Any, *, flatten: bool = False) -> None:
351 # Docstring inherited from lsst.daf.butler.registry.Registry
352 record = self._managers.collections.find(parent)
353 if record.type is not CollectionType.CHAINED:
354 raise TypeError(f"Collection '{parent}' has type {record.type.name}, not CHAINED.")
355 assert isinstance(record, ChainedCollectionRecord)
356 children = CollectionSearch.fromExpression(children)
357 if children != record.children or flatten:
358 record.update(self._managers.collections, children, flatten=flatten)
360 def getCollectionDocumentation(self, collection: str) -> Optional[str]:
361 # Docstring inherited from lsst.daf.butler.registry.Registry
362 return self._managers.collections.getDocumentation(self._managers.collections.find(collection).key)
364 def setCollectionDocumentation(self, collection: str, doc: Optional[str]) -> None:
365 # Docstring inherited from lsst.daf.butler.registry.Registry
366 self._managers.collections.setDocumentation(self._managers.collections.find(collection).key, doc)
368 def getCollectionSummary(self, collection: str) -> CollectionSummary:
369 # Docstring inherited from lsst.daf.butler.registry.Registry
370 record = self._managers.collections.find(collection)
371 return self._managers.datasets.getCollectionSummary(record)
373 def registerDatasetType(self, datasetType: DatasetType) -> bool:
374 # Docstring inherited from lsst.daf.butler.registry.Registry
375 _, inserted = self._managers.datasets.register(datasetType)
376 return inserted
378 def removeDatasetType(self, name: str) -> None:
379 # Docstring inherited from lsst.daf.butler.registry.Registry
380 self._managers.datasets.remove(name)
382 def getDatasetType(self, name: str) -> DatasetType:
383 # Docstring inherited from lsst.daf.butler.registry.Registry
384 return self._managers.datasets[name].datasetType
386 def findDataset(self, datasetType: Union[DatasetType, str], dataId: Optional[DataId] = None, *,
387 collections: Any = None, timespan: Optional[Timespan] = None,
388 **kwargs: Any) -> Optional[DatasetRef]:
389 # Docstring inherited from lsst.daf.butler.registry.Registry
390 if isinstance(datasetType, DatasetType):
391 storage = self._managers.datasets[datasetType.name]
392 else:
393 storage = self._managers.datasets[datasetType]
394 dataId = DataCoordinate.standardize(dataId, graph=storage.datasetType.dimensions,
395 universe=self.dimensions, defaults=self.defaults.dataId,
396 **kwargs)
397 if collections is None:
398 if not self.defaults.collections:
399 raise TypeError("No collections provided to findDataset, "
400 "and no defaults from registry construction.")
401 collections = self.defaults.collections
402 else:
403 collections = CollectionSearch.fromExpression(collections)
404 for collectionRecord in collections.iter(self._managers.collections):
405 if (collectionRecord.type is CollectionType.CALIBRATION
406 and (not storage.datasetType.isCalibration() or timespan is None)):
407 continue
408 result = storage.find(collectionRecord, dataId, timespan=timespan)
409 if result is not None:
410 return result
412 return None
414 @transactional
415 def insertDatasets(self, datasetType: Union[DatasetType, str], dataIds: Iterable[DataId],
416 run: Optional[str] = None, expand: bool = True) -> List[DatasetRef]:
417 # Docstring inherited from lsst.daf.butler.registry.Registry
418 if isinstance(datasetType, DatasetType):
419 storage = self._managers.datasets.find(datasetType.name)
420 if storage is None:
421 raise LookupError(f"DatasetType '{datasetType}' has not been registered.")
422 else:
423 storage = self._managers.datasets.find(datasetType)
424 if storage is None:
425 raise LookupError(f"DatasetType with name '{datasetType}' has not been registered.")
426 if run is None:
427 if self.defaults.run is None:
428 raise TypeError("No run provided to insertDatasets, "
429 "and no default from registry construction.")
430 run = self.defaults.run
431 runRecord = self._managers.collections.find(run)
432 if runRecord.type is not CollectionType.RUN:
433 raise TypeError(f"Given collection is of type {runRecord.type.name}; RUN collection required.")
434 assert isinstance(runRecord, RunRecord)
435 progress = Progress("daf.butler.Registry.insertDatasets", level=logging.DEBUG)
436 if expand:
437 expandedDataIds = [self.expandDataId(dataId, graph=storage.datasetType.dimensions)
438 for dataId in progress.wrap(dataIds,
439 f"Expanding {storage.datasetType.name} data IDs")]
440 else:
441 expandedDataIds = [DataCoordinate.standardize(dataId, graph=storage.datasetType.dimensions)
442 for dataId in dataIds]
443 try:
444 refs = list(storage.insert(runRecord, expandedDataIds))
445 except sqlalchemy.exc.IntegrityError as err:
446 raise ConflictingDefinitionError(f"A database constraint failure was triggered by inserting "
447 f"one or more datasets of type {storage.datasetType} into "
448 f"collection '{run}'. "
449 f"This probably means a dataset with the same data ID "
450 f"and dataset type already exists, but it may also mean a "
451 f"dimension row is missing.") from err
452 return refs
454 def getDataset(self, id: int) -> Optional[DatasetRef]:
455 # Docstring inherited from lsst.daf.butler.registry.Registry
456 return self._managers.datasets.getDatasetRef(id)
458 @transactional
459 def removeDatasets(self, refs: Iterable[DatasetRef]) -> None:
460 # Docstring inherited from lsst.daf.butler.registry.Registry
461 progress = Progress("lsst.daf.butler.Registry.removeDatasets", level=logging.DEBUG)
462 for datasetType, refsForType in progress.iter_item_chunks(DatasetRef.groupByType(refs).items(),
463 desc="Removing datasets by type"):
464 storage = self._managers.datasets[datasetType.name]
465 try:
466 storage.delete(refsForType)
467 except sqlalchemy.exc.IntegrityError as err:
468 raise OrphanedRecordError("One or more datasets is still "
469 "present in one or more Datastores.") from err
471 @transactional
472 def associate(self, collection: str, refs: Iterable[DatasetRef]) -> None:
473 # Docstring inherited from lsst.daf.butler.registry.Registry
474 progress = Progress("lsst.daf.butler.Registry.associate", level=logging.DEBUG)
475 collectionRecord = self._managers.collections.find(collection)
476 if collectionRecord.type is not CollectionType.TAGGED:
477 raise TypeError(f"Collection '{collection}' has type {collectionRecord.type.name}, not TAGGED.")
478 for datasetType, refsForType in progress.iter_item_chunks(DatasetRef.groupByType(refs).items(),
479 desc="Associating datasets by type"):
480 storage = self._managers.datasets[datasetType.name]
481 try:
482 storage.associate(collectionRecord, refsForType)
483 except sqlalchemy.exc.IntegrityError as err:
484 raise ConflictingDefinitionError(
485 f"Constraint violation while associating dataset of type {datasetType.name} with "
486 f"collection {collection}. This probably means that one or more datasets with the same "
487 f"dataset type and data ID already exist in the collection, but it may also indicate "
488 f"that the datasets do not exist."
489 ) from err
491 @transactional
492 def disassociate(self, collection: str, refs: Iterable[DatasetRef]) -> None:
493 # Docstring inherited from lsst.daf.butler.registry.Registry
494 progress = Progress("lsst.daf.butler.Registry.disassociate", level=logging.DEBUG)
495 collectionRecord = self._managers.collections.find(collection)
496 if collectionRecord.type is not CollectionType.TAGGED:
497 raise TypeError(f"Collection '{collection}' has type {collectionRecord.type.name}; "
498 "expected TAGGED.")
499 for datasetType, refsForType in progress.iter_item_chunks(DatasetRef.groupByType(refs).items(),
500 desc="Disassociating datasets by type"):
501 storage = self._managers.datasets[datasetType.name]
502 storage.disassociate(collectionRecord, refsForType)
504 @transactional
505 def certify(self, collection: str, refs: Iterable[DatasetRef], timespan: Timespan) -> None:
506 # Docstring inherited from lsst.daf.butler.registry.Registry
507 progress = Progress("lsst.daf.butler.Registry.certify", level=logging.DEBUG)
508 collectionRecord = self._managers.collections.find(collection)
509 for datasetType, refsForType in progress.iter_item_chunks(DatasetRef.groupByType(refs).items(),
510 desc="Certifying datasets by type"):
511 storage = self._managers.datasets[datasetType.name]
512 storage.certify(collectionRecord, refsForType, timespan)
514 @transactional
515 def decertify(self, collection: str, datasetType: Union[str, DatasetType], timespan: Timespan, *,
516 dataIds: Optional[Iterable[DataId]] = None) -> None:
517 # Docstring inherited from lsst.daf.butler.registry.Registry
518 collectionRecord = self._managers.collections.find(collection)
519 if isinstance(datasetType, str):
520 storage = self._managers.datasets[datasetType]
521 else:
522 storage = self._managers.datasets[datasetType.name]
523 standardizedDataIds = None
524 if dataIds is not None:
525 standardizedDataIds = [DataCoordinate.standardize(d, graph=storage.datasetType.dimensions)
526 for d in dataIds]
527 storage.decertify(collectionRecord, timespan, dataIds=standardizedDataIds)
529 def getDatastoreBridgeManager(self) -> DatastoreRegistryBridgeManager:
530 """Return an object that allows a new `Datastore` instance to
531 communicate with this `Registry`.
533 Returns
534 -------
535 manager : `DatastoreRegistryBridgeManager`
536 Object that mediates communication between this `Registry` and its
537 associated datastores.
538 """
539 return self._managers.datastores
541 def getDatasetLocations(self, ref: DatasetRef) -> Iterable[str]:
542 # Docstring inherited from lsst.daf.butler.registry.Registry
543 return self._managers.datastores.findDatastores(ref)
545 def expandDataId(self, dataId: Optional[DataId] = None, *, graph: Optional[DimensionGraph] = None,
546 records: Optional[NameLookupMapping[DimensionElement, Optional[DimensionRecord]]] = None,
547 withDefaults: bool = True,
548 **kwargs: Any) -> DataCoordinate:
549 # Docstring inherited from lsst.daf.butler.registry.Registry
550 if not withDefaults:
551 defaults = None
552 else:
553 defaults = self.defaults.dataId
554 standardized = DataCoordinate.standardize(dataId, graph=graph, universe=self.dimensions,
555 defaults=defaults, **kwargs)
556 if standardized.hasRecords():
557 return standardized
558 if records is None:
559 records = {}
560 elif isinstance(records, NamedKeyMapping):
561 records = records.byName()
562 else:
563 records = dict(records)
564 if isinstance(dataId, DataCoordinate) and dataId.hasRecords():
565 records.update(dataId.records.byName())
566 keys = standardized.byName()
567 for element in standardized.graph.primaryKeyTraversalOrder:
568 record = records.get(element.name, ...) # Use ... to mean not found; None might mean NULL
569 if record is ...:
570 if isinstance(element, Dimension) and keys.get(element.name) is None:
571 if element in standardized.graph.required:
572 raise LookupError(
573 f"No value or null value for required dimension {element.name}."
574 )
575 keys[element.name] = None
576 record = None
577 else:
578 storage = self._managers.dimensions[element]
579 dataIdSet = DataCoordinateIterable.fromScalar(
580 DataCoordinate.standardize(keys, graph=element.graph)
581 )
582 fetched = tuple(storage.fetch(dataIdSet))
583 try:
584 (record,) = fetched
585 except ValueError:
586 record = None
587 records[element.name] = record
588 if record is not None:
589 for d in element.implied:
590 value = getattr(record, d.name)
591 if keys.setdefault(d.name, value) != value:
592 raise InconsistentDataIdError(
593 f"Data ID {standardized} has {d.name}={keys[d.name]!r}, "
594 f"but {element.name} implies {d.name}={value!r}."
595 )
596 else:
597 if element in standardized.graph.required:
598 raise LookupError(
599 f"Could not fetch record for required dimension {element.name} via keys {keys}."
600 )
601 if element.alwaysJoin:
602 raise InconsistentDataIdError(
603 f"Could not fetch record for element {element.name} via keys {keys}, ",
604 "but it is marked alwaysJoin=True; this means one or more dimensions are not "
605 "related."
606 )
607 for d in element.implied:
608 keys.setdefault(d.name, None)
609 records.setdefault(d.name, None)
610 return DataCoordinate.standardize(keys, graph=standardized.graph).expanded(records=records)
612 def insertDimensionData(self, element: Union[DimensionElement, str],
613 *data: Union[Mapping[str, Any], DimensionRecord],
614 conform: bool = True) -> None:
615 # Docstring inherited from lsst.daf.butler.registry.Registry
616 if conform:
617 if isinstance(element, str):
618 element = self.dimensions[element]
619 records = [row if isinstance(row, DimensionRecord) else element.RecordClass(**row)
620 for row in data]
621 else:
622 # Ignore typing since caller said to trust them with conform=False.
623 records = data # type: ignore
624 storage = self._managers.dimensions[element] # type: ignore
625 storage.insert(*records)
627 def syncDimensionData(self, element: Union[DimensionElement, str],
628 row: Union[Mapping[str, Any], DimensionRecord],
629 conform: bool = True) -> bool:
630 # Docstring inherited from lsst.daf.butler.registry.Registry
631 if conform:
632 if isinstance(element, str):
633 element = self.dimensions[element]
634 record = row if isinstance(row, DimensionRecord) else element.RecordClass(**row)
635 else:
636 # Ignore typing since caller said to trust them with conform=False.
637 record = row # type: ignore
638 storage = self._managers.dimensions[element] # type: ignore
639 return storage.sync(record)
641 def queryDatasetTypes(self, expression: Any = ..., *, components: Optional[bool] = None
642 ) -> Iterator[DatasetType]:
643 # Docstring inherited from lsst.daf.butler.registry.Registry
644 wildcard = CategorizedWildcard.fromExpression(expression, coerceUnrecognized=lambda d: d.name)
645 if wildcard is Ellipsis:
646 for datasetType in self._managers.datasets:
647 # The dataset type can no longer be a component
648 yield datasetType
649 if components:
650 # Automatically create the component dataset types
651 try:
652 componentsForDatasetType = datasetType.makeAllComponentDatasetTypes()
653 except KeyError as err:
654 _LOG.warning(f"Could not load storage class {err} for {datasetType.name}; "
655 "if it has components they will not be included in query results.")
656 else:
657 yield from componentsForDatasetType
658 return
659 done: Set[str] = set()
660 for name in wildcard.strings:
661 storage = self._managers.datasets.find(name)
662 if storage is not None:
663 done.add(storage.datasetType.name)
664 yield storage.datasetType
665 if wildcard.patterns:
666 # If components (the argument) is None, we'll save component
667 # dataset that we might want to match, but only if their parents
668 # didn't get included.
669 componentsForLater = []
670 for registeredDatasetType in self._managers.datasets:
671 # Components are not stored in registry so expand them here
672 allDatasetTypes = [registeredDatasetType]
673 try:
674 allDatasetTypes.extend(registeredDatasetType.makeAllComponentDatasetTypes())
675 except KeyError as err:
676 _LOG.warning(f"Could not load storage class {err} for {registeredDatasetType.name}; "
677 "if it has components they will not be included in query results.")
678 for datasetType in allDatasetTypes:
679 if datasetType.name in done:
680 continue
681 parentName, componentName = datasetType.nameAndComponent()
682 if componentName is not None and not components:
683 if components is None and parentName not in done:
684 componentsForLater.append(datasetType)
685 continue
686 if any(p.fullmatch(datasetType.name) for p in wildcard.patterns):
687 done.add(datasetType.name)
688 yield datasetType
689 # Go back and try to match saved components.
690 for datasetType in componentsForLater:
691 parentName, _ = datasetType.nameAndComponent()
692 if parentName not in done and any(p.fullmatch(datasetType.name) for p in wildcard.patterns):
693 yield datasetType
695 def queryCollections(self, expression: Any = ...,
696 datasetType: Optional[DatasetType] = None,
697 collectionTypes: Iterable[CollectionType] = CollectionType.all(),
698 flattenChains: bool = False,
699 includeChains: Optional[bool] = None) -> Iterator[str]:
700 # Docstring inherited from lsst.daf.butler.registry.Registry
702 # Right now the datasetTypes argument is completely ignored, but that
703 # is consistent with its [lack of] guarantees. DM-24939 or a follow-up
704 # ticket will take care of that.
705 query = CollectionQuery.fromExpression(expression)
706 for record in query.iter(self._managers.collections, collectionTypes=frozenset(collectionTypes),
707 flattenChains=flattenChains, includeChains=includeChains):
708 yield record.name
710 def makeQueryBuilder(self, summary: queries.QuerySummary) -> queries.QueryBuilder:
711 """Return a `QueryBuilder` instance capable of constructing and
712 managing more complex queries than those obtainable via `Registry`
713 interfaces.
715 This is an advanced interface; downstream code should prefer
716 `Registry.queryDataIds` and `Registry.queryDatasets` whenever those
717 are sufficient.
719 Parameters
720 ----------
721 summary : `queries.QuerySummary`
722 Object describing and categorizing the full set of dimensions that
723 will be included in the query.
725 Returns
726 -------
727 builder : `queries.QueryBuilder`
728 Object that can be used to construct and perform advanced queries.
729 """
730 return queries.QueryBuilder(
731 summary,
732 queries.RegistryManagers(
733 collections=self._managers.collections,
734 dimensions=self._managers.dimensions,
735 datasets=self._managers.datasets,
736 TimespanReprClass=self._db.getTimespanRepresentation(),
737 ),
738 )
740 def queryDatasets(self, datasetType: Any, *,
741 collections: Any = None,
742 dimensions: Optional[Iterable[Union[Dimension, str]]] = None,
743 dataId: Optional[DataId] = None,
744 where: Optional[str] = None,
745 findFirst: bool = False,
746 components: Optional[bool] = None,
747 bind: Optional[Mapping[str, Any]] = None,
748 check: bool = True,
749 **kwargs: Any) -> queries.DatasetQueryResults:
750 # Docstring inherited from lsst.daf.butler.registry.Registry
752 # Standardize the collections expression.
753 if collections is None:
754 if not self.defaults.collections:
755 raise TypeError("No collections provided to findDataset, "
756 "and no defaults from registry construction.")
757 collections = self.defaults.collections
758 elif findFirst:
759 collections = CollectionSearch.fromExpression(collections)
760 else:
761 collections = CollectionQuery.fromExpression(collections)
762 # Standardize and expand the data ID provided as a constraint.
763 standardizedDataId = self.expandDataId(dataId, **kwargs)
765 # We can only query directly if given a non-component DatasetType
766 # instance. If we were given an expression or str or a component
767 # DatasetType instance, we'll populate this dict, recurse, and return.
768 # If we already have a non-component DatasetType, it will remain None
769 # and we'll run the query directly.
770 composition: Optional[
771 Dict[
772 DatasetType, # parent dataset type
773 List[Optional[str]] # component name, or None for parent
774 ]
775 ] = None
776 if not isinstance(datasetType, DatasetType):
777 # We were given a dataset type expression (which may be as simple
778 # as a str). Loop over all matching datasets, delegating handling
779 # of the `components` argument to queryDatasetTypes, as we populate
780 # the composition dict.
781 composition = defaultdict(list)
782 for trueDatasetType in self.queryDatasetTypes(datasetType, components=components):
783 parentName, componentName = trueDatasetType.nameAndComponent()
784 if componentName is not None:
785 parentDatasetType = self.getDatasetType(parentName)
786 composition.setdefault(parentDatasetType, []).append(componentName)
787 else:
788 composition.setdefault(trueDatasetType, []).append(None)
789 elif datasetType.isComponent():
790 # We were given a true DatasetType instance, but it's a component.
791 # the composition dict will have exactly one item.
792 parentName, componentName = datasetType.nameAndComponent()
793 parentDatasetType = self.getDatasetType(parentName)
794 composition = {parentDatasetType: [componentName]}
795 if composition is not None:
796 # We need to recurse. Do that once for each parent dataset type.
797 chain = []
798 for parentDatasetType, componentNames in composition.items():
799 parentResults = self.queryDatasets(
800 parentDatasetType,
801 collections=collections,
802 dimensions=dimensions,
803 dataId=standardizedDataId,
804 where=where,
805 findFirst=findFirst,
806 check=check,
807 )
808 if isinstance(parentResults, queries.ParentDatasetQueryResults):
809 chain.append(
810 parentResults.withComponents(componentNames)
811 )
812 else:
813 # Should only happen if we know there would be no results.
814 assert isinstance(parentResults, queries.ChainedDatasetQueryResults) \
815 and not parentResults._chain
816 return queries.ChainedDatasetQueryResults(chain)
817 # If we get here, there's no need to recurse (or we are already
818 # recursing; there can only ever be one level of recursion).
820 # The full set of dimensions in the query is the combination of those
821 # needed for the DatasetType and those explicitly requested, if any.
822 requestedDimensionNames = set(datasetType.dimensions.names)
823 if dimensions is not None:
824 requestedDimensionNames.update(self.dimensions.extract(dimensions).names)
825 # Construct the summary structure needed to construct a QueryBuilder.
826 summary = queries.QuerySummary(
827 requested=DimensionGraph(self.dimensions, names=requestedDimensionNames),
828 dataId=standardizedDataId,
829 expression=where,
830 bind=bind,
831 defaults=self.defaults.dataId,
832 check=check,
833 )
834 builder = self.makeQueryBuilder(summary)
835 # Add the dataset subquery to the query, telling the QueryBuilder to
836 # include the rank of the selected collection in the results only if we
837 # need to findFirst. Note that if any of the collections are
838 # actually wildcard expressions, and we've asked for deduplication,
839 # this will raise TypeError for us.
840 if not builder.joinDataset(datasetType, collections, isResult=True, findFirst=findFirst):
841 return queries.ChainedDatasetQueryResults(())
842 query = builder.finish()
843 return queries.ParentDatasetQueryResults(self._db, query, components=[None])
845 def queryDataIds(self, dimensions: Union[Iterable[Union[Dimension, str]], Dimension, str], *,
846 dataId: Optional[DataId] = None,
847 datasets: Any = None,
848 collections: Any = None,
849 where: Optional[str] = None,
850 components: Optional[bool] = None,
851 bind: Optional[Mapping[str, Any]] = None,
852 check: bool = True,
853 **kwargs: Any) -> queries.DataCoordinateQueryResults:
854 # Docstring inherited from lsst.daf.butler.registry.Registry
855 dimensions = iterable(dimensions)
856 standardizedDataId = self.expandDataId(dataId, **kwargs)
857 standardizedDatasetTypes = set()
858 requestedDimensions = self.dimensions.extract(dimensions)
859 queryDimensionNames = set(requestedDimensions.names)
860 if datasets is not None:
861 if collections is None:
862 if not self.defaults.collections:
863 raise TypeError("Cannot pass 'datasets' without 'collections'.")
864 collections = self.defaults.collections
865 else:
866 # Preprocess collections expression in case the original
867 # included single-pass iterators (we'll want to use it multiple
868 # times below).
869 collections = CollectionQuery.fromExpression(collections)
870 for datasetType in self.queryDatasetTypes(datasets, components=components):
871 queryDimensionNames.update(datasetType.dimensions.names)
872 # If any matched dataset type is a component, just operate on
873 # its parent instead, because Registry doesn't know anything
874 # about what components exist, and here (unlike queryDatasets)
875 # we don't care about returning them.
876 parentDatasetTypeName, componentName = datasetType.nameAndComponent()
877 if componentName is not None:
878 datasetType = self.getDatasetType(parentDatasetTypeName)
879 standardizedDatasetTypes.add(datasetType)
881 summary = queries.QuerySummary(
882 requested=DimensionGraph(self.dimensions, names=queryDimensionNames),
883 dataId=standardizedDataId,
884 expression=where,
885 bind=bind,
886 defaults=self.defaults.dataId,
887 check=check,
888 )
889 builder = self.makeQueryBuilder(summary)
890 for datasetType in standardizedDatasetTypes:
891 builder.joinDataset(datasetType, collections, isResult=False)
892 query = builder.finish()
893 return queries.DataCoordinateQueryResults(self._db, query)
895 def queryDimensionRecords(self, element: Union[DimensionElement, str], *,
896 dataId: Optional[DataId] = None,
897 datasets: Any = None,
898 collections: Any = None,
899 where: Optional[str] = None,
900 components: Optional[bool] = None,
901 bind: Optional[Mapping[str, Any]] = None,
902 check: bool = True,
903 **kwargs: Any) -> Iterator[DimensionRecord]:
904 # Docstring inherited from lsst.daf.butler.registry.Registry
905 if not isinstance(element, DimensionElement):
906 try:
907 element = self.dimensions[element]
908 except KeyError as e:
909 raise KeyError(f"No such dimension '{element}', available dimensions: "
910 + str(self.dimensions.getStaticElements())) from e
911 dataIds = self.queryDataIds(element.graph, dataId=dataId, datasets=datasets, collections=collections,
912 where=where, components=components, bind=bind, check=check, **kwargs)
913 return iter(self._managers.dimensions[element].fetch(dataIds))
915 def queryDatasetAssociations(
916 self,
917 datasetType: Union[str, DatasetType],
918 collections: Any = ...,
919 *,
920 collectionTypes: Iterable[CollectionType] = CollectionType.all(),
921 flattenChains: bool = False,
922 ) -> Iterator[DatasetAssociation]:
923 # Docstring inherited from lsst.daf.butler.registry.Registry
924 if collections is None:
925 if not self.defaults.collections:
926 raise TypeError("No collections provided to findDataset, "
927 "and no defaults from registry construction.")
928 collections = self.defaults.collections
929 else:
930 collections = CollectionQuery.fromExpression(collections)
931 TimespanReprClass = self._db.getTimespanRepresentation()
932 if isinstance(datasetType, str):
933 storage = self._managers.datasets[datasetType]
934 else:
935 storage = self._managers.datasets[datasetType.name]
936 for collectionRecord in collections.iter(self._managers.collections,
937 collectionTypes=frozenset(collectionTypes),
938 flattenChains=flattenChains):
939 query = storage.select(collectionRecord)
940 if query is None:
941 continue
942 for row in self._db.query(query.combine()):
943 dataId = DataCoordinate.fromRequiredValues(
944 storage.datasetType.dimensions,
945 tuple(row[name] for name in storage.datasetType.dimensions.required.names)
946 )
947 runRecord = self._managers.collections[row[self._managers.collections.getRunForeignKeyName()]]
948 ref = DatasetRef(storage.datasetType, dataId, id=row["id"], run=runRecord.name,
949 conform=False)
950 if collectionRecord.type is CollectionType.CALIBRATION:
951 timespan = TimespanReprClass.extract(row)
952 else:
953 timespan = None
954 yield DatasetAssociation(ref=ref, collection=collectionRecord.name, timespan=timespan)
956 storageClasses: StorageClassFactory
957 """All storage classes known to the registry (`StorageClassFactory`).
958 """