Coverage for python/lsst/daf/butler/registry/_registry.py : 12%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22from __future__ import annotations
24__all__ = (
25 "ConsistentDataIds",
26 "Registry",
27)
29from collections import defaultdict
30import contextlib
31from dataclasses import dataclass
32import sys
33from typing import (
34 Any,
35 Dict,
36 Iterable,
37 Iterator,
38 List,
39 Mapping,
40 Optional,
41 Set,
42 Type,
43 TYPE_CHECKING,
44 Union,
45)
47import astropy.time
48import sqlalchemy
50import lsst.sphgeom
51from ..core import (
52 Config,
53 DataCoordinate,
54 DataId,
55 DatasetRef,
56 DatasetType,
57 ddl,
58 Dimension,
59 DimensionElement,
60 DimensionGraph,
61 DimensionRecord,
62 DimensionUniverse,
63 ExpandedDataCoordinate,
64 NamedKeyDict,
65 Timespan,
66 StorageClassFactory,
67)
68from ..core.utils import doImport, iterable, transactional
69from ._config import RegistryConfig
70from .queries import (
71 QueryBuilder,
72 QuerySummary,
73)
74from .tables import makeRegistryTableSpecs
75from ._collectionType import CollectionType
76from ._exceptions import ConflictingDefinitionError, InconsistentDataIdError, OrphanedRecordError
77from .wildcards import CategorizedWildcard, CollectionQuery, CollectionSearch, Ellipsis
78from .interfaces import ChainedCollectionRecord, RunRecord
80if TYPE_CHECKING: 80 ↛ 81line 80 didn't jump to line 81, because the condition on line 80 was never true
81 from ..butlerConfig import ButlerConfig
82 from ..core import (
83 Quantum
84 )
85 from .interfaces import (
86 CollectionManager,
87 Database,
88 OpaqueTableStorageManager,
89 DimensionRecordStorageManager,
90 DatasetRecordStorageManager,
91 DatastoreRegistryBridgeManager,
92 )
95@dataclass
96class ConsistentDataIds:
97 """A struct used to report relationships between data IDs by
98 `Registry.relateDataIds`.
100 If an instance of this class is returned (instead of `None`), the data IDs
101 are "not inconsistent" - any keys they have in common have the same value,
102 and any spatial or temporal relationships they have at least might involve
103 an overlap. To capture this, any instance of `ConsistentDataIds` coerces
104 to `True` in boolean contexts.
105 """
107 overlaps: bool
108 """If `True`, the data IDs have at least one key in common, associated with
109 the same value.
111 Note that data IDs are not inconsistent even if overlaps is `False` - they
112 may simply have no keys in common, which means they cannot have
113 inconsistent values for any keys. They may even be equal, in the case that
114 both data IDs are empty.
116 This field does _not_ indicate whether a spatial or temporal overlap
117 relationship exists.
118 """
120 contains: bool
121 """If `True`, all keys in the first data ID are in the second, and are
122 associated with the same values.
124 This includes case where the first data ID is empty.
125 """
127 within: bool
128 """If `True`, all keys in the second data ID are in the first, and are
129 associated with the same values.
131 This includes case where the second data ID is empty.
132 """
134 @property
135 def equal(self) -> bool:
136 """If `True`, the two data IDs are the same.
138 Data IDs are equal if they have both a `contains` and a `within`
139 relationship.
140 """
141 return self.contains and self.within
143 @property
144 def disjoint(self) -> bool:
145 """If `True`, the two data IDs have no keys in common.
147 This is simply the oppose of `overlaps`. Disjoint datasets are by
148 definition not inconsistent.
149 """
150 return not self.overlaps
152 def __bool__(self) -> bool:
153 return True
156class Registry:
157 """Registry interface.
159 Parameters
160 ----------
161 config : `ButlerConfig`, `RegistryConfig`, `Config` or `str`
162 Registry configuration
163 """
165 defaultConfigFile = None
166 """Path to configuration defaults. Relative to $DAF_BUTLER_DIR/config or
167 absolute path. Can be None if no defaults specified.
168 """
170 @classmethod
171 def fromConfig(cls, config: Union[ButlerConfig, RegistryConfig, Config, str], create: bool = False,
172 butlerRoot: Optional[str] = None, writeable: bool = True) -> Registry:
173 """Create `Registry` subclass instance from `config`.
175 Uses ``registry.cls`` from `config` to determine which subclass to
176 instantiate.
178 Parameters
179 ----------
180 config : `ButlerConfig`, `RegistryConfig`, `Config` or `str`
181 Registry configuration
182 create : `bool`, optional
183 Assume empty Registry and create a new one.
184 butlerRoot : `str`, optional
185 Path to the repository root this `Registry` will manage.
186 writeable : `bool`, optional
187 If `True` (default) create a read-write connection to the database.
189 Returns
190 -------
191 registry : `Registry` (subclass)
192 A new `Registry` subclass instance.
193 """
194 if not isinstance(config, RegistryConfig):
195 if isinstance(config, str) or isinstance(config, Config):
196 config = RegistryConfig(config)
197 else:
198 raise ValueError("Incompatible Registry configuration: {}".format(config))
199 config.replaceRoot(butlerRoot)
200 DatabaseClass = config.getDatabaseClass()
201 database = DatabaseClass.fromUri(str(config.connectionString), origin=config.get("origin", 0),
202 namespace=config.get("namespace"), writeable=writeable)
203 universe = DimensionUniverse(config)
204 opaque = doImport(config["managers", "opaque"])
205 dimensions = doImport(config["managers", "dimensions"])
206 collections = doImport(config["managers", "collections"])
207 datasets = doImport(config["managers", "datasets"])
208 datastoreBridges = doImport(config["managers", "datastores"])
209 return cls(database, universe, dimensions=dimensions, opaque=opaque, collections=collections,
210 datasets=datasets, datastoreBridges=datastoreBridges, create=create)
212 def __init__(self, database: Database, universe: DimensionUniverse, *,
213 opaque: Type[OpaqueTableStorageManager],
214 dimensions: Type[DimensionRecordStorageManager],
215 collections: Type[CollectionManager],
216 datasets: Type[DatasetRecordStorageManager],
217 datastoreBridges: Type[DatastoreRegistryBridgeManager],
218 create: bool = False):
219 self._db = database
220 self.storageClasses = StorageClassFactory()
221 with self._db.declareStaticTables(create=create) as context:
222 self._dimensions = dimensions.initialize(self._db, context, universe=universe)
223 self._collections = collections.initialize(self._db, context)
224 self._datasets = datasets.initialize(self._db, context,
225 collections=self._collections,
226 universe=self.dimensions)
227 self._opaque = opaque.initialize(self._db, context)
228 self._datastoreBridges = datastoreBridges.initialize(self._db, context,
229 opaque=self._opaque,
230 datasets=datasets,
231 universe=self.dimensions)
232 self._tables = context.addTableTuple(makeRegistryTableSpecs(self.dimensions,
233 self._collections,
234 self._datasets))
235 self._collections.refresh()
236 self._datasets.refresh(universe=self._dimensions.universe)
238 def __str__(self) -> str:
239 return str(self._db)
241 def __repr__(self) -> str:
242 return f"Registry({self._db!r}, {self.dimensions!r})"
244 def isWriteable(self) -> bool:
245 """Return `True` if this registry allows write operations, and `False`
246 otherwise.
247 """
248 return self._db.isWriteable()
250 @property
251 def dimensions(self) -> DimensionUniverse:
252 """All dimensions recognized by this `Registry` (`DimensionUniverse`).
253 """
254 return self._dimensions.universe
256 @contextlib.contextmanager
257 def transaction(self) -> Iterator[None]:
258 """Return a context manager that represents a transaction.
259 """
260 # TODO make savepoint=False the default.
261 try:
262 with self._db.transaction():
263 yield
264 except BaseException:
265 # TODO: this clears the caches sometimes when we wouldn't actually
266 # need to. Can we avoid that?
267 self._dimensions.clearCaches()
268 raise
270 def registerOpaqueTable(self, tableName: str, spec: ddl.TableSpec) -> None:
271 """Add an opaque (to the `Registry`) table for use by a `Datastore` or
272 other data repository client.
274 Opaque table records can be added via `insertOpaqueData`, retrieved via
275 `fetchOpaqueData`, and removed via `deleteOpaqueData`.
277 Parameters
278 ----------
279 tableName : `str`
280 Logical name of the opaque table. This may differ from the
281 actual name used in the database by a prefix and/or suffix.
282 spec : `ddl.TableSpec`
283 Specification for the table to be added.
284 """
285 self._opaque.register(tableName, spec)
287 @transactional
288 def insertOpaqueData(self, tableName: str, *data: dict) -> None:
289 """Insert records into an opaque table.
291 Parameters
292 ----------
293 tableName : `str`
294 Logical name of the opaque table. Must match the name used in a
295 previous call to `registerOpaqueTable`.
296 data
297 Each additional positional argument is a dictionary that represents
298 a single row to be added.
299 """
300 self._opaque[tableName].insert(*data)
302 def fetchOpaqueData(self, tableName: str, **where: Any) -> Iterator[dict]:
303 """Retrieve records from an opaque table.
305 Parameters
306 ----------
307 tableName : `str`
308 Logical name of the opaque table. Must match the name used in a
309 previous call to `registerOpaqueTable`.
310 where
311 Additional keyword arguments are interpreted as equality
312 constraints that restrict the returned rows (combined with AND);
313 keyword arguments are column names and values are the values they
314 must have.
316 Yields
317 ------
318 row : `dict`
319 A dictionary representing a single result row.
320 """
321 yield from self._opaque[tableName].fetch(**where)
323 @transactional
324 def deleteOpaqueData(self, tableName: str, **where: Any) -> None:
325 """Remove records from an opaque table.
327 Parameters
328 ----------
329 tableName : `str`
330 Logical name of the opaque table. Must match the name used in a
331 previous call to `registerOpaqueTable`.
332 where
333 Additional keyword arguments are interpreted as equality
334 constraints that restrict the deleted rows (combined with AND);
335 keyword arguments are column names and values are the values they
336 must have.
337 """
338 self._opaque[tableName].delete(**where)
340 def registerCollection(self, name: str, type: CollectionType = CollectionType.TAGGED) -> None:
341 """Add a new collection if one with the given name does not exist.
343 Parameters
344 ----------
345 name : `str`
346 The name of the collection to create.
347 type : `CollectionType`
348 Enum value indicating the type of collection to create.
350 Notes
351 -----
352 This method cannot be called within transactions, as it needs to be
353 able to perform its own transaction to be concurrent.
354 """
355 self._collections.register(name, type)
357 def getCollectionType(self, name: str) -> CollectionType:
358 """Return an enumeration value indicating the type of the given
359 collection.
361 Parameters
362 ----------
363 name : `str`
364 The name of the collection.
366 Returns
367 -------
368 type : `CollectionType`
369 Enum value indicating the type of this collection.
371 Raises
372 ------
373 MissingCollectionError
374 Raised if no collection with the given name exists.
375 """
376 return self._collections.find(name).type
378 def registerRun(self, name: str) -> None:
379 """Add a new run if one with the given name does not exist.
381 Parameters
382 ----------
383 name : `str`
384 The name of the run to create.
386 Notes
387 -----
388 This method cannot be called within transactions, as it needs to be
389 able to perform its own transaction to be concurrent.
390 """
391 self._collections.register(name, CollectionType.RUN)
393 @transactional
394 def removeCollection(self, name: str) -> None:
395 """Completely remove the given collection.
397 Parameters
398 ----------
399 name : `str`
400 The name of the collection to remove.
402 Raises
403 ------
404 MissingCollectionError
405 Raised if no collection with the given name exists.
407 Notes
408 -----
409 If this is a `~CollectionType.RUN` collection, all datasets and quanta
410 in it are also fully removed. This requires that those datasets be
411 removed (or at least trashed) from any datastores that hold them first.
413 A collection may not be deleted as long as it is referenced by a
414 `~CollectionType.CHAINED` collection; the ``CHAINED`` collection must
415 be deleted or redefined first.
416 """
417 self._collections.remove(name)
419 def getCollectionChain(self, parent: str) -> CollectionSearch:
420 """Return the child collections in a `~CollectionType.CHAINED`
421 collection.
423 Parameters
424 ----------
425 parent : `str`
426 Name of the chained collection. Must have already been added via
427 a call to `Registry.registerCollection`.
429 Returns
430 -------
431 children : `CollectionSearch`
432 An object that defines the search path of the collection.
433 See :ref:`daf_butler_collection_expressions` for more information.
435 Raises
436 ------
437 MissingCollectionError
438 Raised if ``parent`` does not exist in the `Registry`.
439 TypeError
440 Raised if ``parent`` does not correspond to a
441 `~CollectionType.CHAINED` collection.
442 """
443 record = self._collections.find(parent)
444 if record.type is not CollectionType.CHAINED:
445 raise TypeError(f"Collection '{parent}' has type {record.type.name}, not CHAINED.")
446 assert isinstance(record, ChainedCollectionRecord)
447 return record.children
449 @transactional
450 def setCollectionChain(self, parent: str, children: Any) -> None:
451 """Define or redefine a `~CollectionType.CHAINED` collection.
453 Parameters
454 ----------
455 parent : `str`
456 Name of the chained collection. Must have already been added via
457 a call to `Registry.registerCollection`.
458 children : `Any`
459 An expression defining an ordered search of child collections,
460 generally an iterable of `str`. Restrictions on the dataset types
461 to be searched can also be included, by passing mapping or an
462 iterable containing tuples; see
463 :ref:`daf_butler_collection_expressions` for more information.
465 Raises
466 ------
467 MissingCollectionError
468 Raised when any of the given collections do not exist in the
469 `Registry`.
470 TypeError
471 Raised if ``parent`` does not correspond to a
472 `~CollectionType.CHAINED` collection.
473 ValueError
474 Raised if the given collections contains a cycle.
475 """
476 record = self._collections.find(parent)
477 if record.type is not CollectionType.CHAINED:
478 raise TypeError(f"Collection '{parent}' has type {record.type.name}, not CHAINED.")
479 assert isinstance(record, ChainedCollectionRecord)
480 children = CollectionSearch.fromExpression(children)
481 if children != record.children:
482 record.update(self._collections, children)
484 def registerDatasetType(self, datasetType: DatasetType) -> bool:
485 """
486 Add a new `DatasetType` to the Registry.
488 It is not an error to register the same `DatasetType` twice.
490 Parameters
491 ----------
492 datasetType : `DatasetType`
493 The `DatasetType` to be added.
495 Returns
496 -------
497 inserted : `bool`
498 `True` if ``datasetType`` was inserted, `False` if an identical
499 existing `DatsetType` was found. Note that in either case the
500 DatasetType is guaranteed to be defined in the Registry
501 consistently with the given definition.
503 Raises
504 ------
505 ValueError
506 Raised if the dimensions or storage class are invalid.
507 ConflictingDefinitionError
508 Raised if this DatasetType is already registered with a different
509 definition.
511 Notes
512 -----
513 This method cannot be called within transactions, as it needs to be
514 able to perform its own transaction to be concurrent.
515 """
516 _, inserted = self._datasets.register(datasetType)
517 return inserted
519 def getDatasetType(self, name: str) -> DatasetType:
520 """Get the `DatasetType`.
522 Parameters
523 ----------
524 name : `str`
525 Name of the type.
527 Returns
528 -------
529 type : `DatasetType`
530 The `DatasetType` associated with the given name.
532 Raises
533 ------
534 KeyError
535 Requested named DatasetType could not be found in registry.
536 """
537 storage = self._datasets.find(name)
538 if storage is None:
539 raise KeyError(f"DatasetType '{name}' could not be found.")
540 return storage.datasetType
542 def findDataset(self, datasetType: Union[DatasetType, str], dataId: Optional[DataId] = None, *,
543 collections: Any, **kwargs: Any) -> Optional[DatasetRef]:
544 """Find a dataset given its `DatasetType` and data ID.
546 This can be used to obtain a `DatasetRef` that permits the dataset to
547 be read from a `Datastore`. If the dataset is a component and can not
548 be found using the provided dataset type, a dataset ref for the parent
549 will be returned instead but with the correct dataset type.
551 Parameters
552 ----------
553 datasetType : `DatasetType` or `str`
554 A `DatasetType` or the name of one.
555 dataId : `dict` or `DataCoordinate`, optional
556 A `dict`-like object containing the `Dimension` links that identify
557 the dataset within a collection.
558 collections
559 An expression that fully or partially identifies the collections
560 to search for the dataset, such as a `str`, `re.Pattern`, or
561 iterable thereof. `...` can be used to return all collections.
562 See :ref:`daf_butler_collection_expressions` for more information.
563 **kwargs
564 Additional keyword arguments passed to
565 `DataCoordinate.standardize` to convert ``dataId`` to a true
566 `DataCoordinate` or augment an existing one.
568 Returns
569 -------
570 ref : `DatasetRef`
571 A reference to the dataset, or `None` if no matching Dataset
572 was found.
574 Raises
575 ------
576 LookupError
577 Raised if one or more data ID keys are missing or the dataset type
578 does not exist.
579 MissingCollectionError
580 Raised if any of ``collections`` does not exist in the registry.
581 """
582 if isinstance(datasetType, DatasetType):
583 storage = self._datasets.find(datasetType.name)
584 if storage is None:
585 raise LookupError(f"DatasetType '{datasetType}' has not been registered.")
586 else:
587 storage = self._datasets.find(datasetType)
588 if storage is None:
589 raise LookupError(f"DatasetType with name '{datasetType}' has not been registered.")
590 dataId = DataCoordinate.standardize(dataId, graph=storage.datasetType.dimensions,
591 universe=self.dimensions, **kwargs)
592 collections = CollectionSearch.fromExpression(collections)
593 for collectionRecord in collections.iter(self._collections, datasetType=storage.datasetType):
594 result = storage.find(collectionRecord, dataId)
595 if result is not None:
596 return result
598 # fallback to the parent if we got nothing and this was a component
599 if storage.datasetType.isComponent():
600 parentType, _ = storage.datasetType.nameAndComponent()
601 parentRef = self.findDataset(parentType, dataId, collections=collections, **kwargs)
602 if parentRef is not None:
603 # Should already conform and we know no components
604 return DatasetRef(storage.datasetType, parentRef.dataId, id=parentRef.id,
605 run=parentRef.run, conform=False, hasParentId=True)
607 return None
609 @transactional
610 def insertDatasets(self, datasetType: Union[DatasetType, str], dataIds: Iterable[DataId],
611 run: str, *, producer: Optional[Quantum] = None) -> List[DatasetRef]:
612 """Insert one or more datasets into the `Registry`
614 This always adds new datasets; to associate existing datasets with
615 a new collection, use ``associate``.
617 Parameters
618 ----------
619 datasetType : `DatasetType` or `str`
620 A `DatasetType` or the name of one.
621 dataIds : `~collections.abc.Iterable` of `dict` or `DataCoordinate`
622 Dimension-based identifiers for the new datasets.
623 run : `str`
624 The name of the run that produced the datasets.
625 producer : `Quantum`
626 Unit of work that produced the datasets. May be `None` to store
627 no provenance information, but if present the `Quantum` must
628 already have been added to the Registry.
630 Returns
631 -------
632 refs : `list` of `DatasetRef`
633 Resolved `DatasetRef` instances for all given data IDs (in the same
634 order).
636 Raises
637 ------
638 ConflictingDefinitionError
639 If a dataset with the same dataset type and data ID as one of those
640 given already exists in ``run``.
641 MissingCollectionError
642 Raised if ``run`` does not exist in the registry.
643 """
644 if isinstance(datasetType, DatasetType):
645 storage = self._datasets.find(datasetType.name)
646 if storage is None:
647 raise LookupError(f"DatasetType '{datasetType}' has not been registered.")
648 else:
649 storage = self._datasets.find(datasetType)
650 if storage is None:
651 raise LookupError(f"DatasetType with name '{datasetType}' has not been registered.")
652 runRecord = self._collections.find(run)
653 if runRecord.type is not CollectionType.RUN:
654 raise TypeError("Given collection is of type {runRecord.type.name}; RUN collection required.")
655 assert isinstance(runRecord, RunRecord)
656 expandedDataIds = [self.expandDataId(dataId, graph=storage.datasetType.dimensions)
657 for dataId in dataIds]
658 try:
659 refs = list(storage.insert(runRecord, expandedDataIds, quantum=producer))
660 except sqlalchemy.exc.IntegrityError as err:
661 raise ConflictingDefinitionError(f"A database constraint failure was triggered by inserting "
662 f"one or more datasets of type {storage.datasetType} into "
663 f"collection '{run}'. "
664 f"This probably means a dataset with the same data ID "
665 f"and dataset type already exists, but it may also mean a "
666 f"dimension row is missing.") from err
667 return refs
669 def getDataset(self, id: int) -> Optional[DatasetRef]:
670 """Retrieve a Dataset entry.
672 Parameters
673 ----------
674 id : `int`
675 The unique identifier for the dataset.
677 Returns
678 -------
679 ref : `DatasetRef` or `None`
680 A ref to the Dataset, or `None` if no matching Dataset
681 was found.
682 """
683 ref = self._datasets.getDatasetRef(id, universe=self.dimensions)
684 if ref is None:
685 return None
686 return ref
688 @transactional
689 def removeDatasets(self, refs: Iterable[DatasetRef]) -> None:
690 """Remove datasets from the Registry.
692 The datasets will be removed unconditionally from all collections, and
693 any `Quantum` that consumed this dataset will instead be marked with
694 having a NULL input. `Datastore` records will *not* be deleted; the
695 caller is responsible for ensuring that the dataset has already been
696 removed from all Datastores.
698 Parameters
699 ----------
700 refs : `Iterable` of `DatasetRef`
701 References to the datasets to be removed. Must include a valid
702 ``id`` attribute, and should be considered invalidated upon return.
704 Raises
705 ------
706 AmbiguousDatasetError
707 Raised if any ``ref.id`` is `None`.
708 OrphanedRecordError
709 Raised if any dataset is still present in any `Datastore`.
710 """
711 for datasetType, refsForType in DatasetRef.groupByType(refs).items():
712 storage = self._datasets.find(datasetType.name)
713 assert storage is not None
714 try:
715 storage.delete(refsForType)
716 except sqlalchemy.exc.IntegrityError as err:
717 raise OrphanedRecordError("One or more datasets is still "
718 "present in one or more Datastores.") from err
720 @transactional
721 def associate(self, collection: str, refs: Iterable[DatasetRef]) -> None:
722 """Add existing datasets to a `~CollectionType.TAGGED` collection.
724 If a DatasetRef with the same exact integer ID is already in a
725 collection nothing is changed. If a `DatasetRef` with the same
726 `DatasetType` and data ID but with different integer ID
727 exists in the collection, `ConflictingDefinitionError` is raised.
729 Parameters
730 ----------
731 collection : `str`
732 Indicates the collection the datasets should be associated with.
733 refs : `Iterable` [ `DatasetRef` ]
734 An iterable of resolved `DatasetRef` instances that already exist
735 in this `Registry`.
737 Raises
738 ------
739 ConflictingDefinitionError
740 If a Dataset with the given `DatasetRef` already exists in the
741 given collection.
742 AmbiguousDatasetError
743 Raised if ``any(ref.id is None for ref in refs)``.
744 MissingCollectionError
745 Raised if ``collection`` does not exist in the registry.
746 TypeError
747 Raise adding new datasets to the given ``collection`` is not
748 allowed.
749 """
750 collectionRecord = self._collections.find(collection)
751 if collectionRecord.type is not CollectionType.TAGGED:
752 raise TypeError(f"Collection '{collection}' has type {collectionRecord.type.name}, not TAGGED.")
753 for datasetType, refsForType in DatasetRef.groupByType(refs).items():
754 storage = self._datasets.find(datasetType.name)
755 assert storage is not None
756 try:
757 storage.associate(collectionRecord, refsForType)
758 except sqlalchemy.exc.IntegrityError as err:
759 raise ConflictingDefinitionError(
760 f"Constraint violation while associating dataset of type {datasetType.name} with "
761 f"collection {collection}. This probably means that one or more datasets with the same "
762 f"dataset type and data ID already exist in the collection, but it may also indicate "
763 f"that the datasets do not exist."
764 ) from err
766 @transactional
767 def disassociate(self, collection: str, refs: Iterable[DatasetRef]) -> None:
768 """Remove existing datasets from a `~CollectionType.TAGGED` collection.
770 ``collection`` and ``ref`` combinations that are not currently
771 associated are silently ignored.
773 Parameters
774 ----------
775 collection : `str`
776 The collection the datasets should no longer be associated with.
777 refs : `Iterable` [ `DatasetRef` ]
778 An iterable of resolved `DatasetRef` instances that already exist
779 in this `Registry`.
781 Raises
782 ------
783 AmbiguousDatasetError
784 Raised if any of the given dataset references is unresolved.
785 MissingCollectionError
786 Raised if ``collection`` does not exist in the registry.
787 TypeError
788 Raise adding new datasets to the given ``collection`` is not
789 allowed.
790 """
791 collectionRecord = self._collections.find(collection)
792 if collectionRecord.type is not CollectionType.TAGGED:
793 raise TypeError(f"Collection '{collection}' has type {collectionRecord.type.name}; "
794 "expected TAGGED.")
795 for datasetType, refsForType in DatasetRef.groupByType(refs).items():
796 storage = self._datasets.find(datasetType.name)
797 assert storage is not None
798 storage.disassociate(collectionRecord, refsForType)
800 def getDatastoreBridgeManager(self) -> DatastoreRegistryBridgeManager:
801 """Return an object that allows a new `Datastore` instance to
802 communicate with this `Registry`.
804 Returns
805 -------
806 manager : `DatastoreRegistryBridgeManager`
807 Object that mediates communication between this `Registry` and its
808 associated datastores.
809 """
810 return self._datastoreBridges
812 def getDatasetLocations(self, ref: DatasetRef) -> Iterable[str]:
813 """Retrieve datastore locations for a given dataset.
815 Parameters
816 ----------
817 ref : `DatasetRef`
818 A reference to the dataset for which to retrieve storage
819 information.
821 Returns
822 -------
823 datastores : `Iterable` [ `str` ]
824 All the matching datastores holding this dataset.
826 Raises
827 ------
828 AmbiguousDatasetError
829 Raised if ``ref.id`` is `None`.
830 """
831 return self._datastoreBridges.findDatastores(ref)
833 def expandDataId(self, dataId: Optional[DataId] = None, *, graph: Optional[DimensionGraph] = None,
834 records: Optional[Mapping[DimensionElement, Optional[DimensionRecord]]] = None,
835 **kwargs: Any) -> ExpandedDataCoordinate:
836 """Expand a dimension-based data ID to include additional information.
838 Parameters
839 ----------
840 dataId : `DataCoordinate` or `dict`, optional
841 Data ID to be expanded; augmented and overridden by ``kwds``.
842 graph : `DimensionGraph`, optional
843 Set of dimensions for the expanded ID. If `None`, the dimensions
844 will be inferred from the keys of ``dataId`` and ``kwds``.
845 Dimensions that are in ``dataId`` or ``kwds`` but not in ``graph``
846 are silently ignored, providing a way to extract and expand a
847 subset of a data ID.
848 records : `Mapping` [`DimensionElement`, `DimensionRecord`], optional
849 Dimension record data to use before querying the database for that
850 data.
851 **kwargs
852 Additional keywords are treated like additional key-value pairs for
853 ``dataId``, extending and overriding
855 Returns
856 -------
857 expanded : `ExpandedDataCoordinate`
858 A data ID that includes full metadata for all of the dimensions it
859 identifieds.
860 """
861 standardized = DataCoordinate.standardize(dataId, graph=graph, universe=self.dimensions, **kwargs)
862 if isinstance(standardized, ExpandedDataCoordinate):
863 return standardized
864 elif isinstance(dataId, ExpandedDataCoordinate):
865 records = NamedKeyDict(records) if records is not None else NamedKeyDict()
866 records.update(dataId.records)
867 else:
868 records = NamedKeyDict(records) if records is not None else NamedKeyDict()
869 keys = dict(standardized.byName())
870 regions: List[lsst.sphgeom.ConvexPolygon] = []
871 timespans: List[Timespan[astropy.time.Time]] = []
872 for element in standardized.graph.primaryKeyTraversalOrder:
873 record = records.get(element.name, ...) # Use ... to mean not found; None might mean NULL
874 if record is ...:
875 storage = self._dimensions[element]
876 record = storage.fetch(keys)
877 records[element] = record
878 if record is not None:
879 for d in element.implied:
880 value = getattr(record, d.name)
881 if keys.setdefault(d.name, value) != value:
882 raise InconsistentDataIdError(
883 f"Data ID {standardized} has {d.name}={keys[d.name]!r}, "
884 f"but {element.name} implies {d.name}={value!r}."
885 )
886 if element in standardized.graph.spatial and record.region is not None:
887 if any(record.region.relate(r) & lsst.sphgeom.DISJOINT for r in regions):
888 raise InconsistentDataIdError(f"Data ID {standardized}'s region for {element.name} "
889 f"is disjoint with those for other elements.")
890 regions.append(record.region)
891 if element in standardized.graph.temporal:
892 if any(not record.timespan.overlaps(t) for t in timespans):
893 raise InconsistentDataIdError(f"Data ID {standardized}'s timespan for {element.name}"
894 f" is disjoint with those for other elements.")
895 timespans.append(record.timespan)
896 else:
897 if element in standardized.graph.required:
898 raise LookupError(
899 f"Could not fetch record for required dimension {element.name} via keys {keys}."
900 )
901 if element.alwaysJoin:
902 raise InconsistentDataIdError(
903 f"Could not fetch record for element {element.name} via keys {keys}, ",
904 "but it is marked alwaysJoin=True; this means one or more dimensions are not "
905 "related."
906 )
907 records.update((d, None) for d in element.implied)
908 return ExpandedDataCoordinate(standardized.graph, standardized.values(), records=records)
910 def relateDataIds(self, a: DataId, b: DataId) -> Optional[ConsistentDataIds]:
911 """Compare the keys and values of a pair of data IDs for consistency.
913 See `ConsistentDataIds` for more information.
915 Parameters
916 ----------
917 a : `dict` or `DataCoordinate`
918 First data ID to be compared.
919 b : `dict` or `DataCoordinate`
920 Second data ID to be compared.
922 Returns
923 -------
924 relationship : `ConsistentDataIds` or `None`
925 Relationship information. This is not `None` and coerces to
926 `True` in boolean contexts if and only if the data IDs are
927 consistent in terms of all common key-value pairs, all many-to-many
928 join tables, and all spatial andtemporal relationships.
929 """
930 a = DataCoordinate.standardize(a, universe=self.dimensions)
931 b = DataCoordinate.standardize(b, universe=self.dimensions)
932 aFull = getattr(a, "full", None)
933 bFull = getattr(b, "full", None)
934 aBest = aFull if aFull is not None else a
935 bBest = bFull if bFull is not None else b
936 jointKeys = aBest.keys() & bBest.keys()
937 # If any common values are not equal, we know they are inconsistent.
938 if any(aBest[k] != bBest[k] for k in jointKeys):
939 return None
940 # If the graphs are equal, we know the data IDs are.
941 if a.graph == b.graph:
942 return ConsistentDataIds(contains=True, within=True, overlaps=bool(jointKeys))
943 # Result is still inconclusive. Try to expand a data ID containing
944 # keys from both; that will fail if they are inconsistent.
945 # First, if either input was already an ExpandedDataCoordinate, extract
946 # its records so we don't have to query for them.
947 records: NamedKeyDict[DimensionElement, Optional[DimensionRecord]] = NamedKeyDict()
948 if isinstance(a, ExpandedDataCoordinate):
949 records.update(a.records)
950 if isinstance(b, ExpandedDataCoordinate):
951 records.update(b.records)
952 try:
953 self.expandDataId({**a.byName(), **b.byName()}, graph=(a.graph | b.graph), records=records)
954 except InconsistentDataIdError:
955 return None
956 # We know the answer is not `None`; time to figure out what it is.
957 return ConsistentDataIds(
958 contains=(a.graph >= b.graph),
959 within=(a.graph <= b.graph),
960 overlaps=bool(a.graph & b.graph),
961 )
963 def insertDimensionData(self, element: Union[DimensionElement, str],
964 *data: Union[Mapping[str, Any], DimensionRecord],
965 conform: bool = True) -> None:
966 """Insert one or more dimension records into the database.
968 Parameters
969 ----------
970 element : `DimensionElement` or `str`
971 The `DimensionElement` or name thereof that identifies the table
972 records will be inserted into.
973 data : `dict` or `DimensionRecord` (variadic)
974 One or more records to insert.
975 conform : `bool`, optional
976 If `False` (`True` is default) perform no checking or conversions,
977 and assume that ``element`` is a `DimensionElement` instance and
978 ``data`` is a one or more `DimensionRecord` instances of the
979 appropriate subclass.
980 """
981 if conform:
982 if isinstance(element, str):
983 element = self.dimensions[element]
984 records = [row if isinstance(row, DimensionRecord) else element.RecordClass.fromDict(row)
985 for row in data]
986 else:
987 # Ignore typing since caller said to trust them with conform=False.
988 records = data # type: ignore
989 storage = self._dimensions[element] # type: ignore
990 storage.insert(*records)
992 def syncDimensionData(self, element: Union[DimensionElement, str],
993 row: Union[Mapping[str, Any], DimensionRecord],
994 conform: bool = True) -> bool:
995 """Synchronize the given dimension record with the database, inserting
996 if it does not already exist and comparing values if it does.
998 Parameters
999 ----------
1000 element : `DimensionElement` or `str`
1001 The `DimensionElement` or name thereof that identifies the table
1002 records will be inserted into.
1003 row : `dict` or `DimensionRecord`
1004 The record to insert.
1005 conform : `bool`, optional
1006 If `False` (`True` is default) perform no checking or conversions,
1007 and assume that ``element`` is a `DimensionElement` instance and
1008 ``data`` is a one or more `DimensionRecord` instances of the
1009 appropriate subclass.
1011 Returns
1012 -------
1013 inserted : `bool`
1014 `True` if a new row was inserted, `False` otherwise.
1016 Raises
1017 ------
1018 ConflictingDefinitionError
1019 Raised if the record exists in the database (according to primary
1020 key lookup) but is inconsistent with the given one.
1022 Notes
1023 -----
1024 This method cannot be called within transactions, as it needs to be
1025 able to perform its own transaction to be concurrent.
1026 """
1027 if conform:
1028 if isinstance(element, str):
1029 element = self.dimensions[element]
1030 record = row if isinstance(row, DimensionRecord) else element.RecordClass.fromDict(row)
1031 else:
1032 # Ignore typing since caller said to trust them with conform=False.
1033 record = row # type: ignore
1034 storage = self._dimensions[element] # type: ignore
1035 return storage.sync(record)
1037 def queryDatasetTypes(self, expression: Any = ..., *, components: Optional[bool] = None
1038 ) -> Iterator[DatasetType]:
1039 """Iterate over the dataset types whose names match an expression.
1041 Parameters
1042 ----------
1043 expression : `Any`, optional
1044 An expression that fully or partially identifies the dataset types
1045 to return, such as a `str`, `re.Pattern`, or iterable thereof.
1046 `...` can be used to return all dataset types, and is the default.
1047 See :ref:`daf_butler_dataset_type_expressions` for more
1048 information.
1049 components : `bool`, optional
1050 If `True`, apply all expression patterns to component dataset type
1051 names as well. If `False`, never apply patterns to components.
1052 If `None` (default), apply patterns to components only if their
1053 parent datasets were not matched by the expression.
1054 Fully-specified component datasets (`str` or `DatasetType`
1055 instances) are always included.
1057 Yields
1058 ------
1059 datasetType : `DatasetType`
1060 A `DatasetType` instance whose name matches ``expression``.
1061 """
1062 wildcard = CategorizedWildcard.fromExpression(expression, coerceUnrecognized=lambda d: d.name)
1063 if wildcard is Ellipsis:
1064 for datasetType in self._datasets:
1065 if components or not datasetType.isComponent():
1066 yield datasetType
1067 return
1068 done: Set[str] = set()
1069 for name in wildcard.strings:
1070 storage = self._datasets.find(name)
1071 if storage is not None:
1072 done.add(storage.datasetType.name)
1073 yield storage.datasetType
1074 if wildcard.patterns:
1075 # If components (the argument) is None, we'll save component
1076 # dataset that we might want to match, but only if their parents
1077 # didn't get included.
1078 componentsForLater = []
1079 for datasetType in self._datasets:
1080 if datasetType.name in done:
1081 continue
1082 parentName, componentName = datasetType.nameAndComponent()
1083 if componentName is not None and not components:
1084 if components is None and parentName not in done:
1085 componentsForLater.append(datasetType)
1086 continue
1087 if any(p.fullmatch(datasetType.name) for p in wildcard.patterns):
1088 done.add(datasetType.name)
1089 yield datasetType
1090 # Go back and try to match saved components.
1091 for datasetType in componentsForLater:
1092 parentName, _ = datasetType.nameAndComponent()
1093 if parentName not in done and any(p.fullmatch(datasetType.name) for p in wildcard.patterns):
1094 yield datasetType
1096 def queryCollections(self, expression: Any = ...,
1097 datasetType: Optional[DatasetType] = None,
1098 collectionType: Optional[CollectionType] = None,
1099 flattenChains: bool = False,
1100 includeChains: Optional[bool] = None) -> Iterator[str]:
1101 """Iterate over the collections whose names match an expression.
1103 Parameters
1104 ----------
1105 expression : `Any`, optional
1106 An expression that fully or partially identifies the collections
1107 to return, such as a `str`, `re.Pattern`, or iterable thereof.
1108 `...` can be used to return all collections, and is the default.
1109 See :ref:`daf_butler_collection_expressions` for more
1110 information.
1111 datasetType : `DatasetType`, optional
1112 If provided, only yield collections that should be searched for
1113 this dataset type according to ``expression``. If this is
1114 not provided, any dataset type restrictions in ``expression`` are
1115 ignored.
1116 collectionType : `CollectionType`, optional
1117 If provided, only yield collections of this type.
1118 flattenChains : `bool`, optional
1119 If `True` (`False` is default), recursively yield the child
1120 collections of matching `~CollectionType.CHAINED` collections.
1121 includeChains : `bool`, optional
1122 If `True`, yield records for matching `~CollectionType.CHAINED`
1123 collections. Default is the opposite of ``flattenChains``: include
1124 either CHAINED collections or their children, but not both.
1126 Yields
1127 ------
1128 collection : `str`
1129 The name of a collection that matches ``expression``.
1130 """
1131 query = CollectionQuery.fromExpression(expression)
1132 for record in query.iter(self._collections, datasetType=datasetType, collectionType=collectionType,
1133 flattenChains=flattenChains, includeChains=includeChains):
1134 yield record.name
1136 def makeQueryBuilder(self, summary: QuerySummary) -> QueryBuilder:
1137 """Return a `QueryBuilder` instance capable of constructing and
1138 managing more complex queries than those obtainable via `Registry`
1139 interfaces.
1141 This is an advanced interface; downstream code should prefer
1142 `Registry.queryDimensions` and `Registry.queryDatasets` whenever those
1143 are sufficient.
1145 Parameters
1146 ----------
1147 summary : `QuerySummary`
1148 Object describing and categorizing the full set of dimensions that
1149 will be included in the query.
1151 Returns
1152 -------
1153 builder : `QueryBuilder`
1154 Object that can be used to construct and perform advanced queries.
1155 """
1156 return QueryBuilder(summary=summary,
1157 collections=self._collections,
1158 dimensions=self._dimensions,
1159 datasets=self._datasets)
1161 def queryDimensions(self, dimensions: Union[Iterable[Union[Dimension, str]], Dimension, str], *,
1162 dataId: Optional[DataId] = None,
1163 datasets: Any = None,
1164 collections: Any = None,
1165 where: Optional[str] = None,
1166 expand: bool = True,
1167 components: Optional[bool] = None,
1168 **kwargs: Any) -> Iterator[DataCoordinate]:
1169 """Query for and iterate over data IDs matching user-provided criteria.
1171 Parameters
1172 ----------
1173 dimensions : `Dimension` or `str`, or iterable thereof
1174 The dimensions of the data IDs to yield, as either `Dimension`
1175 instances or `str`. Will be automatically expanded to a complete
1176 `DimensionGraph`.
1177 dataId : `dict` or `DataCoordinate`, optional
1178 A data ID whose key-value pairs are used as equality constraints
1179 in the query.
1180 datasets : `Any`, optional
1181 An expression that fully or partially identifies dataset types
1182 that should constrain the yielded data IDs. For example, including
1183 "raw" here would constrain the yielded ``instrument``,
1184 ``exposure``, ``detector``, and ``physical_filter`` values to only
1185 those for which at least one "raw" dataset exists in
1186 ``collections``. Allowed types include `DatasetType`, `str`,
1187 `re.Pattern`, and iterables thereof. Unlike other dataset type
1188 expressions, `...` is not permitted - it doesn't make sense to
1189 constrain data IDs on the existence of *all* datasets.
1190 See :ref:`daf_butler_dataset_type_expressions` for more
1191 information.
1192 collections: `Any`, optional
1193 An expression that fully or partially identifies the collections
1194 to search for datasets, such as a `str`, `re.Pattern`, or iterable
1195 thereof. `...` can be used to return all collections. Must be
1196 provided if ``datasets`` is, and is ignored if it is not. See
1197 :ref:`daf_butler_collection_expressions` for more information.
1198 where : `str`, optional
1199 A string expression similar to a SQL WHERE clause. May involve
1200 any column of a dimension table or (as a shortcut for the primary
1201 key column of a dimension table) dimension name. See
1202 :ref:`daf_butler_dimension_expressions` for more information.
1203 expand : `bool`, optional
1204 If `True` (default) yield `ExpandedDataCoordinate` instead of
1205 minimal `DataCoordinate` base-class instances.
1206 components : `bool`, optional
1207 If `True`, apply all dataset expression patterns to component
1208 dataset type names as well. If `False`, never apply patterns to
1209 components. If `None` (default), apply patterns to components only
1210 if their parent datasets were not matched by the expression.
1211 Fully-specified component datasets (`str` or `DatasetType`
1212 instances) are always included.
1213 **kwargs
1214 Additional keyword arguments are forwarded to
1215 `DataCoordinate.standardize` when processing the ``dataId``
1216 argument (and may be used to provide a constraining data ID even
1217 when the ``dataId`` argument is `None`).
1219 Yields
1220 ------
1221 dataId : `DataCoordinate`
1222 Data IDs matching the given query parameters. Order is
1223 unspecified.
1224 """
1225 dimensions = iterable(dimensions)
1226 standardizedDataId = self.expandDataId(dataId, **kwargs)
1227 standardizedDatasetTypes = set()
1228 requestedDimensionNames = set(self.dimensions.extract(dimensions).names)
1229 if datasets is not None:
1230 if collections is None:
1231 raise TypeError("Cannot pass 'datasets' without 'collections'.")
1232 for datasetType in self.queryDatasetTypes(datasets, components=components):
1233 requestedDimensionNames.update(datasetType.dimensions.names)
1234 # If any matched dataset type is a component, just operate on
1235 # its parent instead, because Registry doesn't know anything
1236 # about what components exist, and here (unlike queryDatasets)
1237 # we don't care about returning them.
1238 parentDatasetTypeName, componentName = datasetType.nameAndComponent()
1239 if componentName is not None:
1240 datasetType = self.getDatasetType(parentDatasetTypeName)
1241 standardizedDatasetTypes.add(datasetType)
1242 # Preprocess collections expression in case the original included
1243 # single-pass iterators (we'll want to use it multiple times
1244 # below).
1245 collections = CollectionQuery.fromExpression(collections)
1247 summary = QuerySummary(
1248 requested=DimensionGraph(self.dimensions, names=requestedDimensionNames),
1249 dataId=standardizedDataId,
1250 expression=where,
1251 )
1252 builder = self.makeQueryBuilder(summary)
1253 for datasetType in standardizedDatasetTypes:
1254 builder.joinDataset(datasetType, collections, isResult=False)
1255 query = builder.finish()
1256 predicate = query.predicate()
1257 for row in self._db.query(query.sql):
1258 if predicate(row):
1259 result = query.extractDataId(row)
1260 if expand:
1261 yield self.expandDataId(result, records=standardizedDataId.records)
1262 else:
1263 yield result
1265 def queryDatasets(self, datasetType: Any, *,
1266 collections: Any,
1267 dimensions: Optional[Iterable[Union[Dimension, str]]] = None,
1268 dataId: Optional[DataId] = None,
1269 where: Optional[str] = None,
1270 deduplicate: bool = False,
1271 expand: bool = True,
1272 components: Optional[bool] = None,
1273 **kwargs: Any) -> Iterator[DatasetRef]:
1274 """Query for and iterate over dataset references matching user-provided
1275 criteria.
1277 Parameters
1278 ----------
1279 datasetType
1280 An expression that fully or partially identifies the dataset types
1281 to be queried. Allowed types include `DatasetType`, `str`,
1282 `re.Pattern`, and iterables thereof. The special value `...` can
1283 be used to query all dataset types. See
1284 :ref:`daf_butler_dataset_type_expressions` for more information.
1285 collections
1286 An expression that fully or partially identifies the collections
1287 to search for datasets, such as a `str`, `re.Pattern`, or iterable
1288 thereof. `...` can be used to return all collections. See
1289 :ref:`daf_butler_collection_expressions` for more information.
1290 dimensions : `~collections.abc.Iterable` of `Dimension` or `str`
1291 Dimensions to include in the query (in addition to those used
1292 to identify the queried dataset type(s)), either to constrain
1293 the resulting datasets to those for which a matching dimension
1294 exists, or to relate the dataset type's dimensions to dimensions
1295 referenced by the ``dataId`` or ``where`` arguments.
1296 dataId : `dict` or `DataCoordinate`, optional
1297 A data ID whose key-value pairs are used as equality constraints
1298 in the query.
1299 where : `str`, optional
1300 A string expression similar to a SQL WHERE clause. May involve
1301 any column of a dimension table or (as a shortcut for the primary
1302 key column of a dimension table) dimension name. See
1303 :ref:`daf_butler_dimension_expressions` for more information.
1304 deduplicate : `bool`, optional
1305 If `True` (`False` is default), for each result data ID, only
1306 yield one `DatasetRef` of each `DatasetType`, from the first
1307 collection in which a dataset of that dataset type appears
1308 (according to the order of ``collections`` passed in). If `True`,
1309 ``collections`` must not contain regular expressions and may not
1310 be `...`.
1311 expand : `bool`, optional
1312 If `True` (default) attach `ExpandedDataCoordinate` instead of
1313 minimal `DataCoordinate` base-class instances.
1314 components : `bool`, optional
1315 If `True`, apply all dataset expression patterns to component
1316 dataset type names as well. If `False`, never apply patterns to
1317 components. If `None` (default), apply patterns to components only
1318 if their parent datasets were not matched by the expression.
1319 Fully-specified component datasets (`str` or `DatasetType`
1320 instances) are always included.
1321 **kwargs
1322 Additional keyword arguments are forwarded to
1323 `DataCoordinate.standardize` when processing the ``dataId``
1324 argument (and may be used to provide a constraining data ID even
1325 when the ``dataId`` argument is `None`).
1327 Yields
1328 ------
1329 ref : `DatasetRef`
1330 Dataset references matching the given query criteria. These
1331 are grouped by `DatasetType` if the query evaluates to multiple
1332 dataset types, but order is otherwise unspecified.
1334 Raises
1335 ------
1336 TypeError
1337 Raised when the arguments are incompatible, such as when a
1338 collection wildcard is passed when ``deduplicate`` is `True`.
1340 Notes
1341 -----
1342 When multiple dataset types are queried in a single call, the
1343 results of this operation are equivalent to querying for each dataset
1344 type separately in turn, and no information about the relationships
1345 between datasets of different types is included. In contexts where
1346 that kind of information is important, the recommended pattern is to
1347 use `queryDimensions` to first obtain data IDs (possibly with the
1348 desired dataset types and collections passed as constraints to the
1349 query), and then use multiple (generally much simpler) calls to
1350 `queryDatasets` with the returned data IDs passed as constraints.
1351 """
1352 # Standardize the collections expression.
1353 if deduplicate:
1354 collections = CollectionSearch.fromExpression(collections)
1355 else:
1356 collections = CollectionQuery.fromExpression(collections)
1357 # Standardize and expand the data ID provided as a constraint.
1358 standardizedDataId = self.expandDataId(dataId, **kwargs)
1360 # We can only query directly if given a non-component DatasetType
1361 # instance. If we were given an expression or str or a component
1362 # DatasetType instance, we'll populate this dict, recurse, and return.
1363 # If we already have a non-component DatasetType, it will remain None
1364 # and we'll run the query directly.
1365 composition: Optional[
1366 Dict[
1367 DatasetType, # parent dataset type
1368 List[Optional[str]] # component name, or None for parent
1369 ]
1370 ] = None
1371 if not isinstance(datasetType, DatasetType):
1372 # We were given a dataset type expression (which may be as simple
1373 # as a str). Loop over all matching datasets, delegating handling
1374 # of the `components` argument to queryDatasetTypes, as we populate
1375 # the composition dict.
1376 composition = defaultdict(list)
1377 for trueDatasetType in self.queryDatasetTypes(datasetType, components=components):
1378 parentName, componentName = trueDatasetType.nameAndComponent()
1379 if componentName is not None:
1380 parentDatasetType = self.getDatasetType(parentName)
1381 composition.setdefault(parentDatasetType, []).append(componentName)
1382 else:
1383 composition.setdefault(trueDatasetType, []).append(None)
1384 elif datasetType.isComponent():
1385 # We were given a true DatasetType instance, but it's a component.
1386 # the composition dict will have exactly one item.
1387 parentName, componentName = datasetType.nameAndComponent()
1388 parentDatasetType = self.getDatasetType(parentName)
1389 composition = {parentDatasetType: [componentName]}
1390 if composition is not None:
1391 # We need to recurse. Do that once for each parent dataset type.
1392 for parentDatasetType, componentNames in composition.items():
1393 for parentRef in self.queryDatasets(parentDatasetType, collections=collections,
1394 dimensions=dimensions, dataId=standardizedDataId,
1395 where=where, deduplicate=deduplicate):
1396 # Loop over components, yielding one for each one for each
1397 # one requested.
1398 for componentName in componentNames:
1399 if componentName is None:
1400 yield parentRef
1401 else:
1402 yield parentRef.makeComponentRef(componentName)
1403 return
1404 # If we get here, there's no need to recurse (or we are already
1405 # recursing; there can only ever be one level of recursion).
1407 # The full set of dimensions in the query is the combination of those
1408 # needed for the DatasetType and those explicitly requested, if any.
1409 requestedDimensionNames = set(datasetType.dimensions.names)
1410 if dimensions is not None:
1411 requestedDimensionNames.update(self.dimensions.extract(dimensions).names)
1412 # Construct the summary structure needed to construct a QueryBuilder.
1413 summary = QuerySummary(
1414 requested=DimensionGraph(self.dimensions, names=requestedDimensionNames),
1415 dataId=standardizedDataId,
1416 expression=where,
1417 )
1418 builder = self.makeQueryBuilder(summary)
1419 # Add the dataset subquery to the query, telling the QueryBuilder to
1420 # include the rank of the selected collection in the results only if we
1421 # need to deduplicate. Note that if any of the collections are
1422 # actually wildcard expressions, and we've asked for deduplication,
1423 # this will raise TypeError for us.
1424 if not builder.joinDataset(datasetType, collections, isResult=True, addRank=deduplicate):
1425 return
1426 query = builder.finish()
1427 predicate = query.predicate()
1428 if not deduplicate:
1429 # No need to de-duplicate across collections.
1430 for row in self._db.query(query.sql):
1431 if predicate(row):
1432 dataId = query.extractDataId(row, graph=datasetType.dimensions)
1433 if expand:
1434 dataId = self.expandDataId(dataId, records=standardizedDataId.records)
1435 yield query.extractDatasetRef(row, datasetType, dataId)[0]
1436 else:
1437 # For each data ID, yield only the DatasetRef with the lowest
1438 # collection rank.
1439 bestRefs = {}
1440 bestRanks: Dict[DataCoordinate, int] = {}
1441 for row in self._db.query(query.sql):
1442 if predicate(row):
1443 ref, rank = query.extractDatasetRef(row, datasetType)
1444 bestRank = bestRanks.get(ref.dataId, sys.maxsize)
1445 assert rank is not None
1446 if rank < bestRank:
1447 bestRefs[ref.dataId] = ref
1448 bestRanks[ref.dataId] = rank
1449 # If caller requested expanded data IDs, we defer that until here
1450 # so we do as little expansion as possible.
1451 if expand:
1452 for ref in bestRefs.values():
1453 dataId = self.expandDataId(ref.dataId, records=standardizedDataId.records)
1454 yield ref.expanded(dataId)
1455 else:
1456 yield from bestRefs.values()
1458 storageClasses: StorageClassFactory
1459 """All storage classes known to the registry (`StorageClassFactory`).
1460 """