Coverage for python/lsst/daf/butler/registry/_registry.py : 12%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22from __future__ import annotations
24__all__ = (
25 "ConsistentDataIds",
26 "Registry",
27)
29from collections import defaultdict
30import contextlib
31from dataclasses import dataclass
32import sys
33from typing import (
34 Any,
35 Dict,
36 Iterable,
37 Iterator,
38 List,
39 Mapping,
40 Optional,
41 Set,
42 Type,
43 TYPE_CHECKING,
44 Union,
45)
47import astropy.time
48import sqlalchemy
50import lsst.sphgeom
51from ..core import (
52 Config,
53 DataCoordinate,
54 DataId,
55 DatasetRef,
56 DatasetType,
57 ddl,
58 Dimension,
59 DimensionElement,
60 DimensionGraph,
61 DimensionRecord,
62 DimensionUniverse,
63 ExpandedDataCoordinate,
64 NamedKeyDict,
65 Timespan,
66 StorageClassFactory,
67)
68from ..core.utils import doImport, iterable, transactional
69from ._config import RegistryConfig
70from .queries import (
71 QueryBuilder,
72 QuerySummary,
73)
74from ._collectionType import CollectionType
75from ._exceptions import ConflictingDefinitionError, InconsistentDataIdError, OrphanedRecordError
76from .wildcards import CategorizedWildcard, CollectionQuery, CollectionSearch, Ellipsis
77from .interfaces import ChainedCollectionRecord, RunRecord
79if TYPE_CHECKING: 79 ↛ 80line 79 didn't jump to line 80, because the condition on line 79 was never true
80 from ..butlerConfig import ButlerConfig
81 from .interfaces import (
82 CollectionManager,
83 Database,
84 OpaqueTableStorageManager,
85 DimensionRecordStorageManager,
86 DatasetRecordStorageManager,
87 DatastoreRegistryBridgeManager,
88 )
91@dataclass
92class ConsistentDataIds:
93 """A struct used to report relationships between data IDs by
94 `Registry.relateDataIds`.
96 If an instance of this class is returned (instead of `None`), the data IDs
97 are "not inconsistent" - any keys they have in common have the same value,
98 and any spatial or temporal relationships they have at least might involve
99 an overlap. To capture this, any instance of `ConsistentDataIds` coerces
100 to `True` in boolean contexts.
101 """
103 overlaps: bool
104 """If `True`, the data IDs have at least one key in common, associated with
105 the same value.
107 Note that data IDs are not inconsistent even if overlaps is `False` - they
108 may simply have no keys in common, which means they cannot have
109 inconsistent values for any keys. They may even be equal, in the case that
110 both data IDs are empty.
112 This field does _not_ indicate whether a spatial or temporal overlap
113 relationship exists.
114 """
116 contains: bool
117 """If `True`, all keys in the first data ID are in the second, and are
118 associated with the same values.
120 This includes case where the first data ID is empty.
121 """
123 within: bool
124 """If `True`, all keys in the second data ID are in the first, and are
125 associated with the same values.
127 This includes case where the second data ID is empty.
128 """
130 @property
131 def equal(self) -> bool:
132 """If `True`, the two data IDs are the same.
134 Data IDs are equal if they have both a `contains` and a `within`
135 relationship.
136 """
137 return self.contains and self.within
139 @property
140 def disjoint(self) -> bool:
141 """If `True`, the two data IDs have no keys in common.
143 This is simply the oppose of `overlaps`. Disjoint datasets are by
144 definition not inconsistent.
145 """
146 return not self.overlaps
148 def __bool__(self) -> bool:
149 return True
152class Registry:
153 """Registry interface.
155 Parameters
156 ----------
157 config : `ButlerConfig`, `RegistryConfig`, `Config` or `str`
158 Registry configuration
159 """
161 defaultConfigFile = None
162 """Path to configuration defaults. Relative to $DAF_BUTLER_DIR/config or
163 absolute path. Can be None if no defaults specified.
164 """
166 @classmethod
167 def fromConfig(cls, config: Union[ButlerConfig, RegistryConfig, Config, str], create: bool = False,
168 butlerRoot: Optional[str] = None, writeable: bool = True) -> Registry:
169 """Create `Registry` subclass instance from `config`.
171 Uses ``registry.cls`` from `config` to determine which subclass to
172 instantiate.
174 Parameters
175 ----------
176 config : `ButlerConfig`, `RegistryConfig`, `Config` or `str`
177 Registry configuration
178 create : `bool`, optional
179 Assume empty Registry and create a new one.
180 butlerRoot : `str`, optional
181 Path to the repository root this `Registry` will manage.
182 writeable : `bool`, optional
183 If `True` (default) create a read-write connection to the database.
185 Returns
186 -------
187 registry : `Registry` (subclass)
188 A new `Registry` subclass instance.
189 """
190 if not isinstance(config, RegistryConfig):
191 if isinstance(config, str) or isinstance(config, Config):
192 config = RegistryConfig(config)
193 else:
194 raise ValueError("Incompatible Registry configuration: {}".format(config))
195 config.replaceRoot(butlerRoot)
196 DatabaseClass = config.getDatabaseClass()
197 database = DatabaseClass.fromUri(str(config.connectionString), origin=config.get("origin", 0),
198 namespace=config.get("namespace"), writeable=writeable)
199 universe = DimensionUniverse(config)
200 opaque = doImport(config["managers", "opaque"])
201 dimensions = doImport(config["managers", "dimensions"])
202 collections = doImport(config["managers", "collections"])
203 datasets = doImport(config["managers", "datasets"])
204 datastoreBridges = doImport(config["managers", "datastores"])
205 return cls(database, universe, dimensions=dimensions, opaque=opaque, collections=collections,
206 datasets=datasets, datastoreBridges=datastoreBridges, create=create)
208 def __init__(self, database: Database, universe: DimensionUniverse, *,
209 opaque: Type[OpaqueTableStorageManager],
210 dimensions: Type[DimensionRecordStorageManager],
211 collections: Type[CollectionManager],
212 datasets: Type[DatasetRecordStorageManager],
213 datastoreBridges: Type[DatastoreRegistryBridgeManager],
214 create: bool = False):
215 self._db = database
216 self.storageClasses = StorageClassFactory()
217 with self._db.declareStaticTables(create=create) as context:
218 self._dimensions = dimensions.initialize(self._db, context, universe=universe)
219 self._collections = collections.initialize(self._db, context)
220 self._datasets = datasets.initialize(self._db, context,
221 collections=self._collections,
222 universe=self.dimensions)
223 self._opaque = opaque.initialize(self._db, context)
224 self._datastoreBridges = datastoreBridges.initialize(self._db, context,
225 opaque=self._opaque,
226 datasets=datasets,
227 universe=self.dimensions)
228 self._collections.refresh()
229 self._datasets.refresh(universe=self._dimensions.universe)
231 def __str__(self) -> str:
232 return str(self._db)
234 def __repr__(self) -> str:
235 return f"Registry({self._db!r}, {self.dimensions!r})"
237 def isWriteable(self) -> bool:
238 """Return `True` if this registry allows write operations, and `False`
239 otherwise.
240 """
241 return self._db.isWriteable()
243 @property
244 def dimensions(self) -> DimensionUniverse:
245 """All dimensions recognized by this `Registry` (`DimensionUniverse`).
246 """
247 return self._dimensions.universe
249 @contextlib.contextmanager
250 def transaction(self) -> Iterator[None]:
251 """Return a context manager that represents a transaction.
252 """
253 # TODO make savepoint=False the default.
254 try:
255 with self._db.transaction():
256 yield
257 except BaseException:
258 # TODO: this clears the caches sometimes when we wouldn't actually
259 # need to. Can we avoid that?
260 self._dimensions.clearCaches()
261 raise
263 def registerOpaqueTable(self, tableName: str, spec: ddl.TableSpec) -> None:
264 """Add an opaque (to the `Registry`) table for use by a `Datastore` or
265 other data repository client.
267 Opaque table records can be added via `insertOpaqueData`, retrieved via
268 `fetchOpaqueData`, and removed via `deleteOpaqueData`.
270 Parameters
271 ----------
272 tableName : `str`
273 Logical name of the opaque table. This may differ from the
274 actual name used in the database by a prefix and/or suffix.
275 spec : `ddl.TableSpec`
276 Specification for the table to be added.
277 """
278 self._opaque.register(tableName, spec)
280 @transactional
281 def insertOpaqueData(self, tableName: str, *data: dict) -> None:
282 """Insert records into an opaque table.
284 Parameters
285 ----------
286 tableName : `str`
287 Logical name of the opaque table. Must match the name used in a
288 previous call to `registerOpaqueTable`.
289 data
290 Each additional positional argument is a dictionary that represents
291 a single row to be added.
292 """
293 self._opaque[tableName].insert(*data)
295 def fetchOpaqueData(self, tableName: str, **where: Any) -> Iterator[dict]:
296 """Retrieve records from an opaque table.
298 Parameters
299 ----------
300 tableName : `str`
301 Logical name of the opaque table. Must match the name used in a
302 previous call to `registerOpaqueTable`.
303 where
304 Additional keyword arguments are interpreted as equality
305 constraints that restrict the returned rows (combined with AND);
306 keyword arguments are column names and values are the values they
307 must have.
309 Yields
310 ------
311 row : `dict`
312 A dictionary representing a single result row.
313 """
314 yield from self._opaque[tableName].fetch(**where)
316 @transactional
317 def deleteOpaqueData(self, tableName: str, **where: Any) -> None:
318 """Remove records from an opaque table.
320 Parameters
321 ----------
322 tableName : `str`
323 Logical name of the opaque table. Must match the name used in a
324 previous call to `registerOpaqueTable`.
325 where
326 Additional keyword arguments are interpreted as equality
327 constraints that restrict the deleted rows (combined with AND);
328 keyword arguments are column names and values are the values they
329 must have.
330 """
331 self._opaque[tableName].delete(**where)
333 def registerCollection(self, name: str, type: CollectionType = CollectionType.TAGGED) -> None:
334 """Add a new collection if one with the given name does not exist.
336 Parameters
337 ----------
338 name : `str`
339 The name of the collection to create.
340 type : `CollectionType`
341 Enum value indicating the type of collection to create.
343 Notes
344 -----
345 This method cannot be called within transactions, as it needs to be
346 able to perform its own transaction to be concurrent.
347 """
348 self._collections.register(name, type)
350 def getCollectionType(self, name: str) -> CollectionType:
351 """Return an enumeration value indicating the type of the given
352 collection.
354 Parameters
355 ----------
356 name : `str`
357 The name of the collection.
359 Returns
360 -------
361 type : `CollectionType`
362 Enum value indicating the type of this collection.
364 Raises
365 ------
366 MissingCollectionError
367 Raised if no collection with the given name exists.
368 """
369 return self._collections.find(name).type
371 def registerRun(self, name: str) -> None:
372 """Add a new run if one with the given name does not exist.
374 Parameters
375 ----------
376 name : `str`
377 The name of the run to create.
379 Notes
380 -----
381 This method cannot be called within transactions, as it needs to be
382 able to perform its own transaction to be concurrent.
383 """
384 self._collections.register(name, CollectionType.RUN)
386 @transactional
387 def removeCollection(self, name: str) -> None:
388 """Completely remove the given collection.
390 Parameters
391 ----------
392 name : `str`
393 The name of the collection to remove.
395 Raises
396 ------
397 MissingCollectionError
398 Raised if no collection with the given name exists.
400 Notes
401 -----
402 If this is a `~CollectionType.RUN` collection, all datasets and quanta
403 in it are also fully removed. This requires that those datasets be
404 removed (or at least trashed) from any datastores that hold them first.
406 A collection may not be deleted as long as it is referenced by a
407 `~CollectionType.CHAINED` collection; the ``CHAINED`` collection must
408 be deleted or redefined first.
409 """
410 self._collections.remove(name)
412 def getCollectionChain(self, parent: str) -> CollectionSearch:
413 """Return the child collections in a `~CollectionType.CHAINED`
414 collection.
416 Parameters
417 ----------
418 parent : `str`
419 Name of the chained collection. Must have already been added via
420 a call to `Registry.registerCollection`.
422 Returns
423 -------
424 children : `CollectionSearch`
425 An object that defines the search path of the collection.
426 See :ref:`daf_butler_collection_expressions` for more information.
428 Raises
429 ------
430 MissingCollectionError
431 Raised if ``parent`` does not exist in the `Registry`.
432 TypeError
433 Raised if ``parent`` does not correspond to a
434 `~CollectionType.CHAINED` collection.
435 """
436 record = self._collections.find(parent)
437 if record.type is not CollectionType.CHAINED:
438 raise TypeError(f"Collection '{parent}' has type {record.type.name}, not CHAINED.")
439 assert isinstance(record, ChainedCollectionRecord)
440 return record.children
442 @transactional
443 def setCollectionChain(self, parent: str, children: Any) -> None:
444 """Define or redefine a `~CollectionType.CHAINED` collection.
446 Parameters
447 ----------
448 parent : `str`
449 Name of the chained collection. Must have already been added via
450 a call to `Registry.registerCollection`.
451 children : `Any`
452 An expression defining an ordered search of child collections,
453 generally an iterable of `str`. Restrictions on the dataset types
454 to be searched can also be included, by passing mapping or an
455 iterable containing tuples; see
456 :ref:`daf_butler_collection_expressions` for more information.
458 Raises
459 ------
460 MissingCollectionError
461 Raised when any of the given collections do not exist in the
462 `Registry`.
463 TypeError
464 Raised if ``parent`` does not correspond to a
465 `~CollectionType.CHAINED` collection.
466 ValueError
467 Raised if the given collections contains a cycle.
468 """
469 record = self._collections.find(parent)
470 if record.type is not CollectionType.CHAINED:
471 raise TypeError(f"Collection '{parent}' has type {record.type.name}, not CHAINED.")
472 assert isinstance(record, ChainedCollectionRecord)
473 children = CollectionSearch.fromExpression(children)
474 if children != record.children:
475 record.update(self._collections, children)
477 def registerDatasetType(self, datasetType: DatasetType) -> bool:
478 """
479 Add a new `DatasetType` to the Registry.
481 It is not an error to register the same `DatasetType` twice.
483 Parameters
484 ----------
485 datasetType : `DatasetType`
486 The `DatasetType` to be added.
488 Returns
489 -------
490 inserted : `bool`
491 `True` if ``datasetType`` was inserted, `False` if an identical
492 existing `DatsetType` was found. Note that in either case the
493 DatasetType is guaranteed to be defined in the Registry
494 consistently with the given definition.
496 Raises
497 ------
498 ValueError
499 Raised if the dimensions or storage class are invalid.
500 ConflictingDefinitionError
501 Raised if this DatasetType is already registered with a different
502 definition.
504 Notes
505 -----
506 This method cannot be called within transactions, as it needs to be
507 able to perform its own transaction to be concurrent.
508 """
509 _, inserted = self._datasets.register(datasetType)
510 return inserted
512 def getDatasetType(self, name: str) -> DatasetType:
513 """Get the `DatasetType`.
515 Parameters
516 ----------
517 name : `str`
518 Name of the type.
520 Returns
521 -------
522 type : `DatasetType`
523 The `DatasetType` associated with the given name.
525 Raises
526 ------
527 KeyError
528 Requested named DatasetType could not be found in registry.
529 """
530 storage = self._datasets.find(name)
531 if storage is None:
532 raise KeyError(f"DatasetType '{name}' could not be found.")
533 return storage.datasetType
535 def findDataset(self, datasetType: Union[DatasetType, str], dataId: Optional[DataId] = None, *,
536 collections: Any, **kwargs: Any) -> Optional[DatasetRef]:
537 """Find a dataset given its `DatasetType` and data ID.
539 This can be used to obtain a `DatasetRef` that permits the dataset to
540 be read from a `Datastore`. If the dataset is a component and can not
541 be found using the provided dataset type, a dataset ref for the parent
542 will be returned instead but with the correct dataset type.
544 Parameters
545 ----------
546 datasetType : `DatasetType` or `str`
547 A `DatasetType` or the name of one.
548 dataId : `dict` or `DataCoordinate`, optional
549 A `dict`-like object containing the `Dimension` links that identify
550 the dataset within a collection.
551 collections
552 An expression that fully or partially identifies the collections
553 to search for the dataset, such as a `str`, `re.Pattern`, or
554 iterable thereof. `...` can be used to return all collections.
555 See :ref:`daf_butler_collection_expressions` for more information.
556 **kwargs
557 Additional keyword arguments passed to
558 `DataCoordinate.standardize` to convert ``dataId`` to a true
559 `DataCoordinate` or augment an existing one.
561 Returns
562 -------
563 ref : `DatasetRef`
564 A reference to the dataset, or `None` if no matching Dataset
565 was found.
567 Raises
568 ------
569 LookupError
570 Raised if one or more data ID keys are missing or the dataset type
571 does not exist.
572 MissingCollectionError
573 Raised if any of ``collections`` does not exist in the registry.
574 """
575 if isinstance(datasetType, DatasetType):
576 storage = self._datasets.find(datasetType.name)
577 if storage is None:
578 raise LookupError(f"DatasetType '{datasetType}' has not been registered.")
579 else:
580 storage = self._datasets.find(datasetType)
581 if storage is None:
582 raise LookupError(f"DatasetType with name '{datasetType}' has not been registered.")
583 dataId = DataCoordinate.standardize(dataId, graph=storage.datasetType.dimensions,
584 universe=self.dimensions, **kwargs)
585 collections = CollectionSearch.fromExpression(collections)
586 for collectionRecord in collections.iter(self._collections, datasetType=storage.datasetType):
587 result = storage.find(collectionRecord, dataId)
588 if result is not None:
589 return result
591 # fallback to the parent if we got nothing and this was a component
592 if storage.datasetType.isComponent():
593 parentType, _ = storage.datasetType.nameAndComponent()
594 parentRef = self.findDataset(parentType, dataId, collections=collections, **kwargs)
595 if parentRef is not None:
596 # Should already conform and we know no components
597 return DatasetRef(storage.datasetType, parentRef.dataId, id=parentRef.id,
598 run=parentRef.run, conform=False, hasParentId=True)
600 return None
602 @transactional
603 def insertDatasets(self, datasetType: Union[DatasetType, str], dataIds: Iterable[DataId],
604 run: str) -> List[DatasetRef]:
605 """Insert one or more datasets into the `Registry`
607 This always adds new datasets; to associate existing datasets with
608 a new collection, use ``associate``.
610 Parameters
611 ----------
612 datasetType : `DatasetType` or `str`
613 A `DatasetType` or the name of one.
614 dataIds : `~collections.abc.Iterable` of `dict` or `DataCoordinate`
615 Dimension-based identifiers for the new datasets.
616 run : `str`
617 The name of the run that produced the datasets.
619 Returns
620 -------
621 refs : `list` of `DatasetRef`
622 Resolved `DatasetRef` instances for all given data IDs (in the same
623 order).
625 Raises
626 ------
627 ConflictingDefinitionError
628 If a dataset with the same dataset type and data ID as one of those
629 given already exists in ``run``.
630 MissingCollectionError
631 Raised if ``run`` does not exist in the registry.
632 """
633 if isinstance(datasetType, DatasetType):
634 storage = self._datasets.find(datasetType.name)
635 if storage is None:
636 raise LookupError(f"DatasetType '{datasetType}' has not been registered.")
637 else:
638 storage = self._datasets.find(datasetType)
639 if storage is None:
640 raise LookupError(f"DatasetType with name '{datasetType}' has not been registered.")
641 runRecord = self._collections.find(run)
642 if runRecord.type is not CollectionType.RUN:
643 raise TypeError("Given collection is of type {runRecord.type.name}; RUN collection required.")
644 assert isinstance(runRecord, RunRecord)
645 expandedDataIds = [self.expandDataId(dataId, graph=storage.datasetType.dimensions)
646 for dataId in dataIds]
647 try:
648 refs = list(storage.insert(runRecord, expandedDataIds))
649 except sqlalchemy.exc.IntegrityError as err:
650 raise ConflictingDefinitionError(f"A database constraint failure was triggered by inserting "
651 f"one or more datasets of type {storage.datasetType} into "
652 f"collection '{run}'. "
653 f"This probably means a dataset with the same data ID "
654 f"and dataset type already exists, but it may also mean a "
655 f"dimension row is missing.") from err
656 return refs
658 def getDataset(self, id: int) -> Optional[DatasetRef]:
659 """Retrieve a Dataset entry.
661 Parameters
662 ----------
663 id : `int`
664 The unique identifier for the dataset.
666 Returns
667 -------
668 ref : `DatasetRef` or `None`
669 A ref to the Dataset, or `None` if no matching Dataset
670 was found.
671 """
672 ref = self._datasets.getDatasetRef(id, universe=self.dimensions)
673 if ref is None:
674 return None
675 return ref
677 @transactional
678 def removeDatasets(self, refs: Iterable[DatasetRef]) -> None:
679 """Remove datasets from the Registry.
681 The datasets will be removed unconditionally from all collections, and
682 any `Quantum` that consumed this dataset will instead be marked with
683 having a NULL input. `Datastore` records will *not* be deleted; the
684 caller is responsible for ensuring that the dataset has already been
685 removed from all Datastores.
687 Parameters
688 ----------
689 refs : `Iterable` of `DatasetRef`
690 References to the datasets to be removed. Must include a valid
691 ``id`` attribute, and should be considered invalidated upon return.
693 Raises
694 ------
695 AmbiguousDatasetError
696 Raised if any ``ref.id`` is `None`.
697 OrphanedRecordError
698 Raised if any dataset is still present in any `Datastore`.
699 """
700 for datasetType, refsForType in DatasetRef.groupByType(refs).items():
701 storage = self._datasets.find(datasetType.name)
702 assert storage is not None
703 try:
704 storage.delete(refsForType)
705 except sqlalchemy.exc.IntegrityError as err:
706 raise OrphanedRecordError("One or more datasets is still "
707 "present in one or more Datastores.") from err
709 @transactional
710 def associate(self, collection: str, refs: Iterable[DatasetRef]) -> None:
711 """Add existing datasets to a `~CollectionType.TAGGED` collection.
713 If a DatasetRef with the same exact integer ID is already in a
714 collection nothing is changed. If a `DatasetRef` with the same
715 `DatasetType` and data ID but with different integer ID
716 exists in the collection, `ConflictingDefinitionError` is raised.
718 Parameters
719 ----------
720 collection : `str`
721 Indicates the collection the datasets should be associated with.
722 refs : `Iterable` [ `DatasetRef` ]
723 An iterable of resolved `DatasetRef` instances that already exist
724 in this `Registry`.
726 Raises
727 ------
728 ConflictingDefinitionError
729 If a Dataset with the given `DatasetRef` already exists in the
730 given collection.
731 AmbiguousDatasetError
732 Raised if ``any(ref.id is None for ref in refs)``.
733 MissingCollectionError
734 Raised if ``collection`` does not exist in the registry.
735 TypeError
736 Raise adding new datasets to the given ``collection`` is not
737 allowed.
738 """
739 collectionRecord = self._collections.find(collection)
740 if collectionRecord.type is not CollectionType.TAGGED:
741 raise TypeError(f"Collection '{collection}' has type {collectionRecord.type.name}, not TAGGED.")
742 for datasetType, refsForType in DatasetRef.groupByType(refs).items():
743 storage = self._datasets.find(datasetType.name)
744 assert storage is not None
745 try:
746 storage.associate(collectionRecord, refsForType)
747 except sqlalchemy.exc.IntegrityError as err:
748 raise ConflictingDefinitionError(
749 f"Constraint violation while associating dataset of type {datasetType.name} with "
750 f"collection {collection}. This probably means that one or more datasets with the same "
751 f"dataset type and data ID already exist in the collection, but it may also indicate "
752 f"that the datasets do not exist."
753 ) from err
755 @transactional
756 def disassociate(self, collection: str, refs: Iterable[DatasetRef]) -> None:
757 """Remove existing datasets from a `~CollectionType.TAGGED` collection.
759 ``collection`` and ``ref`` combinations that are not currently
760 associated are silently ignored.
762 Parameters
763 ----------
764 collection : `str`
765 The collection the datasets should no longer be associated with.
766 refs : `Iterable` [ `DatasetRef` ]
767 An iterable of resolved `DatasetRef` instances that already exist
768 in this `Registry`.
770 Raises
771 ------
772 AmbiguousDatasetError
773 Raised if any of the given dataset references is unresolved.
774 MissingCollectionError
775 Raised if ``collection`` does not exist in the registry.
776 TypeError
777 Raise adding new datasets to the given ``collection`` is not
778 allowed.
779 """
780 collectionRecord = self._collections.find(collection)
781 if collectionRecord.type is not CollectionType.TAGGED:
782 raise TypeError(f"Collection '{collection}' has type {collectionRecord.type.name}; "
783 "expected TAGGED.")
784 for datasetType, refsForType in DatasetRef.groupByType(refs).items():
785 storage = self._datasets.find(datasetType.name)
786 assert storage is not None
787 storage.disassociate(collectionRecord, refsForType)
789 def getDatastoreBridgeManager(self) -> DatastoreRegistryBridgeManager:
790 """Return an object that allows a new `Datastore` instance to
791 communicate with this `Registry`.
793 Returns
794 -------
795 manager : `DatastoreRegistryBridgeManager`
796 Object that mediates communication between this `Registry` and its
797 associated datastores.
798 """
799 return self._datastoreBridges
801 def getDatasetLocations(self, ref: DatasetRef) -> Iterable[str]:
802 """Retrieve datastore locations for a given dataset.
804 Parameters
805 ----------
806 ref : `DatasetRef`
807 A reference to the dataset for which to retrieve storage
808 information.
810 Returns
811 -------
812 datastores : `Iterable` [ `str` ]
813 All the matching datastores holding this dataset.
815 Raises
816 ------
817 AmbiguousDatasetError
818 Raised if ``ref.id`` is `None`.
819 """
820 return self._datastoreBridges.findDatastores(ref)
822 def expandDataId(self, dataId: Optional[DataId] = None, *, graph: Optional[DimensionGraph] = None,
823 records: Optional[Mapping[DimensionElement, Optional[DimensionRecord]]] = None,
824 **kwargs: Any) -> ExpandedDataCoordinate:
825 """Expand a dimension-based data ID to include additional information.
827 Parameters
828 ----------
829 dataId : `DataCoordinate` or `dict`, optional
830 Data ID to be expanded; augmented and overridden by ``kwds``.
831 graph : `DimensionGraph`, optional
832 Set of dimensions for the expanded ID. If `None`, the dimensions
833 will be inferred from the keys of ``dataId`` and ``kwds``.
834 Dimensions that are in ``dataId`` or ``kwds`` but not in ``graph``
835 are silently ignored, providing a way to extract and expand a
836 subset of a data ID.
837 records : `Mapping` [`DimensionElement`, `DimensionRecord`], optional
838 Dimension record data to use before querying the database for that
839 data.
840 **kwargs
841 Additional keywords are treated like additional key-value pairs for
842 ``dataId``, extending and overriding
844 Returns
845 -------
846 expanded : `ExpandedDataCoordinate`
847 A data ID that includes full metadata for all of the dimensions it
848 identifieds.
849 """
850 standardized = DataCoordinate.standardize(dataId, graph=graph, universe=self.dimensions, **kwargs)
851 if isinstance(standardized, ExpandedDataCoordinate):
852 return standardized
853 elif isinstance(dataId, ExpandedDataCoordinate):
854 records = NamedKeyDict(records) if records is not None else NamedKeyDict()
855 records.update(dataId.records)
856 else:
857 records = NamedKeyDict(records) if records is not None else NamedKeyDict()
858 keys = dict(standardized.byName())
859 regions: List[lsst.sphgeom.ConvexPolygon] = []
860 timespans: List[Timespan[astropy.time.Time]] = []
861 for element in standardized.graph.primaryKeyTraversalOrder:
862 record = records.get(element.name, ...) # Use ... to mean not found; None might mean NULL
863 if record is ...:
864 storage = self._dimensions[element]
865 record = storage.fetch(keys)
866 records[element] = record
867 if record is not None:
868 for d in element.implied:
869 value = getattr(record, d.name)
870 if keys.setdefault(d.name, value) != value:
871 raise InconsistentDataIdError(
872 f"Data ID {standardized} has {d.name}={keys[d.name]!r}, "
873 f"but {element.name} implies {d.name}={value!r}."
874 )
875 if element in standardized.graph.spatial and record.region is not None:
876 if any(record.region.relate(r) & lsst.sphgeom.DISJOINT for r in regions):
877 raise InconsistentDataIdError(f"Data ID {standardized}'s region for {element.name} "
878 f"is disjoint with those for other elements.")
879 regions.append(record.region)
880 if element in standardized.graph.temporal:
881 if any(not record.timespan.overlaps(t) for t in timespans):
882 raise InconsistentDataIdError(f"Data ID {standardized}'s timespan for {element.name}"
883 f" is disjoint with those for other elements.")
884 timespans.append(record.timespan)
885 else:
886 if element in standardized.graph.required:
887 raise LookupError(
888 f"Could not fetch record for required dimension {element.name} via keys {keys}."
889 )
890 if element.alwaysJoin:
891 raise InconsistentDataIdError(
892 f"Could not fetch record for element {element.name} via keys {keys}, ",
893 "but it is marked alwaysJoin=True; this means one or more dimensions are not "
894 "related."
895 )
896 records.update((d, None) for d in element.implied)
897 return ExpandedDataCoordinate(standardized.graph, standardized.values(), records=records)
899 def relateDataIds(self, a: DataId, b: DataId) -> Optional[ConsistentDataIds]:
900 """Compare the keys and values of a pair of data IDs for consistency.
902 See `ConsistentDataIds` for more information.
904 Parameters
905 ----------
906 a : `dict` or `DataCoordinate`
907 First data ID to be compared.
908 b : `dict` or `DataCoordinate`
909 Second data ID to be compared.
911 Returns
912 -------
913 relationship : `ConsistentDataIds` or `None`
914 Relationship information. This is not `None` and coerces to
915 `True` in boolean contexts if and only if the data IDs are
916 consistent in terms of all common key-value pairs, all many-to-many
917 join tables, and all spatial andtemporal relationships.
918 """
919 a = DataCoordinate.standardize(a, universe=self.dimensions)
920 b = DataCoordinate.standardize(b, universe=self.dimensions)
921 aFull = getattr(a, "full", None)
922 bFull = getattr(b, "full", None)
923 aBest = aFull if aFull is not None else a
924 bBest = bFull if bFull is not None else b
925 jointKeys = aBest.keys() & bBest.keys()
926 # If any common values are not equal, we know they are inconsistent.
927 if any(aBest[k] != bBest[k] for k in jointKeys):
928 return None
929 # If the graphs are equal, we know the data IDs are.
930 if a.graph == b.graph:
931 return ConsistentDataIds(contains=True, within=True, overlaps=bool(jointKeys))
932 # Result is still inconclusive. Try to expand a data ID containing
933 # keys from both; that will fail if they are inconsistent.
934 # First, if either input was already an ExpandedDataCoordinate, extract
935 # its records so we don't have to query for them.
936 records: NamedKeyDict[DimensionElement, Optional[DimensionRecord]] = NamedKeyDict()
937 if isinstance(a, ExpandedDataCoordinate):
938 records.update(a.records)
939 if isinstance(b, ExpandedDataCoordinate):
940 records.update(b.records)
941 try:
942 self.expandDataId({**a.byName(), **b.byName()}, graph=(a.graph | b.graph), records=records)
943 except InconsistentDataIdError:
944 return None
945 # We know the answer is not `None`; time to figure out what it is.
946 return ConsistentDataIds(
947 contains=(a.graph >= b.graph),
948 within=(a.graph <= b.graph),
949 overlaps=bool(a.graph & b.graph),
950 )
952 def insertDimensionData(self, element: Union[DimensionElement, str],
953 *data: Union[Mapping[str, Any], DimensionRecord],
954 conform: bool = True) -> None:
955 """Insert one or more dimension records into the database.
957 Parameters
958 ----------
959 element : `DimensionElement` or `str`
960 The `DimensionElement` or name thereof that identifies the table
961 records will be inserted into.
962 data : `dict` or `DimensionRecord` (variadic)
963 One or more records to insert.
964 conform : `bool`, optional
965 If `False` (`True` is default) perform no checking or conversions,
966 and assume that ``element`` is a `DimensionElement` instance and
967 ``data`` is a one or more `DimensionRecord` instances of the
968 appropriate subclass.
969 """
970 if conform:
971 if isinstance(element, str):
972 element = self.dimensions[element]
973 records = [row if isinstance(row, DimensionRecord) else element.RecordClass.fromDict(row)
974 for row in data]
975 else:
976 # Ignore typing since caller said to trust them with conform=False.
977 records = data # type: ignore
978 storage = self._dimensions[element] # type: ignore
979 storage.insert(*records)
981 def syncDimensionData(self, element: Union[DimensionElement, str],
982 row: Union[Mapping[str, Any], DimensionRecord],
983 conform: bool = True) -> bool:
984 """Synchronize the given dimension record with the database, inserting
985 if it does not already exist and comparing values if it does.
987 Parameters
988 ----------
989 element : `DimensionElement` or `str`
990 The `DimensionElement` or name thereof that identifies the table
991 records will be inserted into.
992 row : `dict` or `DimensionRecord`
993 The record to insert.
994 conform : `bool`, optional
995 If `False` (`True` is default) perform no checking or conversions,
996 and assume that ``element`` is a `DimensionElement` instance and
997 ``data`` is a one or more `DimensionRecord` instances of the
998 appropriate subclass.
1000 Returns
1001 -------
1002 inserted : `bool`
1003 `True` if a new row was inserted, `False` otherwise.
1005 Raises
1006 ------
1007 ConflictingDefinitionError
1008 Raised if the record exists in the database (according to primary
1009 key lookup) but is inconsistent with the given one.
1011 Notes
1012 -----
1013 This method cannot be called within transactions, as it needs to be
1014 able to perform its own transaction to be concurrent.
1015 """
1016 if conform:
1017 if isinstance(element, str):
1018 element = self.dimensions[element]
1019 record = row if isinstance(row, DimensionRecord) else element.RecordClass.fromDict(row)
1020 else:
1021 # Ignore typing since caller said to trust them with conform=False.
1022 record = row # type: ignore
1023 storage = self._dimensions[element] # type: ignore
1024 return storage.sync(record)
1026 def queryDatasetTypes(self, expression: Any = ..., *, components: Optional[bool] = None
1027 ) -> Iterator[DatasetType]:
1028 """Iterate over the dataset types whose names match an expression.
1030 Parameters
1031 ----------
1032 expression : `Any`, optional
1033 An expression that fully or partially identifies the dataset types
1034 to return, such as a `str`, `re.Pattern`, or iterable thereof.
1035 `...` can be used to return all dataset types, and is the default.
1036 See :ref:`daf_butler_dataset_type_expressions` for more
1037 information.
1038 components : `bool`, optional
1039 If `True`, apply all expression patterns to component dataset type
1040 names as well. If `False`, never apply patterns to components.
1041 If `None` (default), apply patterns to components only if their
1042 parent datasets were not matched by the expression.
1043 Fully-specified component datasets (`str` or `DatasetType`
1044 instances) are always included.
1046 Yields
1047 ------
1048 datasetType : `DatasetType`
1049 A `DatasetType` instance whose name matches ``expression``.
1050 """
1051 wildcard = CategorizedWildcard.fromExpression(expression, coerceUnrecognized=lambda d: d.name)
1052 if wildcard is Ellipsis:
1053 for datasetType in self._datasets:
1054 if components or not datasetType.isComponent():
1055 yield datasetType
1056 return
1057 done: Set[str] = set()
1058 for name in wildcard.strings:
1059 storage = self._datasets.find(name)
1060 if storage is not None:
1061 done.add(storage.datasetType.name)
1062 yield storage.datasetType
1063 if wildcard.patterns:
1064 # If components (the argument) is None, we'll save component
1065 # dataset that we might want to match, but only if their parents
1066 # didn't get included.
1067 componentsForLater = []
1068 for datasetType in self._datasets:
1069 if datasetType.name in done:
1070 continue
1071 parentName, componentName = datasetType.nameAndComponent()
1072 if componentName is not None and not components:
1073 if components is None and parentName not in done:
1074 componentsForLater.append(datasetType)
1075 continue
1076 if any(p.fullmatch(datasetType.name) for p in wildcard.patterns):
1077 done.add(datasetType.name)
1078 yield datasetType
1079 # Go back and try to match saved components.
1080 for datasetType in componentsForLater:
1081 parentName, _ = datasetType.nameAndComponent()
1082 if parentName not in done and any(p.fullmatch(datasetType.name) for p in wildcard.patterns):
1083 yield datasetType
1085 def queryCollections(self, expression: Any = ...,
1086 datasetType: Optional[DatasetType] = None,
1087 collectionType: Optional[CollectionType] = None,
1088 flattenChains: bool = False,
1089 includeChains: Optional[bool] = None) -> Iterator[str]:
1090 """Iterate over the collections whose names match an expression.
1092 Parameters
1093 ----------
1094 expression : `Any`, optional
1095 An expression that fully or partially identifies the collections
1096 to return, such as a `str`, `re.Pattern`, or iterable thereof.
1097 `...` can be used to return all collections, and is the default.
1098 See :ref:`daf_butler_collection_expressions` for more
1099 information.
1100 datasetType : `DatasetType`, optional
1101 If provided, only yield collections that should be searched for
1102 this dataset type according to ``expression``. If this is
1103 not provided, any dataset type restrictions in ``expression`` are
1104 ignored.
1105 collectionType : `CollectionType`, optional
1106 If provided, only yield collections of this type.
1107 flattenChains : `bool`, optional
1108 If `True` (`False` is default), recursively yield the child
1109 collections of matching `~CollectionType.CHAINED` collections.
1110 includeChains : `bool`, optional
1111 If `True`, yield records for matching `~CollectionType.CHAINED`
1112 collections. Default is the opposite of ``flattenChains``: include
1113 either CHAINED collections or their children, but not both.
1115 Yields
1116 ------
1117 collection : `str`
1118 The name of a collection that matches ``expression``.
1119 """
1120 query = CollectionQuery.fromExpression(expression)
1121 for record in query.iter(self._collections, datasetType=datasetType, collectionType=collectionType,
1122 flattenChains=flattenChains, includeChains=includeChains):
1123 yield record.name
1125 def makeQueryBuilder(self, summary: QuerySummary) -> QueryBuilder:
1126 """Return a `QueryBuilder` instance capable of constructing and
1127 managing more complex queries than those obtainable via `Registry`
1128 interfaces.
1130 This is an advanced interface; downstream code should prefer
1131 `Registry.queryDimensions` and `Registry.queryDatasets` whenever those
1132 are sufficient.
1134 Parameters
1135 ----------
1136 summary : `QuerySummary`
1137 Object describing and categorizing the full set of dimensions that
1138 will be included in the query.
1140 Returns
1141 -------
1142 builder : `QueryBuilder`
1143 Object that can be used to construct and perform advanced queries.
1144 """
1145 return QueryBuilder(summary=summary,
1146 collections=self._collections,
1147 dimensions=self._dimensions,
1148 datasets=self._datasets)
1150 def queryDimensions(self, dimensions: Union[Iterable[Union[Dimension, str]], Dimension, str], *,
1151 dataId: Optional[DataId] = None,
1152 datasets: Any = None,
1153 collections: Any = None,
1154 where: Optional[str] = None,
1155 expand: bool = True,
1156 components: Optional[bool] = None,
1157 **kwargs: Any) -> Iterator[DataCoordinate]:
1158 """Query for and iterate over data IDs matching user-provided criteria.
1160 Parameters
1161 ----------
1162 dimensions : `Dimension` or `str`, or iterable thereof
1163 The dimensions of the data IDs to yield, as either `Dimension`
1164 instances or `str`. Will be automatically expanded to a complete
1165 `DimensionGraph`.
1166 dataId : `dict` or `DataCoordinate`, optional
1167 A data ID whose key-value pairs are used as equality constraints
1168 in the query.
1169 datasets : `Any`, optional
1170 An expression that fully or partially identifies dataset types
1171 that should constrain the yielded data IDs. For example, including
1172 "raw" here would constrain the yielded ``instrument``,
1173 ``exposure``, ``detector``, and ``physical_filter`` values to only
1174 those for which at least one "raw" dataset exists in
1175 ``collections``. Allowed types include `DatasetType`, `str`,
1176 `re.Pattern`, and iterables thereof. Unlike other dataset type
1177 expressions, `...` is not permitted - it doesn't make sense to
1178 constrain data IDs on the existence of *all* datasets.
1179 See :ref:`daf_butler_dataset_type_expressions` for more
1180 information.
1181 collections: `Any`, optional
1182 An expression that fully or partially identifies the collections
1183 to search for datasets, such as a `str`, `re.Pattern`, or iterable
1184 thereof. `...` can be used to return all collections. Must be
1185 provided if ``datasets`` is, and is ignored if it is not. See
1186 :ref:`daf_butler_collection_expressions` for more information.
1187 where : `str`, optional
1188 A string expression similar to a SQL WHERE clause. May involve
1189 any column of a dimension table or (as a shortcut for the primary
1190 key column of a dimension table) dimension name. See
1191 :ref:`daf_butler_dimension_expressions` for more information.
1192 expand : `bool`, optional
1193 If `True` (default) yield `ExpandedDataCoordinate` instead of
1194 minimal `DataCoordinate` base-class instances.
1195 components : `bool`, optional
1196 If `True`, apply all dataset expression patterns to component
1197 dataset type names as well. If `False`, never apply patterns to
1198 components. If `None` (default), apply patterns to components only
1199 if their parent datasets were not matched by the expression.
1200 Fully-specified component datasets (`str` or `DatasetType`
1201 instances) are always included.
1202 **kwargs
1203 Additional keyword arguments are forwarded to
1204 `DataCoordinate.standardize` when processing the ``dataId``
1205 argument (and may be used to provide a constraining data ID even
1206 when the ``dataId`` argument is `None`).
1208 Yields
1209 ------
1210 dataId : `DataCoordinate`
1211 Data IDs matching the given query parameters. Order is
1212 unspecified.
1213 """
1214 dimensions = iterable(dimensions)
1215 standardizedDataId = self.expandDataId(dataId, **kwargs)
1216 standardizedDatasetTypes = set()
1217 requestedDimensionNames = set(self.dimensions.extract(dimensions).names)
1218 if datasets is not None:
1219 if collections is None:
1220 raise TypeError("Cannot pass 'datasets' without 'collections'.")
1221 for datasetType in self.queryDatasetTypes(datasets, components=components):
1222 requestedDimensionNames.update(datasetType.dimensions.names)
1223 # If any matched dataset type is a component, just operate on
1224 # its parent instead, because Registry doesn't know anything
1225 # about what components exist, and here (unlike queryDatasets)
1226 # we don't care about returning them.
1227 parentDatasetTypeName, componentName = datasetType.nameAndComponent()
1228 if componentName is not None:
1229 datasetType = self.getDatasetType(parentDatasetTypeName)
1230 standardizedDatasetTypes.add(datasetType)
1231 # Preprocess collections expression in case the original included
1232 # single-pass iterators (we'll want to use it multiple times
1233 # below).
1234 collections = CollectionQuery.fromExpression(collections)
1236 summary = QuerySummary(
1237 requested=DimensionGraph(self.dimensions, names=requestedDimensionNames),
1238 dataId=standardizedDataId,
1239 expression=where,
1240 )
1241 builder = self.makeQueryBuilder(summary)
1242 for datasetType in standardizedDatasetTypes:
1243 builder.joinDataset(datasetType, collections, isResult=False)
1244 query = builder.finish()
1245 predicate = query.predicate()
1246 for row in self._db.query(query.sql):
1247 if predicate(row):
1248 result = query.extractDataId(row)
1249 if expand:
1250 yield self.expandDataId(result, records=standardizedDataId.records)
1251 else:
1252 yield result
1254 def queryDatasets(self, datasetType: Any, *,
1255 collections: Any,
1256 dimensions: Optional[Iterable[Union[Dimension, str]]] = None,
1257 dataId: Optional[DataId] = None,
1258 where: Optional[str] = None,
1259 deduplicate: bool = False,
1260 expand: bool = True,
1261 components: Optional[bool] = None,
1262 **kwargs: Any) -> Iterator[DatasetRef]:
1263 """Query for and iterate over dataset references matching user-provided
1264 criteria.
1266 Parameters
1267 ----------
1268 datasetType
1269 An expression that fully or partially identifies the dataset types
1270 to be queried. Allowed types include `DatasetType`, `str`,
1271 `re.Pattern`, and iterables thereof. The special value `...` can
1272 be used to query all dataset types. See
1273 :ref:`daf_butler_dataset_type_expressions` for more information.
1274 collections
1275 An expression that fully or partially identifies the collections
1276 to search for datasets, such as a `str`, `re.Pattern`, or iterable
1277 thereof. `...` can be used to return all collections. See
1278 :ref:`daf_butler_collection_expressions` for more information.
1279 dimensions : `~collections.abc.Iterable` of `Dimension` or `str`
1280 Dimensions to include in the query (in addition to those used
1281 to identify the queried dataset type(s)), either to constrain
1282 the resulting datasets to those for which a matching dimension
1283 exists, or to relate the dataset type's dimensions to dimensions
1284 referenced by the ``dataId`` or ``where`` arguments.
1285 dataId : `dict` or `DataCoordinate`, optional
1286 A data ID whose key-value pairs are used as equality constraints
1287 in the query.
1288 where : `str`, optional
1289 A string expression similar to a SQL WHERE clause. May involve
1290 any column of a dimension table or (as a shortcut for the primary
1291 key column of a dimension table) dimension name. See
1292 :ref:`daf_butler_dimension_expressions` for more information.
1293 deduplicate : `bool`, optional
1294 If `True` (`False` is default), for each result data ID, only
1295 yield one `DatasetRef` of each `DatasetType`, from the first
1296 collection in which a dataset of that dataset type appears
1297 (according to the order of ``collections`` passed in). If `True`,
1298 ``collections`` must not contain regular expressions and may not
1299 be `...`.
1300 expand : `bool`, optional
1301 If `True` (default) attach `ExpandedDataCoordinate` instead of
1302 minimal `DataCoordinate` base-class instances.
1303 components : `bool`, optional
1304 If `True`, apply all dataset expression patterns to component
1305 dataset type names as well. If `False`, never apply patterns to
1306 components. If `None` (default), apply patterns to components only
1307 if their parent datasets were not matched by the expression.
1308 Fully-specified component datasets (`str` or `DatasetType`
1309 instances) are always included.
1310 **kwargs
1311 Additional keyword arguments are forwarded to
1312 `DataCoordinate.standardize` when processing the ``dataId``
1313 argument (and may be used to provide a constraining data ID even
1314 when the ``dataId`` argument is `None`).
1316 Yields
1317 ------
1318 ref : `DatasetRef`
1319 Dataset references matching the given query criteria. These
1320 are grouped by `DatasetType` if the query evaluates to multiple
1321 dataset types, but order is otherwise unspecified.
1323 Raises
1324 ------
1325 TypeError
1326 Raised when the arguments are incompatible, such as when a
1327 collection wildcard is passed when ``deduplicate`` is `True`.
1329 Notes
1330 -----
1331 When multiple dataset types are queried in a single call, the
1332 results of this operation are equivalent to querying for each dataset
1333 type separately in turn, and no information about the relationships
1334 between datasets of different types is included. In contexts where
1335 that kind of information is important, the recommended pattern is to
1336 use `queryDimensions` to first obtain data IDs (possibly with the
1337 desired dataset types and collections passed as constraints to the
1338 query), and then use multiple (generally much simpler) calls to
1339 `queryDatasets` with the returned data IDs passed as constraints.
1340 """
1341 # Standardize the collections expression.
1342 if deduplicate:
1343 collections = CollectionSearch.fromExpression(collections)
1344 else:
1345 collections = CollectionQuery.fromExpression(collections)
1346 # Standardize and expand the data ID provided as a constraint.
1347 standardizedDataId = self.expandDataId(dataId, **kwargs)
1349 # We can only query directly if given a non-component DatasetType
1350 # instance. If we were given an expression or str or a component
1351 # DatasetType instance, we'll populate this dict, recurse, and return.
1352 # If we already have a non-component DatasetType, it will remain None
1353 # and we'll run the query directly.
1354 composition: Optional[
1355 Dict[
1356 DatasetType, # parent dataset type
1357 List[Optional[str]] # component name, or None for parent
1358 ]
1359 ] = None
1360 if not isinstance(datasetType, DatasetType):
1361 # We were given a dataset type expression (which may be as simple
1362 # as a str). Loop over all matching datasets, delegating handling
1363 # of the `components` argument to queryDatasetTypes, as we populate
1364 # the composition dict.
1365 composition = defaultdict(list)
1366 for trueDatasetType in self.queryDatasetTypes(datasetType, components=components):
1367 parentName, componentName = trueDatasetType.nameAndComponent()
1368 if componentName is not None:
1369 parentDatasetType = self.getDatasetType(parentName)
1370 composition.setdefault(parentDatasetType, []).append(componentName)
1371 else:
1372 composition.setdefault(trueDatasetType, []).append(None)
1373 elif datasetType.isComponent():
1374 # We were given a true DatasetType instance, but it's a component.
1375 # the composition dict will have exactly one item.
1376 parentName, componentName = datasetType.nameAndComponent()
1377 parentDatasetType = self.getDatasetType(parentName)
1378 composition = {parentDatasetType: [componentName]}
1379 if composition is not None:
1380 # We need to recurse. Do that once for each parent dataset type.
1381 for parentDatasetType, componentNames in composition.items():
1382 for parentRef in self.queryDatasets(parentDatasetType, collections=collections,
1383 dimensions=dimensions, dataId=standardizedDataId,
1384 where=where, deduplicate=deduplicate):
1385 # Loop over components, yielding one for each one for each
1386 # one requested.
1387 for componentName in componentNames:
1388 if componentName is None:
1389 yield parentRef
1390 else:
1391 yield parentRef.makeComponentRef(componentName)
1392 return
1393 # If we get here, there's no need to recurse (or we are already
1394 # recursing; there can only ever be one level of recursion).
1396 # The full set of dimensions in the query is the combination of those
1397 # needed for the DatasetType and those explicitly requested, if any.
1398 requestedDimensionNames = set(datasetType.dimensions.names)
1399 if dimensions is not None:
1400 requestedDimensionNames.update(self.dimensions.extract(dimensions).names)
1401 # Construct the summary structure needed to construct a QueryBuilder.
1402 summary = QuerySummary(
1403 requested=DimensionGraph(self.dimensions, names=requestedDimensionNames),
1404 dataId=standardizedDataId,
1405 expression=where,
1406 )
1407 builder = self.makeQueryBuilder(summary)
1408 # Add the dataset subquery to the query, telling the QueryBuilder to
1409 # include the rank of the selected collection in the results only if we
1410 # need to deduplicate. Note that if any of the collections are
1411 # actually wildcard expressions, and we've asked for deduplication,
1412 # this will raise TypeError for us.
1413 if not builder.joinDataset(datasetType, collections, isResult=True, addRank=deduplicate):
1414 return
1415 query = builder.finish()
1416 predicate = query.predicate()
1417 if not deduplicate:
1418 # No need to de-duplicate across collections.
1419 for row in self._db.query(query.sql):
1420 if predicate(row):
1421 dataId = query.extractDataId(row, graph=datasetType.dimensions)
1422 if expand:
1423 dataId = self.expandDataId(dataId, records=standardizedDataId.records)
1424 yield query.extractDatasetRef(row, datasetType, dataId)[0]
1425 else:
1426 # For each data ID, yield only the DatasetRef with the lowest
1427 # collection rank.
1428 bestRefs = {}
1429 bestRanks: Dict[DataCoordinate, int] = {}
1430 for row in self._db.query(query.sql):
1431 if predicate(row):
1432 ref, rank = query.extractDatasetRef(row, datasetType)
1433 bestRank = bestRanks.get(ref.dataId, sys.maxsize)
1434 assert rank is not None
1435 if rank < bestRank:
1436 bestRefs[ref.dataId] = ref
1437 bestRanks[ref.dataId] = rank
1438 # If caller requested expanded data IDs, we defer that until here
1439 # so we do as little expansion as possible.
1440 if expand:
1441 for ref in bestRefs.values():
1442 dataId = self.expandDataId(ref.dataId, records=standardizedDataId.records)
1443 yield ref.expanded(dataId)
1444 else:
1445 yield from bestRefs.values()
1447 storageClasses: StorageClassFactory
1448 """All storage classes known to the registry (`StorageClassFactory`).
1449 """