Coverage for python/lsst/daf/butler/registry/_registry.py : 13%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22from __future__ import annotations
24__all__ = (
25 "ConsistentDataIds",
26 "Registry",
27)
29from collections import defaultdict
30import contextlib
31from dataclasses import dataclass
32import sys
33from typing import (
34 Any,
35 Dict,
36 Iterable,
37 Iterator,
38 List,
39 Mapping,
40 Optional,
41 Type,
42 TYPE_CHECKING,
43 Union,
44)
46import sqlalchemy
48import lsst.sphgeom
49from ..core import (
50 Config,
51 DataCoordinate,
52 DataId,
53 DatasetRef,
54 DatasetType,
55 Dimension,
56 DimensionElement,
57 DimensionGraph,
58 DimensionRecord,
59 DimensionUniverse,
60 ExpandedDataCoordinate,
61 StorageClassFactory,
62)
63from ..core import ddl
64from ..core.utils import doImport, iterable, transactional
65from ._config import RegistryConfig
66from .queries import (
67 QueryBuilder,
68 QuerySummary,
69)
70from .tables import makeRegistryTableSpecs
71from ._collectionType import CollectionType
72from ._exceptions import ConflictingDefinitionError, InconsistentDataIdError, OrphanedRecordError
73from .wildcards import CategorizedWildcard, CollectionQuery, CollectionSearch
75if TYPE_CHECKING: 75 ↛ 76line 75 didn't jump to line 76, because the condition on line 75 was never true
76 from ..butlerConfig import ButlerConfig
77 from ..core import (
78 Quantum
79 )
80 from .interfaces import (
81 CollectionManager,
82 Database,
83 OpaqueTableStorageManager,
84 DimensionRecordStorageManager,
85 DatasetRecordStorageManager,
86 DatastoreRegistryBridgeManager,
87 )
90@dataclass
91class ConsistentDataIds:
92 """A struct used to report relationships between data IDs by
93 `Registry.relateDataIds`.
95 If an instance of this class is returned (instead of `None`), the data IDs
96 are "not inconsistent" - any keys they have in common have the same value,
97 and any spatial or temporal relationships they have at least might involve
98 an overlap. To capture this, any instance of `ConsistentDataIds` coerces
99 to `True` in boolean contexts.
100 """
102 overlaps: bool
103 """If `True`, the data IDs have at least one key in common, associated with
104 the same value.
106 Note that data IDs are not inconsistent even if overlaps is `False` - they
107 may simply have no keys in common, which means they cannot have
108 inconsistent values for any keys. They may even be equal, in the case that
109 both data IDs are empty.
111 This field does _not_ indicate whether a spatial or temporal overlap
112 relationship exists.
113 """
115 contains: bool
116 """If `True`, all keys in the first data ID are in the second, and are
117 associated with the same values.
119 This includes case where the first data ID is empty.
120 """
122 within: bool
123 """If `True`, all keys in the second data ID are in the first, and are
124 associated with the same values.
126 This includes case where the second data ID is empty.
127 """
129 @property
130 def equal(self) -> bool:
131 """If `True`, the two data IDs are the same.
133 Data IDs are equal if they have both a `contains` and a `within`
134 relationship.
135 """
136 return self.contains and self.within
138 @property
139 def disjoint(self) -> bool:
140 """If `True`, the two data IDs have no keys in common.
142 This is simply the oppose of `overlaps`. Disjoint datasets are by
143 definition not inconsistent.
144 """
145 return not self.overlaps
147 def __bool__(self) -> bool:
148 return True
151class Registry:
152 """Registry interface.
154 Parameters
155 ----------
156 config : `ButlerConfig`, `RegistryConfig`, `Config` or `str`
157 Registry configuration
158 """
160 defaultConfigFile = None
161 """Path to configuration defaults. Relative to $DAF_BUTLER_DIR/config or
162 absolute path. Can be None if no defaults specified.
163 """
165 @classmethod
166 def fromConfig(cls, config: Union[ButlerConfig, RegistryConfig, Config, str], create: bool = False,
167 butlerRoot: Optional[str] = None, writeable: bool = True) -> Registry:
168 """Create `Registry` subclass instance from `config`.
170 Uses ``registry.cls`` from `config` to determine which subclass to
171 instantiate.
173 Parameters
174 ----------
175 config : `ButlerConfig`, `RegistryConfig`, `Config` or `str`
176 Registry configuration
177 create : `bool`, optional
178 Assume empty Registry and create a new one.
179 butlerRoot : `str`, optional
180 Path to the repository root this `Registry` will manage.
181 writeable : `bool`, optional
182 If `True` (default) create a read-write connection to the database.
184 Returns
185 -------
186 registry : `Registry` (subclass)
187 A new `Registry` subclass instance.
188 """
189 if not isinstance(config, RegistryConfig):
190 if isinstance(config, str) or isinstance(config, Config):
191 config = RegistryConfig(config)
192 else:
193 raise ValueError("Incompatible Registry configuration: {}".format(config))
194 config.replaceRoot(butlerRoot)
195 DatabaseClass = config.getDatabaseClass()
196 database = DatabaseClass.fromUri(str(config.connectionString), origin=config.get("origin", 0),
197 namespace=config.get("namespace"), writeable=writeable)
198 universe = DimensionUniverse(config)
199 opaque = doImport(config["managers", "opaque"])
200 dimensions = doImport(config["managers", "dimensions"])
201 collections = doImport(config["managers", "collections"])
202 datasets = doImport(config["managers", "datasets"])
203 datastoreBridges = doImport(config["managers", "datastores"])
204 return cls(database, universe, dimensions=dimensions, opaque=opaque, collections=collections,
205 datasets=datasets, datastoreBridges=datastoreBridges, create=create)
207 def __init__(self, database: Database, universe: DimensionUniverse, *,
208 opaque: Type[OpaqueTableStorageManager],
209 dimensions: Type[DimensionRecordStorageManager],
210 collections: Type[CollectionManager],
211 datasets: Type[DatasetRecordStorageManager],
212 datastoreBridges: Type[DatastoreRegistryBridgeManager],
213 create: bool = False):
214 self._db = database
215 self.storageClasses = StorageClassFactory()
216 with self._db.declareStaticTables(create=create) as context:
217 self._dimensions = dimensions.initialize(self._db, context, universe=universe)
218 self._collections = collections.initialize(self._db, context)
219 self._datasets = datasets.initialize(self._db, context,
220 collections=self._collections,
221 universe=self.dimensions)
222 self._opaque = opaque.initialize(self._db, context)
223 self._datastoreBridges = datastoreBridges.initialize(self._db, context,
224 opaque=self._opaque,
225 datasets=datasets,
226 universe=self.dimensions)
227 self._tables = context.addTableTuple(makeRegistryTableSpecs(self.dimensions,
228 self._collections,
229 self._datasets))
230 self._collections.refresh()
231 self._datasets.refresh(universe=self._dimensions.universe)
233 def __str__(self) -> str:
234 return str(self._db)
236 def __repr__(self) -> str:
237 return f"Registry({self._db!r}, {self.dimensions!r})"
239 def isWriteable(self) -> bool:
240 """Return `True` if this registry allows write operations, and `False`
241 otherwise.
242 """
243 return self._db.isWriteable()
245 @property
246 def dimensions(self) -> DimensionUniverse:
247 """All dimensions recognized by this `Registry` (`DimensionUniverse`).
248 """
249 return self._dimensions.universe
251 @contextlib.contextmanager
252 def transaction(self):
253 """Return a context manager that represents a transaction.
254 """
255 # TODO make savepoint=False the default.
256 try:
257 with self._db.transaction():
258 yield
259 except BaseException:
260 # TODO: this clears the caches sometimes when we wouldn't actually
261 # need to. Can we avoid that?
262 self._dimensions.clearCaches()
263 raise
265 def registerOpaqueTable(self, tableName: str, spec: ddl.TableSpec):
266 """Add an opaque (to the `Registry`) table for use by a `Datastore` or
267 other data repository client.
269 Opaque table records can be added via `insertOpaqueData`, retrieved via
270 `fetchOpaqueData`, and removed via `deleteOpaqueData`.
272 Parameters
273 ----------
274 tableName : `str`
275 Logical name of the opaque table. This may differ from the
276 actual name used in the database by a prefix and/or suffix.
277 spec : `ddl.TableSpec`
278 Specification for the table to be added.
279 """
280 self._opaque.register(tableName, spec)
282 @transactional
283 def insertOpaqueData(self, tableName: str, *data: dict):
284 """Insert records into an opaque table.
286 Parameters
287 ----------
288 tableName : `str`
289 Logical name of the opaque table. Must match the name used in a
290 previous call to `registerOpaqueTable`.
291 data
292 Each additional positional argument is a dictionary that represents
293 a single row to be added.
294 """
295 self._opaque[tableName].insert(*data)
297 def fetchOpaqueData(self, tableName: str, **where: Any) -> Iterator[dict]:
298 """Retrieve records from an opaque table.
300 Parameters
301 ----------
302 tableName : `str`
303 Logical name of the opaque table. Must match the name used in a
304 previous call to `registerOpaqueTable`.
305 where
306 Additional keyword arguments are interpreted as equality
307 constraints that restrict the returned rows (combined with AND);
308 keyword arguments are column names and values are the values they
309 must have.
311 Yields
312 ------
313 row : `dict`
314 A dictionary representing a single result row.
315 """
316 yield from self._opaque[tableName].fetch(**where)
318 @transactional
319 def deleteOpaqueData(self, tableName: str, **where: Any):
320 """Remove records from an opaque table.
322 Parameters
323 ----------
324 tableName : `str`
325 Logical name of the opaque table. Must match the name used in a
326 previous call to `registerOpaqueTable`.
327 where
328 Additional keyword arguments are interpreted as equality
329 constraints that restrict the deleted rows (combined with AND);
330 keyword arguments are column names and values are the values they
331 must have.
332 """
333 self._opaque[tableName].delete(**where)
335 def registerCollection(self, name: str, type: CollectionType = CollectionType.TAGGED):
336 """Add a new collection if one with the given name does not exist.
338 Parameters
339 ----------
340 name : `str`
341 The name of the collection to create.
342 type : `CollectionType`
343 Enum value indicating the type of collection to create.
345 Notes
346 -----
347 This method cannot be called within transactions, as it needs to be
348 able to perform its own transaction to be concurrent.
349 """
350 self._collections.register(name, type)
352 def getCollectionType(self, name: str) -> CollectionType:
353 """Return an enumeration value indicating the type of the given
354 collection.
356 Parameters
357 ----------
358 name : `str`
359 The name of the collection.
361 Returns
362 -------
363 type : `CollectionType`
364 Enum value indicating the type of this collection.
366 Raises
367 ------
368 MissingCollectionError
369 Raised if no collection with the given name exists.
370 """
371 return self._collections.find(name).type
373 def registerRun(self, name: str):
374 """Add a new run if one with the given name does not exist.
376 Parameters
377 ----------
378 name : `str`
379 The name of the run to create.
381 Notes
382 -----
383 This method cannot be called within transactions, as it needs to be
384 able to perform its own transaction to be concurrent.
385 """
386 self._collections.register(name, CollectionType.RUN)
388 @transactional
389 def removeCollection(self, name: str):
390 """Completely remove the given collection.
392 Parameters
393 ----------
394 name : `str`
395 The name of the collection to remove.
397 Raises
398 ------
399 MissingCollectionError
400 Raised if no collection with the given name exists.
402 Notes
403 -----
404 If this is a `~CollectionType.RUN` collection, all datasets and quanta
405 in it are also fully removed. This requires that those datasets be
406 removed (or at least trashed) from any datastores that hold them first.
408 A collection may not be deleted as long as it is referenced by a
409 `~CollectionType.CHAINED` collection; the ``CHAINED`` collection must
410 be deleted or redefined first.
411 """
412 self._collections.remove(name)
414 def getCollectionChain(self, parent: str) -> CollectionSearch:
415 """Return the child collections in a `~CollectionType.CHAINED`
416 collection.
418 Parameters
419 ----------
420 parent : `str`
421 Name of the chained collection. Must have already been added via
422 a call to `Registry.registerCollection`.
424 Returns
425 -------
426 children : `CollectionSearch`
427 An object that defines the search path of the collection.
428 See :ref:`daf_butler_collection_expressions` for more information.
430 Raises
431 ------
432 MissingCollectionError
433 Raised if ``parent`` does not exist in the `Registry`.
434 TypeError
435 Raised if ``parent`` does not correspond to a
436 `~CollectionType.CHAINED` collection.
437 """
438 record = self._collections.find(parent)
439 if record.type is not CollectionType.CHAINED:
440 raise TypeError(f"Collection '{parent}' has type {record.type.name}, not CHAINED.")
441 return record.children
443 @transactional
444 def setCollectionChain(self, parent: str, children: Any):
445 """Define or redefine a `~CollectionType.CHAINED` collection.
447 Parameters
448 ----------
449 parent : `str`
450 Name of the chained collection. Must have already been added via
451 a call to `Registry.registerCollection`.
452 children : `Any`
453 An expression defining an ordered search of child collections,
454 generally an iterable of `str`. Restrictions on the dataset types
455 to be searched can also be included, by passing mapping or an
456 iterable containing tuples; see
457 :ref:`daf_butler_collection_expressions` for more information.
459 Raises
460 ------
461 MissingCollectionError
462 Raised when any of the given collections do not exist in the
463 `Registry`.
464 TypeError
465 Raised if ``parent`` does not correspond to a
466 `~CollectionType.CHAINED` collection.
467 ValueError
468 Raised if the given collections contains a cycle.
469 """
470 record = self._collections.find(parent)
471 if record.type is not CollectionType.CHAINED:
472 raise TypeError(f"Collection '{parent}' has type {record.type.name}, not CHAINED.")
473 children = CollectionSearch.fromExpression(children)
474 if children != record.children:
475 record.update(self._collections, children)
477 def registerDatasetType(self, datasetType: DatasetType) -> bool:
478 """
479 Add a new `DatasetType` to the Registry.
481 It is not an error to register the same `DatasetType` twice.
483 Parameters
484 ----------
485 datasetType : `DatasetType`
486 The `DatasetType` to be added.
488 Returns
489 -------
490 inserted : `bool`
491 `True` if ``datasetType`` was inserted, `False` if an identical
492 existing `DatsetType` was found. Note that in either case the
493 DatasetType is guaranteed to be defined in the Registry
494 consistently with the given definition.
496 Raises
497 ------
498 ValueError
499 Raised if the dimensions or storage class are invalid.
500 ConflictingDefinitionError
501 Raised if this DatasetType is already registered with a different
502 definition.
504 Notes
505 -----
506 This method cannot be called within transactions, as it needs to be
507 able to perform its own transaction to be concurrent.
508 """
509 _, inserted = self._datasets.register(datasetType)
510 return inserted
512 def getDatasetType(self, name: str) -> DatasetType:
513 """Get the `DatasetType`.
515 Parameters
516 ----------
517 name : `str`
518 Name of the type.
520 Returns
521 -------
522 type : `DatasetType`
523 The `DatasetType` associated with the given name.
525 Raises
526 ------
527 KeyError
528 Requested named DatasetType could not be found in registry.
529 """
530 storage = self._datasets.find(name)
531 if storage is None:
532 raise KeyError(f"DatasetType '{name}' could not be found.")
533 return storage.datasetType
535 def findDataset(self, datasetType: Union[DatasetType, str], dataId: Optional[DataId] = None, *,
536 collections: Any, **kwargs: Any) -> Optional[DatasetRef]:
537 """Find a dataset given its `DatasetType` and data ID.
539 This can be used to obtain a `DatasetRef` that permits the dataset to
540 be read from a `Datastore`. If the dataset is a component and can not
541 be found using the provided dataset type, a dataset ref for the parent
542 will be returned instead but with the correct dataset type.
544 Parameters
545 ----------
546 datasetType : `DatasetType` or `str`
547 A `DatasetType` or the name of one.
548 dataId : `dict` or `DataCoordinate`, optional
549 A `dict`-like object containing the `Dimension` links that identify
550 the dataset within a collection.
551 collections
552 An expression that fully or partially identifies the collections
553 to search for the dataset, such as a `str`, `re.Pattern`, or
554 iterable thereof. `...` can be used to return all collections.
555 See :ref:`daf_butler_collection_expressions` for more information.
556 **kwargs
557 Additional keyword arguments passed to
558 `DataCoordinate.standardize` to convert ``dataId`` to a true
559 `DataCoordinate` or augment an existing one.
561 Returns
562 -------
563 ref : `DatasetRef`
564 A reference to the dataset, or `None` if no matching Dataset
565 was found.
567 Raises
568 ------
569 LookupError
570 Raised if one or more data ID keys are missing or the dataset type
571 does not exist.
572 MissingCollectionError
573 Raised if any of ``collections`` does not exist in the registry.
574 """
575 if isinstance(datasetType, DatasetType):
576 storage = self._datasets.find(datasetType.name)
577 if storage is None:
578 raise LookupError(f"DatasetType '{datasetType}' has not been registered.")
579 else:
580 storage = self._datasets.find(datasetType)
581 if storage is None:
582 raise LookupError(f"DatasetType with name '{datasetType}' has not been registered.")
583 dataId = DataCoordinate.standardize(dataId, graph=storage.datasetType.dimensions,
584 universe=self.dimensions, **kwargs)
585 collections = CollectionSearch.fromExpression(collections)
586 for collectionRecord in collections.iter(self._collections, datasetType=storage.datasetType):
587 result = storage.find(collectionRecord, dataId)
588 if result is not None:
589 return result
591 # fallback to the parent if we got nothing and this was a component
592 if storage.datasetType.isComponent():
593 parentType, _ = storage.datasetType.nameAndComponent()
594 parentRef = self.findDataset(parentType, dataId, collections=collections, **kwargs)
595 if parentRef is not None:
596 # Should already conform and we know no components
597 return DatasetRef(storage.datasetType, parentRef.dataId, id=parentRef.id,
598 run=parentRef.run, conform=False, hasParentId=True)
600 return None
602 @transactional
603 def insertDatasets(self, datasetType: Union[DatasetType, str], dataIds: Iterable[DataId],
604 run: str, *, producer: Optional[Quantum] = None) -> List[DatasetRef]:
605 """Insert one or more datasets into the `Registry`
607 This always adds new datasets; to associate existing datasets with
608 a new collection, use ``associate``.
610 Parameters
611 ----------
612 datasetType : `DatasetType` or `str`
613 A `DatasetType` or the name of one.
614 dataIds : `~collections.abc.Iterable` of `dict` or `DataCoordinate`
615 Dimension-based identifiers for the new datasets.
616 run : `str`
617 The name of the run that produced the datasets.
618 producer : `Quantum`
619 Unit of work that produced the datasets. May be `None` to store
620 no provenance information, but if present the `Quantum` must
621 already have been added to the Registry.
623 Returns
624 -------
625 refs : `list` of `DatasetRef`
626 Resolved `DatasetRef` instances for all given data IDs (in the same
627 order).
629 Raises
630 ------
631 ConflictingDefinitionError
632 If a dataset with the same dataset type and data ID as one of those
633 given already exists in ``run``.
634 MissingCollectionError
635 Raised if ``run`` does not exist in the registry.
636 """
637 if isinstance(datasetType, DatasetType):
638 storage = self._datasets.find(datasetType.name)
639 if storage is None:
640 raise LookupError(f"DatasetType '{datasetType}' has not been registered.")
641 else:
642 storage = self._datasets.find(datasetType)
643 if storage is None:
644 raise LookupError(f"DatasetType with name '{datasetType}' has not been registered.")
645 runRecord = self._collections.find(run)
646 dataIds = [self.expandDataId(dataId, graph=storage.datasetType.dimensions) for dataId in dataIds]
647 try:
648 refs = list(storage.insert(runRecord, dataIds, quantum=producer))
649 except sqlalchemy.exc.IntegrityError as err:
650 raise ConflictingDefinitionError(f"A database constraint failure was triggered by inserting "
651 f"one or more datasets of type {storage.datasetType} into "
652 f"collection '{run}'. "
653 f"This probably means a dataset with the same data ID "
654 f"and dataset type already exists, but it may also mean a "
655 f"dimension row is missing.") from err
656 return refs
658 def getDataset(self, id: int) -> Optional[DatasetRef]:
659 """Retrieve a Dataset entry.
661 Parameters
662 ----------
663 id : `int`
664 The unique identifier for the dataset.
666 Returns
667 -------
668 ref : `DatasetRef` or `None`
669 A ref to the Dataset, or `None` if no matching Dataset
670 was found.
671 """
672 ref = self._datasets.getDatasetRef(id)
673 if ref is None:
674 return None
675 return ref
677 @transactional
678 def removeDatasets(self, refs: Iterable[DatasetRef]):
679 """Remove datasets from the Registry.
681 The datasets will be removed unconditionally from all collections, and
682 any `Quantum` that consumed this dataset will instead be marked with
683 having a NULL input. `Datastore` records will *not* be deleted; the
684 caller is responsible for ensuring that the dataset has already been
685 removed from all Datastores.
687 Parameters
688 ----------
689 refs : `Iterable` of `DatasetRef`
690 References to the datasets to be removed. Must include a valid
691 ``id`` attribute, and should be considered invalidated upon return.
693 Raises
694 ------
695 AmbiguousDatasetError
696 Raised if any ``ref.id`` is `None`.
697 OrphanedRecordError
698 Raised if any dataset is still present in any `Datastore`.
699 """
700 for datasetType, refsForType in DatasetRef.groupByType(refs).items():
701 storage = self._datasets.find(datasetType.name)
702 try:
703 storage.delete(refsForType)
704 except sqlalchemy.exc.IntegrityError as err:
705 raise OrphanedRecordError("One or more datasets is still "
706 "present in one or more Datastores.") from err
708 @transactional
709 def associate(self, collection: str, refs: Iterable[DatasetRef]):
710 """Add existing datasets to a `~CollectionType.TAGGED` collection.
712 If a DatasetRef with the same exact integer ID is already in a
713 collection nothing is changed. If a `DatasetRef` with the same
714 `DatasetType` and data ID but with different integer ID
715 exists in the collection, `ConflictingDefinitionError` is raised.
717 Parameters
718 ----------
719 collection : `str`
720 Indicates the collection the datasets should be associated with.
721 refs : `Iterable` [ `DatasetRef` ]
722 An iterable of resolved `DatasetRef` instances that already exist
723 in this `Registry`.
725 Raises
726 ------
727 ConflictingDefinitionError
728 If a Dataset with the given `DatasetRef` already exists in the
729 given collection.
730 AmbiguousDatasetError
731 Raised if ``any(ref.id is None for ref in refs)``.
732 MissingCollectionError
733 Raised if ``collection`` does not exist in the registry.
734 TypeError
735 Raise adding new datasets to the given ``collection`` is not
736 allowed.
737 """
738 collectionRecord = self._collections.find(collection)
739 if collectionRecord.type is not CollectionType.TAGGED:
740 raise TypeError(f"Collection '{collection}' has type {collectionRecord.type.name}, not TAGGED.")
741 for datasetType, refsForType in DatasetRef.groupByType(refs).items():
742 storage = self._datasets.find(datasetType.name)
743 try:
744 storage.associate(collectionRecord, refsForType)
745 except sqlalchemy.exc.IntegrityError as err:
746 raise ConflictingDefinitionError(
747 f"Constraint violation while associating dataset of type {datasetType.name} with "
748 f"collection {collection}. This probably means that one or more datasets with the same "
749 f"dataset type and data ID already exist in the collection, but it may also indicate "
750 f"that the datasets do not exist."
751 ) from err
753 @transactional
754 def disassociate(self, collection: str, refs: Iterable[DatasetRef]):
755 """Remove existing datasets from a `~CollectionType.TAGGED` collection.
757 ``collection`` and ``ref`` combinations that are not currently
758 associated are silently ignored.
760 Parameters
761 ----------
762 collection : `str`
763 The collection the datasets should no longer be associated with.
764 refs : `Iterable` [ `DatasetRef` ]
765 An iterable of resolved `DatasetRef` instances that already exist
766 in this `Registry`.
768 Raises
769 ------
770 AmbiguousDatasetError
771 Raised if any of the given dataset references is unresolved.
772 MissingCollectionError
773 Raised if ``collection`` does not exist in the registry.
774 TypeError
775 Raise adding new datasets to the given ``collection`` is not
776 allowed.
777 """
778 collectionRecord = self._collections.find(collection)
779 if collectionRecord.type is not CollectionType.TAGGED:
780 raise TypeError(f"Collection '{collection}' has type {collectionRecord.type.name}; "
781 "expected TAGGED.")
782 for datasetType, refsForType in DatasetRef.groupByType(refs).items():
783 storage = self._datasets.find(datasetType.name)
784 storage.disassociate(collectionRecord, refsForType)
786 def getDatastoreBridgeManager(self) -> DatastoreRegistryBridgeManager:
787 # TODO docs
788 return self._datastoreBridges
790 def getDatasetLocations(self, ref: DatasetRef) -> Iterator[str]:
791 """Retrieve datastore locations for a given dataset.
793 Typically used by `Datastore`.
795 Parameters
796 ----------
797 ref : `DatasetRef`
798 A reference to the dataset for which to retrieve storage
799 information.
801 Returns
802 -------
803 datastores : `Iterable` [ `str` ]
804 All the matching datastores holding this dataset.
806 Raises
807 ------
808 AmbiguousDatasetError
809 Raised if ``ref.id`` is `None`.
810 """
811 return self._datastoreBridges.findDatastores(ref)
813 def expandDataId(self, dataId: Optional[DataId] = None, *, graph: Optional[DimensionGraph] = None,
814 records: Optional[Mapping[DimensionElement, DimensionRecord]] = None, **kwds):
815 """Expand a dimension-based data ID to include additional information.
817 Parameters
818 ----------
819 dataId : `DataCoordinate` or `dict`, optional
820 Data ID to be expanded; augmented and overridden by ``kwds``.
821 graph : `DimensionGraph`, optional
822 Set of dimensions for the expanded ID. If `None`, the dimensions
823 will be inferred from the keys of ``dataId`` and ``kwds``.
824 Dimensions that are in ``dataId`` or ``kwds`` but not in ``graph``
825 are silently ignored, providing a way to extract and expand a
826 subset of a data ID.
827 records : mapping [`DimensionElement`, `DimensionRecord`], optional
828 Dimension record data to use before querying the database for that
829 data.
830 **kwds
831 Additional keywords are treated like additional key-value pairs for
832 ``dataId``, extending and overriding
834 Returns
835 -------
836 expanded : `ExpandedDataCoordinate`
837 A data ID that includes full metadata for all of the dimensions it
838 identifieds.
839 """
840 standardized = DataCoordinate.standardize(dataId, graph=graph, universe=self.dimensions, **kwds)
841 if isinstance(standardized, ExpandedDataCoordinate):
842 return standardized
843 elif isinstance(dataId, ExpandedDataCoordinate):
844 records = dict(records) if records is not None else {}
845 records.update(dataId.records)
846 else:
847 records = dict(records) if records is not None else {}
848 keys = dict(standardized)
849 regions = []
850 timespans = []
851 for element in standardized.graph.primaryKeyTraversalOrder:
852 record = records.get(element.name, ...) # Use ... to mean not found; None might mean NULL
853 if record is ...:
854 storage = self._dimensions[element]
855 record = storage.fetch(keys)
856 records[element] = record
857 if record is not None:
858 for d in element.implied:
859 value = getattr(record, d.name)
860 if keys.setdefault(d, value) != value:
861 raise InconsistentDataIdError(f"Data ID {standardized} has {d.name}={keys[d]!r}, "
862 f"but {element.name} implies {d.name}={value!r}.")
863 if element in standardized.graph.spatial and record.region is not None:
864 if any(record.region.relate(r) & lsst.sphgeom.DISJOINT for r in regions):
865 raise InconsistentDataIdError(f"Data ID {standardized}'s region for {element.name} "
866 f"is disjoint with those for other elements.")
867 regions.append(record.region)
868 if element in standardized.graph.temporal:
869 if any(not record.timespan.overlaps(t) for t in timespans):
870 raise InconsistentDataIdError(f"Data ID {standardized}'s timespan for {element.name}"
871 f" is disjoint with those for other elements.")
872 timespans.append(record.timespan)
873 else:
874 if element in standardized.graph.required:
875 raise LookupError(
876 f"Could not fetch record for required dimension {element.name} via keys {keys}."
877 )
878 if element.alwaysJoin:
879 raise InconsistentDataIdError(
880 f"Could not fetch record for element {element.name} via keys {keys}, ",
881 "but it is marked alwaysJoin=True; this means one or more dimensions are not "
882 "related."
883 )
884 records.update((d, None) for d in element.implied)
885 return ExpandedDataCoordinate(standardized.graph, standardized.values(), records=records)
887 def relateDataIds(self, a: DataId, b: DataId) -> Optional[ConsistentDataIds]:
888 """Compare the keys and values of a pair of data IDs for consistency.
890 See `ConsistentDataIds` for more information.
892 Parameters
893 ----------
894 a : `dict` or `DataCoordinate`
895 First data ID to be compared.
896 b : `dict` or `DataCoordinate`
897 Second data ID to be compared.
899 Returns
900 -------
901 relationship : `ConsistentDataIds` or `None`
902 Relationship information. This is not `None` and coerces to
903 `True` in boolean contexts if and only if the data IDs are
904 consistent in terms of all common key-value pairs, all many-to-many
905 join tables, and all spatial andtemporal relationships.
906 """
907 a = DataCoordinate.standardize(a, universe=self.dimensions)
908 b = DataCoordinate.standardize(b, universe=self.dimensions)
909 aFull = getattr(a, "full", None)
910 bFull = getattr(b, "full", None)
911 aBest = aFull if aFull is not None else a
912 bBest = bFull if bFull is not None else b
913 jointKeys = aBest.keys() & bBest.keys()
914 # If any common values are not equal, we know they are inconsistent.
915 if any(aBest[k] != bBest[k] for k in jointKeys):
916 return None
917 # If the graphs are equal, we know the data IDs are.
918 if a.graph == b.graph:
919 return ConsistentDataIds(contains=True, within=True, overlaps=bool(jointKeys))
920 # Result is still inconclusive. Try to expand a data ID containing
921 # keys from both; that will fail if they are inconsistent.
922 # First, if either input was already an ExpandedDataCoordinate, extract
923 # its records so we don't have to query for them.
924 records = {}
925 if hasattr(a, "records"):
926 records.update(a.records)
927 if hasattr(b, "records"):
928 records.update(b.records)
929 try:
930 self.expandDataId({**a, **b}, graph=(a.graph | b.graph), records=records)
931 except InconsistentDataIdError:
932 return None
933 # We know the answer is not `None`; time to figure out what it is.
934 return ConsistentDataIds(
935 contains=(a.graph >= b.graph),
936 within=(a.graph <= b.graph),
937 overlaps=bool(a.graph & b.graph),
938 )
940 def insertDimensionData(self, element: Union[DimensionElement, str],
941 *data: Union[dict, DimensionRecord],
942 conform: bool = True):
943 """Insert one or more dimension records into the database.
945 Parameters
946 ----------
947 element : `DimensionElement` or `str`
948 The `DimensionElement` or name thereof that identifies the table
949 records will be inserted into.
950 data : `dict` or `DimensionRecord` (variadic)
951 One or more records to insert.
952 conform : `bool`, optional
953 If `False` (`True` is default) perform no checking or conversions,
954 and assume that ``element`` is a `DimensionElement` instance and
955 ``data`` is a one or more `DimensionRecord` instances of the
956 appropriate subclass.
957 """
958 if conform:
959 element = self.dimensions[element] # if this is a name, convert it to a true DimensionElement.
960 records = [element.RecordClass.fromDict(row) if not type(row) is element.RecordClass else row
961 for row in data]
962 else:
963 records = data
964 storage = self._dimensions[element]
965 storage.insert(*records)
967 def syncDimensionData(self, element: Union[DimensionElement, str],
968 row: Union[dict, DimensionRecord],
969 conform: bool = True) -> bool:
970 """Synchronize the given dimension record with the database, inserting
971 if it does not already exist and comparing values if it does.
973 Parameters
974 ----------
975 element : `DimensionElement` or `str`
976 The `DimensionElement` or name thereof that identifies the table
977 records will be inserted into.
978 row : `dict` or `DimensionRecord`
979 The record to insert.
980 conform : `bool`, optional
981 If `False` (`True` is default) perform no checking or conversions,
982 and assume that ``element`` is a `DimensionElement` instance and
983 ``data`` is a one or more `DimensionRecord` instances of the
984 appropriate subclass.
986 Returns
987 -------
988 inserted : `bool`
989 `True` if a new row was inserted, `False` otherwise.
991 Raises
992 ------
993 ConflictingDefinitionError
994 Raised if the record exists in the database (according to primary
995 key lookup) but is inconsistent with the given one.
997 Notes
998 -----
999 This method cannot be called within transactions, as it needs to be
1000 able to perform its own transaction to be concurrent.
1001 """
1002 if conform:
1003 element = self.dimensions[element] # if this is a name, convert it to a true DimensionElement.
1004 record = element.RecordClass.fromDict(row) if not type(row) is element.RecordClass else row
1005 else:
1006 record = row
1007 storage = self._dimensions[element]
1008 return storage.sync(record)
1010 def queryDatasetTypes(self, expression: Any = ..., *, components: Optional[bool] = None
1011 ) -> Iterator[DatasetType]:
1012 """Iterate over the dataset types whose names match an expression.
1014 Parameters
1015 ----------
1016 expression : `Any`, optional
1017 An expression that fully or partially identifies the dataset types
1018 to return, such as a `str`, `re.Pattern`, or iterable thereof.
1019 `...` can be used to return all dataset types, and is the default.
1020 See :ref:`daf_butler_dataset_type_expressions` for more
1021 information.
1022 components : `bool`, optional
1023 If `True`, apply all expression patterns to component dataset type
1024 names as well. If `False`, never apply patterns to components.
1025 If `None` (default), apply patterns to components only if their
1026 parent datasets were not matched by the expression.
1027 Fully-specified component datasets (`str` or `DatasetType`
1028 instances) are always included.
1030 Yields
1031 ------
1032 datasetType : `DatasetType`
1033 A `DatasetType` instance whose name matches ``expression``.
1034 """
1035 wildcard = CategorizedWildcard.fromExpression(expression, coerceUnrecognized=lambda d: d.name)
1036 if wildcard is ...:
1037 for datasetType in self._datasets:
1038 if components or not datasetType.isComponent():
1039 yield datasetType
1040 return
1041 done = set()
1042 for name in wildcard.strings:
1043 storage = self._datasets.find(name)
1044 if storage is not None:
1045 done.add(storage.datasetType)
1046 yield storage.datasetType
1047 if wildcard.patterns:
1048 # If components (the argument) is None, we'll save component
1049 # dataset that we might want to match, but only if their parents
1050 # didn't get included.
1051 componentsForLater = []
1052 for datasetType in self._datasets:
1053 if datasetType.name in done:
1054 continue
1055 parentName, componentName = datasetType.nameAndComponent()
1056 if componentName is not None and not components:
1057 if components is None and parentName not in done:
1058 componentsForLater.append(datasetType)
1059 continue
1060 if any(p.fullmatch(datasetType.name) for p in wildcard.patterns):
1061 done.add(datasetType.name)
1062 yield datasetType
1063 # Go back and try to match saved components.
1064 for datasetType in componentsForLater:
1065 parentName, _ = datasetType.nameAndComponent()
1066 if parentName not in done and any(p.fullmatch(datasetType.name) for p in wildcard.patterns):
1067 yield datasetType
1069 def queryCollections(self, expression: Any = ...,
1070 datasetType: Optional[DatasetType] = None,
1071 collectionType: Optional[CollectionType] = None,
1072 flattenChains: bool = False,
1073 includeChains: Optional[bool] = None) -> Iterator[str]:
1074 """Iterate over the collections whose names match an expression.
1076 Parameters
1077 ----------
1078 expression : `Any`, optional
1079 An expression that fully or partially identifies the collections
1080 to return, such as a `str`, `re.Pattern`, or iterable thereof.
1081 `...` can be used to return all collections, and is the default.
1082 See :ref:`daf_butler_collection_expressions` for more
1083 information.
1084 datasetType : `DatasetType`, optional
1085 If provided, only yield collections that should be searched for
1086 this dataset type according to ``expression``. If this is
1087 not provided, any dataset type restrictions in ``expression`` are
1088 ignored.
1089 collectionType : `CollectionType`, optional
1090 If provided, only yield collections of this type.
1091 flattenChains : `bool`, optional
1092 If `True` (`False` is default), recursively yield the child
1093 collections of matching `~CollectionType.CHAINED` collections.
1094 includeChains : `bool`, optional
1095 If `True`, yield records for matching `~CollectionType.CHAINED`
1096 collections. Default is the opposite of ``flattenChains``: include
1097 either CHAINED collections or their children, but not both.
1099 Yields
1100 ------
1101 collection : `str`
1102 The name of a collection that matches ``expression``.
1103 """
1104 query = CollectionQuery.fromExpression(expression)
1105 for record in query.iter(self._collections, datasetType=datasetType, collectionType=collectionType,
1106 flattenChains=flattenChains, includeChains=includeChains):
1107 yield record.name
1109 def makeQueryBuilder(self, summary: QuerySummary) -> QueryBuilder:
1110 """Return a `QueryBuilder` instance capable of constructing and
1111 managing more complex queries than those obtainable via `Registry`
1112 interfaces.
1114 This is an advanced interface; downstream code should prefer
1115 `Registry.queryDimensions` and `Registry.queryDatasets` whenever those
1116 are sufficient.
1118 Parameters
1119 ----------
1120 summary : `QuerySummary`
1121 Object describing and categorizing the full set of dimensions that
1122 will be included in the query.
1124 Returns
1125 -------
1126 builder : `QueryBuilder`
1127 Object that can be used to construct and perform advanced queries.
1128 """
1129 return QueryBuilder(summary=summary,
1130 collections=self._collections,
1131 dimensions=self._dimensions,
1132 datasets=self._datasets)
1134 def queryDimensions(self, dimensions: Union[Iterable[Union[Dimension, str]], Dimension, str], *,
1135 dataId: Optional[DataId] = None,
1136 datasets: Any = None,
1137 collections: Any = None,
1138 where: Optional[str] = None,
1139 expand: bool = True,
1140 components: Optional[bool] = None,
1141 **kwds) -> Iterator[DataCoordinate]:
1142 """Query for and iterate over data IDs matching user-provided criteria.
1144 Parameters
1145 ----------
1146 dimensions : `Dimension` or `str`, or iterable thereof
1147 The dimensions of the data IDs to yield, as either `Dimension`
1148 instances or `str`. Will be automatically expanded to a complete
1149 `DimensionGraph`.
1150 dataId : `dict` or `DataCoordinate`, optional
1151 A data ID whose key-value pairs are used as equality constraints
1152 in the query.
1153 datasets : `Any`, optional
1154 An expression that fully or partially identifies dataset types
1155 that should constrain the yielded data IDs. For example, including
1156 "raw" here would constrain the yielded ``instrument``,
1157 ``exposure``, ``detector``, and ``physical_filter`` values to only
1158 those for which at least one "raw" dataset exists in
1159 ``collections``. Allowed types include `DatasetType`, `str`,
1160 `re.Pattern`, and iterables thereof. Unlike other dataset type
1161 expressions, `...` is not permitted - it doesn't make sense to
1162 constrain data IDs on the existence of *all* datasets.
1163 See :ref:`daf_butler_dataset_type_expressions` for more
1164 information.
1165 collections: `Any`, optional
1166 An expression that fully or partially identifies the collections
1167 to search for datasets, such as a `str`, `re.Pattern`, or iterable
1168 thereof. `...` can be used to return all collections. Must be
1169 provided if ``datasets`` is, and is ignored if it is not. See
1170 :ref:`daf_butler_collection_expressions` for more information.
1171 where : `str`, optional
1172 A string expression similar to a SQL WHERE clause. May involve
1173 any column of a dimension table or (as a shortcut for the primary
1174 key column of a dimension table) dimension name. See
1175 :ref:`daf_butler_dimension_expressions` for more information.
1176 expand : `bool`, optional
1177 If `True` (default) yield `ExpandedDataCoordinate` instead of
1178 minimal `DataCoordinate` base-class instances.
1179 components : `bool`, optional
1180 If `True`, apply all dataset expression patterns to component
1181 dataset type names as well. If `False`, never apply patterns to
1182 components. If `None` (default), apply patterns to components only
1183 if their parent datasets were not matched by the expression.
1184 Fully-specified component datasets (`str` or `DatasetType`
1185 instances) are always included.
1186 kwds
1187 Additional keyword arguments are forwarded to
1188 `DataCoordinate.standardize` when processing the ``dataId``
1189 argument (and may be used to provide a constraining data ID even
1190 when the ``dataId`` argument is `None`).
1192 Yields
1193 ------
1194 dataId : `DataCoordinate`
1195 Data IDs matching the given query parameters. Order is
1196 unspecified.
1197 """
1198 dimensions = iterable(dimensions)
1199 standardizedDataId = self.expandDataId(dataId, **kwds)
1200 standardizedDatasetTypes = set()
1201 requestedDimensionNames = set(self.dimensions.extract(dimensions).names)
1202 if datasets is not None:
1203 if collections is None:
1204 raise TypeError("Cannot pass 'datasets' without 'collections'.")
1205 for datasetType in self.queryDatasetTypes(datasets, components=components):
1206 requestedDimensionNames.update(datasetType.dimensions.names)
1207 # If any matched dataset type is a component, just operate on
1208 # its parent instead, because Registry doesn't know anything
1209 # about what components exist, and here (unlike queryDatasets)
1210 # we don't care about returning them.
1211 parentDatasetTypeName, componentName = datasetType.nameAndComponent()
1212 if componentName is not None:
1213 datasetType = self.getDatasetType(parentDatasetTypeName)
1214 standardizedDatasetTypes.add(datasetType)
1215 # Preprocess collections expression in case the original included
1216 # single-pass iterators (we'll want to use it multiple times
1217 # below).
1218 collections = CollectionQuery.fromExpression(collections)
1220 summary = QuerySummary(
1221 requested=DimensionGraph(self.dimensions, names=requestedDimensionNames),
1222 dataId=standardizedDataId,
1223 expression=where,
1224 )
1225 builder = self.makeQueryBuilder(summary)
1226 for datasetType in standardizedDatasetTypes:
1227 builder.joinDataset(datasetType, collections, isResult=False)
1228 query = builder.finish()
1229 predicate = query.predicate()
1230 for row in self._db.query(query.sql):
1231 if predicate(row):
1232 result = query.extractDataId(row)
1233 if expand:
1234 yield self.expandDataId(result, records=standardizedDataId.records)
1235 else:
1236 yield result
1238 def queryDatasets(self, datasetType: Any, *,
1239 collections: Any,
1240 dimensions: Optional[Iterable[Union[Dimension, str]]] = None,
1241 dataId: Optional[DataId] = None,
1242 where: Optional[str] = None,
1243 deduplicate: bool = False,
1244 expand: bool = True,
1245 components: Optional[bool] = None,
1246 **kwds) -> Iterator[DatasetRef]:
1247 """Query for and iterate over dataset references matching user-provided
1248 criteria.
1250 Parameters
1251 ----------
1252 datasetType
1253 An expression that fully or partially identifies the dataset types
1254 to be queried. Allowed types include `DatasetType`, `str`,
1255 `re.Pattern`, and iterables thereof. The special value `...` can
1256 be used to query all dataset types. See
1257 :ref:`daf_butler_dataset_type_expressions` for more information.
1258 collections
1259 An expression that fully or partially identifies the collections
1260 to search for datasets, such as a `str`, `re.Pattern`, or iterable
1261 thereof. `...` can be used to return all collections. See
1262 :ref:`daf_butler_collection_expressions` for more information.
1263 dimensions : `~collections.abc.Iterable` of `Dimension` or `str`
1264 Dimensions to include in the query (in addition to those used
1265 to identify the queried dataset type(s)), either to constrain
1266 the resulting datasets to those for which a matching dimension
1267 exists, or to relate the dataset type's dimensions to dimensions
1268 referenced by the ``dataId`` or ``where`` arguments.
1269 dataId : `dict` or `DataCoordinate`, optional
1270 A data ID whose key-value pairs are used as equality constraints
1271 in the query.
1272 where : `str`, optional
1273 A string expression similar to a SQL WHERE clause. May involve
1274 any column of a dimension table or (as a shortcut for the primary
1275 key column of a dimension table) dimension name. See
1276 :ref:`daf_butler_dimension_expressions` for more information.
1277 deduplicate : `bool`, optional
1278 If `True` (`False` is default), for each result data ID, only
1279 yield one `DatasetRef` of each `DatasetType`, from the first
1280 collection in which a dataset of that dataset type appears
1281 (according to the order of ``collections`` passed in). If `True`,
1282 ``collections`` must not contain regular expressions and may not
1283 be `...`.
1284 expand : `bool`, optional
1285 If `True` (default) attach `ExpandedDataCoordinate` instead of
1286 minimal `DataCoordinate` base-class instances.
1287 components : `bool`, optional
1288 If `True`, apply all dataset expression patterns to component
1289 dataset type names as well. If `False`, never apply patterns to
1290 components. If `None` (default), apply patterns to components only
1291 if their parent datasets were not matched by the expression.
1292 Fully-specified component datasets (`str` or `DatasetType`
1293 instances) are always included.
1294 kwds
1295 Additional keyword arguments are forwarded to
1296 `DataCoordinate.standardize` when processing the ``dataId``
1297 argument (and may be used to provide a constraining data ID even
1298 when the ``dataId`` argument is `None`).
1300 Yields
1301 ------
1302 ref : `DatasetRef`
1303 Dataset references matching the given query criteria. These
1304 are grouped by `DatasetType` if the query evaluates to multiple
1305 dataset types, but order is otherwise unspecified.
1307 Raises
1308 ------
1309 TypeError
1310 Raised when the arguments are incompatible, such as when a
1311 collection wildcard is passed when ``deduplicate`` is `True`.
1313 Notes
1314 -----
1315 When multiple dataset types are queried in a single call, the
1316 results of this operation are equivalent to querying for each dataset
1317 type separately in turn, and no information about the relationships
1318 between datasets of different types is included. In contexts where
1319 that kind of information is important, the recommended pattern is to
1320 use `queryDimensions` to first obtain data IDs (possibly with the
1321 desired dataset types and collections passed as constraints to the
1322 query), and then use multiple (generally much simpler) calls to
1323 `queryDatasets` with the returned data IDs passed as constraints.
1324 """
1325 # Standardize the collections expression.
1326 if deduplicate:
1327 collections = CollectionSearch.fromExpression(collections)
1328 else:
1329 collections = CollectionQuery.fromExpression(collections)
1330 # Standardize and expand the data ID provided as a constraint.
1331 standardizedDataId = self.expandDataId(dataId, **kwds)
1333 # We can only query directly if given a non-component DatasetType
1334 # instance. If we were given an expression or str or a component
1335 # DatasetType instance, we'll populate this dict, recurse, and return.
1336 # If we already have a non-component DatasetType, it will remain None
1337 # and we'll run the query directly.
1338 composition: Optional[
1339 Dict[
1340 DatasetType, # parent dataset type
1341 List[Optional[str]] # component name, or None for parent
1342 ]
1343 ] = None
1344 if not isinstance(datasetType, DatasetType):
1345 # We were given a dataset type expression (which may be as simple
1346 # as a str). Loop over all matching datasets, delegating handling
1347 # of the `components` argument to queryDatasetTypes, as we populate
1348 # the composition dict.
1349 composition = defaultdict(list)
1350 for trueDatasetType in self.queryDatasetTypes(datasetType, components=components):
1351 parentName, componentName = trueDatasetType.nameAndComponent()
1352 if componentName is not None:
1353 parentDatasetType = self.getDatasetType(parentName)
1354 composition.setdefault(parentDatasetType, []).append(componentName)
1355 else:
1356 composition.setdefault(trueDatasetType, []).append(None)
1357 elif datasetType.isComponent():
1358 # We were given a true DatasetType instance, but it's a component.
1359 # the composition dict will have exactly one item.
1360 parentName, componentName = datasetType.nameAndComponent()
1361 parentDatasetType = self.getDatasetType(parentName)
1362 composition = {parentDatasetType: [componentName]}
1363 if composition is not None:
1364 # We need to recurse. Do that once for each parent dataset type.
1365 for parentDatasetType, componentNames in composition.items():
1366 for parentRef in self.queryDatasets(parentDatasetType, collections=collections,
1367 dimensions=dimensions, dataId=standardizedDataId,
1368 where=where, deduplicate=deduplicate):
1369 # Loop over components, yielding one for each one for each
1370 # one requested.
1371 for componentName in componentNames:
1372 if componentName is None:
1373 yield parentRef
1374 else:
1375 yield parentRef.makeComponentRef(componentName)
1376 return
1377 # If we get here, there's no need to recurse (or we are already
1378 # recursing; there can only ever be one level of recursion).
1380 # The full set of dimensions in the query is the combination of those
1381 # needed for the DatasetType and those explicitly requested, if any.
1382 requestedDimensionNames = set(datasetType.dimensions.names)
1383 if dimensions is not None:
1384 requestedDimensionNames.update(self.dimensions.extract(dimensions).names)
1385 # Construct the summary structure needed to construct a QueryBuilder.
1386 summary = QuerySummary(
1387 requested=DimensionGraph(self.dimensions, names=requestedDimensionNames),
1388 dataId=standardizedDataId,
1389 expression=where,
1390 )
1391 builder = self.makeQueryBuilder(summary)
1392 # Add the dataset subquery to the query, telling the QueryBuilder to
1393 # include the rank of the selected collection in the results only if we
1394 # need to deduplicate. Note that if any of the collections are
1395 # actually wildcard expressions, and we've asked for deduplication,
1396 # this will raise TypeError for us.
1397 if not builder.joinDataset(datasetType, collections, isResult=True, addRank=deduplicate):
1398 return
1399 query = builder.finish()
1400 predicate = query.predicate()
1401 if not deduplicate:
1402 # No need to de-duplicate across collections.
1403 for row in self._db.query(query.sql):
1404 if predicate(row):
1405 dataId = query.extractDataId(row, graph=datasetType.dimensions)
1406 if expand:
1407 dataId = self.expandDataId(dataId, records=standardizedDataId.records)
1408 yield query.extractDatasetRef(row, datasetType, dataId)[0]
1409 else:
1410 # For each data ID, yield only the DatasetRef with the lowest
1411 # collection rank.
1412 bestRefs = {}
1413 bestRanks = {}
1414 for row in self._db.query(query.sql):
1415 if predicate(row):
1416 ref, rank = query.extractDatasetRef(row, datasetType)
1417 bestRank = bestRanks.get(ref.dataId, sys.maxsize)
1418 if rank < bestRank:
1419 bestRefs[ref.dataId] = ref
1420 bestRanks[ref.dataId] = rank
1421 # If caller requested expanded data IDs, we defer that until here
1422 # so we do as little expansion as possible.
1423 if expand:
1424 for ref in bestRefs.values():
1425 dataId = self.expandDataId(ref.dataId, records=standardizedDataId.records)
1426 yield ref.expanded(dataId)
1427 else:
1428 yield from bestRefs.values()
1430 dimensions: DimensionUniverse
1431 """The universe of all dimensions known to the registry
1432 (`DimensionUniverse`).
1433 """
1435 storageClasses: StorageClassFactory
1436 """All storage classes known to the registry (`StorageClassFactory`).
1437 """