Coverage for python/lsst/daf/butler/registry/_registry.py : 14%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22from __future__ import annotations
24__all__ = (
25 "ConsistentDataIds",
26 "Registry",
27)
29from collections import defaultdict
30import contextlib
31from dataclasses import dataclass
32import sys
33from typing import (
34 Any,
35 Iterable,
36 Iterator,
37 List,
38 Mapping,
39 Optional,
40 Type,
41 TYPE_CHECKING,
42 Union,
43)
45import sqlalchemy
47import lsst.sphgeom
48from ..core import (
49 Config,
50 DataCoordinate,
51 DataId,
52 DatasetRef,
53 DatasetType,
54 Dimension,
55 DimensionElement,
56 DimensionGraph,
57 DimensionRecord,
58 DimensionUniverse,
59 ExpandedDataCoordinate,
60 StorageClassFactory,
61)
62from ..core import ddl
63from ..core.utils import doImport, iterable, transactional
64from ._config import RegistryConfig
65from .queries import (
66 QueryBuilder,
67 QuerySummary,
68)
69from .tables import makeRegistryTableSpecs
70from ._collectionType import CollectionType
71from ._exceptions import ConflictingDefinitionError, InconsistentDataIdError, OrphanedRecordError
72from .wildcards import CategorizedWildcard, CollectionQuery, CollectionSearch
74if TYPE_CHECKING: 74 ↛ 75line 74 didn't jump to line 75, because the condition on line 74 was never true
75 from ..butlerConfig import ButlerConfig
76 from ..core import (
77 Quantum
78 )
79 from .interfaces import (
80 CollectionManager,
81 Database,
82 OpaqueTableStorageManager,
83 DimensionRecordStorageManager,
84 DatasetRecordStorageManager,
85 DatastoreRegistryBridgeManager,
86 )
89@dataclass
90class ConsistentDataIds:
91 """A struct used to report relationships between data IDs by
92 `Registry.relateDataIds`.
94 If an instance of this class is returned (instead of `None`), the data IDs
95 are "not inconsistent" - any keys they have in common have the same value,
96 and any spatial or temporal relationships they have at least might involve
97 an overlap. To capture this, any instance of `ConsistentDataIds` coerces
98 to `True` in boolean contexts.
99 """
101 overlaps: bool
102 """If `True`, the data IDs have at least one key in common, associated with
103 the same value.
105 Note that data IDs are not inconsistent even if overlaps is `False` - they
106 may simply have no keys in common, which means they cannot have
107 inconsistent values for any keys. They may even be equal, in the case that
108 both data IDs are empty.
110 This field does _not_ indicate whether a spatial or temporal overlap
111 relationship exists.
112 """
114 contains: bool
115 """If `True`, all keys in the first data ID are in the second, and are
116 associated with the same values.
118 This includes case where the first data ID is empty.
119 """
121 within: bool
122 """If `True`, all keys in the second data ID are in the first, and are
123 associated with the same values.
125 This includes case where the second data ID is empty.
126 """
128 @property
129 def equal(self) -> bool:
130 """If `True`, the two data IDs are the same.
132 Data IDs are equal if they have both a `contains` and a `within`
133 relationship.
134 """
135 return self.contains and self.within
137 @property
138 def disjoint(self) -> bool:
139 """If `True`, the two data IDs have no keys in common.
141 This is simply the oppose of `overlaps`. Disjoint datasets are by
142 definition not inconsistent.
143 """
144 return not self.overlaps
146 def __bool__(self) -> bool:
147 return True
150class Registry:
151 """Registry interface.
153 Parameters
154 ----------
155 config : `ButlerConfig`, `RegistryConfig`, `Config` or `str`
156 Registry configuration
157 """
159 defaultConfigFile = None
160 """Path to configuration defaults. Relative to $DAF_BUTLER_DIR/config or
161 absolute path. Can be None if no defaults specified.
162 """
164 @classmethod
165 def fromConfig(cls, config: Union[ButlerConfig, RegistryConfig, Config, str], create: bool = False,
166 butlerRoot: Optional[str] = None, writeable: bool = True) -> Registry:
167 """Create `Registry` subclass instance from `config`.
169 Uses ``registry.cls`` from `config` to determine which subclass to
170 instantiate.
172 Parameters
173 ----------
174 config : `ButlerConfig`, `RegistryConfig`, `Config` or `str`
175 Registry configuration
176 create : `bool`, optional
177 Assume empty Registry and create a new one.
178 butlerRoot : `str`, optional
179 Path to the repository root this `Registry` will manage.
180 writeable : `bool`, optional
181 If `True` (default) create a read-write connection to the database.
183 Returns
184 -------
185 registry : `Registry` (subclass)
186 A new `Registry` subclass instance.
187 """
188 if not isinstance(config, RegistryConfig):
189 if isinstance(config, str) or isinstance(config, Config):
190 config = RegistryConfig(config)
191 else:
192 raise ValueError("Incompatible Registry configuration: {}".format(config))
193 config.replaceRoot(butlerRoot)
194 DatabaseClass = config.getDatabaseClass()
195 database = DatabaseClass.fromUri(str(config.connectionString), origin=config.get("origin", 0),
196 namespace=config.get("namespace"), writeable=writeable)
197 universe = DimensionUniverse(config)
198 opaque = doImport(config["managers", "opaque"])
199 dimensions = doImport(config["managers", "dimensions"])
200 collections = doImport(config["managers", "collections"])
201 datasets = doImport(config["managers", "datasets"])
202 datastoreBridges = doImport(config["managers", "datastores"])
203 return cls(database, universe, dimensions=dimensions, opaque=opaque, collections=collections,
204 datasets=datasets, datastoreBridges=datastoreBridges, create=create)
206 def __init__(self, database: Database, universe: DimensionUniverse, *,
207 opaque: Type[OpaqueTableStorageManager],
208 dimensions: Type[DimensionRecordStorageManager],
209 collections: Type[CollectionManager],
210 datasets: Type[DatasetRecordStorageManager],
211 datastoreBridges: Type[DatastoreRegistryBridgeManager],
212 create: bool = False):
213 self._db = database
214 self.storageClasses = StorageClassFactory()
215 with self._db.declareStaticTables(create=create) as context:
216 self._dimensions = dimensions.initialize(self._db, context, universe=universe)
217 self._collections = collections.initialize(self._db, context)
218 self._datasets = datasets.initialize(self._db, context,
219 collections=self._collections,
220 universe=self.dimensions)
221 self._opaque = opaque.initialize(self._db, context)
222 self._datastoreBridges = datastoreBridges.initialize(self._db, context,
223 opaque=self._opaque,
224 datasets=datasets,
225 universe=self.dimensions)
226 self._tables = context.addTableTuple(makeRegistryTableSpecs(self.dimensions,
227 self._collections,
228 self._datasets))
229 self._collections.refresh()
230 self._datasets.refresh(universe=self._dimensions.universe)
232 def __str__(self) -> str:
233 return str(self._db)
235 def __repr__(self) -> str:
236 return f"Registry({self._db!r}, {self.dimensions!r})"
238 def isWriteable(self) -> bool:
239 """Return `True` if this registry allows write operations, and `False`
240 otherwise.
241 """
242 return self._db.isWriteable()
244 @property
245 def dimensions(self) -> DimensionUniverse:
246 """All dimensions recognized by this `Registry` (`DimensionUniverse`).
247 """
248 return self._dimensions.universe
250 @contextlib.contextmanager
251 def transaction(self):
252 """Return a context manager that represents a transaction.
253 """
254 # TODO make savepoint=False the default.
255 try:
256 with self._db.transaction():
257 yield
258 except BaseException:
259 # TODO: this clears the caches sometimes when we wouldn't actually
260 # need to. Can we avoid that?
261 self._dimensions.clearCaches()
262 raise
264 def registerOpaqueTable(self, tableName: str, spec: ddl.TableSpec):
265 """Add an opaque (to the `Registry`) table for use by a `Datastore` or
266 other data repository client.
268 Opaque table records can be added via `insertOpaqueData`, retrieved via
269 `fetchOpaqueData`, and removed via `deleteOpaqueData`.
271 Parameters
272 ----------
273 tableName : `str`
274 Logical name of the opaque table. This may differ from the
275 actual name used in the database by a prefix and/or suffix.
276 spec : `ddl.TableSpec`
277 Specification for the table to be added.
278 """
279 self._opaque.register(tableName, spec)
281 @transactional
282 def insertOpaqueData(self, tableName: str, *data: dict):
283 """Insert records into an opaque table.
285 Parameters
286 ----------
287 tableName : `str`
288 Logical name of the opaque table. Must match the name used in a
289 previous call to `registerOpaqueTable`.
290 data
291 Each additional positional argument is a dictionary that represents
292 a single row to be added.
293 """
294 self._opaque[tableName].insert(*data)
296 def fetchOpaqueData(self, tableName: str, **where: Any) -> Iterator[dict]:
297 """Retrieve records from an opaque table.
299 Parameters
300 ----------
301 tableName : `str`
302 Logical name of the opaque table. Must match the name used in a
303 previous call to `registerOpaqueTable`.
304 where
305 Additional keyword arguments are interpreted as equality
306 constraints that restrict the returned rows (combined with AND);
307 keyword arguments are column names and values are the values they
308 must have.
310 Yields
311 ------
312 row : `dict`
313 A dictionary representing a single result row.
314 """
315 yield from self._opaque[tableName].fetch(**where)
317 @transactional
318 def deleteOpaqueData(self, tableName: str, **where: Any):
319 """Remove records from an opaque table.
321 Parameters
322 ----------
323 tableName : `str`
324 Logical name of the opaque table. Must match the name used in a
325 previous call to `registerOpaqueTable`.
326 where
327 Additional keyword arguments are interpreted as equality
328 constraints that restrict the deleted rows (combined with AND);
329 keyword arguments are column names and values are the values they
330 must have.
331 """
332 self._opaque[tableName].delete(**where)
334 def registerCollection(self, name: str, type: CollectionType = CollectionType.TAGGED):
335 """Add a new collection if one with the given name does not exist.
337 Parameters
338 ----------
339 name : `str`
340 The name of the collection to create.
341 type : `CollectionType`
342 Enum value indicating the type of collection to create.
344 Notes
345 -----
346 This method cannot be called within transactions, as it needs to be
347 able to perform its own transaction to be concurrent.
348 """
349 self._collections.register(name, type)
351 def getCollectionType(self, name: str) -> CollectionType:
352 """Return an enumeration value indicating the type of the given
353 collection.
355 Parameters
356 ----------
357 name : `str`
358 The name of the collection.
360 Returns
361 -------
362 type : `CollectionType`
363 Enum value indicating the type of this collection.
365 Raises
366 ------
367 MissingCollectionError
368 Raised if no collection with the given name exists.
369 """
370 return self._collections.find(name).type
372 def registerRun(self, name: str):
373 """Add a new run if one with the given name does not exist.
375 Parameters
376 ----------
377 name : `str`
378 The name of the run to create.
380 Notes
381 -----
382 This method cannot be called within transactions, as it needs to be
383 able to perform its own transaction to be concurrent.
384 """
385 self._collections.register(name, CollectionType.RUN)
387 @transactional
388 def removeCollection(self, name: str):
389 """Completely remove the given collection.
391 Parameters
392 ----------
393 name : `str`
394 The name of the collection to remove.
396 Raises
397 ------
398 MissingCollectionError
399 Raised if no collection with the given name exists.
401 Notes
402 -----
403 If this is a `~CollectionType.RUN` collection, all datasets and quanta
404 in it are also fully removed. This requires that those datasets be
405 removed (or at least trashed) from any datastores that hold them first.
407 A collection may not be deleted as long as it is referenced by a
408 `~CollectionType.CHAINED` collection; the ``CHAINED`` collection must
409 be deleted or redefined first.
410 """
411 self._collections.remove(name)
413 def getCollectionChain(self, parent: str) -> CollectionSearch:
414 """Return the child collections in a `~CollectionType.CHAINED`
415 collection.
417 Parameters
418 ----------
419 parent : `str`
420 Name of the chained collection. Must have already been added via
421 a call to `Registry.registerCollection`.
423 Returns
424 -------
425 children : `CollectionSearch`
426 An object that defines the search path of the collection.
427 See :ref:`daf_butler_collection_expressions` for more information.
429 Raises
430 ------
431 MissingCollectionError
432 Raised if ``parent`` does not exist in the `Registry`.
433 TypeError
434 Raised if ``parent`` does not correspond to a
435 `~CollectionType.CHAINED` collection.
436 """
437 record = self._collections.find(parent)
438 if record.type is not CollectionType.CHAINED:
439 raise TypeError(f"Collection '{parent}' has type {record.type.name}, not CHAINED.")
440 return record.children
442 @transactional
443 def setCollectionChain(self, parent: str, children: Any):
444 """Define or redefine a `~CollectionType.CHAINED` collection.
446 Parameters
447 ----------
448 parent : `str`
449 Name of the chained collection. Must have already been added via
450 a call to `Registry.registerCollection`.
451 children : `Any`
452 An expression defining an ordered search of child collections,
453 generally an iterable of `str`. Restrictions on the dataset types
454 to be searched can also be included, by passing mapping or an
455 iterable containing tuples; see
456 :ref:`daf_butler_collection_expressions` for more information.
458 Raises
459 ------
460 MissingCollectionError
461 Raised when any of the given collections do not exist in the
462 `Registry`.
463 TypeError
464 Raised if ``parent`` does not correspond to a
465 `~CollectionType.CHAINED` collection.
466 ValueError
467 Raised if the given collections contains a cycle.
468 """
469 record = self._collections.find(parent)
470 if record.type is not CollectionType.CHAINED:
471 raise TypeError(f"Collection '{parent}' has type {record.type.name}, not CHAINED.")
472 children = CollectionSearch.fromExpression(children)
473 if children != record.children:
474 record.update(self._collections, children)
476 def registerDatasetType(self, datasetType: DatasetType) -> bool:
477 """
478 Add a new `DatasetType` to the Registry.
480 It is not an error to register the same `DatasetType` twice.
482 Parameters
483 ----------
484 datasetType : `DatasetType`
485 The `DatasetType` to be added.
487 Returns
488 -------
489 inserted : `bool`
490 `True` if ``datasetType`` was inserted, `False` if an identical
491 existing `DatsetType` was found. Note that in either case the
492 DatasetType is guaranteed to be defined in the Registry
493 consistently with the given definition.
495 Raises
496 ------
497 ValueError
498 Raised if the dimensions or storage class are invalid.
499 ConflictingDefinitionError
500 Raised if this DatasetType is already registered with a different
501 definition.
503 Notes
504 -----
505 This method cannot be called within transactions, as it needs to be
506 able to perform its own transaction to be concurrent.
507 """
508 _, inserted = self._datasets.register(datasetType)
509 return inserted
511 def getDatasetType(self, name: str) -> DatasetType:
512 """Get the `DatasetType`.
514 Parameters
515 ----------
516 name : `str`
517 Name of the type.
519 Returns
520 -------
521 type : `DatasetType`
522 The `DatasetType` associated with the given name.
524 Raises
525 ------
526 KeyError
527 Requested named DatasetType could not be found in registry.
528 """
529 storage = self._datasets.find(name)
530 if storage is None:
531 raise KeyError(f"DatasetType '{name}' could not be found.")
532 return storage.datasetType
534 def findDataset(self, datasetType: Union[DatasetType, str], dataId: Optional[DataId] = None, *,
535 collections: Any, **kwargs: Any) -> Optional[DatasetRef]:
536 """Find a dataset given its `DatasetType` and data ID.
538 This can be used to obtain a `DatasetRef` that permits the dataset to
539 be read from a `Datastore`. If the dataset is a component and can not
540 be found using the provided dataset type, a dataset ref for the parent
541 will be returned instead but with the correct dataset type.
543 Parameters
544 ----------
545 datasetType : `DatasetType` or `str`
546 A `DatasetType` or the name of one.
547 dataId : `dict` or `DataCoordinate`, optional
548 A `dict`-like object containing the `Dimension` links that identify
549 the dataset within a collection.
550 collections
551 An expression that fully or partially identifies the collections
552 to search for the dataset, such as a `str`, `re.Pattern`, or
553 iterable thereof. `...` can be used to return all collections.
554 See :ref:`daf_butler_collection_expressions` for more information.
555 **kwargs
556 Additional keyword arguments passed to
557 `DataCoordinate.standardize` to convert ``dataId`` to a true
558 `DataCoordinate` or augment an existing one.
560 Returns
561 -------
562 ref : `DatasetRef`
563 A reference to the dataset, or `None` if no matching Dataset
564 was found.
566 Raises
567 ------
568 LookupError
569 Raised if one or more data ID keys are missing or the dataset type
570 does not exist.
571 MissingCollectionError
572 Raised if any of ``collections`` does not exist in the registry.
573 """
574 if isinstance(datasetType, DatasetType):
575 storage = self._datasets.find(datasetType.name)
576 if storage is None:
577 raise LookupError(f"DatasetType '{datasetType}' has not been registered.")
578 else:
579 storage = self._datasets.find(datasetType)
580 if storage is None:
581 raise LookupError(f"DatasetType with name '{datasetType}' has not been registered.")
582 dataId = DataCoordinate.standardize(dataId, graph=storage.datasetType.dimensions,
583 universe=self.dimensions, **kwargs)
584 collections = CollectionSearch.fromExpression(collections)
585 for collectionRecord in collections.iter(self._collections, datasetType=storage.datasetType):
586 result = storage.find(collectionRecord, dataId)
587 if result is not None:
588 if result.datasetType.isComposite():
589 result = self._datasets.fetchComponents(result)
590 return result
592 # fallback to the parent if we got nothing and this was a component
593 if storage.datasetType.isComponent():
594 parentType, _ = storage.datasetType.nameAndComponent()
595 parentRef = self.findDataset(parentType, dataId, collections=collections, **kwargs)
596 if parentRef is not None:
597 # Should already conform and we know no components
598 return DatasetRef(storage.datasetType, parentRef.dataId, id=parentRef.id,
599 run=parentRef.run, conform=False, hasParentId=True)
601 return None
603 @transactional
604 def insertDatasets(self, datasetType: Union[DatasetType, str], dataIds: Iterable[DataId],
605 run: str, *, producer: Optional[Quantum] = None, recursive: bool = False
606 ) -> List[DatasetRef]:
607 """Insert one or more datasets into the `Registry`
609 This always adds new datasets; to associate existing datasets with
610 a new collection, use ``associate``.
612 Parameters
613 ----------
614 datasetType : `DatasetType` or `str`
615 A `DatasetType` or the name of one.
616 dataIds : `~collections.abc.Iterable` of `dict` or `DataCoordinate`
617 Dimension-based identifiers for the new datasets.
618 run : `str`
619 The name of the run that produced the datasets.
620 producer : `Quantum`
621 Unit of work that produced the datasets. May be `None` to store
622 no provenance information, but if present the `Quantum` must
623 already have been added to the Registry.
624 recursive : `bool`
625 If True, recursively add datasets and attach entries for component
626 datasets as well.
628 Returns
629 -------
630 refs : `list` of `DatasetRef`
631 Resolved `DatasetRef` instances for all given data IDs (in the same
632 order).
634 Raises
635 ------
636 ConflictingDefinitionError
637 If a dataset with the same dataset type and data ID as one of those
638 given already exists in ``run``.
639 MissingCollectionError
640 Raised if ``run`` does not exist in the registry.
641 """
642 if isinstance(datasetType, DatasetType):
643 storage = self._datasets.find(datasetType.name)
644 if storage is None:
645 raise LookupError(f"DatasetType '{datasetType}' has not been registered.")
646 else:
647 storage = self._datasets.find(datasetType)
648 if storage is None:
649 raise LookupError(f"DatasetType with name '{datasetType}' has not been registered.")
650 runRecord = self._collections.find(run)
651 dataIds = [self.expandDataId(dataId, graph=storage.datasetType.dimensions) for dataId in dataIds]
652 try:
653 refs = list(storage.insert(runRecord, dataIds, quantum=producer))
654 except sqlalchemy.exc.IntegrityError as err:
655 raise ConflictingDefinitionError(f"A database constraint failure was triggered by inserting "
656 f"one or more datasets of type {storage.datasetType} into "
657 f"collection '{run}'. "
658 f"This probably means a dataset with the same data ID "
659 f"and dataset type already exists, but it may also mean a "
660 f"dimension row is missing.") from err
661 if recursive and storage.datasetType.isComposite():
662 # Insert component rows by recursing.
663 composites = defaultdict(dict)
664 # TODO: we really shouldn't be inserting all components defined by
665 # the storage class, because there's no guarantee all of them are
666 # actually present in these datasets.
667 for componentName in storage.datasetType.storageClass.components:
668 componentDatasetType = storage.datasetType.makeComponentDatasetType(componentName)
669 componentRefs = self.insertDatasets(componentDatasetType,
670 dataIds=dataIds,
671 run=run,
672 producer=producer,
673 recursive=True)
674 for parentRef, componentRef in zip(refs, componentRefs):
675 composites[parentRef][componentName] = componentRef
676 if composites:
677 refs = list(self._datasets.attachComponents(composites.items()))
678 return refs
680 def getDataset(self, id: int) -> Optional[DatasetRef]:
681 """Retrieve a Dataset entry.
683 Parameters
684 ----------
685 id : `int`
686 The unique identifier for the dataset.
688 Returns
689 -------
690 ref : `DatasetRef` or `None`
691 A ref to the Dataset, or `None` if no matching Dataset
692 was found.
693 """
694 ref = self._datasets.getDatasetRef(id)
695 if ref is None:
696 return None
697 if ref.datasetType.isComposite():
698 return self._datasets.fetchComponents(ref)
699 return ref
701 @transactional
702 def removeDatasets(self, refs: Iterable[DatasetRef], *, recursive: bool = True):
703 """Remove datasets from the Registry.
705 The datasets will be removed unconditionally from all collections, and
706 any `Quantum` that consumed this dataset will instead be marked with
707 having a NULL input. `Datastore` records will *not* be deleted; the
708 caller is responsible for ensuring that the dataset has already been
709 removed from all Datastores.
711 Parameters
712 ----------
713 refs : `Iterable` of `DatasetRef`
714 References to the datasets to be removed. Must include a valid
715 ``id`` attribute, and should be considered invalidated upon return.
716 recursive : `bool`, optional
717 If `True`, remove all component datasets as well. Note that
718 this only removes components that are actually included in the
719 given `DatasetRef` instances, which may not be the same as those in
720 the database (especially if they were obtained from
721 `queryDatasets`, which does not populate `DatasetRef.components`).
723 Raises
724 ------
725 AmbiguousDatasetError
726 Raised if any ``ref.id`` is `None`.
727 OrphanedRecordError
728 Raised if any dataset is still present in any `Datastore`.
729 """
730 for datasetType, refsForType in DatasetRef.groupByType(refs, recursive=recursive).items():
731 storage = self._datasets.find(datasetType.name)
732 try:
733 storage.delete(refsForType)
734 except sqlalchemy.exc.IntegrityError as err:
735 raise OrphanedRecordError("One or more datasets is still "
736 "present in one or more Datastores.") from err
738 @transactional
739 def attachComponents(self, parent: DatasetRef, components: Mapping[str, DatasetRef]):
740 """Attach components to a dataset.
742 Parameters
743 ----------
744 parent : `DatasetRef`
745 A reference to the parent dataset.
746 components : `Mapping` [ `str`, `DatasetRef` ]
747 Mapping from component name to the `DatasetRef` for that component.
749 Returns
750 -------
751 ref : `DatasetRef`
752 An updated version of ``parent`` with components included.
754 Raises
755 ------
756 AmbiguousDatasetError
757 Raised if ``parent.id`` or any `DatasetRef.id` in ``components``
758 is `None`.
759 """
760 for name, ref in components.items():
761 if ref.datasetType.storageClass != parent.datasetType.storageClass.components[name]:
762 raise TypeError(f"Expected storage class "
763 f"'{parent.datasetType.storageClass.components[name].name}' "
764 f"for component '{name}' of dataset {parent}; got "
765 f"dataset {ref} with storage class "
766 f"'{ref.datasetType.storageClass.name}'.")
767 ref, = self._datasets.attachComponents([(parent, components)])
768 return ref
770 @transactional
771 def associate(self, collection: str, refs: Iterable[DatasetRef], *, recursive: bool = True):
772 """Add existing datasets to a `~CollectionType.TAGGED` collection.
774 If a DatasetRef with the same exact integer ID is already in a
775 collection nothing is changed. If a `DatasetRef` with the same
776 `DatasetType` and data ID but with different integer ID
777 exists in the collection, `ConflictingDefinitionError` is raised.
779 Parameters
780 ----------
781 collection : `str`
782 Indicates the collection the datasets should be associated with.
783 refs : `Iterable` [ `DatasetRef` ]
784 An iterable of resolved `DatasetRef` instances that already exist
785 in this `Registry`.
786 recursive : `bool`, optional
787 If `True`, associate all component datasets as well. Note that
788 this only associates components that are actually included in the
789 given `DatasetRef` instances, which may not be the same as those in
790 the database (especially if they were obtained from
791 `queryDatasets`, which does not populate `DatasetRef.components`).
793 Raises
794 ------
795 ConflictingDefinitionError
796 If a Dataset with the given `DatasetRef` already exists in the
797 given collection.
798 AmbiguousDatasetError
799 Raised if ``any(ref.id is None for ref in refs)``.
800 MissingCollectionError
801 Raised if ``collection`` does not exist in the registry.
802 TypeError
803 Raise adding new datasets to the given ``collection`` is not
804 allowed.
805 """
806 collectionRecord = self._collections.find(collection)
807 if collectionRecord.type is not CollectionType.TAGGED:
808 raise TypeError(f"Collection '{collection}' has type {collectionRecord.type.name}, not TAGGED.")
809 for datasetType, refsForType in DatasetRef.groupByType(refs, recursive=recursive).items():
810 storage = self._datasets.find(datasetType.name)
811 try:
812 storage.associate(collectionRecord, refsForType)
813 except sqlalchemy.exc.IntegrityError as err:
814 raise ConflictingDefinitionError(
815 f"Constraint violation while associating dataset of type {datasetType.name} with "
816 f"collection {collection}. This probably means that one or more datasets with the same "
817 f"dataset type and data ID already exist in the collection, but it may also indicate "
818 f"that the datasets do not exist."
819 ) from err
821 @transactional
822 def disassociate(self, collection: str, refs: Iterable[DatasetRef], *, recursive: bool = True):
823 """Remove existing datasets from a `~CollectionType.TAGGED` collection.
825 ``collection`` and ``ref`` combinations that are not currently
826 associated are silently ignored.
828 Parameters
829 ----------
830 collection : `str`
831 The collection the datasets should no longer be associated with.
832 refs : `Iterable` [ `DatasetRef` ]
833 An iterable of resolved `DatasetRef` instances that already exist
834 in this `Registry`.
835 recursive : `bool`, optional
836 If `True`, disassociate all component datasets as well. Note that
837 this only disassociates components that are actually included in
838 the given `DatasetRef` instances, which may not be the same as
839 those in the database (especially if they were obtained from
840 `queryDatasets`, which does not populate `DatasetRef.components`).
842 Raises
843 ------
844 AmbiguousDatasetError
845 Raised if any of the given dataset references is unresolved.
846 MissingCollectionError
847 Raised if ``collection`` does not exist in the registry.
848 TypeError
849 Raise adding new datasets to the given ``collection`` is not
850 allowed.
851 """
852 collectionRecord = self._collections.find(collection)
853 if collectionRecord.type is not CollectionType.TAGGED:
854 raise TypeError(f"Collection '{collection}' has type {collectionRecord.type.name}; "
855 "expected TAGGED.")
856 for datasetType, refsForType in DatasetRef.groupByType(refs, recursive=recursive).items():
857 storage = self._datasets.find(datasetType.name)
858 storage.disassociate(collectionRecord, refsForType)
860 def getDatastoreBridgeManager(self) -> DatastoreRegistryBridgeManager:
861 # TODO docs
862 return self._datastoreBridges
864 def getDatasetLocations(self, ref: DatasetRef) -> Iterator[str]:
865 """Retrieve datastore locations for a given dataset.
867 Typically used by `Datastore`.
869 Parameters
870 ----------
871 ref : `DatasetRef`
872 A reference to the dataset for which to retrieve storage
873 information.
875 Returns
876 -------
877 datastores : `Iterable` [ `str` ]
878 All the matching datastores holding this dataset.
880 Raises
881 ------
882 AmbiguousDatasetError
883 Raised if ``ref.id`` is `None`.
884 """
885 return self._datastoreBridges.findDatastores(ref)
887 def expandDataId(self, dataId: Optional[DataId] = None, *, graph: Optional[DimensionGraph] = None,
888 records: Optional[Mapping[DimensionElement, DimensionRecord]] = None, **kwds):
889 """Expand a dimension-based data ID to include additional information.
891 Parameters
892 ----------
893 dataId : `DataCoordinate` or `dict`, optional
894 Data ID to be expanded; augmented and overridden by ``kwds``.
895 graph : `DimensionGraph`, optional
896 Set of dimensions for the expanded ID. If `None`, the dimensions
897 will be inferred from the keys of ``dataId`` and ``kwds``.
898 Dimensions that are in ``dataId`` or ``kwds`` but not in ``graph``
899 are silently ignored, providing a way to extract and expand a
900 subset of a data ID.
901 records : mapping [`DimensionElement`, `DimensionRecord`], optional
902 Dimension record data to use before querying the database for that
903 data.
904 **kwds
905 Additional keywords are treated like additional key-value pairs for
906 ``dataId``, extending and overriding
908 Returns
909 -------
910 expanded : `ExpandedDataCoordinate`
911 A data ID that includes full metadata for all of the dimensions it
912 identifieds.
913 """
914 standardized = DataCoordinate.standardize(dataId, graph=graph, universe=self.dimensions, **kwds)
915 if isinstance(standardized, ExpandedDataCoordinate):
916 return standardized
917 elif isinstance(dataId, ExpandedDataCoordinate):
918 records = dict(records) if records is not None else {}
919 records.update(dataId.records)
920 else:
921 records = dict(records) if records is not None else {}
922 keys = dict(standardized)
923 regions = []
924 timespans = []
925 for element in standardized.graph.primaryKeyTraversalOrder:
926 record = records.get(element.name, ...) # Use ... to mean not found; None might mean NULL
927 if record is ...:
928 storage = self._dimensions[element]
929 record = storage.fetch(keys)
930 records[element] = record
931 if record is not None:
932 for d in element.implied:
933 value = getattr(record, d.name)
934 if keys.setdefault(d, value) != value:
935 raise InconsistentDataIdError(f"Data ID {standardized} has {d.name}={keys[d]!r}, "
936 f"but {element.name} implies {d.name}={value!r}.")
937 if element in standardized.graph.spatial and record.region is not None:
938 if any(record.region.relate(r) & lsst.sphgeom.DISJOINT for r in regions):
939 raise InconsistentDataIdError(f"Data ID {standardized}'s region for {element.name} "
940 f"is disjoint with those for other elements.")
941 regions.append(record.region)
942 if element in standardized.graph.temporal:
943 if any(not record.timespan.overlaps(t) for t in timespans):
944 raise InconsistentDataIdError(f"Data ID {standardized}'s timespan for {element.name}"
945 f" is disjoint with those for other elements.")
946 timespans.append(record.timespan)
947 else:
948 if element in standardized.graph.required:
949 raise LookupError(
950 f"Could not fetch record for required dimension {element.name} via keys {keys}."
951 )
952 if element.alwaysJoin:
953 raise InconsistentDataIdError(
954 f"Could not fetch record for element {element.name} via keys {keys}, ",
955 "but it is marked alwaysJoin=True; this means one or more dimensions are not "
956 "related."
957 )
958 records.update((d, None) for d in element.implied)
959 return ExpandedDataCoordinate(standardized.graph, standardized.values(), records=records)
961 def relateDataIds(self, a: DataId, b: DataId) -> Optional[ConsistentDataIds]:
962 """Compare the keys and values of a pair of data IDs for consistency.
964 See `ConsistentDataIds` for more information.
966 Parameters
967 ----------
968 a : `dict` or `DataCoordinate`
969 First data ID to be compared.
970 b : `dict` or `DataCoordinate`
971 Second data ID to be compared.
973 Returns
974 -------
975 relationship : `ConsistentDataIds` or `None`
976 Relationship information. This is not `None` and coerces to
977 `True` in boolean contexts if and only if the data IDs are
978 consistent in terms of all common key-value pairs, all many-to-many
979 join tables, and all spatial andtemporal relationships.
980 """
981 a = DataCoordinate.standardize(a, universe=self.dimensions)
982 b = DataCoordinate.standardize(b, universe=self.dimensions)
983 aFull = getattr(a, "full", None)
984 bFull = getattr(b, "full", None)
985 aBest = aFull if aFull is not None else a
986 bBest = bFull if bFull is not None else b
987 jointKeys = aBest.keys() & bBest.keys()
988 # If any common values are not equal, we know they are inconsistent.
989 if any(aBest[k] != bBest[k] for k in jointKeys):
990 return None
991 # If the graphs are equal, we know the data IDs are.
992 if a.graph == b.graph:
993 return ConsistentDataIds(contains=True, within=True, overlaps=bool(jointKeys))
994 # Result is still inconclusive. Try to expand a data ID containing
995 # keys from both; that will fail if they are inconsistent.
996 # First, if either input was already an ExpandedDataCoordinate, extract
997 # its records so we don't have to query for them.
998 records = {}
999 if hasattr(a, "records"):
1000 records.update(a.records)
1001 if hasattr(b, "records"):
1002 records.update(b.records)
1003 try:
1004 self.expandDataId({**a, **b}, graph=(a.graph | b.graph), records=records)
1005 except InconsistentDataIdError:
1006 return None
1007 # We know the answer is not `None`; time to figure out what it is.
1008 return ConsistentDataIds(
1009 contains=(a.graph >= b.graph),
1010 within=(a.graph <= b.graph),
1011 overlaps=bool(a.graph & b.graph),
1012 )
1014 def insertDimensionData(self, element: Union[DimensionElement, str],
1015 *data: Union[dict, DimensionRecord],
1016 conform: bool = True):
1017 """Insert one or more dimension records into the database.
1019 Parameters
1020 ----------
1021 element : `DimensionElement` or `str`
1022 The `DimensionElement` or name thereof that identifies the table
1023 records will be inserted into.
1024 data : `dict` or `DimensionRecord` (variadic)
1025 One or more records to insert.
1026 conform : `bool`, optional
1027 If `False` (`True` is default) perform no checking or conversions,
1028 and assume that ``element`` is a `DimensionElement` instance and
1029 ``data`` is a one or more `DimensionRecord` instances of the
1030 appropriate subclass.
1031 """
1032 if conform:
1033 element = self.dimensions[element] # if this is a name, convert it to a true DimensionElement.
1034 records = [element.RecordClass.fromDict(row) if not type(row) is element.RecordClass else row
1035 for row in data]
1036 else:
1037 records = data
1038 storage = self._dimensions[element]
1039 storage.insert(*records)
1041 def syncDimensionData(self, element: Union[DimensionElement, str],
1042 row: Union[dict, DimensionRecord],
1043 conform: bool = True) -> bool:
1044 """Synchronize the given dimension record with the database, inserting
1045 if it does not already exist and comparing values if it does.
1047 Parameters
1048 ----------
1049 element : `DimensionElement` or `str`
1050 The `DimensionElement` or name thereof that identifies the table
1051 records will be inserted into.
1052 row : `dict` or `DimensionRecord`
1053 The record to insert.
1054 conform : `bool`, optional
1055 If `False` (`True` is default) perform no checking or conversions,
1056 and assume that ``element`` is a `DimensionElement` instance and
1057 ``data`` is a one or more `DimensionRecord` instances of the
1058 appropriate subclass.
1060 Returns
1061 -------
1062 inserted : `bool`
1063 `True` if a new row was inserted, `False` otherwise.
1065 Raises
1066 ------
1067 ConflictingDefinitionError
1068 Raised if the record exists in the database (according to primary
1069 key lookup) but is inconsistent with the given one.
1071 Notes
1072 -----
1073 This method cannot be called within transactions, as it needs to be
1074 able to perform its own transaction to be concurrent.
1075 """
1076 if conform:
1077 element = self.dimensions[element] # if this is a name, convert it to a true DimensionElement.
1078 record = element.RecordClass.fromDict(row) if not type(row) is element.RecordClass else row
1079 else:
1080 record = row
1081 storage = self._dimensions[element]
1082 return storage.sync(record)
1084 def queryDatasetTypes(self, expression: Any = ...) -> Iterator[DatasetType]:
1085 """Iterate over the dataset types whose names match an expression.
1087 Parameters
1088 ----------
1089 expression : `Any`, optional
1090 An expression that fully or partially identifies the dataset types
1091 to return, such as a `str`, `re.Pattern`, or iterable thereof.
1092 `...` can be used to return all dataset types, and is the default.
1093 See :ref:`daf_butler_dataset_type_expressions` for more
1094 information.
1096 Yields
1097 ------
1098 datasetType : `DatasetType`
1099 A `DatasetType` instance whose name matches ``expression``.
1100 """
1101 wildcard = CategorizedWildcard.fromExpression(expression, coerceUnrecognized=lambda d: d.name)
1102 if wildcard is ...:
1103 yield from self._datasets
1104 return
1105 done = set()
1106 for name in wildcard.strings:
1107 storage = self._datasets.find(name)
1108 if storage is not None:
1109 done.add(storage.datasetType)
1110 yield storage.datasetType
1111 if wildcard.patterns:
1112 for datasetType in self._datasets:
1113 if datasetType.name in done:
1114 continue
1115 if any(p.fullmatch(datasetType.name) for p in wildcard.patterns):
1116 yield datasetType
1118 def queryCollections(self, expression: Any = ...,
1119 datasetType: Optional[DatasetType] = None,
1120 collectionType: Optional[CollectionType] = None,
1121 flattenChains: bool = False,
1122 includeChains: Optional[bool] = None) -> Iterator[str]:
1123 """Iterate over the collections whose names match an expression.
1125 Parameters
1126 ----------
1127 expression : `Any`, optional
1128 An expression that fully or partially identifies the collections
1129 to return, such as a `str`, `re.Pattern`, or iterable thereof.
1130 `...` can be used to return all collections, and is the default.
1131 See :ref:`daf_butler_collection_expressions` for more
1132 information.
1133 datasetType : `DatasetType`, optional
1134 If provided, only yield collections that should be searched for
1135 this dataset type according to ``expression``. If this is
1136 not provided, any dataset type restrictions in ``expression`` are
1137 ignored.
1138 collectionType : `CollectionType`, optional
1139 If provided, only yield collections of this type.
1140 flattenChains : `bool`, optional
1141 If `True` (`False` is default), recursively yield the child
1142 collections of matching `~CollectionType.CHAINED` collections.
1143 includeChains : `bool`, optional
1144 If `True`, yield records for matching `~CollectionType.CHAINED`
1145 collections. Default is the opposite of ``flattenChains``: include
1146 either CHAINED collections or their children, but not both.
1148 Yields
1149 ------
1150 collection : `str`
1151 The name of a collection that matches ``expression``.
1152 """
1153 query = CollectionQuery.fromExpression(expression)
1154 for record in query.iter(self._collections, datasetType=datasetType, collectionType=collectionType,
1155 flattenChains=flattenChains, includeChains=includeChains):
1156 yield record.name
1158 def makeQueryBuilder(self, summary: QuerySummary) -> QueryBuilder:
1159 """Return a `QueryBuilder` instance capable of constructing and
1160 managing more complex queries than those obtainable via `Registry`
1161 interfaces.
1163 This is an advanced interface; downstream code should prefer
1164 `Registry.queryDimensions` and `Registry.queryDatasets` whenever those
1165 are sufficient.
1167 Parameters
1168 ----------
1169 summary : `QuerySummary`
1170 Object describing and categorizing the full set of dimensions that
1171 will be included in the query.
1173 Returns
1174 -------
1175 builder : `QueryBuilder`
1176 Object that can be used to construct and perform advanced queries.
1177 """
1178 return QueryBuilder(summary=summary,
1179 collections=self._collections,
1180 dimensions=self._dimensions,
1181 datasets=self._datasets)
1183 def queryDimensions(self, dimensions: Union[Iterable[Union[Dimension, str]], Dimension, str], *,
1184 dataId: Optional[DataId] = None,
1185 datasets: Any = None,
1186 collections: Any = None,
1187 where: Optional[str] = None,
1188 expand: bool = True,
1189 **kwds) -> Iterator[DataCoordinate]:
1190 """Query for and iterate over data IDs matching user-provided criteria.
1192 Parameters
1193 ----------
1194 dimensions : `Dimension` or `str`, or iterable thereof
1195 The dimensions of the data IDs to yield, as either `Dimension`
1196 instances or `str`. Will be automatically expanded to a complete
1197 `DimensionGraph`.
1198 dataId : `dict` or `DataCoordinate`, optional
1199 A data ID whose key-value pairs are used as equality constraints
1200 in the query.
1201 datasets : `Any`, optional
1202 An expression that fully or partially identifies dataset types
1203 that should constrain the yielded data IDs. For example, including
1204 "raw" here would constrain the yielded ``instrument``,
1205 ``exposure``, ``detector``, and ``physical_filter`` values to only
1206 those for which at least one "raw" dataset exists in
1207 ``collections``. Allowed types include `DatasetType`, `str`,
1208 `re.Pattern`, and iterables thereof. Unlike other dataset type
1209 expressions, `...` is not permitted - it doesn't make sense to
1210 constrain data IDs on the existence of *all* datasets.
1211 See :ref:`daf_butler_dataset_type_expressions` for more
1212 information.
1213 collections: `Any`, optional
1214 An expression that fully or partially identifies the collections
1215 to search for datasets, such as a `str`, `re.Pattern`, or iterable
1216 thereof. `...` can be used to return all collections. Must be
1217 provided if ``datasets`` is, and is ignored if it is not. See
1218 :ref:`daf_butler_collection_expressions` for more information.
1219 where : `str`, optional
1220 A string expression similar to a SQL WHERE clause. May involve
1221 any column of a dimension table or (as a shortcut for the primary
1222 key column of a dimension table) dimension name. See
1223 :ref:`daf_butler_dimension_expressions` for more information.
1224 expand : `bool`, optional
1225 If `True` (default) yield `ExpandedDataCoordinate` instead of
1226 minimal `DataCoordinate` base-class instances.
1227 kwds
1228 Additional keyword arguments are forwarded to
1229 `DataCoordinate.standardize` when processing the ``dataId``
1230 argument (and may be used to provide a constraining data ID even
1231 when the ``dataId`` argument is `None`).
1233 Yields
1234 ------
1235 dataId : `DataCoordinate`
1236 Data IDs matching the given query parameters. Order is
1237 unspecified.
1238 """
1239 dimensions = iterable(dimensions)
1240 standardizedDataId = self.expandDataId(dataId, **kwds)
1241 standardizedDatasetTypes = []
1242 requestedDimensionNames = set(self.dimensions.extract(dimensions).names)
1243 if datasets is not None:
1244 if collections is None:
1245 raise TypeError("Cannot pass 'datasets' without 'collections'.")
1246 for datasetType in self.queryDatasetTypes(datasets):
1247 requestedDimensionNames.update(datasetType.dimensions.names)
1248 standardizedDatasetTypes.append(datasetType)
1249 # Preprocess collections expression in case the original included
1250 # single-pass iterators (we'll want to use it multiple times
1251 # below).
1252 collections = CollectionQuery.fromExpression(collections)
1254 summary = QuerySummary(
1255 requested=DimensionGraph(self.dimensions, names=requestedDimensionNames),
1256 dataId=standardizedDataId,
1257 expression=where,
1258 )
1259 builder = self.makeQueryBuilder(summary)
1260 for datasetType in standardizedDatasetTypes:
1261 builder.joinDataset(datasetType, collections, isResult=False)
1262 query = builder.finish()
1263 predicate = query.predicate()
1264 for row in self._db.query(query.sql):
1265 if predicate(row):
1266 result = query.extractDataId(row)
1267 if expand:
1268 yield self.expandDataId(result, records=standardizedDataId.records)
1269 else:
1270 yield result
1272 def queryDatasets(self, datasetType: Any, *,
1273 collections: Any,
1274 dimensions: Optional[Iterable[Union[Dimension, str]]] = None,
1275 dataId: Optional[DataId] = None,
1276 where: Optional[str] = None,
1277 deduplicate: bool = False,
1278 expand: bool = True,
1279 **kwds) -> Iterator[DatasetRef]:
1280 """Query for and iterate over dataset references matching user-provided
1281 criteria.
1283 Parameters
1284 ----------
1285 datasetType
1286 An expression that fully or partially identifies the dataset types
1287 to be queried. Allowed types include `DatasetType`, `str`,
1288 `re.Pattern`, and iterables thereof. The special value `...` can
1289 be used to query all dataset types. See
1290 :ref:`daf_butler_dataset_type_expressions` for more information.
1291 collections
1292 An expression that fully or partially identifies the collections
1293 to search for datasets, such as a `str`, `re.Pattern`, or iterable
1294 thereof. `...` can be used to return all collections. See
1295 :ref:`daf_butler_collection_expressions` for more information.
1296 dimensions : `~collections.abc.Iterable` of `Dimension` or `str`
1297 Dimensions to include in the query (in addition to those used
1298 to identify the queried dataset type(s)), either to constrain
1299 the resulting datasets to those for which a matching dimension
1300 exists, or to relate the dataset type's dimensions to dimensions
1301 referenced by the ``dataId`` or ``where`` arguments.
1302 dataId : `dict` or `DataCoordinate`, optional
1303 A data ID whose key-value pairs are used as equality constraints
1304 in the query.
1305 where : `str`, optional
1306 A string expression similar to a SQL WHERE clause. May involve
1307 any column of a dimension table or (as a shortcut for the primary
1308 key column of a dimension table) dimension name. See
1309 :ref:`daf_butler_dimension_expressions` for more information.
1310 deduplicate : `bool`, optional
1311 If `True` (`False` is default), for each result data ID, only
1312 yield one `DatasetRef` of each `DatasetType`, from the first
1313 collection in which a dataset of that dataset type appears
1314 (according to the order of ``collections`` passed in). If `True`,
1315 ``collections`` must not contain regular expressions and may not
1316 be `...`.
1317 expand : `bool`, optional
1318 If `True` (default) attach `ExpandedDataCoordinate` instead of
1319 minimal `DataCoordinate` base-class instances.
1320 kwds
1321 Additional keyword arguments are forwarded to
1322 `DataCoordinate.standardize` when processing the ``dataId``
1323 argument (and may be used to provide a constraining data ID even
1324 when the ``dataId`` argument is `None`).
1326 Yields
1327 ------
1328 ref : `DatasetRef`
1329 Dataset references matching the given query criteria. These
1330 are grouped by `DatasetType` if the query evaluates to multiple
1331 dataset types, but order is otherwise unspecified.
1333 Raises
1334 ------
1335 TypeError
1336 Raised when the arguments are incompatible, such as when a
1337 collection wildcard is passed when ``deduplicate`` is `True`.
1339 Notes
1340 -----
1341 When multiple dataset types are queried in a single call, the
1342 results of this operation are equivalent to querying for each dataset
1343 type separately in turn, and no information about the relationships
1344 between datasets of different types is included. In contexts where
1345 that kind of information is important, the recommended pattern is to
1346 use `queryDimensions` to first obtain data IDs (possibly with the
1347 desired dataset types and collections passed as constraints to the
1348 query), and then use multiple (generally much simpler) calls to
1349 `queryDatasets` with the returned data IDs passed as constraints.
1350 """
1351 # Standardize the collections expression.
1352 if deduplicate:
1353 collections = CollectionSearch.fromExpression(collections)
1354 else:
1355 collections = CollectionQuery.fromExpression(collections)
1356 # Standardize and expand the data ID provided as a constraint.
1357 standardizedDataId = self.expandDataId(dataId, **kwds)
1358 # If the datasetType passed isn't actually a DatasetType, expand it
1359 # (it could be an expression that yields multiple DatasetTypes) and
1360 # recurse.
1361 if not isinstance(datasetType, DatasetType):
1362 for trueDatasetType in self.queryDatasetTypes(datasetType):
1363 yield from self.queryDatasets(trueDatasetType, collections=collections,
1364 dimensions=dimensions, dataId=standardizedDataId,
1365 where=where, deduplicate=deduplicate)
1366 return
1367 # The full set of dimensions in the query is the combination of those
1368 # needed for the DatasetType and those explicitly requested, if any.
1369 requestedDimensionNames = set(datasetType.dimensions.names)
1370 if dimensions is not None:
1371 requestedDimensionNames.update(self.dimensions.extract(dimensions).names)
1372 # Construct the summary structure needed to construct a QueryBuilder.
1373 summary = QuerySummary(
1374 requested=DimensionGraph(self.dimensions, names=requestedDimensionNames),
1375 dataId=standardizedDataId,
1376 expression=where,
1377 )
1378 builder = self.makeQueryBuilder(summary)
1379 # Add the dataset subquery to the query, telling the QueryBuilder to
1380 # include the rank of the selected collection in the results only if we
1381 # need to deduplicate. Note that if any of the collections are
1382 # actually wildcard expressions, and we've asked for deduplication,
1383 # this will raise TypeError for us.
1384 if not builder.joinDataset(datasetType, collections, isResult=True, addRank=deduplicate):
1385 return
1386 query = builder.finish()
1387 predicate = query.predicate()
1388 if not deduplicate:
1389 # No need to de-duplicate across collections.
1390 for row in self._db.query(query.sql):
1391 if predicate(row):
1392 dataId = query.extractDataId(row, graph=datasetType.dimensions)
1393 if expand:
1394 dataId = self.expandDataId(dataId, records=standardizedDataId.records)
1395 yield query.extractDatasetRef(row, datasetType, dataId)[0]
1396 else:
1397 # For each data ID, yield only the DatasetRef with the lowest
1398 # collection rank.
1399 bestRefs = {}
1400 bestRanks = {}
1401 for row in self._db.query(query.sql):
1402 if predicate(row):
1403 ref, rank = query.extractDatasetRef(row, datasetType)
1404 bestRank = bestRanks.get(ref.dataId, sys.maxsize)
1405 if rank < bestRank:
1406 bestRefs[ref.dataId] = ref
1407 bestRanks[ref.dataId] = rank
1408 # If caller requested expanded data IDs, we defer that until here
1409 # so we do as little expansion as possible.
1410 if expand:
1411 for ref in bestRefs.values():
1412 dataId = self.expandDataId(ref.dataId, records=standardizedDataId.records)
1413 yield ref.expanded(dataId)
1414 else:
1415 yield from bestRefs.values()
1417 dimensions: DimensionUniverse
1418 """The universe of all dimensions known to the registry
1419 (`DimensionUniverse`).
1420 """
1422 storageClasses: StorageClassFactory
1423 """All storage classes known to the registry (`StorageClassFactory`).
1424 """