Coverage for python/lsst/daf/butler/registry/_registry.py : 11%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22from __future__ import annotations
24__all__ = (
25 "Registry",
26)
28from collections import defaultdict
29import contextlib
30import logging
31import sys
32from typing import (
33 Any,
34 Dict,
35 Iterable,
36 Iterator,
37 List,
38 Mapping,
39 Optional,
40 Set,
41 Type,
42 TYPE_CHECKING,
43 Union,
44)
46import sqlalchemy
48from ..core import (
49 Config,
50 DataCoordinate,
51 DataCoordinateIterable,
52 DataId,
53 DatasetRef,
54 DatasetType,
55 ddl,
56 Dimension,
57 DimensionElement,
58 DimensionGraph,
59 DimensionRecord,
60 DimensionUniverse,
61 NamedKeyMapping,
62 NameLookupMapping,
63 StorageClassFactory,
64)
65from ..core.utils import doImport, iterable, transactional
66from ._config import RegistryConfig
67from .queries import (
68 QueryBuilder,
69 QuerySummary,
70)
71from ._collectionType import CollectionType
72from ._exceptions import ConflictingDefinitionError, InconsistentDataIdError, OrphanedRecordError
73from .wildcards import CategorizedWildcard, CollectionQuery, CollectionSearch, Ellipsis
74from .interfaces import ChainedCollectionRecord, RunRecord
75from .versions import ButlerVersionsManager, DigestMismatchError
77if TYPE_CHECKING: 77 ↛ 78line 77 didn't jump to line 78, because the condition on line 77 was never true
78 from ..butlerConfig import ButlerConfig
79 from .interfaces import (
80 ButlerAttributeManager,
81 CollectionManager,
82 Database,
83 OpaqueTableStorageManager,
84 DimensionRecordStorageManager,
85 DatasetRecordStorageManager,
86 DatastoreRegistryBridgeManager,
87 )
90_LOG = logging.getLogger(__name__)
93class Registry:
94 """Registry interface.
96 Parameters
97 ----------
98 database : `Database`
99 Database instance to store Registry.
100 universe : `DimensionUniverse`
101 Full set of dimensions for Registry.
102 attributes : `type`
103 Manager class implementing `ButlerAttributeManager`.
104 opaque : `type`
105 Manager class implementing `OpaqueTableStorageManager`.
106 dimensions : `type`
107 Manager class implementing `DimensionRecordStorageManager`.
108 collections : `type`
109 Manager class implementing `CollectionManager`.
110 datasets : `type`
111 Manager class implementing `DatasetRecordStorageManager`.
112 datastoreBridges : `type`
113 Manager class implementing `DatastoreRegistryBridgeManager`.
114 writeable : `bool`, optional
115 If True then Registry will support write operations.
116 create : `bool`, optional
117 If True then database schema will be initialized, it must be empty
118 before instantiating Registry.
119 """
121 defaultConfigFile: Optional[str] = None
122 """Path to configuration defaults. Accessed within the ``config`` resource
123 or relative to a search path. Can be None if no defaults specified.
124 """
126 @classmethod
127 def fromConfig(cls, config: Union[ButlerConfig, RegistryConfig, Config, str], create: bool = False,
128 butlerRoot: Optional[str] = None, writeable: bool = True) -> Registry:
129 """Create `Registry` subclass instance from `config`.
131 Uses ``registry.cls`` from `config` to determine which subclass to
132 instantiate.
134 Parameters
135 ----------
136 config : `ButlerConfig`, `RegistryConfig`, `Config` or `str`
137 Registry configuration
138 create : `bool`, optional
139 Assume empty Registry and create a new one.
140 butlerRoot : `str`, optional
141 Path to the repository root this `Registry` will manage.
142 writeable : `bool`, optional
143 If `True` (default) create a read-write connection to the database.
145 Returns
146 -------
147 registry : `Registry` (subclass)
148 A new `Registry` subclass instance.
149 """
150 if not isinstance(config, RegistryConfig):
151 if isinstance(config, str) or isinstance(config, Config):
152 config = RegistryConfig(config)
153 else:
154 raise ValueError("Incompatible Registry configuration: {}".format(config))
155 config.replaceRoot(butlerRoot)
156 DatabaseClass = config.getDatabaseClass()
157 database = DatabaseClass.fromUri(str(config.connectionString), origin=config.get("origin", 0),
158 namespace=config.get("namespace"), writeable=writeable)
159 universe = DimensionUniverse(config)
160 attributes = doImport(config["managers", "attributes"])
161 opaque = doImport(config["managers", "opaque"])
162 dimensions = doImport(config["managers", "dimensions"])
163 collections = doImport(config["managers", "collections"])
164 datasets = doImport(config["managers", "datasets"])
165 datastoreBridges = doImport(config["managers", "datastores"])
167 return cls(database, universe, dimensions=dimensions, attributes=attributes, opaque=opaque,
168 collections=collections, datasets=datasets, datastoreBridges=datastoreBridges,
169 writeable=writeable, create=create)
171 def __init__(self, database: Database, universe: DimensionUniverse, *,
172 attributes: Type[ButlerAttributeManager],
173 opaque: Type[OpaqueTableStorageManager],
174 dimensions: Type[DimensionRecordStorageManager],
175 collections: Type[CollectionManager],
176 datasets: Type[DatasetRecordStorageManager],
177 datastoreBridges: Type[DatastoreRegistryBridgeManager],
178 writeable: bool = True,
179 create: bool = False):
180 self._db = database
181 self.storageClasses = StorageClassFactory()
182 with self._db.declareStaticTables(create=create) as context:
183 self._attributes = attributes.initialize(self._db, context)
184 self._dimensions = dimensions.initialize(self._db, context, universe=universe)
185 self._collections = collections.initialize(self._db, context)
186 self._datasets = datasets.initialize(self._db, context,
187 collections=self._collections,
188 universe=self.dimensions)
189 self._opaque = opaque.initialize(self._db, context)
190 self._datastoreBridges = datastoreBridges.initialize(self._db, context,
191 opaque=self._opaque,
192 datasets=datasets,
193 universe=self.dimensions)
194 versions = ButlerVersionsManager(
195 self._attributes,
196 dict(
197 attributes=self._attributes,
198 opaque=self._opaque,
199 dimensions=self._dimensions,
200 collections=self._collections,
201 datasets=self._datasets,
202 datastores=self._datastoreBridges,
203 )
204 )
205 # store managers and their versions in attributes table
206 context.addInitializer(lambda db: versions.storeManagersConfig())
207 context.addInitializer(lambda db: versions.storeManagersVersions())
209 if not create:
210 # verify that configured versions are compatible with schema
211 versions.checkManagersConfig()
212 versions.checkManagersVersions(writeable)
213 try:
214 versions.checkManagersDigests()
215 except DigestMismatchError as exc:
216 # potentially digest mismatch is a serious error but during
217 # development it could be benign, treat this as warning for
218 # now.
219 _LOG.warning(f"Registry schema digest mismatch: {exc}")
221 self._collections.refresh()
222 self._datasets.refresh(universe=self._dimensions.universe)
224 def __str__(self) -> str:
225 return str(self._db)
227 def __repr__(self) -> str:
228 return f"Registry({self._db!r}, {self.dimensions!r})"
230 def isWriteable(self) -> bool:
231 """Return `True` if this registry allows write operations, and `False`
232 otherwise.
233 """
234 return self._db.isWriteable()
236 @property
237 def dimensions(self) -> DimensionUniverse:
238 """All dimensions recognized by this `Registry` (`DimensionUniverse`).
239 """
240 return self._dimensions.universe
242 @contextlib.contextmanager
243 def transaction(self) -> Iterator[None]:
244 """Return a context manager that represents a transaction.
245 """
246 # TODO make savepoint=False the default.
247 try:
248 with self._db.transaction():
249 yield
250 except BaseException:
251 # TODO: this clears the caches sometimes when we wouldn't actually
252 # need to. Can we avoid that?
253 self._dimensions.clearCaches()
254 raise
256 def registerOpaqueTable(self, tableName: str, spec: ddl.TableSpec) -> None:
257 """Add an opaque (to the `Registry`) table for use by a `Datastore` or
258 other data repository client.
260 Opaque table records can be added via `insertOpaqueData`, retrieved via
261 `fetchOpaqueData`, and removed via `deleteOpaqueData`.
263 Parameters
264 ----------
265 tableName : `str`
266 Logical name of the opaque table. This may differ from the
267 actual name used in the database by a prefix and/or suffix.
268 spec : `ddl.TableSpec`
269 Specification for the table to be added.
270 """
271 self._opaque.register(tableName, spec)
273 @transactional
274 def insertOpaqueData(self, tableName: str, *data: dict) -> None:
275 """Insert records into an opaque table.
277 Parameters
278 ----------
279 tableName : `str`
280 Logical name of the opaque table. Must match the name used in a
281 previous call to `registerOpaqueTable`.
282 data
283 Each additional positional argument is a dictionary that represents
284 a single row to be added.
285 """
286 self._opaque[tableName].insert(*data)
288 def fetchOpaqueData(self, tableName: str, **where: Any) -> Iterator[dict]:
289 """Retrieve records from an opaque table.
291 Parameters
292 ----------
293 tableName : `str`
294 Logical name of the opaque table. Must match the name used in a
295 previous call to `registerOpaqueTable`.
296 where
297 Additional keyword arguments are interpreted as equality
298 constraints that restrict the returned rows (combined with AND);
299 keyword arguments are column names and values are the values they
300 must have.
302 Yields
303 ------
304 row : `dict`
305 A dictionary representing a single result row.
306 """
307 yield from self._opaque[tableName].fetch(**where)
309 @transactional
310 def deleteOpaqueData(self, tableName: str, **where: Any) -> None:
311 """Remove records from an opaque table.
313 Parameters
314 ----------
315 tableName : `str`
316 Logical name of the opaque table. Must match the name used in a
317 previous call to `registerOpaqueTable`.
318 where
319 Additional keyword arguments are interpreted as equality
320 constraints that restrict the deleted rows (combined with AND);
321 keyword arguments are column names and values are the values they
322 must have.
323 """
324 self._opaque[tableName].delete(**where)
326 def registerCollection(self, name: str, type: CollectionType = CollectionType.TAGGED) -> None:
327 """Add a new collection if one with the given name does not exist.
329 Parameters
330 ----------
331 name : `str`
332 The name of the collection to create.
333 type : `CollectionType`
334 Enum value indicating the type of collection to create.
336 Notes
337 -----
338 This method cannot be called within transactions, as it needs to be
339 able to perform its own transaction to be concurrent.
340 """
341 self._collections.register(name, type)
343 def getCollectionType(self, name: str) -> CollectionType:
344 """Return an enumeration value indicating the type of the given
345 collection.
347 Parameters
348 ----------
349 name : `str`
350 The name of the collection.
352 Returns
353 -------
354 type : `CollectionType`
355 Enum value indicating the type of this collection.
357 Raises
358 ------
359 MissingCollectionError
360 Raised if no collection with the given name exists.
361 """
362 return self._collections.find(name).type
364 def registerRun(self, name: str) -> None:
365 """Add a new run if one with the given name does not exist.
367 Parameters
368 ----------
369 name : `str`
370 The name of the run to create.
372 Notes
373 -----
374 This method cannot be called within transactions, as it needs to be
375 able to perform its own transaction to be concurrent.
376 """
377 self._collections.register(name, CollectionType.RUN)
379 @transactional
380 def removeCollection(self, name: str) -> None:
381 """Completely remove the given collection.
383 Parameters
384 ----------
385 name : `str`
386 The name of the collection to remove.
388 Raises
389 ------
390 MissingCollectionError
391 Raised if no collection with the given name exists.
393 Notes
394 -----
395 If this is a `~CollectionType.RUN` collection, all datasets and quanta
396 in it are also fully removed. This requires that those datasets be
397 removed (or at least trashed) from any datastores that hold them first.
399 A collection may not be deleted as long as it is referenced by a
400 `~CollectionType.CHAINED` collection; the ``CHAINED`` collection must
401 be deleted or redefined first.
402 """
403 self._collections.remove(name)
405 def getCollectionChain(self, parent: str) -> CollectionSearch:
406 """Return the child collections in a `~CollectionType.CHAINED`
407 collection.
409 Parameters
410 ----------
411 parent : `str`
412 Name of the chained collection. Must have already been added via
413 a call to `Registry.registerCollection`.
415 Returns
416 -------
417 children : `CollectionSearch`
418 An object that defines the search path of the collection.
419 See :ref:`daf_butler_collection_expressions` for more information.
421 Raises
422 ------
423 MissingCollectionError
424 Raised if ``parent`` does not exist in the `Registry`.
425 TypeError
426 Raised if ``parent`` does not correspond to a
427 `~CollectionType.CHAINED` collection.
428 """
429 record = self._collections.find(parent)
430 if record.type is not CollectionType.CHAINED:
431 raise TypeError(f"Collection '{parent}' has type {record.type.name}, not CHAINED.")
432 assert isinstance(record, ChainedCollectionRecord)
433 return record.children
435 @transactional
436 def setCollectionChain(self, parent: str, children: Any) -> None:
437 """Define or redefine a `~CollectionType.CHAINED` collection.
439 Parameters
440 ----------
441 parent : `str`
442 Name of the chained collection. Must have already been added via
443 a call to `Registry.registerCollection`.
444 children : `Any`
445 An expression defining an ordered search of child collections,
446 generally an iterable of `str`. Restrictions on the dataset types
447 to be searched can also be included, by passing mapping or an
448 iterable containing tuples; see
449 :ref:`daf_butler_collection_expressions` for more information.
451 Raises
452 ------
453 MissingCollectionError
454 Raised when any of the given collections do not exist in the
455 `Registry`.
456 TypeError
457 Raised if ``parent`` does not correspond to a
458 `~CollectionType.CHAINED` collection.
459 ValueError
460 Raised if the given collections contains a cycle.
461 """
462 record = self._collections.find(parent)
463 if record.type is not CollectionType.CHAINED:
464 raise TypeError(f"Collection '{parent}' has type {record.type.name}, not CHAINED.")
465 assert isinstance(record, ChainedCollectionRecord)
466 children = CollectionSearch.fromExpression(children)
467 if children != record.children:
468 record.update(self._collections, children)
470 def registerDatasetType(self, datasetType: DatasetType) -> bool:
471 """
472 Add a new `DatasetType` to the Registry.
474 It is not an error to register the same `DatasetType` twice.
476 Parameters
477 ----------
478 datasetType : `DatasetType`
479 The `DatasetType` to be added.
481 Returns
482 -------
483 inserted : `bool`
484 `True` if ``datasetType`` was inserted, `False` if an identical
485 existing `DatsetType` was found. Note that in either case the
486 DatasetType is guaranteed to be defined in the Registry
487 consistently with the given definition.
489 Raises
490 ------
491 ValueError
492 Raised if the dimensions or storage class are invalid.
493 ConflictingDefinitionError
494 Raised if this DatasetType is already registered with a different
495 definition.
497 Notes
498 -----
499 This method cannot be called within transactions, as it needs to be
500 able to perform its own transaction to be concurrent.
501 """
502 _, inserted = self._datasets.register(datasetType)
503 return inserted
505 def getDatasetType(self, name: str) -> DatasetType:
506 """Get the `DatasetType`.
508 Parameters
509 ----------
510 name : `str`
511 Name of the type.
513 Returns
514 -------
515 type : `DatasetType`
516 The `DatasetType` associated with the given name.
518 Raises
519 ------
520 KeyError
521 Requested named DatasetType could not be found in registry.
522 """
523 storage = self._datasets.find(name)
524 if storage is None:
525 raise KeyError(f"DatasetType '{name}' could not be found.")
526 return storage.datasetType
528 def findDataset(self, datasetType: Union[DatasetType, str], dataId: Optional[DataId] = None, *,
529 collections: Any, **kwargs: Any) -> Optional[DatasetRef]:
530 """Find a dataset given its `DatasetType` and data ID.
532 This can be used to obtain a `DatasetRef` that permits the dataset to
533 be read from a `Datastore`. If the dataset is a component and can not
534 be found using the provided dataset type, a dataset ref for the parent
535 will be returned instead but with the correct dataset type.
537 Parameters
538 ----------
539 datasetType : `DatasetType` or `str`
540 A `DatasetType` or the name of one.
541 dataId : `dict` or `DataCoordinate`, optional
542 A `dict`-like object containing the `Dimension` links that identify
543 the dataset within a collection.
544 collections
545 An expression that fully or partially identifies the collections
546 to search for the dataset, such as a `str`, `re.Pattern`, or
547 iterable thereof. `...` can be used to return all collections.
548 See :ref:`daf_butler_collection_expressions` for more information.
549 **kwargs
550 Additional keyword arguments passed to
551 `DataCoordinate.standardize` to convert ``dataId`` to a true
552 `DataCoordinate` or augment an existing one.
554 Returns
555 -------
556 ref : `DatasetRef`
557 A reference to the dataset, or `None` if no matching Dataset
558 was found.
560 Raises
561 ------
562 LookupError
563 Raised if one or more data ID keys are missing or the dataset type
564 does not exist.
565 MissingCollectionError
566 Raised if any of ``collections`` does not exist in the registry.
567 """
568 if isinstance(datasetType, DatasetType):
569 storage = self._datasets.find(datasetType.name)
570 if storage is None:
571 raise LookupError(f"DatasetType '{datasetType}' has not been registered.")
572 else:
573 storage = self._datasets.find(datasetType)
574 if storage is None:
575 raise LookupError(f"DatasetType with name '{datasetType}' has not been registered.")
576 dataId = DataCoordinate.standardize(dataId, graph=storage.datasetType.dimensions,
577 universe=self.dimensions, **kwargs)
578 collections = CollectionSearch.fromExpression(collections)
579 for collectionRecord in collections.iter(self._collections, datasetType=storage.datasetType):
580 result = storage.find(collectionRecord, dataId)
581 if result is not None:
582 return result
584 return None
586 @transactional
587 def insertDatasets(self, datasetType: Union[DatasetType, str], dataIds: Iterable[DataId],
588 run: str) -> List[DatasetRef]:
589 """Insert one or more datasets into the `Registry`
591 This always adds new datasets; to associate existing datasets with
592 a new collection, use ``associate``.
594 Parameters
595 ----------
596 datasetType : `DatasetType` or `str`
597 A `DatasetType` or the name of one.
598 dataIds : `~collections.abc.Iterable` of `dict` or `DataCoordinate`
599 Dimension-based identifiers for the new datasets.
600 run : `str`
601 The name of the run that produced the datasets.
603 Returns
604 -------
605 refs : `list` of `DatasetRef`
606 Resolved `DatasetRef` instances for all given data IDs (in the same
607 order).
609 Raises
610 ------
611 ConflictingDefinitionError
612 If a dataset with the same dataset type and data ID as one of those
613 given already exists in ``run``.
614 MissingCollectionError
615 Raised if ``run`` does not exist in the registry.
616 """
617 if isinstance(datasetType, DatasetType):
618 storage = self._datasets.find(datasetType.name)
619 if storage is None:
620 raise LookupError(f"DatasetType '{datasetType}' has not been registered.")
621 else:
622 storage = self._datasets.find(datasetType)
623 if storage is None:
624 raise LookupError(f"DatasetType with name '{datasetType}' has not been registered.")
625 runRecord = self._collections.find(run)
626 if runRecord.type is not CollectionType.RUN:
627 raise TypeError("Given collection is of type {runRecord.type.name}; RUN collection required.")
628 assert isinstance(runRecord, RunRecord)
629 expandedDataIds = [self.expandDataId(dataId, graph=storage.datasetType.dimensions)
630 for dataId in dataIds]
631 try:
632 refs = list(storage.insert(runRecord, expandedDataIds))
633 except sqlalchemy.exc.IntegrityError as err:
634 raise ConflictingDefinitionError(f"A database constraint failure was triggered by inserting "
635 f"one or more datasets of type {storage.datasetType} into "
636 f"collection '{run}'. "
637 f"This probably means a dataset with the same data ID "
638 f"and dataset type already exists, but it may also mean a "
639 f"dimension row is missing.") from err
640 return refs
642 def getDataset(self, id: int) -> Optional[DatasetRef]:
643 """Retrieve a Dataset entry.
645 Parameters
646 ----------
647 id : `int`
648 The unique identifier for the dataset.
650 Returns
651 -------
652 ref : `DatasetRef` or `None`
653 A ref to the Dataset, or `None` if no matching Dataset
654 was found.
655 """
656 ref = self._datasets.getDatasetRef(id, universe=self.dimensions)
657 if ref is None:
658 return None
659 return ref
661 @transactional
662 def removeDatasets(self, refs: Iterable[DatasetRef]) -> None:
663 """Remove datasets from the Registry.
665 The datasets will be removed unconditionally from all collections, and
666 any `Quantum` that consumed this dataset will instead be marked with
667 having a NULL input. `Datastore` records will *not* be deleted; the
668 caller is responsible for ensuring that the dataset has already been
669 removed from all Datastores.
671 Parameters
672 ----------
673 refs : `Iterable` of `DatasetRef`
674 References to the datasets to be removed. Must include a valid
675 ``id`` attribute, and should be considered invalidated upon return.
677 Raises
678 ------
679 AmbiguousDatasetError
680 Raised if any ``ref.id`` is `None`.
681 OrphanedRecordError
682 Raised if any dataset is still present in any `Datastore`.
683 """
684 for datasetType, refsForType in DatasetRef.groupByType(refs).items():
685 storage = self._datasets.find(datasetType.name)
686 assert storage is not None
687 try:
688 storage.delete(refsForType)
689 except sqlalchemy.exc.IntegrityError as err:
690 raise OrphanedRecordError("One or more datasets is still "
691 "present in one or more Datastores.") from err
693 @transactional
694 def associate(self, collection: str, refs: Iterable[DatasetRef]) -> None:
695 """Add existing datasets to a `~CollectionType.TAGGED` collection.
697 If a DatasetRef with the same exact integer ID is already in a
698 collection nothing is changed. If a `DatasetRef` with the same
699 `DatasetType` and data ID but with different integer ID
700 exists in the collection, `ConflictingDefinitionError` is raised.
702 Parameters
703 ----------
704 collection : `str`
705 Indicates the collection the datasets should be associated with.
706 refs : `Iterable` [ `DatasetRef` ]
707 An iterable of resolved `DatasetRef` instances that already exist
708 in this `Registry`.
710 Raises
711 ------
712 ConflictingDefinitionError
713 If a Dataset with the given `DatasetRef` already exists in the
714 given collection.
715 AmbiguousDatasetError
716 Raised if ``any(ref.id is None for ref in refs)``.
717 MissingCollectionError
718 Raised if ``collection`` does not exist in the registry.
719 TypeError
720 Raise adding new datasets to the given ``collection`` is not
721 allowed.
722 """
723 collectionRecord = self._collections.find(collection)
724 if collectionRecord.type is not CollectionType.TAGGED:
725 raise TypeError(f"Collection '{collection}' has type {collectionRecord.type.name}, not TAGGED.")
726 for datasetType, refsForType in DatasetRef.groupByType(refs).items():
727 storage = self._datasets.find(datasetType.name)
728 assert storage is not None
729 try:
730 storage.associate(collectionRecord, refsForType)
731 except sqlalchemy.exc.IntegrityError as err:
732 raise ConflictingDefinitionError(
733 f"Constraint violation while associating dataset of type {datasetType.name} with "
734 f"collection {collection}. This probably means that one or more datasets with the same "
735 f"dataset type and data ID already exist in the collection, but it may also indicate "
736 f"that the datasets do not exist."
737 ) from err
739 @transactional
740 def disassociate(self, collection: str, refs: Iterable[DatasetRef]) -> None:
741 """Remove existing datasets from a `~CollectionType.TAGGED` collection.
743 ``collection`` and ``ref`` combinations that are not currently
744 associated are silently ignored.
746 Parameters
747 ----------
748 collection : `str`
749 The collection the datasets should no longer be associated with.
750 refs : `Iterable` [ `DatasetRef` ]
751 An iterable of resolved `DatasetRef` instances that already exist
752 in this `Registry`.
754 Raises
755 ------
756 AmbiguousDatasetError
757 Raised if any of the given dataset references is unresolved.
758 MissingCollectionError
759 Raised if ``collection`` does not exist in the registry.
760 TypeError
761 Raise adding new datasets to the given ``collection`` is not
762 allowed.
763 """
764 collectionRecord = self._collections.find(collection)
765 if collectionRecord.type is not CollectionType.TAGGED:
766 raise TypeError(f"Collection '{collection}' has type {collectionRecord.type.name}; "
767 "expected TAGGED.")
768 for datasetType, refsForType in DatasetRef.groupByType(refs).items():
769 storage = self._datasets.find(datasetType.name)
770 assert storage is not None
771 storage.disassociate(collectionRecord, refsForType)
773 def getDatastoreBridgeManager(self) -> DatastoreRegistryBridgeManager:
774 """Return an object that allows a new `Datastore` instance to
775 communicate with this `Registry`.
777 Returns
778 -------
779 manager : `DatastoreRegistryBridgeManager`
780 Object that mediates communication between this `Registry` and its
781 associated datastores.
782 """
783 return self._datastoreBridges
785 def getDatasetLocations(self, ref: DatasetRef) -> Iterable[str]:
786 """Retrieve datastore locations for a given dataset.
788 Parameters
789 ----------
790 ref : `DatasetRef`
791 A reference to the dataset for which to retrieve storage
792 information.
794 Returns
795 -------
796 datastores : `Iterable` [ `str` ]
797 All the matching datastores holding this dataset.
799 Raises
800 ------
801 AmbiguousDatasetError
802 Raised if ``ref.id`` is `None`.
803 """
804 return self._datastoreBridges.findDatastores(ref)
806 def expandDataId(self, dataId: Optional[DataId] = None, *, graph: Optional[DimensionGraph] = None,
807 records: Optional[NameLookupMapping[DimensionElement, Optional[DimensionRecord]]] = None,
808 **kwargs: Any) -> DataCoordinate:
809 """Expand a dimension-based data ID to include additional information.
811 Parameters
812 ----------
813 dataId : `DataCoordinate` or `dict`, optional
814 Data ID to be expanded; augmented and overridden by ``kwds``.
815 graph : `DimensionGraph`, optional
816 Set of dimensions for the expanded ID. If `None`, the dimensions
817 will be inferred from the keys of ``dataId`` and ``kwds``.
818 Dimensions that are in ``dataId`` or ``kwds`` but not in ``graph``
819 are silently ignored, providing a way to extract and expand a
820 subset of a data ID.
821 records : `Mapping` [`str`, `DimensionRecord`], optional
822 Dimension record data to use before querying the database for that
823 data, keyed by element name.
824 **kwargs
825 Additional keywords are treated like additional key-value pairs for
826 ``dataId``, extending and overriding
828 Returns
829 -------
830 expanded : `DataCoordinate`
831 A data ID that includes full metadata for all of the dimensions it
832 identifieds, i.e. guarantees that ``expanded.hasRecords()`` and
833 ``expanded.hasFull()`` both return `True`.
834 """
835 standardized = DataCoordinate.standardize(dataId, graph=graph, universe=self.dimensions, **kwargs)
836 if standardized.hasRecords():
837 return standardized
838 if records is None:
839 records = {}
840 elif isinstance(records, NamedKeyMapping):
841 records = records.byName()
842 else:
843 records = dict(records)
844 if isinstance(dataId, DataCoordinate) and dataId.hasRecords():
845 records.update(dataId.records.byName())
846 keys = standardized.byName()
847 for element in standardized.graph.primaryKeyTraversalOrder:
848 record = records.get(element.name, ...) # Use ... to mean not found; None might mean NULL
849 if record is ...:
850 if isinstance(element, Dimension) and keys.get(element.name) is None:
851 if element in standardized.graph.required:
852 raise LookupError(
853 f"No value or null value for required dimension {element.name}."
854 )
855 keys[element.name] = None
856 record = None
857 else:
858 storage = self._dimensions[element]
859 dataIdSet = DataCoordinateIterable.fromScalar(
860 DataCoordinate.standardize(keys, graph=element.graph)
861 )
862 fetched = tuple(storage.fetch(dataIdSet))
863 try:
864 (record,) = fetched
865 except ValueError:
866 record = None
867 records[element.name] = record
868 if record is not None:
869 for d in element.implied:
870 value = getattr(record, d.name)
871 if keys.setdefault(d.name, value) != value:
872 raise InconsistentDataIdError(
873 f"Data ID {standardized} has {d.name}={keys[d.name]!r}, "
874 f"but {element.name} implies {d.name}={value!r}."
875 )
876 else:
877 if element in standardized.graph.required:
878 raise LookupError(
879 f"Could not fetch record for required dimension {element.name} via keys {keys}."
880 )
881 if element.alwaysJoin:
882 raise InconsistentDataIdError(
883 f"Could not fetch record for element {element.name} via keys {keys}, ",
884 "but it is marked alwaysJoin=True; this means one or more dimensions are not "
885 "related."
886 )
887 for d in element.implied:
888 keys.setdefault(d.name, None)
889 records.setdefault(d.name, None)
890 return DataCoordinate.standardize(keys, graph=standardized.graph).expanded(records=records)
892 def insertDimensionData(self, element: Union[DimensionElement, str],
893 *data: Union[Mapping[str, Any], DimensionRecord],
894 conform: bool = True) -> None:
895 """Insert one or more dimension records into the database.
897 Parameters
898 ----------
899 element : `DimensionElement` or `str`
900 The `DimensionElement` or name thereof that identifies the table
901 records will be inserted into.
902 data : `dict` or `DimensionRecord` (variadic)
903 One or more records to insert.
904 conform : `bool`, optional
905 If `False` (`True` is default) perform no checking or conversions,
906 and assume that ``element`` is a `DimensionElement` instance and
907 ``data`` is a one or more `DimensionRecord` instances of the
908 appropriate subclass.
909 """
910 if conform:
911 if isinstance(element, str):
912 element = self.dimensions[element]
913 records = [row if isinstance(row, DimensionRecord) else element.RecordClass.fromDict(row)
914 for row in data]
915 else:
916 # Ignore typing since caller said to trust them with conform=False.
917 records = data # type: ignore
918 storage = self._dimensions[element] # type: ignore
919 storage.insert(*records)
921 def syncDimensionData(self, element: Union[DimensionElement, str],
922 row: Union[Mapping[str, Any], DimensionRecord],
923 conform: bool = True) -> bool:
924 """Synchronize the given dimension record with the database, inserting
925 if it does not already exist and comparing values if it does.
927 Parameters
928 ----------
929 element : `DimensionElement` or `str`
930 The `DimensionElement` or name thereof that identifies the table
931 records will be inserted into.
932 row : `dict` or `DimensionRecord`
933 The record to insert.
934 conform : `bool`, optional
935 If `False` (`True` is default) perform no checking or conversions,
936 and assume that ``element`` is a `DimensionElement` instance and
937 ``data`` is a one or more `DimensionRecord` instances of the
938 appropriate subclass.
940 Returns
941 -------
942 inserted : `bool`
943 `True` if a new row was inserted, `False` otherwise.
945 Raises
946 ------
947 ConflictingDefinitionError
948 Raised if the record exists in the database (according to primary
949 key lookup) but is inconsistent with the given one.
951 Notes
952 -----
953 This method cannot be called within transactions, as it needs to be
954 able to perform its own transaction to be concurrent.
955 """
956 if conform:
957 if isinstance(element, str):
958 element = self.dimensions[element]
959 record = row if isinstance(row, DimensionRecord) else element.RecordClass.fromDict(row)
960 else:
961 # Ignore typing since caller said to trust them with conform=False.
962 record = row # type: ignore
963 storage = self._dimensions[element] # type: ignore
964 return storage.sync(record)
966 def queryDatasetTypes(self, expression: Any = ..., *, components: Optional[bool] = None
967 ) -> Iterator[DatasetType]:
968 """Iterate over the dataset types whose names match an expression.
970 Parameters
971 ----------
972 expression : `Any`, optional
973 An expression that fully or partially identifies the dataset types
974 to return, such as a `str`, `re.Pattern`, or iterable thereof.
975 `...` can be used to return all dataset types, and is the default.
976 See :ref:`daf_butler_dataset_type_expressions` for more
977 information.
978 components : `bool`, optional
979 If `True`, apply all expression patterns to component dataset type
980 names as well. If `False`, never apply patterns to components.
981 If `None` (default), apply patterns to components only if their
982 parent datasets were not matched by the expression.
983 Fully-specified component datasets (`str` or `DatasetType`
984 instances) are always included.
986 Yields
987 ------
988 datasetType : `DatasetType`
989 A `DatasetType` instance whose name matches ``expression``.
990 """
991 wildcard = CategorizedWildcard.fromExpression(expression, coerceUnrecognized=lambda d: d.name)
992 if wildcard is Ellipsis:
993 for datasetType in self._datasets:
994 # The dataset type can no longer be a component
995 yield datasetType
996 if components and datasetType.isComposite():
997 # Automatically create the component dataset types
998 for component in datasetType.makeAllComponentDatasetTypes():
999 yield component
1000 return
1001 done: Set[str] = set()
1002 for name in wildcard.strings:
1003 storage = self._datasets.find(name)
1004 if storage is not None:
1005 done.add(storage.datasetType.name)
1006 yield storage.datasetType
1007 if wildcard.patterns:
1008 # If components (the argument) is None, we'll save component
1009 # dataset that we might want to match, but only if their parents
1010 # didn't get included.
1011 componentsForLater = []
1012 for registeredDatasetType in self._datasets:
1013 # Components are not stored in registry so expand them here
1014 allDatasetTypes = [registeredDatasetType] \
1015 + registeredDatasetType.makeAllComponentDatasetTypes()
1016 for datasetType in allDatasetTypes:
1017 if datasetType.name in done:
1018 continue
1019 parentName, componentName = datasetType.nameAndComponent()
1020 if componentName is not None and not components:
1021 if components is None and parentName not in done:
1022 componentsForLater.append(datasetType)
1023 continue
1024 if any(p.fullmatch(datasetType.name) for p in wildcard.patterns):
1025 done.add(datasetType.name)
1026 yield datasetType
1027 # Go back and try to match saved components.
1028 for datasetType in componentsForLater:
1029 parentName, _ = datasetType.nameAndComponent()
1030 if parentName not in done and any(p.fullmatch(datasetType.name) for p in wildcard.patterns):
1031 yield datasetType
1033 def queryCollections(self, expression: Any = ...,
1034 datasetType: Optional[DatasetType] = None,
1035 collectionType: Optional[CollectionType] = None,
1036 flattenChains: bool = False,
1037 includeChains: Optional[bool] = None) -> Iterator[str]:
1038 """Iterate over the collections whose names match an expression.
1040 Parameters
1041 ----------
1042 expression : `Any`, optional
1043 An expression that fully or partially identifies the collections
1044 to return, such as a `str`, `re.Pattern`, or iterable thereof.
1045 `...` can be used to return all collections, and is the default.
1046 See :ref:`daf_butler_collection_expressions` for more
1047 information.
1048 datasetType : `DatasetType`, optional
1049 If provided, only yield collections that should be searched for
1050 this dataset type according to ``expression``. If this is
1051 not provided, any dataset type restrictions in ``expression`` are
1052 ignored.
1053 collectionType : `CollectionType`, optional
1054 If provided, only yield collections of this type.
1055 flattenChains : `bool`, optional
1056 If `True` (`False` is default), recursively yield the child
1057 collections of matching `~CollectionType.CHAINED` collections.
1058 includeChains : `bool`, optional
1059 If `True`, yield records for matching `~CollectionType.CHAINED`
1060 collections. Default is the opposite of ``flattenChains``: include
1061 either CHAINED collections or their children, but not both.
1063 Yields
1064 ------
1065 collection : `str`
1066 The name of a collection that matches ``expression``.
1067 """
1068 query = CollectionQuery.fromExpression(expression)
1069 for record in query.iter(self._collections, datasetType=datasetType, collectionType=collectionType,
1070 flattenChains=flattenChains, includeChains=includeChains):
1071 yield record.name
1073 def makeQueryBuilder(self, summary: QuerySummary) -> QueryBuilder:
1074 """Return a `QueryBuilder` instance capable of constructing and
1075 managing more complex queries than those obtainable via `Registry`
1076 interfaces.
1078 This is an advanced interface; downstream code should prefer
1079 `Registry.queryDimensions` and `Registry.queryDatasets` whenever those
1080 are sufficient.
1082 Parameters
1083 ----------
1084 summary : `QuerySummary`
1085 Object describing and categorizing the full set of dimensions that
1086 will be included in the query.
1088 Returns
1089 -------
1090 builder : `QueryBuilder`
1091 Object that can be used to construct and perform advanced queries.
1092 """
1093 return QueryBuilder(summary=summary,
1094 collections=self._collections,
1095 dimensions=self._dimensions,
1096 datasets=self._datasets)
1098 def queryDimensions(self, dimensions: Union[Iterable[Union[Dimension, str]], Dimension, str], *,
1099 dataId: Optional[DataId] = None,
1100 datasets: Any = None,
1101 collections: Any = None,
1102 where: Optional[str] = None,
1103 expand: bool = True,
1104 components: Optional[bool] = None,
1105 **kwargs: Any) -> Iterator[DataCoordinate]:
1106 """Query for and iterate over data IDs matching user-provided criteria.
1108 Parameters
1109 ----------
1110 dimensions : `Dimension` or `str`, or iterable thereof
1111 The dimensions of the data IDs to yield, as either `Dimension`
1112 instances or `str`. Will be automatically expanded to a complete
1113 `DimensionGraph`.
1114 dataId : `dict` or `DataCoordinate`, optional
1115 A data ID whose key-value pairs are used as equality constraints
1116 in the query.
1117 datasets : `Any`, optional
1118 An expression that fully or partially identifies dataset types
1119 that should constrain the yielded data IDs. For example, including
1120 "raw" here would constrain the yielded ``instrument``,
1121 ``exposure``, ``detector``, and ``physical_filter`` values to only
1122 those for which at least one "raw" dataset exists in
1123 ``collections``. Allowed types include `DatasetType`, `str`,
1124 `re.Pattern`, and iterables thereof. Unlike other dataset type
1125 expressions, `...` is not permitted - it doesn't make sense to
1126 constrain data IDs on the existence of *all* datasets.
1127 See :ref:`daf_butler_dataset_type_expressions` for more
1128 information.
1129 collections: `Any`, optional
1130 An expression that fully or partially identifies the collections
1131 to search for datasets, such as a `str`, `re.Pattern`, or iterable
1132 thereof. `...` can be used to return all collections. Must be
1133 provided if ``datasets`` is, and is ignored if it is not. See
1134 :ref:`daf_butler_collection_expressions` for more information.
1135 where : `str`, optional
1136 A string expression similar to a SQL WHERE clause. May involve
1137 any column of a dimension table or (as a shortcut for the primary
1138 key column of a dimension table) dimension name. See
1139 :ref:`daf_butler_dimension_expressions` for more information.
1140 expand : `bool`, optional
1141 If `True` (default) yield `DataCoordinate` instances for which
1142 `~DataCoordinate.hasRecords` is guaranteed to return `True`,
1143 performing extra database fetches as necessary.
1144 components : `bool`, optional
1145 If `True`, apply all dataset expression patterns to component
1146 dataset type names as well. If `False`, never apply patterns to
1147 components. If `None` (default), apply patterns to components only
1148 if their parent datasets were not matched by the expression.
1149 Fully-specified component datasets (`str` or `DatasetType`
1150 instances) are always included.
1151 **kwargs
1152 Additional keyword arguments are forwarded to
1153 `DataCoordinate.standardize` when processing the ``dataId``
1154 argument (and may be used to provide a constraining data ID even
1155 when the ``dataId`` argument is `None`).
1157 Yields
1158 ------
1159 dataId : `DataCoordinate`
1160 Data IDs matching the given query parameters. Order is
1161 unspecified.
1162 """
1163 dimensions = iterable(dimensions)
1164 standardizedDataId = self.expandDataId(dataId, **kwargs)
1165 standardizedDatasetTypes = set()
1166 requestedDimensionNames = set(self.dimensions.extract(dimensions).names)
1167 if datasets is not None:
1168 if collections is None:
1169 raise TypeError("Cannot pass 'datasets' without 'collections'.")
1170 for datasetType in self.queryDatasetTypes(datasets, components=components):
1171 requestedDimensionNames.update(datasetType.dimensions.names)
1172 # If any matched dataset type is a component, just operate on
1173 # its parent instead, because Registry doesn't know anything
1174 # about what components exist, and here (unlike queryDatasets)
1175 # we don't care about returning them.
1176 parentDatasetTypeName, componentName = datasetType.nameAndComponent()
1177 if componentName is not None:
1178 datasetType = self.getDatasetType(parentDatasetTypeName)
1179 standardizedDatasetTypes.add(datasetType)
1180 # Preprocess collections expression in case the original included
1181 # single-pass iterators (we'll want to use it multiple times
1182 # below).
1183 collections = CollectionQuery.fromExpression(collections)
1185 summary = QuerySummary(
1186 requested=DimensionGraph(self.dimensions, names=requestedDimensionNames),
1187 dataId=standardizedDataId,
1188 expression=where,
1189 )
1190 builder = self.makeQueryBuilder(summary)
1191 for datasetType in standardizedDatasetTypes:
1192 builder.joinDataset(datasetType, collections, isResult=False)
1193 query = builder.finish()
1194 predicate = query.predicate()
1195 for row in self._db.query(query.sql):
1196 if predicate(row):
1197 result = query.extractDataId(row)
1198 if expand:
1199 yield self.expandDataId(
1200 result,
1201 records=standardizedDataId.records,
1202 )
1203 else:
1204 yield result
1206 def queryDatasets(self, datasetType: Any, *,
1207 collections: Any,
1208 dimensions: Optional[Iterable[Union[Dimension, str]]] = None,
1209 dataId: Optional[DataId] = None,
1210 where: Optional[str] = None,
1211 deduplicate: bool = False,
1212 expand: bool = True,
1213 components: Optional[bool] = None,
1214 **kwargs: Any) -> Iterator[DatasetRef]:
1215 """Query for and iterate over dataset references matching user-provided
1216 criteria.
1218 Parameters
1219 ----------
1220 datasetType
1221 An expression that fully or partially identifies the dataset types
1222 to be queried. Allowed types include `DatasetType`, `str`,
1223 `re.Pattern`, and iterables thereof. The special value `...` can
1224 be used to query all dataset types. See
1225 :ref:`daf_butler_dataset_type_expressions` for more information.
1226 collections
1227 An expression that fully or partially identifies the collections
1228 to search for datasets, such as a `str`, `re.Pattern`, or iterable
1229 thereof. `...` can be used to return all collections. See
1230 :ref:`daf_butler_collection_expressions` for more information.
1231 dimensions : `~collections.abc.Iterable` of `Dimension` or `str`
1232 Dimensions to include in the query (in addition to those used
1233 to identify the queried dataset type(s)), either to constrain
1234 the resulting datasets to those for which a matching dimension
1235 exists, or to relate the dataset type's dimensions to dimensions
1236 referenced by the ``dataId`` or ``where`` arguments.
1237 dataId : `dict` or `DataCoordinate`, optional
1238 A data ID whose key-value pairs are used as equality constraints
1239 in the query.
1240 where : `str`, optional
1241 A string expression similar to a SQL WHERE clause. May involve
1242 any column of a dimension table or (as a shortcut for the primary
1243 key column of a dimension table) dimension name. See
1244 :ref:`daf_butler_dimension_expressions` for more information.
1245 deduplicate : `bool`, optional
1246 If `True` (`False` is default), for each result data ID, only
1247 yield one `DatasetRef` of each `DatasetType`, from the first
1248 collection in which a dataset of that dataset type appears
1249 (according to the order of ``collections`` passed in). If `True`,
1250 ``collections`` must not contain regular expressions and may not
1251 be `...`.
1252 expand : `bool`, optional
1253 If `True` (default) attach `DataCoordinate` instances for which
1254 `~DataCoordinate.hasRecords` is guaranteed to return `True`,
1255 performing extra database fetches as necessary.
1256 components : `bool`, optional
1257 If `True`, apply all dataset expression patterns to component
1258 dataset type names as well. If `False`, never apply patterns to
1259 components. If `None` (default), apply patterns to components only
1260 if their parent datasets were not matched by the expression.
1261 Fully-specified component datasets (`str` or `DatasetType`
1262 instances) are always included.
1263 **kwargs
1264 Additional keyword arguments are forwarded to
1265 `DataCoordinate.standardize` when processing the ``dataId``
1266 argument (and may be used to provide a constraining data ID even
1267 when the ``dataId`` argument is `None`).
1269 Yields
1270 ------
1271 ref : `DatasetRef`
1272 Dataset references matching the given query criteria. These
1273 are grouped by `DatasetType` if the query evaluates to multiple
1274 dataset types, but order is otherwise unspecified.
1276 Raises
1277 ------
1278 TypeError
1279 Raised when the arguments are incompatible, such as when a
1280 collection wildcard is passed when ``deduplicate`` is `True`.
1282 Notes
1283 -----
1284 When multiple dataset types are queried in a single call, the
1285 results of this operation are equivalent to querying for each dataset
1286 type separately in turn, and no information about the relationships
1287 between datasets of different types is included. In contexts where
1288 that kind of information is important, the recommended pattern is to
1289 use `queryDimensions` to first obtain data IDs (possibly with the
1290 desired dataset types and collections passed as constraints to the
1291 query), and then use multiple (generally much simpler) calls to
1292 `queryDatasets` with the returned data IDs passed as constraints.
1293 """
1294 # Standardize the collections expression.
1295 if deduplicate:
1296 collections = CollectionSearch.fromExpression(collections)
1297 else:
1298 collections = CollectionQuery.fromExpression(collections)
1299 # Standardize and expand the data ID provided as a constraint.
1300 standardizedDataId = self.expandDataId(dataId, **kwargs)
1302 # We can only query directly if given a non-component DatasetType
1303 # instance. If we were given an expression or str or a component
1304 # DatasetType instance, we'll populate this dict, recurse, and return.
1305 # If we already have a non-component DatasetType, it will remain None
1306 # and we'll run the query directly.
1307 composition: Optional[
1308 Dict[
1309 DatasetType, # parent dataset type
1310 List[Optional[str]] # component name, or None for parent
1311 ]
1312 ] = None
1313 if not isinstance(datasetType, DatasetType):
1314 # We were given a dataset type expression (which may be as simple
1315 # as a str). Loop over all matching datasets, delegating handling
1316 # of the `components` argument to queryDatasetTypes, as we populate
1317 # the composition dict.
1318 composition = defaultdict(list)
1319 for trueDatasetType in self.queryDatasetTypes(datasetType, components=components):
1320 parentName, componentName = trueDatasetType.nameAndComponent()
1321 if componentName is not None:
1322 parentDatasetType = self.getDatasetType(parentName)
1323 composition.setdefault(parentDatasetType, []).append(componentName)
1324 else:
1325 composition.setdefault(trueDatasetType, []).append(None)
1326 elif datasetType.isComponent():
1327 # We were given a true DatasetType instance, but it's a component.
1328 # the composition dict will have exactly one item.
1329 parentName, componentName = datasetType.nameAndComponent()
1330 parentDatasetType = self.getDatasetType(parentName)
1331 composition = {parentDatasetType: [componentName]}
1332 if composition is not None:
1333 # We need to recurse. Do that once for each parent dataset type.
1334 for parentDatasetType, componentNames in composition.items():
1335 for parentRef in self.queryDatasets(parentDatasetType, collections=collections,
1336 dimensions=dimensions, dataId=standardizedDataId,
1337 where=where, deduplicate=deduplicate):
1338 # Loop over components, yielding one for each one for each
1339 # one requested.
1340 for componentName in componentNames:
1341 if componentName is None:
1342 yield parentRef
1343 else:
1344 yield parentRef.makeComponentRef(componentName)
1345 return
1346 # If we get here, there's no need to recurse (or we are already
1347 # recursing; there can only ever be one level of recursion).
1349 # The full set of dimensions in the query is the combination of those
1350 # needed for the DatasetType and those explicitly requested, if any.
1351 requestedDimensionNames = set(datasetType.dimensions.names)
1352 if dimensions is not None:
1353 requestedDimensionNames.update(self.dimensions.extract(dimensions).names)
1354 # Construct the summary structure needed to construct a QueryBuilder.
1355 summary = QuerySummary(
1356 requested=DimensionGraph(self.dimensions, names=requestedDimensionNames),
1357 dataId=standardizedDataId,
1358 expression=where,
1359 )
1360 builder = self.makeQueryBuilder(summary)
1361 # Add the dataset subquery to the query, telling the QueryBuilder to
1362 # include the rank of the selected collection in the results only if we
1363 # need to deduplicate. Note that if any of the collections are
1364 # actually wildcard expressions, and we've asked for deduplication,
1365 # this will raise TypeError for us.
1366 if not builder.joinDataset(datasetType, collections, isResult=True, addRank=deduplicate):
1367 return
1368 query = builder.finish()
1369 predicate = query.predicate()
1370 if not deduplicate:
1371 # No need to de-duplicate across collections.
1372 for row in self._db.query(query.sql):
1373 if predicate(row):
1374 dataId = query.extractDataId(row, graph=datasetType.dimensions)
1375 if expand:
1376 dataId = self.expandDataId(
1377 dataId,
1378 records=standardizedDataId.records
1379 )
1380 yield query.extractDatasetRef(row, datasetType, dataId)[0]
1381 else:
1382 # For each data ID, yield only the DatasetRef with the lowest
1383 # collection rank.
1384 bestRefs = {}
1385 bestRanks: Dict[DataCoordinate, int] = {}
1386 for row in self._db.query(query.sql):
1387 if predicate(row):
1388 ref, rank = query.extractDatasetRef(row, datasetType)
1389 bestRank = bestRanks.get(ref.dataId, sys.maxsize)
1390 assert rank is not None
1391 if rank < bestRank:
1392 bestRefs[ref.dataId] = ref
1393 bestRanks[ref.dataId] = rank
1394 # If caller requested expanded data IDs, we defer that until here
1395 # so we do as little expansion as possible.
1396 if expand:
1397 for ref in bestRefs.values():
1398 dataId = self.expandDataId(
1399 ref.dataId,
1400 records=standardizedDataId.records
1401 )
1402 yield ref.expanded(dataId)
1403 else:
1404 yield from bestRefs.values()
1406 storageClasses: StorageClassFactory
1407 """All storage classes known to the registry (`StorageClassFactory`).
1408 """