Coverage for python/lsst/daf/butler/registry/_registry.py : 14%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22from __future__ import annotations
24__all__ = (
25 "ConsistentDataIds",
26 "Registry",
27)
29from collections import defaultdict
30import contextlib
31from dataclasses import dataclass
32import sys
33from typing import (
34 Any,
35 Iterable,
36 Iterator,
37 List,
38 Mapping,
39 Optional,
40 Set,
41 Type,
42 TYPE_CHECKING,
43 Union,
44)
46import sqlalchemy
48import lsst.sphgeom
49from ..core import (
50 Config,
51 DataCoordinate,
52 DataId,
53 DatasetRef,
54 DatasetType,
55 Dimension,
56 DimensionElement,
57 DimensionGraph,
58 DimensionRecord,
59 DimensionUniverse,
60 ExpandedDataCoordinate,
61 FakeDatasetRef,
62 StorageClassFactory,
63)
64from ..core import ddl
65from ..core.utils import doImport, iterable, transactional
66from ._config import RegistryConfig
67from .queries import (
68 QueryBuilder,
69 QuerySummary,
70)
71from .tables import makeRegistryTableSpecs
72from ._collectionType import CollectionType
73from ._exceptions import ConflictingDefinitionError, InconsistentDataIdError, OrphanedRecordError
74from .wildcards import CategorizedWildcard, CollectionQuery, CollectionSearch
76if TYPE_CHECKING: 76 ↛ 77line 76 didn't jump to line 77, because the condition on line 76 was never true
77 from ..butlerConfig import ButlerConfig
78 from ..core import (
79 Quantum
80 )
81 from .interfaces import (
82 CollectionManager,
83 Database,
84 OpaqueTableStorageManager,
85 DimensionRecordStorageManager,
86 DatasetRecordStorageManager,
87 )
90@dataclass
91class ConsistentDataIds:
92 """A struct used to report relationships between data IDs by
93 `Registry.relateDataIds`.
95 If an instance of this class is returned (instead of `None`), the data IDs
96 are "not inconsistent" - any keys they have in common have the same value,
97 and any spatial or temporal relationships they have at least might involve
98 an overlap. To capture this, any instance of `ConsistentDataIds` coerces
99 to `True` in boolean contexts.
100 """
102 overlaps: bool
103 """If `True`, the data IDs have at least one key in common, associated with
104 the same value.
106 Note that data IDs are not inconsistent even if overlaps is `False` - they
107 may simply have no keys in common, which means they cannot have
108 inconsistent values for any keys. They may even be equal, in the case that
109 both data IDs are empty.
111 This field does _not_ indicate whether a spatial or temporal overlap
112 relationship exists.
113 """
115 contains: bool
116 """If `True`, all keys in the first data ID are in the second, and are
117 associated with the same values.
119 This includes case where the first data ID is empty.
120 """
122 within: bool
123 """If `True`, all keys in the second data ID are in the first, and are
124 associated with the same values.
126 This includes case where the second data ID is empty.
127 """
129 @property
130 def equal(self) -> bool:
131 """If `True`, the two data IDs are the same.
133 Data IDs are equal if they have both a `contains` and a `within`
134 relationship.
135 """
136 return self.contains and self.within
138 @property
139 def disjoint(self) -> bool:
140 """If `True`, the two data IDs have no keys in common.
142 This is simply the oppose of `overlaps`. Disjoint datasets are by
143 definition not inconsistent.
144 """
145 return not self.overlaps
147 def __bool__(self) -> bool:
148 return True
151class Registry:
152 """Registry interface.
154 Parameters
155 ----------
156 config : `ButlerConfig`, `RegistryConfig`, `Config` or `str`
157 Registry configuration
158 """
160 defaultConfigFile = None
161 """Path to configuration defaults. Relative to $DAF_BUTLER_DIR/config or
162 absolute path. Can be None if no defaults specified.
163 """
165 @classmethod
166 def fromConfig(cls, config: Union[ButlerConfig, RegistryConfig, Config, str], create: bool = False,
167 butlerRoot: Optional[str] = None, writeable: bool = True) -> Registry:
168 """Create `Registry` subclass instance from `config`.
170 Uses ``registry.cls`` from `config` to determine which subclass to
171 instantiate.
173 Parameters
174 ----------
175 config : `ButlerConfig`, `RegistryConfig`, `Config` or `str`
176 Registry configuration
177 create : `bool`, optional
178 Assume empty Registry and create a new one.
179 butlerRoot : `str`, optional
180 Path to the repository root this `Registry` will manage.
181 writeable : `bool`, optional
182 If `True` (default) create a read-write connection to the database.
184 Returns
185 -------
186 registry : `Registry` (subclass)
187 A new `Registry` subclass instance.
188 """
189 if not isinstance(config, RegistryConfig):
190 if isinstance(config, str) or isinstance(config, Config):
191 config = RegistryConfig(config)
192 else:
193 raise ValueError("Incompatible Registry configuration: {}".format(config))
194 config.replaceRoot(butlerRoot)
195 DatabaseClass = config.getDatabaseClass()
196 database = DatabaseClass.fromUri(str(config.connectionString), origin=config.get("origin", 0),
197 namespace=config.get("namespace"), writeable=writeable)
198 universe = DimensionUniverse(config)
199 opaque = doImport(config["managers", "opaque"])
200 dimensions = doImport(config["managers", "dimensions"])
201 collections = doImport(config["managers", "collections"])
202 datasets = doImport(config["managers", "datasets"])
203 return cls(database, universe, dimensions=dimensions, opaque=opaque, collections=collections,
204 datasets=datasets, create=create)
206 def __init__(self, database: Database, universe: DimensionUniverse, *,
207 opaque: Type[OpaqueTableStorageManager],
208 dimensions: Type[DimensionRecordStorageManager],
209 collections: Type[CollectionManager],
210 datasets: Type[DatasetRecordStorageManager],
211 create: bool = False):
212 self._db = database
213 self.storageClasses = StorageClassFactory()
214 with self._db.declareStaticTables(create=create) as context:
215 self._dimensions = dimensions.initialize(self._db, context, universe=universe)
216 self._collections = collections.initialize(self._db, context)
217 self._datasets = datasets.initialize(self._db, context,
218 collections=self._collections,
219 universe=self.dimensions)
220 self._tables = context.addTableTuple(makeRegistryTableSpecs(self.dimensions,
221 self._collections,
222 self._datasets))
223 self._opaque = opaque.initialize(self._db, context)
224 self._collections.refresh()
225 self._datasets.refresh(universe=self._dimensions.universe)
227 def __str__(self) -> str:
228 return str(self._db)
230 def __repr__(self) -> str:
231 return f"Registry({self._db!r}, {self.dimensions!r})"
233 def isWriteable(self) -> bool:
234 """Return `True` if this registry allows write operations, and `False`
235 otherwise.
236 """
237 return self._db.isWriteable()
239 @property
240 def dimensions(self) -> DimensionUniverse:
241 """All dimensions recognized by this `Registry` (`DimensionUniverse`).
242 """
243 return self._dimensions.universe
245 @contextlib.contextmanager
246 def transaction(self):
247 """Return a context manager that represents a transaction.
248 """
249 # TODO make savepoint=False the default.
250 try:
251 with self._db.transaction():
252 yield
253 except BaseException:
254 # TODO: this clears the caches sometimes when we wouldn't actually
255 # need to. Can we avoid that?
256 self._dimensions.clearCaches()
257 raise
259 def registerOpaqueTable(self, tableName: str, spec: ddl.TableSpec):
260 """Add an opaque (to the `Registry`) table for use by a `Datastore` or
261 other data repository client.
263 Opaque table records can be added via `insertOpaqueData`, retrieved via
264 `fetchOpaqueData`, and removed via `deleteOpaqueData`.
266 Parameters
267 ----------
268 tableName : `str`
269 Logical name of the opaque table. This may differ from the
270 actual name used in the database by a prefix and/or suffix.
271 spec : `ddl.TableSpec`
272 Specification for the table to be added.
273 """
274 self._opaque.register(tableName, spec)
276 @transactional
277 def insertOpaqueData(self, tableName: str, *data: dict):
278 """Insert records into an opaque table.
280 Parameters
281 ----------
282 tableName : `str`
283 Logical name of the opaque table. Must match the name used in a
284 previous call to `registerOpaqueTable`.
285 data
286 Each additional positional argument is a dictionary that represents
287 a single row to be added.
288 """
289 self._opaque[tableName].insert(*data)
291 def fetchOpaqueData(self, tableName: str, **where: Any) -> Iterator[dict]:
292 """Retrieve records from an opaque table.
294 Parameters
295 ----------
296 tableName : `str`
297 Logical name of the opaque table. Must match the name used in a
298 previous call to `registerOpaqueTable`.
299 where
300 Additional keyword arguments are interpreted as equality
301 constraints that restrict the returned rows (combined with AND);
302 keyword arguments are column names and values are the values they
303 must have.
305 Yields
306 ------
307 row : `dict`
308 A dictionary representing a single result row.
309 """
310 yield from self._opaque[tableName].fetch(**where)
312 @transactional
313 def deleteOpaqueData(self, tableName: str, **where: Any):
314 """Remove records from an opaque table.
316 Parameters
317 ----------
318 tableName : `str`
319 Logical name of the opaque table. Must match the name used in a
320 previous call to `registerOpaqueTable`.
321 where
322 Additional keyword arguments are interpreted as equality
323 constraints that restrict the deleted rows (combined with AND);
324 keyword arguments are column names and values are the values they
325 must have.
326 """
327 self._opaque[tableName].delete(**where)
329 def registerCollection(self, name: str, type: CollectionType = CollectionType.TAGGED):
330 """Add a new collection if one with the given name does not exist.
332 Parameters
333 ----------
334 name : `str`
335 The name of the collection to create.
336 type : `CollectionType`
337 Enum value indicating the type of collection to create.
339 Notes
340 -----
341 This method cannot be called within transactions, as it needs to be
342 able to perform its own transaction to be concurrent.
343 """
344 self._collections.register(name, type)
346 def getCollectionType(self, name: str) -> CollectionType:
347 """Return an enumeration value indicating the type of the given
348 collection.
350 Parameters
351 ----------
352 name : `str`
353 The name of the collection.
355 Returns
356 -------
357 type : `CollectionType`
358 Enum value indicating the type of this collection.
360 Raises
361 ------
362 MissingCollectionError
363 Raised if no collection with the given name exists.
364 """
365 return self._collections.find(name).type
367 def registerRun(self, name: str):
368 """Add a new run if one with the given name does not exist.
370 Parameters
371 ----------
372 name : `str`
373 The name of the run to create.
375 Notes
376 -----
377 This method cannot be called within transactions, as it needs to be
378 able to perform its own transaction to be concurrent.
379 """
380 self._collections.register(name, CollectionType.RUN)
382 @transactional
383 def removeCollection(self, name: str):
384 """Completely remove the given collection.
386 Parameters
387 ----------
388 name : `str`
389 The name of the collection to remove.
391 Raises
392 ------
393 MissingCollectionError
394 Raised if no collection with the given name exists.
396 Notes
397 -----
398 If this is a `~CollectionType.RUN` collection, all datasets and quanta
399 in it are also fully removed. This requires that those datasets be
400 removed (or at least trashed) from any datastores that hold them first.
402 A collection may not be deleted as long as it is referenced by a
403 `~CollectionType.CHAINED` collection; the ``CHAINED`` collection must
404 be deleted or redefined first.
405 """
406 self._collections.remove(name)
408 def getCollectionChain(self, parent: str) -> CollectionSearch:
409 """Return the child collections in a `~CollectionType.CHAINED`
410 collection.
412 Parameters
413 ----------
414 parent : `str`
415 Name of the chained collection. Must have already been added via
416 a call to `Registry.registerCollection`.
418 Returns
419 -------
420 children : `CollectionSearch`
421 An object that defines the search path of the collection.
422 See :ref:`daf_butler_collection_expressions` for more information.
424 Raises
425 ------
426 MissingCollectionError
427 Raised if ``parent`` does not exist in the `Registry`.
428 TypeError
429 Raised if ``parent`` does not correspond to a
430 `~CollectionType.CHAINED` collection.
431 """
432 record = self._collections.find(parent)
433 if record.type is not CollectionType.CHAINED:
434 raise TypeError(f"Collection '{parent}' has type {record.type.name}, not CHAINED.")
435 return record.children
437 @transactional
438 def setCollectionChain(self, parent: str, children: Any):
439 """Define or redefine a `~CollectionType.CHAINED` collection.
441 Parameters
442 ----------
443 parent : `str`
444 Name of the chained collection. Must have already been added via
445 a call to `Registry.registerCollection`.
446 children : `Any`
447 An expression defining an ordered search of child collections,
448 generally an iterable of `str`. Restrictions on the dataset types
449 to be searched can also be included, by passing mapping or an
450 iterable containing tuples; see
451 :ref:`daf_butler_collection_expressions` for more information.
453 Raises
454 ------
455 MissingCollectionError
456 Raised when any of the given collections do not exist in the
457 `Registry`.
458 TypeError
459 Raised if ``parent`` does not correspond to a
460 `~CollectionType.CHAINED` collection.
461 ValueError
462 Raised if the given collections contains a cycle.
463 """
464 record = self._collections.find(parent)
465 if record.type is not CollectionType.CHAINED:
466 raise TypeError(f"Collection '{parent}' has type {record.type.name}, not CHAINED.")
467 children = CollectionSearch.fromExpression(children)
468 if children != record.children:
469 record.update(self._collections, children)
471 def registerDatasetType(self, datasetType: DatasetType) -> bool:
472 """
473 Add a new `DatasetType` to the Registry.
475 It is not an error to register the same `DatasetType` twice.
477 Parameters
478 ----------
479 datasetType : `DatasetType`
480 The `DatasetType` to be added.
482 Returns
483 -------
484 inserted : `bool`
485 `True` if ``datasetType`` was inserted, `False` if an identical
486 existing `DatsetType` was found. Note that in either case the
487 DatasetType is guaranteed to be defined in the Registry
488 consistently with the given definition.
490 Raises
491 ------
492 ValueError
493 Raised if the dimensions or storage class are invalid.
494 ConflictingDefinitionError
495 Raised if this DatasetType is already registered with a different
496 definition.
498 Notes
499 -----
500 This method cannot be called within transactions, as it needs to be
501 able to perform its own transaction to be concurrent.
502 """
503 _, inserted = self._datasets.register(datasetType)
504 return inserted
506 def getDatasetType(self, name: str) -> DatasetType:
507 """Get the `DatasetType`.
509 Parameters
510 ----------
511 name : `str`
512 Name of the type.
514 Returns
515 -------
516 type : `DatasetType`
517 The `DatasetType` associated with the given name.
519 Raises
520 ------
521 KeyError
522 Requested named DatasetType could not be found in registry.
523 """
524 storage = self._datasets.find(name)
525 if storage is None:
526 raise KeyError(f"DatasetType '{name}' could not be found.")
527 return storage.datasetType
529 def findDataset(self, datasetType: Union[DatasetType, str], dataId: Optional[DataId] = None, *,
530 collections: Any, **kwargs: Any) -> Optional[DatasetRef]:
531 """Find a dataset given its `DatasetType` and data ID.
533 This can be used to obtain a `DatasetRef` that permits the dataset to
534 be read from a `Datastore`. If the dataset is a component and can not
535 be found using the provided dataset type, a dataset ref for the parent
536 will be returned instead but with the correct dataset type.
538 Parameters
539 ----------
540 datasetType : `DatasetType` or `str`
541 A `DatasetType` or the name of one.
542 dataId : `dict` or `DataCoordinate`, optional
543 A `dict`-like object containing the `Dimension` links that identify
544 the dataset within a collection.
545 collections
546 An expression that fully or partially identifies the collections
547 to search for the dataset, such as a `str`, `re.Pattern`, or
548 iterable thereof. `...` can be used to return all collections.
549 See :ref:`daf_butler_collection_expressions` for more information.
550 **kwargs
551 Additional keyword arguments passed to
552 `DataCoordinate.standardize` to convert ``dataId`` to a true
553 `DataCoordinate` or augment an existing one.
555 Returns
556 -------
557 ref : `DatasetRef`
558 A reference to the dataset, or `None` if no matching Dataset
559 was found.
561 Raises
562 ------
563 LookupError
564 Raised if one or more data ID keys are missing or the dataset type
565 does not exist.
566 MissingCollectionError
567 Raised if any of ``collections`` does not exist in the registry.
568 """
569 if isinstance(datasetType, DatasetType):
570 storage = self._datasets.find(datasetType.name)
571 if storage is None:
572 raise LookupError(f"DatasetType '{datasetType}' has not been registered.")
573 else:
574 storage = self._datasets.find(datasetType)
575 if storage is None:
576 raise LookupError(f"DatasetType with name '{datasetType}' has not been registered.")
577 dataId = DataCoordinate.standardize(dataId, graph=storage.datasetType.dimensions,
578 universe=self.dimensions, **kwargs)
579 collections = CollectionSearch.fromExpression(collections)
580 for collectionRecord in collections.iter(self._collections, datasetType=storage.datasetType):
581 result = storage.find(collectionRecord, dataId)
582 if result is not None:
583 if result.datasetType.isComposite():
584 result = self._datasets.fetchComponents(result)
585 return result
587 # fallback to the parent if we got nothing and this was a component
588 if storage.datasetType.isComponent():
589 parentType, _ = storage.datasetType.nameAndComponent()
590 parentRef = self.findDataset(parentType, dataId, collections=collections, **kwargs)
591 if parentRef is not None:
592 # Should already conform and we know no components
593 return DatasetRef(storage.datasetType, parentRef.dataId, id=parentRef.id,
594 run=parentRef.run, conform=False, hasParentId=True)
596 return None
598 @transactional
599 def insertDatasets(self, datasetType: Union[DatasetType, str], dataIds: Iterable[DataId],
600 run: str, *, producer: Optional[Quantum] = None, recursive: bool = False
601 ) -> List[DatasetRef]:
602 """Insert one or more datasets into the `Registry`
604 This always adds new datasets; to associate existing datasets with
605 a new collection, use ``associate``.
607 Parameters
608 ----------
609 datasetType : `DatasetType` or `str`
610 A `DatasetType` or the name of one.
611 dataIds : `~collections.abc.Iterable` of `dict` or `DataCoordinate`
612 Dimension-based identifiers for the new datasets.
613 run : `str`
614 The name of the run that produced the datasets.
615 producer : `Quantum`
616 Unit of work that produced the datasets. May be `None` to store
617 no provenance information, but if present the `Quantum` must
618 already have been added to the Registry.
619 recursive : `bool`
620 If True, recursively add datasets and attach entries for component
621 datasets as well.
623 Returns
624 -------
625 refs : `list` of `DatasetRef`
626 Resolved `DatasetRef` instances for all given data IDs (in the same
627 order).
629 Raises
630 ------
631 ConflictingDefinitionError
632 If a dataset with the same dataset type and data ID as one of those
633 given already exists in ``run``.
634 MissingCollectionError
635 Raised if ``run`` does not exist in the registry.
636 """
637 if isinstance(datasetType, DatasetType):
638 storage = self._datasets.find(datasetType.name)
639 if storage is None:
640 raise LookupError(f"DatasetType '{datasetType}' has not been registered.")
641 else:
642 storage = self._datasets.find(datasetType)
643 if storage is None:
644 raise LookupError(f"DatasetType with name '{datasetType}' has not been registered.")
645 runRecord = self._collections.find(run)
646 dataIds = [self.expandDataId(dataId, graph=storage.datasetType.dimensions) for dataId in dataIds]
647 try:
648 refs = list(storage.insert(runRecord, dataIds, quantum=producer))
649 except sqlalchemy.exc.IntegrityError as err:
650 raise ConflictingDefinitionError(f"A database constraint failure was triggered by inserting "
651 f"one or more datasets of type {storage.datasetType} into "
652 f"collection '{run}'. "
653 f"This probably means a dataset with the same data ID "
654 f"and dataset type already exists, but it may also mean a "
655 f"dimension row is missing.") from err
656 if recursive and storage.datasetType.isComposite():
657 # Insert component rows by recursing.
658 composites = defaultdict(dict)
659 # TODO: we really shouldn't be inserting all components defined by
660 # the storage class, because there's no guarantee all of them are
661 # actually present in these datasets.
662 for componentName in storage.datasetType.storageClass.components:
663 componentDatasetType = storage.datasetType.makeComponentDatasetType(componentName)
664 componentRefs = self.insertDatasets(componentDatasetType,
665 dataIds=dataIds,
666 run=run,
667 producer=producer,
668 recursive=True)
669 for parentRef, componentRef in zip(refs, componentRefs):
670 composites[parentRef][componentName] = componentRef
671 if composites:
672 refs = list(self._datasets.attachComponents(composites.items()))
673 return refs
675 def getDataset(self, id: int) -> Optional[DatasetRef]:
676 """Retrieve a Dataset entry.
678 Parameters
679 ----------
680 id : `int`
681 The unique identifier for the dataset.
683 Returns
684 -------
685 ref : `DatasetRef` or `None`
686 A ref to the Dataset, or `None` if no matching Dataset
687 was found.
688 """
689 ref = self._datasets.getDatasetRef(id)
690 if ref is None:
691 return None
692 if ref.datasetType.isComposite():
693 return self._datasets.fetchComponents(ref)
694 return ref
696 @transactional
697 def removeDatasets(self, refs: Iterable[DatasetRef], *, recursive: bool = True):
698 """Remove datasets from the Registry.
700 The datasets will be removed unconditionally from all collections, and
701 any `Quantum` that consumed this dataset will instead be marked with
702 having a NULL input. `Datastore` records will *not* be deleted; the
703 caller is responsible for ensuring that the dataset has already been
704 removed from all Datastores.
706 Parameters
707 ----------
708 refs : `Iterable` of `DatasetRef`
709 References to the datasets to be removed. Must include a valid
710 ``id`` attribute, and should be considered invalidated upon return.
711 recursive : `bool`, optional
712 If `True`, remove all component datasets as well. Note that
713 this only removes components that are actually included in the
714 given `DatasetRef` instances, which may not be the same as those in
715 the database (especially if they were obtained from
716 `queryDatasets`, which does not populate `DatasetRef.components`).
718 Raises
719 ------
720 AmbiguousDatasetError
721 Raised if any ``ref.id`` is `None`.
722 OrphanedRecordError
723 Raised if any dataset is still present in any `Datastore`.
724 """
725 for datasetType, refsForType in DatasetRef.groupByType(refs, recursive=recursive).items():
726 storage = self._datasets.find(datasetType.name)
727 try:
728 storage.delete(refsForType)
729 except sqlalchemy.exc.IntegrityError as err:
730 raise OrphanedRecordError("One or more datasets is still "
731 "present in one or more Datastores.") from err
733 @transactional
734 def attachComponents(self, parent: DatasetRef, components: Mapping[str, DatasetRef]):
735 """Attach components to a dataset.
737 Parameters
738 ----------
739 parent : `DatasetRef`
740 A reference to the parent dataset.
741 components : `Mapping` [ `str`, `DatasetRef` ]
742 Mapping from component name to the `DatasetRef` for that component.
744 Returns
745 -------
746 ref : `DatasetRef`
747 An updated version of ``parent`` with components included.
749 Raises
750 ------
751 AmbiguousDatasetError
752 Raised if ``parent.id`` or any `DatasetRef.id` in ``components``
753 is `None`.
754 """
755 for name, ref in components.items():
756 if ref.datasetType.storageClass != parent.datasetType.storageClass.components[name]:
757 raise TypeError(f"Expected storage class "
758 f"'{parent.datasetType.storageClass.components[name].name}' "
759 f"for component '{name}' of dataset {parent}; got "
760 f"dataset {ref} with storage class "
761 f"'{ref.datasetType.storageClass.name}'.")
762 ref, = self._datasets.attachComponents([(parent, components)])
763 return ref
765 @transactional
766 def associate(self, collection: str, refs: Iterable[DatasetRef], *, recursive: bool = True):
767 """Add existing datasets to a `~CollectionType.TAGGED` collection.
769 If a DatasetRef with the same exact integer ID is already in a
770 collection nothing is changed. If a `DatasetRef` with the same
771 `DatasetType` and data ID but with different integer ID
772 exists in the collection, `ConflictingDefinitionError` is raised.
774 Parameters
775 ----------
776 collection : `str`
777 Indicates the collection the datasets should be associated with.
778 refs : `Iterable` [ `DatasetRef` ]
779 An iterable of resolved `DatasetRef` instances that already exist
780 in this `Registry`.
781 recursive : `bool`, optional
782 If `True`, associate all component datasets as well. Note that
783 this only associates components that are actually included in the
784 given `DatasetRef` instances, which may not be the same as those in
785 the database (especially if they were obtained from
786 `queryDatasets`, which does not populate `DatasetRef.components`).
788 Raises
789 ------
790 ConflictingDefinitionError
791 If a Dataset with the given `DatasetRef` already exists in the
792 given collection.
793 AmbiguousDatasetError
794 Raised if ``any(ref.id is None for ref in refs)``.
795 MissingCollectionError
796 Raised if ``collection`` does not exist in the registry.
797 TypeError
798 Raise adding new datasets to the given ``collection`` is not
799 allowed.
800 """
801 collectionRecord = self._collections.find(collection)
802 if collectionRecord.type is not CollectionType.TAGGED:
803 raise TypeError(f"Collection '{collection}' has type {collectionRecord.type.name}, not TAGGED.")
804 for datasetType, refsForType in DatasetRef.groupByType(refs, recursive=recursive).items():
805 storage = self._datasets.find(datasetType.name)
806 try:
807 storage.associate(collectionRecord, refsForType)
808 except sqlalchemy.exc.IntegrityError as err:
809 raise ConflictingDefinitionError(
810 f"Constraint violation while associating dataset of type {datasetType.name} with "
811 f"collection {collection}. This probably means that one or more datasets with the same "
812 f"dataset type and data ID already exist in the collection, but it may also indicate "
813 f"that the datasets do not exist."
814 ) from err
816 @transactional
817 def disassociate(self, collection: str, refs: Iterable[DatasetRef], *, recursive: bool = True):
818 """Remove existing datasets from a `~CollectionType.TAGGED` collection.
820 ``collection`` and ``ref`` combinations that are not currently
821 associated are silently ignored.
823 Parameters
824 ----------
825 collection : `str`
826 The collection the datasets should no longer be associated with.
827 refs : `Iterable` [ `DatasetRef` ]
828 An iterable of resolved `DatasetRef` instances that already exist
829 in this `Registry`.
830 recursive : `bool`, optional
831 If `True`, disassociate all component datasets as well. Note that
832 this only disassociates components that are actually included in
833 the given `DatasetRef` instances, which may not be the same as
834 those in the database (especially if they were obtained from
835 `queryDatasets`, which does not populate `DatasetRef.components`).
837 Raises
838 ------
839 AmbiguousDatasetError
840 Raised if any of the given dataset references is unresolved.
841 MissingCollectionError
842 Raised if ``collection`` does not exist in the registry.
843 TypeError
844 Raise adding new datasets to the given ``collection`` is not
845 allowed.
846 """
847 collectionRecord = self._collections.find(collection)
848 if collectionRecord.type is not CollectionType.TAGGED:
849 raise TypeError(f"Collection '{collection}' has type {collectionRecord.type.name}; "
850 "expected TAGGED.")
851 for datasetType, refsForType in DatasetRef.groupByType(refs, recursive=recursive).items():
852 storage = self._datasets.find(datasetType.name)
853 storage.disassociate(collectionRecord, refsForType)
855 @transactional
856 def insertDatasetLocations(self, datastoreName: str, refs: Iterable[DatasetRef]):
857 """Record that a datastore holds the given datasets.
859 Typically used by `Datastore`.
861 Parameters
862 ----------
863 datastoreName : `str`
864 Name of the datastore holding these datasets.
865 refs : `~collections.abc.Iterable` of `DatasetRef`
866 References to the datasets.
868 Raises
869 ------
870 AmbiguousDatasetError
871 Raised if ``any(ref.id is None for ref in refs)``.
872 """
873 self._db.insert(
874 self._tables.dataset_location,
875 *[{"datastore_name": datastoreName, "dataset_id": ref.getCheckedId()} for ref in refs]
876 )
878 @transactional
879 def moveDatasetLocationToTrash(self, datastoreName: str, refs: Iterable[DatasetRef]):
880 """Move the dataset location information to trash.
882 Parameters
883 ----------
884 datastoreName : `str`
885 Name of the datastore holding these datasets.
886 refs : `~collections.abc.Iterable` of `DatasetRef`
887 References to the datasets.
888 """
889 # We only want to move rows that already exist in the main table
890 filtered = self.checkDatasetLocations(datastoreName, refs)
891 self.canDeleteDatasetLocations(datastoreName, filtered)
892 self.removeDatasetLocation(datastoreName, filtered)
894 @transactional
895 def canDeleteDatasetLocations(self, datastoreName: str, refs: Iterable[DatasetRef]):
896 """Record that a datastore can delete this dataset
898 Parameters
899 ----------
900 datastoreName : `str`
901 Name of the datastore holding these datasets.
902 refs : `~collections.abc.Iterable` of `DatasetRef`
903 References to the datasets.
905 Raises
906 ------
907 AmbiguousDatasetError
908 Raised if ``any(ref.id is None for ref in refs)``.
909 """
910 self._db.insert(
911 self._tables.dataset_location_trash,
912 *[{"datastore_name": datastoreName, "dataset_id": ref.getCheckedId()} for ref in refs]
913 )
915 def checkDatasetLocations(self, datastoreName: str, refs: Iterable[DatasetRef]) -> List[DatasetRef]:
916 """Check which refs are listed for this datastore.
918 Parameters
919 ----------
920 datastoreName : `str`
921 Name of the datastore holding these datasets.
922 refs : `~collections.abc.Iterable` of `DatasetRef`
923 References to the datasets.
925 Returns
926 -------
927 present : `list` of `DatasetRef`
928 All the `DatasetRef` that are listed.
929 """
931 table = self._tables.dataset_location
932 result = self._db.query(
933 sqlalchemy.sql.select(
934 [table.columns.datastore_name, table.columns.dataset_id]
935 ).where(
936 sqlalchemy.sql.and_(table.columns.dataset_id.in_([ref.id for ref in refs]),
937 table.columns.datastore_name == datastoreName)
938 )
939 ).fetchall()
941 matched_ids = {r["dataset_id"] for r in result}
942 return [ref for ref in refs if ref.id in matched_ids]
944 def getDatasetLocations(self, ref: DatasetRef) -> Set[str]:
945 """Retrieve datastore locations for a given dataset.
947 Typically used by `Datastore`.
949 Parameters
950 ----------
951 ref : `DatasetRef`
952 A reference to the dataset for which to retrieve storage
953 information.
955 Returns
956 -------
957 datastores : `set` of `str`
958 All the matching datastores holding this dataset. Empty set
959 if the dataset does not exist anywhere.
961 Raises
962 ------
963 AmbiguousDatasetError
964 Raised if ``ref.id`` is `None`.
965 """
966 table = self._tables.dataset_location
967 result = self._db.query(
968 sqlalchemy.sql.select(
969 [table.columns.datastore_name]
970 ).where(
971 table.columns.dataset_id == ref.id
972 )
973 ).fetchall()
974 return {r["datastore_name"] for r in result}
976 @transactional
977 def getTrashedDatasets(self, datastoreName: str) -> Set[FakeDatasetRef]:
978 """Retrieve all the dataset ref IDs that are in the trash
979 associated with the specified datastore.
981 Parameters
982 ----------
983 datastoreName : `str`
984 The relevant datastore name to use.
986 Returns
987 -------
988 ids : `set` of `FakeDatasetRef`
989 The IDs of datasets that can be safely removed from this datastore.
990 Can be empty.
991 """
992 table = self._tables.dataset_location_trash
993 result = self._db.query(
994 sqlalchemy.sql.select(
995 [table.columns.dataset_id]
996 ).where(
997 table.columns.datastore_name == datastoreName
998 )
999 ).fetchall()
1000 return {FakeDatasetRef(r["dataset_id"]) for r in result}
1002 @transactional
1003 def emptyDatasetLocationsTrash(self, datastoreName: str, refs: Iterable[FakeDatasetRef]) -> None:
1004 """Remove datastore location associated with these datasets from trash.
1006 Typically used by `Datastore` when a dataset is removed.
1008 Parameters
1009 ----------
1010 datastoreName : `str`
1011 Name of this `Datastore`.
1012 refs : iterable of `FakeDatasetRef`
1013 The dataset IDs to be removed.
1015 Raises
1016 ------
1017 AmbiguousDatasetError
1018 Raised if ``ref.id`` is `None`.
1019 """
1020 if not refs:
1021 return
1022 self._db.delete(
1023 self._tables.dataset_location_trash,
1024 ["dataset_id", "datastore_name"],
1025 *[{"dataset_id": ref.id, "datastore_name": datastoreName} for ref in refs]
1026 )
1028 @transactional
1029 def removeDatasetLocation(self, datastoreName: str, refs: Iterable[DatasetRef]) -> None:
1030 """Remove datastore location associated with this dataset.
1032 Typically used by `Datastore` when a dataset is removed.
1034 Parameters
1035 ----------
1036 datastoreName : `str`
1037 Name of this `Datastore`.
1038 refs : iterable of `DatasetRef`
1039 A reference to the dataset for which information is to be removed.
1041 Raises
1042 ------
1043 AmbiguousDatasetError
1044 Raised if ``ref.id`` is `None`.
1045 """
1046 if not refs:
1047 return
1048 self._db.delete(
1049 self._tables.dataset_location,
1050 ["dataset_id", "datastore_name"],
1051 *[{"dataset_id": ref.getCheckedId(), "datastore_name": datastoreName} for ref in refs]
1052 )
1054 def expandDataId(self, dataId: Optional[DataId] = None, *, graph: Optional[DimensionGraph] = None,
1055 records: Optional[Mapping[DimensionElement, DimensionRecord]] = None, **kwds):
1056 """Expand a dimension-based data ID to include additional information.
1058 Parameters
1059 ----------
1060 dataId : `DataCoordinate` or `dict`, optional
1061 Data ID to be expanded; augmented and overridden by ``kwds``.
1062 graph : `DimensionGraph`, optional
1063 Set of dimensions for the expanded ID. If `None`, the dimensions
1064 will be inferred from the keys of ``dataId`` and ``kwds``.
1065 Dimensions that are in ``dataId`` or ``kwds`` but not in ``graph``
1066 are silently ignored, providing a way to extract and expand a
1067 subset of a data ID.
1068 records : mapping [`DimensionElement`, `DimensionRecord`], optional
1069 Dimension record data to use before querying the database for that
1070 data.
1071 **kwds
1072 Additional keywords are treated like additional key-value pairs for
1073 ``dataId``, extending and overriding
1075 Returns
1076 -------
1077 expanded : `ExpandedDataCoordinate`
1078 A data ID that includes full metadata for all of the dimensions it
1079 identifieds.
1080 """
1081 standardized = DataCoordinate.standardize(dataId, graph=graph, universe=self.dimensions, **kwds)
1082 if isinstance(standardized, ExpandedDataCoordinate):
1083 return standardized
1084 elif isinstance(dataId, ExpandedDataCoordinate):
1085 records = dict(records) if records is not None else {}
1086 records.update(dataId.records)
1087 else:
1088 records = dict(records) if records is not None else {}
1089 keys = dict(standardized)
1090 regions = []
1091 timespans = []
1092 for element in standardized.graph.primaryKeyTraversalOrder:
1093 record = records.get(element.name, ...) # Use ... to mean not found; None might mean NULL
1094 if record is ...:
1095 storage = self._dimensions[element]
1096 record = storage.fetch(keys)
1097 records[element] = record
1098 if record is not None:
1099 for d in element.implied:
1100 value = getattr(record, d.name)
1101 if keys.setdefault(d, value) != value:
1102 raise InconsistentDataIdError(f"Data ID {standardized} has {d.name}={keys[d]!r}, "
1103 f"but {element.name} implies {d.name}={value!r}.")
1104 if element in standardized.graph.spatial and record.region is not None:
1105 if any(record.region.relate(r) & lsst.sphgeom.DISJOINT for r in regions):
1106 raise InconsistentDataIdError(f"Data ID {standardized}'s region for {element.name} "
1107 f"is disjoint with those for other elements.")
1108 regions.append(record.region)
1109 if element in standardized.graph.temporal:
1110 if any(not record.timespan.overlaps(t) for t in timespans):
1111 raise InconsistentDataIdError(f"Data ID {standardized}'s timespan for {element.name}"
1112 f" is disjoint with those for other elements.")
1113 timespans.append(record.timespan)
1114 else:
1115 if element in standardized.graph.required:
1116 raise LookupError(
1117 f"Could not fetch record for required dimension {element.name} via keys {keys}."
1118 )
1119 if element.alwaysJoin:
1120 raise InconsistentDataIdError(
1121 f"Could not fetch record for element {element.name} via keys {keys}, ",
1122 f"but it is marked alwaysJoin=True; this means one or more dimensions are not "
1123 f"related."
1124 )
1125 records.update((d, None) for d in element.implied)
1126 return ExpandedDataCoordinate(standardized.graph, standardized.values(), records=records)
1128 def relateDataIds(self, a: DataId, b: DataId) -> Optional[ConsistentDataIds]:
1129 """Compare the keys and values of a pair of data IDs for consistency.
1131 See `ConsistentDataIds` for more information.
1133 Parameters
1134 ----------
1135 a : `dict` or `DataCoordinate`
1136 First data ID to be compared.
1137 b : `dict` or `DataCoordinate`
1138 Second data ID to be compared.
1140 Returns
1141 -------
1142 relationship : `ConsistentDataIds` or `None`
1143 Relationship information. This is not `None` and coerces to
1144 `True` in boolean contexts if and only if the data IDs are
1145 consistent in terms of all common key-value pairs, all many-to-many
1146 join tables, and all spatial andtemporal relationships.
1147 """
1148 a = DataCoordinate.standardize(a, universe=self.dimensions)
1149 b = DataCoordinate.standardize(b, universe=self.dimensions)
1150 aFull = getattr(a, "full", None)
1151 bFull = getattr(b, "full", None)
1152 aBest = aFull if aFull is not None else a
1153 bBest = bFull if bFull is not None else b
1154 jointKeys = aBest.keys() & bBest.keys()
1155 # If any common values are not equal, we know they are inconsistent.
1156 if any(aBest[k] != bBest[k] for k in jointKeys):
1157 return None
1158 # If the graphs are equal, we know the data IDs are.
1159 if a.graph == b.graph:
1160 return ConsistentDataIds(contains=True, within=True, overlaps=bool(jointKeys))
1161 # Result is still inconclusive. Try to expand a data ID containing
1162 # keys from both; that will fail if they are inconsistent.
1163 # First, if either input was already an ExpandedDataCoordinate, extract
1164 # its records so we don't have to query for them.
1165 records = {}
1166 if hasattr(a, "records"):
1167 records.update(a.records)
1168 if hasattr(b, "records"):
1169 records.update(b.records)
1170 try:
1171 self.expandDataId({**a, **b}, graph=(a.graph | b.graph), records=records)
1172 except InconsistentDataIdError:
1173 return None
1174 # We know the answer is not `None`; time to figure out what it is.
1175 return ConsistentDataIds(
1176 contains=(a.graph >= b.graph),
1177 within=(a.graph <= b.graph),
1178 overlaps=bool(a.graph & b.graph),
1179 )
1181 def insertDimensionData(self, element: Union[DimensionElement, str],
1182 *data: Union[dict, DimensionRecord],
1183 conform: bool = True):
1184 """Insert one or more dimension records into the database.
1186 Parameters
1187 ----------
1188 element : `DimensionElement` or `str`
1189 The `DimensionElement` or name thereof that identifies the table
1190 records will be inserted into.
1191 data : `dict` or `DimensionRecord` (variadic)
1192 One or more records to insert.
1193 conform : `bool`, optional
1194 If `False` (`True` is default) perform no checking or conversions,
1195 and assume that ``element`` is a `DimensionElement` instance and
1196 ``data`` is a one or more `DimensionRecord` instances of the
1197 appropriate subclass.
1198 """
1199 if conform:
1200 element = self.dimensions[element] # if this is a name, convert it to a true DimensionElement.
1201 records = [element.RecordClass.fromDict(row) if not type(row) is element.RecordClass else row
1202 for row in data]
1203 else:
1204 records = data
1205 storage = self._dimensions[element]
1206 storage.insert(*records)
1208 def syncDimensionData(self, element: Union[DimensionElement, str],
1209 row: Union[dict, DimensionRecord],
1210 conform: bool = True) -> bool:
1211 """Synchronize the given dimension record with the database, inserting
1212 if it does not already exist and comparing values if it does.
1214 Parameters
1215 ----------
1216 element : `DimensionElement` or `str`
1217 The `DimensionElement` or name thereof that identifies the table
1218 records will be inserted into.
1219 row : `dict` or `DimensionRecord`
1220 The record to insert.
1221 conform : `bool`, optional
1222 If `False` (`True` is default) perform no checking or conversions,
1223 and assume that ``element`` is a `DimensionElement` instance and
1224 ``data`` is a one or more `DimensionRecord` instances of the
1225 appropriate subclass.
1227 Returns
1228 -------
1229 inserted : `bool`
1230 `True` if a new row was inserted, `False` otherwise.
1232 Raises
1233 ------
1234 ConflictingDefinitionError
1235 Raised if the record exists in the database (according to primary
1236 key lookup) but is inconsistent with the given one.
1238 Notes
1239 -----
1240 This method cannot be called within transactions, as it needs to be
1241 able to perform its own transaction to be concurrent.
1242 """
1243 if conform:
1244 element = self.dimensions[element] # if this is a name, convert it to a true DimensionElement.
1245 record = element.RecordClass.fromDict(row) if not type(row) is element.RecordClass else row
1246 else:
1247 record = row
1248 storage = self._dimensions[element]
1249 return storage.sync(record)
1251 def queryDatasetTypes(self, expression: Any = ...) -> Iterator[DatasetType]:
1252 """Iterate over the dataset types whose names match an expression.
1254 Parameters
1255 ----------
1256 expression : `Any`, optional
1257 An expression that fully or partially identifies the dataset types
1258 to return, such as a `str`, `re.Pattern`, or iterable thereof.
1259 `...` can be used to return all dataset types, and is the default.
1260 See :ref:`daf_butler_dataset_type_expressions` for more
1261 information.
1263 Yields
1264 ------
1265 datasetType : `DatasetType`
1266 A `DatasetType` instance whose name matches ``expression``.
1267 """
1268 wildcard = CategorizedWildcard.fromExpression(expression, coerceUnrecognized=lambda d: d.name)
1269 if wildcard is ...:
1270 yield from self._datasets
1271 return
1272 done = set()
1273 for name in wildcard.strings:
1274 storage = self._datasets.find(name)
1275 if storage is not None:
1276 done.add(storage.datasetType)
1277 yield storage.datasetType
1278 if wildcard.patterns:
1279 for datasetType in self._datasets:
1280 if datasetType.name in done:
1281 continue
1282 if any(p.fullmatch(datasetType.name) for p in wildcard.patterns):
1283 yield datasetType
1285 def queryCollections(self, expression: Any = ...,
1286 datasetType: Optional[DatasetType] = None,
1287 collectionType: Optional[CollectionType] = None,
1288 flattenChains: bool = False,
1289 includeChains: Optional[bool] = None) -> Iterator[str]:
1290 """Iterate over the collections whose names match an expression.
1292 Parameters
1293 ----------
1294 expression : `Any`, optional
1295 An expression that fully or partially identifies the collections
1296 to return, such as a `str`, `re.Pattern`, or iterable thereof.
1297 `...` can be used to return all collections, and is the default.
1298 See :ref:`daf_butler_collection_expressions` for more
1299 information.
1300 datasetType : `DatasetType`, optional
1301 If provided, only yield collections that should be searched for
1302 this dataset type according to ``expression``. If this is
1303 not provided, any dataset type restrictions in ``expression`` are
1304 ignored.
1305 collectionType : `CollectionType`, optional
1306 If provided, only yield collections of this type.
1307 flattenChains : `bool`, optional
1308 If `True` (`False` is default), recursively yield the child
1309 collections of matching `~CollectionType.CHAINED` collections.
1310 includeChains : `bool`, optional
1311 If `True`, yield records for matching `~CollectionType.CHAINED`
1312 collections. Default is the opposite of ``flattenChains``: include
1313 either CHAINED collections or their children, but not both.
1315 Yields
1316 ------
1317 collection : `str`
1318 The name of a collection that matches ``expression``.
1319 """
1320 query = CollectionQuery.fromExpression(expression)
1321 for record in query.iter(self._collections, datasetType=datasetType, collectionType=collectionType,
1322 flattenChains=flattenChains, includeChains=includeChains):
1323 yield record.name
1325 def makeQueryBuilder(self, summary: QuerySummary) -> QueryBuilder:
1326 """Return a `QueryBuilder` instance capable of constructing and
1327 managing more complex queries than those obtainable via `Registry`
1328 interfaces.
1330 This is an advanced interface; downstream code should prefer
1331 `Registry.queryDimensions` and `Registry.queryDatasets` whenever those
1332 are sufficient.
1334 Parameters
1335 ----------
1336 summary : `QuerySummary`
1337 Object describing and categorizing the full set of dimensions that
1338 will be included in the query.
1340 Returns
1341 -------
1342 builder : `QueryBuilder`
1343 Object that can be used to construct and perform advanced queries.
1344 """
1345 return QueryBuilder(summary=summary,
1346 collections=self._collections,
1347 dimensions=self._dimensions,
1348 datasets=self._datasets)
1350 def queryDimensions(self, dimensions: Union[Iterable[Union[Dimension, str]], Dimension, str], *,
1351 dataId: Optional[DataId] = None,
1352 datasets: Any = None,
1353 collections: Any = None,
1354 where: Optional[str] = None,
1355 expand: bool = True,
1356 **kwds) -> Iterator[DataCoordinate]:
1357 """Query for and iterate over data IDs matching user-provided criteria.
1359 Parameters
1360 ----------
1361 dimensions : `Dimension` or `str`, or iterable thereof
1362 The dimensions of the data IDs to yield, as either `Dimension`
1363 instances or `str`. Will be automatically expanded to a complete
1364 `DimensionGraph`.
1365 dataId : `dict` or `DataCoordinate`, optional
1366 A data ID whose key-value pairs are used as equality constraints
1367 in the query.
1368 datasets : `Any`, optional
1369 An expression that fully or partially identifies dataset types
1370 that should constrain the yielded data IDs. For example, including
1371 "raw" here would constrain the yielded ``instrument``,
1372 ``exposure``, ``detector``, and ``physical_filter`` values to only
1373 those for which at least one "raw" dataset exists in
1374 ``collections``. Allowed types include `DatasetType`, `str`,
1375 `re.Pattern`, and iterables thereof. Unlike other dataset type
1376 expressions, `...` is not permitted - it doesn't make sense to
1377 constrain data IDs on the existence of *all* datasets.
1378 See :ref:`daf_butler_dataset_type_expressions` for more
1379 information.
1380 collections: `Any`, optional
1381 An expression that fully or partially identifies the collections
1382 to search for datasets, such as a `str`, `re.Pattern`, or iterable
1383 thereof. `...` can be used to return all collections. Must be
1384 provided if ``datasets`` is, and is ignored if it is not. See
1385 :ref:`daf_butler_collection_expressions` for more information.
1386 where : `str`, optional
1387 A string expression similar to a SQL WHERE clause. May involve
1388 any column of a dimension table or (as a shortcut for the primary
1389 key column of a dimension table) dimension name. See
1390 :ref:`daf_butler_dimension_expressions` for more information.
1391 expand : `bool`, optional
1392 If `True` (default) yield `ExpandedDataCoordinate` instead of
1393 minimal `DataCoordinate` base-class instances.
1394 kwds
1395 Additional keyword arguments are forwarded to
1396 `DataCoordinate.standardize` when processing the ``dataId``
1397 argument (and may be used to provide a constraining data ID even
1398 when the ``dataId`` argument is `None`).
1400 Yields
1401 ------
1402 dataId : `DataCoordinate`
1403 Data IDs matching the given query parameters. Order is
1404 unspecified.
1405 """
1406 dimensions = iterable(dimensions)
1407 standardizedDataId = self.expandDataId(dataId, **kwds)
1408 standardizedDatasetTypes = []
1409 requestedDimensionNames = set(self.dimensions.extract(dimensions).names)
1410 if datasets is not None:
1411 if collections is None:
1412 raise TypeError("Cannot pass 'datasets' without 'collections'.")
1413 for datasetType in self.queryDatasetTypes(datasets):
1414 requestedDimensionNames.update(datasetType.dimensions.names)
1415 standardizedDatasetTypes.append(datasetType)
1416 # Preprocess collections expression in case the original included
1417 # single-pass iterators (we'll want to use it multiple times
1418 # below).
1419 collections = CollectionQuery.fromExpression(collections)
1421 summary = QuerySummary(
1422 requested=DimensionGraph(self.dimensions, names=requestedDimensionNames),
1423 dataId=standardizedDataId,
1424 expression=where,
1425 )
1426 builder = self.makeQueryBuilder(summary)
1427 for datasetType in standardizedDatasetTypes:
1428 builder.joinDataset(datasetType, collections, isResult=False)
1429 query = builder.finish()
1430 predicate = query.predicate()
1431 for row in self._db.query(query.sql):
1432 if predicate(row):
1433 result = query.extractDataId(row)
1434 if expand:
1435 yield self.expandDataId(result, records=standardizedDataId.records)
1436 else:
1437 yield result
1439 def queryDatasets(self, datasetType: Any, *,
1440 collections: Any,
1441 dimensions: Optional[Iterable[Union[Dimension, str]]] = None,
1442 dataId: Optional[DataId] = None,
1443 where: Optional[str] = None,
1444 deduplicate: bool = False,
1445 expand: bool = True,
1446 **kwds) -> Iterator[DatasetRef]:
1447 """Query for and iterate over dataset references matching user-provided
1448 criteria.
1450 Parameters
1451 ----------
1452 datasetType
1453 An expression that fully or partially identifies the dataset types
1454 to be queried. Allowed types include `DatasetType`, `str`,
1455 `re.Pattern`, and iterables thereof. The special value `...` can
1456 be used to query all dataset types. See
1457 :ref:`daf_butler_dataset_type_expressions` for more information.
1458 collections
1459 An expression that fully or partially identifies the collections
1460 to search for datasets, such as a `str`, `re.Pattern`, or iterable
1461 thereof. `...` can be used to return all collections. See
1462 :ref:`daf_butler_collection_expressions` for more information.
1463 dimensions : `~collections.abc.Iterable` of `Dimension` or `str`
1464 Dimensions to include in the query (in addition to those used
1465 to identify the queried dataset type(s)), either to constrain
1466 the resulting datasets to those for which a matching dimension
1467 exists, or to relate the dataset type's dimensions to dimensions
1468 referenced by the ``dataId`` or ``where`` arguments.
1469 dataId : `dict` or `DataCoordinate`, optional
1470 A data ID whose key-value pairs are used as equality constraints
1471 in the query.
1472 where : `str`, optional
1473 A string expression similar to a SQL WHERE clause. May involve
1474 any column of a dimension table or (as a shortcut for the primary
1475 key column of a dimension table) dimension name. See
1476 :ref:`daf_butler_dimension_expressions` for more information.
1477 deduplicate : `bool`, optional
1478 If `True` (`False` is default), for each result data ID, only
1479 yield one `DatasetRef` of each `DatasetType`, from the first
1480 collection in which a dataset of that dataset type appears
1481 (according to the order of ``collections`` passed in). If `True`,
1482 ``collections`` must not contain regular expressions and may not
1483 be `...`.
1484 expand : `bool`, optional
1485 If `True` (default) attach `ExpandedDataCoordinate` instead of
1486 minimal `DataCoordinate` base-class instances.
1487 kwds
1488 Additional keyword arguments are forwarded to
1489 `DataCoordinate.standardize` when processing the ``dataId``
1490 argument (and may be used to provide a constraining data ID even
1491 when the ``dataId`` argument is `None`).
1493 Yields
1494 ------
1495 ref : `DatasetRef`
1496 Dataset references matching the given query criteria. These
1497 are grouped by `DatasetType` if the query evaluates to multiple
1498 dataset types, but order is otherwise unspecified.
1500 Raises
1501 ------
1502 TypeError
1503 Raised when the arguments are incompatible, such as when a
1504 collection wildcard is passed when ``deduplicate`` is `True`.
1506 Notes
1507 -----
1508 When multiple dataset types are queried in a single call, the
1509 results of this operation are equivalent to querying for each dataset
1510 type separately in turn, and no information about the relationships
1511 between datasets of different types is included. In contexts where
1512 that kind of information is important, the recommended pattern is to
1513 use `queryDimensions` to first obtain data IDs (possibly with the
1514 desired dataset types and collections passed as constraints to the
1515 query), and then use multiple (generally much simpler) calls to
1516 `queryDatasets` with the returned data IDs passed as constraints.
1517 """
1518 # Standardize the collections expression.
1519 if deduplicate:
1520 collections = CollectionSearch.fromExpression(collections)
1521 else:
1522 collections = CollectionQuery.fromExpression(collections)
1523 # Standardize and expand the data ID provided as a constraint.
1524 standardizedDataId = self.expandDataId(dataId, **kwds)
1525 # If the datasetType passed isn't actually a DatasetType, expand it
1526 # (it could be an expression that yields multiple DatasetTypes) and
1527 # recurse.
1528 if not isinstance(datasetType, DatasetType):
1529 for trueDatasetType in self.queryDatasetTypes(datasetType):
1530 yield from self.queryDatasets(trueDatasetType, collections=collections,
1531 dimensions=dimensions, dataId=standardizedDataId,
1532 where=where, deduplicate=deduplicate)
1533 return
1534 # The full set of dimensions in the query is the combination of those
1535 # needed for the DatasetType and those explicitly requested, if any.
1536 requestedDimensionNames = set(datasetType.dimensions.names)
1537 if dimensions is not None:
1538 requestedDimensionNames.update(self.dimensions.extract(dimensions).names)
1539 # Construct the summary structure needed to construct a QueryBuilder.
1540 summary = QuerySummary(
1541 requested=DimensionGraph(self.dimensions, names=requestedDimensionNames),
1542 dataId=standardizedDataId,
1543 expression=where,
1544 )
1545 builder = self.makeQueryBuilder(summary)
1546 # Add the dataset subquery to the query, telling the QueryBuilder to
1547 # include the rank of the selected collection in the results only if we
1548 # need to deduplicate. Note that if any of the collections are
1549 # actually wildcard expressions, and we've asked for deduplication,
1550 # this will raise TypeError for us.
1551 if not builder.joinDataset(datasetType, collections, isResult=True, addRank=deduplicate):
1552 return
1553 query = builder.finish()
1554 predicate = query.predicate()
1555 if not deduplicate:
1556 # No need to de-duplicate across collections.
1557 for row in self._db.query(query.sql):
1558 if predicate(row):
1559 dataId = query.extractDataId(row, graph=datasetType.dimensions)
1560 if expand:
1561 dataId = self.expandDataId(dataId, records=standardizedDataId.records)
1562 yield query.extractDatasetRef(row, datasetType, dataId)[0]
1563 else:
1564 # For each data ID, yield only the DatasetRef with the lowest
1565 # collection rank.
1566 bestRefs = {}
1567 bestRanks = {}
1568 for row in self._db.query(query.sql):
1569 if predicate(row):
1570 ref, rank = query.extractDatasetRef(row, datasetType)
1571 bestRank = bestRanks.get(ref.dataId, sys.maxsize)
1572 if rank < bestRank:
1573 bestRefs[ref.dataId] = ref
1574 bestRanks[ref.dataId] = rank
1575 # If caller requested expanded data IDs, we defer that until here
1576 # so we do as little expansion as possible.
1577 if expand:
1578 for ref in bestRefs.values():
1579 dataId = self.expandDataId(ref.dataId, records=standardizedDataId.records)
1580 yield ref.expanded(dataId)
1581 else:
1582 yield from bestRefs.values()
1584 dimensions: DimensionUniverse
1585 """The universe of all dimensions known to the registry
1586 (`DimensionUniverse`).
1587 """
1589 storageClasses: StorageClassFactory
1590 """All storage classes known to the registry (`StorageClassFactory`).
1591 """