Coverage for python/lsst/daf/butler/registry/_registry.py : 12%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22from __future__ import annotations
24__all__ = (
25 "Registry",
26)
28from collections import defaultdict
29import contextlib
30import logging
31from typing import (
32 Any,
33 Dict,
34 Iterable,
35 Iterator,
36 List,
37 Mapping,
38 Optional,
39 Set,
40 Type,
41 TYPE_CHECKING,
42 Union,
43)
45import sqlalchemy
47from ..core import (
48 Config,
49 DataCoordinate,
50 DataCoordinateIterable,
51 DataId,
52 DatasetAssociation,
53 DatasetRef,
54 DatasetType,
55 ddl,
56 Dimension,
57 DimensionElement,
58 DimensionGraph,
59 DimensionRecord,
60 DimensionUniverse,
61 NamedKeyMapping,
62 NameLookupMapping,
63 StorageClassFactory,
64 Timespan,
65)
66from . import queries
67from ..core.utils import doImport, iterable, transactional
68from ._config import RegistryConfig
69from ._collectionType import CollectionType
70from ._exceptions import ConflictingDefinitionError, InconsistentDataIdError, OrphanedRecordError
71from .wildcards import CategorizedWildcard, CollectionQuery, CollectionSearch, Ellipsis
72from .interfaces import ChainedCollectionRecord, RunRecord
73from .versions import ButlerVersionsManager, DigestMismatchError
75if TYPE_CHECKING: 75 ↛ 76line 75 didn't jump to line 76, because the condition on line 75 was never true
76 from ..butlerConfig import ButlerConfig
77 from .interfaces import (
78 ButlerAttributeManager,
79 CollectionManager,
80 Database,
81 OpaqueTableStorageManager,
82 DimensionRecordStorageManager,
83 DatasetRecordStorageManager,
84 DatastoreRegistryBridgeManager,
85 )
88_LOG = logging.getLogger(__name__)
91class Registry:
92 """Registry interface.
94 Parameters
95 ----------
96 database : `Database`
97 Database instance to store Registry.
98 universe : `DimensionUniverse`
99 Full set of dimensions for Registry.
100 attributes : `type`
101 Manager class implementing `ButlerAttributeManager`.
102 opaque : `type`
103 Manager class implementing `OpaqueTableStorageManager`.
104 dimensions : `type`
105 Manager class implementing `DimensionRecordStorageManager`.
106 collections : `type`
107 Manager class implementing `CollectionManager`.
108 datasets : `type`
109 Manager class implementing `DatasetRecordStorageManager`.
110 datastoreBridges : `type`
111 Manager class implementing `DatastoreRegistryBridgeManager`.
112 writeable : `bool`, optional
113 If True then Registry will support write operations.
114 create : `bool`, optional
115 If True then database schema will be initialized, it must be empty
116 before instantiating Registry.
117 """
119 defaultConfigFile: Optional[str] = None
120 """Path to configuration defaults. Accessed within the ``configs`` resource
121 or relative to a search path. Can be None if no defaults specified.
122 """
124 @classmethod
125 def fromConfig(cls, config: Union[ButlerConfig, RegistryConfig, Config, str], create: bool = False,
126 butlerRoot: Optional[str] = None, writeable: bool = True) -> Registry:
127 """Create `Registry` subclass instance from `config`.
129 Uses ``registry.cls`` from `config` to determine which subclass to
130 instantiate.
132 Parameters
133 ----------
134 config : `ButlerConfig`, `RegistryConfig`, `Config` or `str`
135 Registry configuration
136 create : `bool`, optional
137 Assume empty Registry and create a new one.
138 butlerRoot : `str`, optional
139 Path to the repository root this `Registry` will manage.
140 writeable : `bool`, optional
141 If `True` (default) create a read-write connection to the database.
143 Returns
144 -------
145 registry : `Registry` (subclass)
146 A new `Registry` subclass instance.
147 """
148 if not isinstance(config, RegistryConfig):
149 if isinstance(config, str) or isinstance(config, Config):
150 config = RegistryConfig(config)
151 else:
152 raise ValueError("Incompatible Registry configuration: {}".format(config))
153 config.replaceRoot(butlerRoot)
154 DatabaseClass = config.getDatabaseClass()
155 database = DatabaseClass.fromUri(str(config.connectionString), origin=config.get("origin", 0),
156 namespace=config.get("namespace"), writeable=writeable)
157 universe = DimensionUniverse(config)
158 attributes = doImport(config["managers", "attributes"])
159 opaque = doImport(config["managers", "opaque"])
160 dimensions = doImport(config["managers", "dimensions"])
161 collections = doImport(config["managers", "collections"])
162 datasets = doImport(config["managers", "datasets"])
163 datastoreBridges = doImport(config["managers", "datastores"])
165 return cls(database, universe, dimensions=dimensions, attributes=attributes, opaque=opaque,
166 collections=collections, datasets=datasets, datastoreBridges=datastoreBridges,
167 writeable=writeable, create=create)
169 def __init__(self, database: Database, universe: DimensionUniverse, *,
170 attributes: Type[ButlerAttributeManager],
171 opaque: Type[OpaqueTableStorageManager],
172 dimensions: Type[DimensionRecordStorageManager],
173 collections: Type[CollectionManager],
174 datasets: Type[DatasetRecordStorageManager],
175 datastoreBridges: Type[DatastoreRegistryBridgeManager],
176 writeable: bool = True,
177 create: bool = False):
178 self._db = database
179 self.storageClasses = StorageClassFactory()
180 with self._db.declareStaticTables(create=create) as context:
181 self._attributes = attributes.initialize(self._db, context)
182 self._dimensions = dimensions.initialize(self._db, context, universe=universe)
183 self._collections = collections.initialize(self._db, context)
184 self._datasets = datasets.initialize(self._db, context,
185 collections=self._collections,
186 universe=self.dimensions)
187 self._opaque = opaque.initialize(self._db, context)
188 self._datastoreBridges = datastoreBridges.initialize(self._db, context,
189 opaque=self._opaque,
190 datasets=datasets,
191 universe=self.dimensions)
192 versions = ButlerVersionsManager(
193 self._attributes,
194 dict(
195 attributes=self._attributes,
196 opaque=self._opaque,
197 dimensions=self._dimensions,
198 collections=self._collections,
199 datasets=self._datasets,
200 datastores=self._datastoreBridges,
201 )
202 )
203 # store managers and their versions in attributes table
204 context.addInitializer(lambda db: versions.storeManagersConfig())
205 context.addInitializer(lambda db: versions.storeManagersVersions())
207 if not create:
208 # verify that configured versions are compatible with schema
209 versions.checkManagersConfig()
210 versions.checkManagersVersions(writeable)
211 try:
212 versions.checkManagersDigests()
213 except DigestMismatchError as exc:
214 # potentially digest mismatch is a serious error but during
215 # development it could be benign, treat this as warning for
216 # now.
217 _LOG.warning(f"Registry schema digest mismatch: {exc}")
219 self._collections.refresh()
220 self._datasets.refresh(universe=self._dimensions.universe)
222 def __str__(self) -> str:
223 return str(self._db)
225 def __repr__(self) -> str:
226 return f"Registry({self._db!r}, {self.dimensions!r})"
228 def isWriteable(self) -> bool:
229 """Return `True` if this registry allows write operations, and `False`
230 otherwise.
231 """
232 return self._db.isWriteable()
234 @property
235 def dimensions(self) -> DimensionUniverse:
236 """All dimensions recognized by this `Registry` (`DimensionUniverse`).
237 """
238 return self._dimensions.universe
240 @contextlib.contextmanager
241 def transaction(self, *, savepoint: bool = False) -> Iterator[None]:
242 """Return a context manager that represents a transaction.
243 """
244 try:
245 with self._db.transaction(savepoint=savepoint):
246 yield
247 except BaseException:
248 # TODO: this clears the caches sometimes when we wouldn't actually
249 # need to. Can we avoid that?
250 self._dimensions.clearCaches()
251 raise
253 def registerOpaqueTable(self, tableName: str, spec: ddl.TableSpec) -> None:
254 """Add an opaque (to the `Registry`) table for use by a `Datastore` or
255 other data repository client.
257 Opaque table records can be added via `insertOpaqueData`, retrieved via
258 `fetchOpaqueData`, and removed via `deleteOpaqueData`.
260 Parameters
261 ----------
262 tableName : `str`
263 Logical name of the opaque table. This may differ from the
264 actual name used in the database by a prefix and/or suffix.
265 spec : `ddl.TableSpec`
266 Specification for the table to be added.
267 """
268 self._opaque.register(tableName, spec)
270 @transactional
271 def insertOpaqueData(self, tableName: str, *data: dict) -> None:
272 """Insert records into an opaque table.
274 Parameters
275 ----------
276 tableName : `str`
277 Logical name of the opaque table. Must match the name used in a
278 previous call to `registerOpaqueTable`.
279 data
280 Each additional positional argument is a dictionary that represents
281 a single row to be added.
282 """
283 self._opaque[tableName].insert(*data)
285 def fetchOpaqueData(self, tableName: str, **where: Any) -> Iterator[dict]:
286 """Retrieve records from an opaque table.
288 Parameters
289 ----------
290 tableName : `str`
291 Logical name of the opaque table. Must match the name used in a
292 previous call to `registerOpaqueTable`.
293 where
294 Additional keyword arguments are interpreted as equality
295 constraints that restrict the returned rows (combined with AND);
296 keyword arguments are column names and values are the values they
297 must have.
299 Yields
300 ------
301 row : `dict`
302 A dictionary representing a single result row.
303 """
304 yield from self._opaque[tableName].fetch(**where)
306 @transactional
307 def deleteOpaqueData(self, tableName: str, **where: Any) -> None:
308 """Remove records from an opaque table.
310 Parameters
311 ----------
312 tableName : `str`
313 Logical name of the opaque table. Must match the name used in a
314 previous call to `registerOpaqueTable`.
315 where
316 Additional keyword arguments are interpreted as equality
317 constraints that restrict the deleted rows (combined with AND);
318 keyword arguments are column names and values are the values they
319 must have.
320 """
321 self._opaque[tableName].delete(**where)
323 def registerCollection(self, name: str, type: CollectionType = CollectionType.TAGGED) -> None:
324 """Add a new collection if one with the given name does not exist.
326 Parameters
327 ----------
328 name : `str`
329 The name of the collection to create.
330 type : `CollectionType`
331 Enum value indicating the type of collection to create.
333 Notes
334 -----
335 This method cannot be called within transactions, as it needs to be
336 able to perform its own transaction to be concurrent.
337 """
338 self._collections.register(name, type)
340 def getCollectionType(self, name: str) -> CollectionType:
341 """Return an enumeration value indicating the type of the given
342 collection.
344 Parameters
345 ----------
346 name : `str`
347 The name of the collection.
349 Returns
350 -------
351 type : `CollectionType`
352 Enum value indicating the type of this collection.
354 Raises
355 ------
356 MissingCollectionError
357 Raised if no collection with the given name exists.
358 """
359 return self._collections.find(name).type
361 def registerRun(self, name: str) -> None:
362 """Add a new run if one with the given name does not exist.
364 Parameters
365 ----------
366 name : `str`
367 The name of the run to create.
369 Notes
370 -----
371 This method cannot be called within transactions, as it needs to be
372 able to perform its own transaction to be concurrent.
373 """
374 self._collections.register(name, CollectionType.RUN)
376 @transactional
377 def removeCollection(self, name: str) -> None:
378 """Completely remove the given collection.
380 Parameters
381 ----------
382 name : `str`
383 The name of the collection to remove.
385 Raises
386 ------
387 MissingCollectionError
388 Raised if no collection with the given name exists.
390 Notes
391 -----
392 If this is a `~CollectionType.RUN` collection, all datasets and quanta
393 in it are also fully removed. This requires that those datasets be
394 removed (or at least trashed) from any datastores that hold them first.
396 A collection may not be deleted as long as it is referenced by a
397 `~CollectionType.CHAINED` collection; the ``CHAINED`` collection must
398 be deleted or redefined first.
399 """
400 self._collections.remove(name)
402 def getCollectionChain(self, parent: str) -> CollectionSearch:
403 """Return the child collections in a `~CollectionType.CHAINED`
404 collection.
406 Parameters
407 ----------
408 parent : `str`
409 Name of the chained collection. Must have already been added via
410 a call to `Registry.registerCollection`.
412 Returns
413 -------
414 children : `CollectionSearch`
415 An object that defines the search path of the collection.
416 See :ref:`daf_butler_collection_expressions` for more information.
418 Raises
419 ------
420 MissingCollectionError
421 Raised if ``parent`` does not exist in the `Registry`.
422 TypeError
423 Raised if ``parent`` does not correspond to a
424 `~CollectionType.CHAINED` collection.
425 """
426 record = self._collections.find(parent)
427 if record.type is not CollectionType.CHAINED:
428 raise TypeError(f"Collection '{parent}' has type {record.type.name}, not CHAINED.")
429 assert isinstance(record, ChainedCollectionRecord)
430 return record.children
432 @transactional
433 def setCollectionChain(self, parent: str, children: Any) -> None:
434 """Define or redefine a `~CollectionType.CHAINED` collection.
436 Parameters
437 ----------
438 parent : `str`
439 Name of the chained collection. Must have already been added via
440 a call to `Registry.registerCollection`.
441 children : `Any`
442 An expression defining an ordered search of child collections,
443 generally an iterable of `str`. Restrictions on the dataset types
444 to be searched can also be included, by passing mapping or an
445 iterable containing tuples; see
446 :ref:`daf_butler_collection_expressions` for more information.
448 Raises
449 ------
450 MissingCollectionError
451 Raised when any of the given collections do not exist in the
452 `Registry`.
453 TypeError
454 Raised if ``parent`` does not correspond to a
455 `~CollectionType.CHAINED` collection.
456 ValueError
457 Raised if the given collections contains a cycle.
458 """
459 record = self._collections.find(parent)
460 if record.type is not CollectionType.CHAINED:
461 raise TypeError(f"Collection '{parent}' has type {record.type.name}, not CHAINED.")
462 assert isinstance(record, ChainedCollectionRecord)
463 children = CollectionSearch.fromExpression(children)
464 if children != record.children:
465 record.update(self._collections, children)
467 def registerDatasetType(self, datasetType: DatasetType) -> bool:
468 """
469 Add a new `DatasetType` to the Registry.
471 It is not an error to register the same `DatasetType` twice.
473 Parameters
474 ----------
475 datasetType : `DatasetType`
476 The `DatasetType` to be added.
478 Returns
479 -------
480 inserted : `bool`
481 `True` if ``datasetType`` was inserted, `False` if an identical
482 existing `DatsetType` was found. Note that in either case the
483 DatasetType is guaranteed to be defined in the Registry
484 consistently with the given definition.
486 Raises
487 ------
488 ValueError
489 Raised if the dimensions or storage class are invalid.
490 ConflictingDefinitionError
491 Raised if this DatasetType is already registered with a different
492 definition.
494 Notes
495 -----
496 This method cannot be called within transactions, as it needs to be
497 able to perform its own transaction to be concurrent.
498 """
499 _, inserted = self._datasets.register(datasetType)
500 return inserted
502 def removeDatasetType(self, name: str) -> None:
503 """Remove the named `DatasetType` from the registry.
505 .. warning::
507 Registry caches the dataset type definitions. This means that
508 deleting the dataset type definition may result in unexpected
509 behavior from other butler processes that are active that have
510 not seen the deletion.
512 Parameters
513 ----------
514 name : `str`
515 Name of the type to be removed.
517 Raises
518 ------
519 lsst.daf.butler.registry.OrphanedRecordError
520 Raised if an attempt is made to remove the dataset type definition
521 when there are already datasets associated with it.
523 Notes
524 -----
525 If the dataset type is not registered the method will return without
526 action.
527 """
528 self._datasets.remove(name, universe=self._dimensions.universe)
530 def getDatasetType(self, name: str) -> DatasetType:
531 """Get the `DatasetType`.
533 Parameters
534 ----------
535 name : `str`
536 Name of the type.
538 Returns
539 -------
540 type : `DatasetType`
541 The `DatasetType` associated with the given name.
543 Raises
544 ------
545 KeyError
546 Requested named DatasetType could not be found in registry.
547 """
548 return self._datasets[name].datasetType
550 def findDataset(self, datasetType: Union[DatasetType, str], dataId: Optional[DataId] = None, *,
551 collections: Any, timespan: Optional[Timespan] = None,
552 **kwargs: Any) -> Optional[DatasetRef]:
553 """Find a dataset given its `DatasetType` and data ID.
555 This can be used to obtain a `DatasetRef` that permits the dataset to
556 be read from a `Datastore`. If the dataset is a component and can not
557 be found using the provided dataset type, a dataset ref for the parent
558 will be returned instead but with the correct dataset type.
560 Parameters
561 ----------
562 datasetType : `DatasetType` or `str`
563 A `DatasetType` or the name of one.
564 dataId : `dict` or `DataCoordinate`, optional
565 A `dict`-like object containing the `Dimension` links that identify
566 the dataset within a collection.
567 collections
568 An expression that fully or partially identifies the collections
569 to search for the dataset, such as a `str`, `DatasetType`, or
570 iterable thereof. See :ref:`daf_butler_collection_expressions`
571 for more information.
572 timespan : `Timespan`, optional
573 A timespan that the validity range of the dataset must overlap.
574 If not provided, any `~CollectionType.CALIBRATION` collections
575 matched by the ``collections`` argument will not be searched.
576 **kwargs
577 Additional keyword arguments passed to
578 `DataCoordinate.standardize` to convert ``dataId`` to a true
579 `DataCoordinate` or augment an existing one.
581 Returns
582 -------
583 ref : `DatasetRef`
584 A reference to the dataset, or `None` if no matching Dataset
585 was found.
587 Raises
588 ------
589 LookupError
590 Raised if one or more data ID keys are missing.
591 KeyError
592 Raised if the dataset type does not exist.
593 MissingCollectionError
594 Raised if any of ``collections`` does not exist in the registry.
596 Notes
597 -----
598 This method simply returns `None` and does not raise an exception even
599 when the set of collections searched is intrinsically incompatible with
600 the dataset type, e.g. if ``datasetType.isCalibration() is False``, but
601 only `~CollectionType.CALIBRATION` collections are being searched.
602 This may make it harder to debug some lookup failures, but the behavior
603 is intentional; we consider it more important that failed searches are
604 reported consistently, regardless of the reason, and that adding
605 additional collections that do not contain a match to the search path
606 never changes the behavior.
607 """
608 if isinstance(datasetType, DatasetType):
609 storage = self._datasets[datasetType.name]
610 else:
611 storage = self._datasets[datasetType]
612 dataId = DataCoordinate.standardize(dataId, graph=storage.datasetType.dimensions,
613 universe=self.dimensions, **kwargs)
614 collections = CollectionSearch.fromExpression(collections)
615 for collectionRecord in collections.iter(self._collections, datasetType=storage.datasetType):
616 if (collectionRecord.type is CollectionType.CALIBRATION
617 and (not storage.datasetType.isCalibration() or timespan is None)):
618 continue
619 result = storage.find(collectionRecord, dataId, timespan=timespan)
620 if result is not None:
621 return result
623 return None
625 @transactional
626 def insertDatasets(self, datasetType: Union[DatasetType, str], dataIds: Iterable[DataId],
627 run: str) -> List[DatasetRef]:
628 """Insert one or more datasets into the `Registry`
630 This always adds new datasets; to associate existing datasets with
631 a new collection, use ``associate``.
633 Parameters
634 ----------
635 datasetType : `DatasetType` or `str`
636 A `DatasetType` or the name of one.
637 dataIds : `~collections.abc.Iterable` of `dict` or `DataCoordinate`
638 Dimension-based identifiers for the new datasets.
639 run : `str`
640 The name of the run that produced the datasets.
642 Returns
643 -------
644 refs : `list` of `DatasetRef`
645 Resolved `DatasetRef` instances for all given data IDs (in the same
646 order).
648 Raises
649 ------
650 ConflictingDefinitionError
651 If a dataset with the same dataset type and data ID as one of those
652 given already exists in ``run``.
653 MissingCollectionError
654 Raised if ``run`` does not exist in the registry.
655 """
656 if isinstance(datasetType, DatasetType):
657 storage = self._datasets.find(datasetType.name)
658 if storage is None:
659 raise LookupError(f"DatasetType '{datasetType}' has not been registered.")
660 else:
661 storage = self._datasets.find(datasetType)
662 if storage is None:
663 raise LookupError(f"DatasetType with name '{datasetType}' has not been registered.")
664 runRecord = self._collections.find(run)
665 if runRecord.type is not CollectionType.RUN:
666 raise TypeError("Given collection is of type {runRecord.type.name}; RUN collection required.")
667 assert isinstance(runRecord, RunRecord)
668 expandedDataIds = [self.expandDataId(dataId, graph=storage.datasetType.dimensions)
669 for dataId in dataIds]
670 try:
671 refs = list(storage.insert(runRecord, expandedDataIds))
672 except sqlalchemy.exc.IntegrityError as err:
673 raise ConflictingDefinitionError(f"A database constraint failure was triggered by inserting "
674 f"one or more datasets of type {storage.datasetType} into "
675 f"collection '{run}'. "
676 f"This probably means a dataset with the same data ID "
677 f"and dataset type already exists, but it may also mean a "
678 f"dimension row is missing.") from err
679 return refs
681 def getDataset(self, id: int) -> Optional[DatasetRef]:
682 """Retrieve a Dataset entry.
684 Parameters
685 ----------
686 id : `int`
687 The unique identifier for the dataset.
689 Returns
690 -------
691 ref : `DatasetRef` or `None`
692 A ref to the Dataset, or `None` if no matching Dataset
693 was found.
694 """
695 ref = self._datasets.getDatasetRef(id, universe=self.dimensions)
696 if ref is None:
697 return None
698 return ref
700 @transactional
701 def removeDatasets(self, refs: Iterable[DatasetRef]) -> None:
702 """Remove datasets from the Registry.
704 The datasets will be removed unconditionally from all collections, and
705 any `Quantum` that consumed this dataset will instead be marked with
706 having a NULL input. `Datastore` records will *not* be deleted; the
707 caller is responsible for ensuring that the dataset has already been
708 removed from all Datastores.
710 Parameters
711 ----------
712 refs : `Iterable` of `DatasetRef`
713 References to the datasets to be removed. Must include a valid
714 ``id`` attribute, and should be considered invalidated upon return.
716 Raises
717 ------
718 AmbiguousDatasetError
719 Raised if any ``ref.id`` is `None`.
720 OrphanedRecordError
721 Raised if any dataset is still present in any `Datastore`.
722 """
723 for datasetType, refsForType in DatasetRef.groupByType(refs).items():
724 storage = self._datasets.find(datasetType.name)
725 assert storage is not None
726 try:
727 storage.delete(refsForType)
728 except sqlalchemy.exc.IntegrityError as err:
729 raise OrphanedRecordError("One or more datasets is still "
730 "present in one or more Datastores.") from err
732 @transactional
733 def associate(self, collection: str, refs: Iterable[DatasetRef]) -> None:
734 """Add existing datasets to a `~CollectionType.TAGGED` collection.
736 If a DatasetRef with the same exact integer ID is already in a
737 collection nothing is changed. If a `DatasetRef` with the same
738 `DatasetType` and data ID but with different integer ID
739 exists in the collection, `ConflictingDefinitionError` is raised.
741 Parameters
742 ----------
743 collection : `str`
744 Indicates the collection the datasets should be associated with.
745 refs : `Iterable` [ `DatasetRef` ]
746 An iterable of resolved `DatasetRef` instances that already exist
747 in this `Registry`.
749 Raises
750 ------
751 ConflictingDefinitionError
752 If a Dataset with the given `DatasetRef` already exists in the
753 given collection.
754 AmbiguousDatasetError
755 Raised if ``any(ref.id is None for ref in refs)``.
756 MissingCollectionError
757 Raised if ``collection`` does not exist in the registry.
758 TypeError
759 Raise adding new datasets to the given ``collection`` is not
760 allowed.
761 """
762 collectionRecord = self._collections.find(collection)
763 if collectionRecord.type is not CollectionType.TAGGED:
764 raise TypeError(f"Collection '{collection}' has type {collectionRecord.type.name}, not TAGGED.")
765 for datasetType, refsForType in DatasetRef.groupByType(refs).items():
766 storage = self._datasets.find(datasetType.name)
767 assert storage is not None
768 try:
769 storage.associate(collectionRecord, refsForType)
770 except sqlalchemy.exc.IntegrityError as err:
771 raise ConflictingDefinitionError(
772 f"Constraint violation while associating dataset of type {datasetType.name} with "
773 f"collection {collection}. This probably means that one or more datasets with the same "
774 f"dataset type and data ID already exist in the collection, but it may also indicate "
775 f"that the datasets do not exist."
776 ) from err
778 @transactional
779 def disassociate(self, collection: str, refs: Iterable[DatasetRef]) -> None:
780 """Remove existing datasets from a `~CollectionType.TAGGED` collection.
782 ``collection`` and ``ref`` combinations that are not currently
783 associated are silently ignored.
785 Parameters
786 ----------
787 collection : `str`
788 The collection the datasets should no longer be associated with.
789 refs : `Iterable` [ `DatasetRef` ]
790 An iterable of resolved `DatasetRef` instances that already exist
791 in this `Registry`.
793 Raises
794 ------
795 AmbiguousDatasetError
796 Raised if any of the given dataset references is unresolved.
797 MissingCollectionError
798 Raised if ``collection`` does not exist in the registry.
799 TypeError
800 Raise adding new datasets to the given ``collection`` is not
801 allowed.
802 """
803 collectionRecord = self._collections.find(collection)
804 if collectionRecord.type is not CollectionType.TAGGED:
805 raise TypeError(f"Collection '{collection}' has type {collectionRecord.type.name}; "
806 "expected TAGGED.")
807 for datasetType, refsForType in DatasetRef.groupByType(refs).items():
808 storage = self._datasets.find(datasetType.name)
809 assert storage is not None
810 storage.disassociate(collectionRecord, refsForType)
812 @transactional
813 def certify(self, collection: str, refs: Iterable[DatasetRef], timespan: Timespan) -> None:
814 """Associate one or more datasets with a calibration collection and a
815 validity range within it.
817 Parameters
818 ----------
819 collection : `str`
820 The name of an already-registered `~CollectionType.CALIBRATION`
821 collection.
822 refs : `Iterable` [ `DatasetRef` ]
823 Datasets to be associated.
824 timespan : `Timespan`
825 The validity range for these datasets within the collection.
827 Raises
828 ------
829 AmbiguousDatasetError
830 Raised if any of the given `DatasetRef` instances is unresolved.
831 ConflictingDefinitionError
832 Raised if the collection already contains a different dataset with
833 the same `DatasetType` and data ID and an overlapping validity
834 range.
835 TypeError
836 Raised if ``collection`` is not a `~CollectionType.CALIBRATION`
837 collection or if one or more datasets are of a dataset type for
838 which `DatasetType.isCalibration` returns `False`.
839 """
840 collectionRecord = self._collections.find(collection)
841 for datasetType, refsForType in DatasetRef.groupByType(refs).items():
842 storage = self._datasets[datasetType.name]
843 storage.certify(collectionRecord, refsForType, timespan)
845 @transactional
846 def decertify(self, collection: str, datasetType: Union[str, DatasetType], timespan: Timespan, *,
847 dataIds: Optional[Iterable[DataId]] = None) -> None:
848 """Remove or adjust datasets to clear a validity range within a
849 calibration collection.
851 Parameters
852 ----------
853 collection : `str`
854 The name of an already-registered `~CollectionType.CALIBRATION`
855 collection.
856 datasetType : `str` or `DatasetType`
857 Name or `DatasetType` instance for the datasets to be decertified.
858 timespan : `Timespan`, optional
859 The validity range to remove datasets from within the collection.
860 Datasets that overlap this range but are not contained by it will
861 have their validity ranges adjusted to not overlap it, which may
862 split a single dataset validity range into two.
863 dataIds : `Iterable` [ `DataId` ], optional
864 Data IDs that should be decertified within the given validity range
865 If `None`, all data IDs for ``self.datasetType`` will be
866 decertified.
868 Raises
869 ------
870 TypeError
871 Raised if ``collection`` is not a `~CollectionType.CALIBRATION`
872 collection or if ``datasetType.isCalibration() is False``.
873 """
874 collectionRecord = self._collections.find(collection)
875 if isinstance(datasetType, str):
876 storage = self._datasets[datasetType]
877 else:
878 storage = self._datasets[datasetType.name]
879 standardizedDataIds = None
880 if dataIds is not None:
881 standardizedDataIds = [DataCoordinate.standardize(d, graph=storage.datasetType.dimensions)
882 for d in dataIds]
883 storage.decertify(collectionRecord, timespan, dataIds=standardizedDataIds)
885 def getDatastoreBridgeManager(self) -> DatastoreRegistryBridgeManager:
886 """Return an object that allows a new `Datastore` instance to
887 communicate with this `Registry`.
889 Returns
890 -------
891 manager : `DatastoreRegistryBridgeManager`
892 Object that mediates communication between this `Registry` and its
893 associated datastores.
894 """
895 return self._datastoreBridges
897 def getDatasetLocations(self, ref: DatasetRef) -> Iterable[str]:
898 """Retrieve datastore locations for a given dataset.
900 Parameters
901 ----------
902 ref : `DatasetRef`
903 A reference to the dataset for which to retrieve storage
904 information.
906 Returns
907 -------
908 datastores : `Iterable` [ `str` ]
909 All the matching datastores holding this dataset.
911 Raises
912 ------
913 AmbiguousDatasetError
914 Raised if ``ref.id`` is `None`.
915 """
916 return self._datastoreBridges.findDatastores(ref)
918 def expandDataId(self, dataId: Optional[DataId] = None, *, graph: Optional[DimensionGraph] = None,
919 records: Optional[NameLookupMapping[DimensionElement, Optional[DimensionRecord]]] = None,
920 **kwargs: Any) -> DataCoordinate:
921 """Expand a dimension-based data ID to include additional information.
923 Parameters
924 ----------
925 dataId : `DataCoordinate` or `dict`, optional
926 Data ID to be expanded; augmented and overridden by ``kwds``.
927 graph : `DimensionGraph`, optional
928 Set of dimensions for the expanded ID. If `None`, the dimensions
929 will be inferred from the keys of ``dataId`` and ``kwds``.
930 Dimensions that are in ``dataId`` or ``kwds`` but not in ``graph``
931 are silently ignored, providing a way to extract and expand a
932 subset of a data ID.
933 records : `Mapping` [`str`, `DimensionRecord`], optional
934 Dimension record data to use before querying the database for that
935 data, keyed by element name.
936 **kwargs
937 Additional keywords are treated like additional key-value pairs for
938 ``dataId``, extending and overriding
940 Returns
941 -------
942 expanded : `DataCoordinate`
943 A data ID that includes full metadata for all of the dimensions it
944 identifieds, i.e. guarantees that ``expanded.hasRecords()`` and
945 ``expanded.hasFull()`` both return `True`.
946 """
947 standardized = DataCoordinate.standardize(dataId, graph=graph, universe=self.dimensions, **kwargs)
948 if standardized.hasRecords():
949 return standardized
950 if records is None:
951 records = {}
952 elif isinstance(records, NamedKeyMapping):
953 records = records.byName()
954 else:
955 records = dict(records)
956 if isinstance(dataId, DataCoordinate) and dataId.hasRecords():
957 records.update(dataId.records.byName())
958 keys = standardized.byName()
959 for element in standardized.graph.primaryKeyTraversalOrder:
960 record = records.get(element.name, ...) # Use ... to mean not found; None might mean NULL
961 if record is ...:
962 if isinstance(element, Dimension) and keys.get(element.name) is None:
963 if element in standardized.graph.required:
964 raise LookupError(
965 f"No value or null value for required dimension {element.name}."
966 )
967 keys[element.name] = None
968 record = None
969 else:
970 storage = self._dimensions[element]
971 dataIdSet = DataCoordinateIterable.fromScalar(
972 DataCoordinate.standardize(keys, graph=element.graph)
973 )
974 fetched = tuple(storage.fetch(dataIdSet))
975 try:
976 (record,) = fetched
977 except ValueError:
978 record = None
979 records[element.name] = record
980 if record is not None:
981 for d in element.implied:
982 value = getattr(record, d.name)
983 if keys.setdefault(d.name, value) != value:
984 raise InconsistentDataIdError(
985 f"Data ID {standardized} has {d.name}={keys[d.name]!r}, "
986 f"but {element.name} implies {d.name}={value!r}."
987 )
988 else:
989 if element in standardized.graph.required:
990 raise LookupError(
991 f"Could not fetch record for required dimension {element.name} via keys {keys}."
992 )
993 if element.alwaysJoin:
994 raise InconsistentDataIdError(
995 f"Could not fetch record for element {element.name} via keys {keys}, ",
996 "but it is marked alwaysJoin=True; this means one or more dimensions are not "
997 "related."
998 )
999 for d in element.implied:
1000 keys.setdefault(d.name, None)
1001 records.setdefault(d.name, None)
1002 return DataCoordinate.standardize(keys, graph=standardized.graph).expanded(records=records)
1004 def insertDimensionData(self, element: Union[DimensionElement, str],
1005 *data: Union[Mapping[str, Any], DimensionRecord],
1006 conform: bool = True) -> None:
1007 """Insert one or more dimension records into the database.
1009 Parameters
1010 ----------
1011 element : `DimensionElement` or `str`
1012 The `DimensionElement` or name thereof that identifies the table
1013 records will be inserted into.
1014 data : `dict` or `DimensionRecord` (variadic)
1015 One or more records to insert.
1016 conform : `bool`, optional
1017 If `False` (`True` is default) perform no checking or conversions,
1018 and assume that ``element`` is a `DimensionElement` instance and
1019 ``data`` is a one or more `DimensionRecord` instances of the
1020 appropriate subclass.
1021 """
1022 if conform:
1023 if isinstance(element, str):
1024 element = self.dimensions[element]
1025 records = [row if isinstance(row, DimensionRecord) else element.RecordClass(**row)
1026 for row in data]
1027 else:
1028 # Ignore typing since caller said to trust them with conform=False.
1029 records = data # type: ignore
1030 storage = self._dimensions[element] # type: ignore
1031 storage.insert(*records)
1033 def syncDimensionData(self, element: Union[DimensionElement, str],
1034 row: Union[Mapping[str, Any], DimensionRecord],
1035 conform: bool = True) -> bool:
1036 """Synchronize the given dimension record with the database, inserting
1037 if it does not already exist and comparing values if it does.
1039 Parameters
1040 ----------
1041 element : `DimensionElement` or `str`
1042 The `DimensionElement` or name thereof that identifies the table
1043 records will be inserted into.
1044 row : `dict` or `DimensionRecord`
1045 The record to insert.
1046 conform : `bool`, optional
1047 If `False` (`True` is default) perform no checking or conversions,
1048 and assume that ``element`` is a `DimensionElement` instance and
1049 ``data`` is a one or more `DimensionRecord` instances of the
1050 appropriate subclass.
1052 Returns
1053 -------
1054 inserted : `bool`
1055 `True` if a new row was inserted, `False` otherwise.
1057 Raises
1058 ------
1059 ConflictingDefinitionError
1060 Raised if the record exists in the database (according to primary
1061 key lookup) but is inconsistent with the given one.
1062 """
1063 if conform:
1064 if isinstance(element, str):
1065 element = self.dimensions[element]
1066 record = row if isinstance(row, DimensionRecord) else element.RecordClass(**row)
1067 else:
1068 # Ignore typing since caller said to trust them with conform=False.
1069 record = row # type: ignore
1070 storage = self._dimensions[element] # type: ignore
1071 return storage.sync(record)
1073 def queryDatasetTypes(self, expression: Any = ..., *, components: Optional[bool] = None
1074 ) -> Iterator[DatasetType]:
1075 """Iterate over the dataset types whose names match an expression.
1077 Parameters
1078 ----------
1079 expression : `Any`, optional
1080 An expression that fully or partially identifies the dataset types
1081 to return, such as a `str`, `re.Pattern`, or iterable thereof.
1082 `...` can be used to return all dataset types, and is the default.
1083 See :ref:`daf_butler_dataset_type_expressions` for more
1084 information.
1085 components : `bool`, optional
1086 If `True`, apply all expression patterns to component dataset type
1087 names as well. If `False`, never apply patterns to components.
1088 If `None` (default), apply patterns to components only if their
1089 parent datasets were not matched by the expression.
1090 Fully-specified component datasets (`str` or `DatasetType`
1091 instances) are always included.
1093 Yields
1094 ------
1095 datasetType : `DatasetType`
1096 A `DatasetType` instance whose name matches ``expression``.
1097 """
1098 wildcard = CategorizedWildcard.fromExpression(expression, coerceUnrecognized=lambda d: d.name)
1099 if wildcard is Ellipsis:
1100 for datasetType in self._datasets:
1101 # The dataset type can no longer be a component
1102 yield datasetType
1103 if components and datasetType.isComposite():
1104 # Automatically create the component dataset types
1105 for component in datasetType.makeAllComponentDatasetTypes():
1106 yield component
1107 return
1108 done: Set[str] = set()
1109 for name in wildcard.strings:
1110 storage = self._datasets.find(name)
1111 if storage is not None:
1112 done.add(storage.datasetType.name)
1113 yield storage.datasetType
1114 if wildcard.patterns:
1115 # If components (the argument) is None, we'll save component
1116 # dataset that we might want to match, but only if their parents
1117 # didn't get included.
1118 componentsForLater = []
1119 for registeredDatasetType in self._datasets:
1120 # Components are not stored in registry so expand them here
1121 allDatasetTypes = [registeredDatasetType] \
1122 + registeredDatasetType.makeAllComponentDatasetTypes()
1123 for datasetType in allDatasetTypes:
1124 if datasetType.name in done:
1125 continue
1126 parentName, componentName = datasetType.nameAndComponent()
1127 if componentName is not None and not components:
1128 if components is None and parentName not in done:
1129 componentsForLater.append(datasetType)
1130 continue
1131 if any(p.fullmatch(datasetType.name) for p in wildcard.patterns):
1132 done.add(datasetType.name)
1133 yield datasetType
1134 # Go back and try to match saved components.
1135 for datasetType in componentsForLater:
1136 parentName, _ = datasetType.nameAndComponent()
1137 if parentName not in done and any(p.fullmatch(datasetType.name) for p in wildcard.patterns):
1138 yield datasetType
1140 def queryCollections(self, expression: Any = ...,
1141 datasetType: Optional[DatasetType] = None,
1142 collectionTypes: Iterable[CollectionType] = CollectionType.all(),
1143 flattenChains: bool = False,
1144 includeChains: Optional[bool] = None) -> Iterator[str]:
1145 """Iterate over the collections whose names match an expression.
1147 Parameters
1148 ----------
1149 expression : `Any`, optional
1150 An expression that fully or partially identifies the collections
1151 to return, such as a `str`, `re.Pattern`, or iterable thereof.
1152 `...` can be used to return all collections, and is the default.
1153 See :ref:`daf_butler_collection_expressions` for more
1154 information.
1155 datasetType : `DatasetType`, optional
1156 If provided, only yield collections that should be searched for
1157 this dataset type according to ``expression``. If this is
1158 not provided, any dataset type restrictions in ``expression`` are
1159 ignored.
1160 collectionTypes : `AbstractSet` [ `CollectionType` ], optional
1161 If provided, only yield collections of these types.
1162 flattenChains : `bool`, optional
1163 If `True` (`False` is default), recursively yield the child
1164 collections of matching `~CollectionType.CHAINED` collections.
1165 includeChains : `bool`, optional
1166 If `True`, yield records for matching `~CollectionType.CHAINED`
1167 collections. Default is the opposite of ``flattenChains``: include
1168 either CHAINED collections or their children, but not both.
1170 Yields
1171 ------
1172 collection : `str`
1173 The name of a collection that matches ``expression``.
1174 """
1175 query = CollectionQuery.fromExpression(expression)
1176 for record in query.iter(self._collections, datasetType=datasetType,
1177 collectionTypes=frozenset(collectionTypes),
1178 flattenChains=flattenChains, includeChains=includeChains):
1179 yield record.name
1181 def makeQueryBuilder(self, summary: queries.QuerySummary) -> queries.QueryBuilder:
1182 """Return a `QueryBuilder` instance capable of constructing and
1183 managing more complex queries than those obtainable via `Registry`
1184 interfaces.
1186 This is an advanced interface; downstream code should prefer
1187 `Registry.queryDataIds` and `Registry.queryDatasets` whenever those
1188 are sufficient.
1190 Parameters
1191 ----------
1192 summary : `queries.QuerySummary`
1193 Object describing and categorizing the full set of dimensions that
1194 will be included in the query.
1196 Returns
1197 -------
1198 builder : `queries.QueryBuilder`
1199 Object that can be used to construct and perform advanced queries.
1200 """
1201 return queries.QueryBuilder(
1202 summary,
1203 queries.RegistryManagers(
1204 collections=self._collections,
1205 dimensions=self._dimensions,
1206 datasets=self._datasets
1207 )
1208 )
1210 def queryDatasets(self, datasetType: Any, *,
1211 collections: Any,
1212 dimensions: Optional[Iterable[Union[Dimension, str]]] = None,
1213 dataId: Optional[DataId] = None,
1214 where: Optional[str] = None,
1215 findFirst: bool = False,
1216 components: Optional[bool] = None,
1217 **kwargs: Any) -> queries.DatasetQueryResults:
1218 """Query for and iterate over dataset references matching user-provided
1219 criteria.
1221 Parameters
1222 ----------
1223 datasetType
1224 An expression that fully or partially identifies the dataset types
1225 to be queried. Allowed types include `DatasetType`, `str`,
1226 `re.Pattern`, and iterables thereof. The special value `...` can
1227 be used to query all dataset types. See
1228 :ref:`daf_butler_dataset_type_expressions` for more information.
1229 collections
1230 An expression that fully or partially identifies the collections
1231 to search for datasets, such as a `str`, `re.Pattern`, or iterable
1232 thereof. `...` can be used to datasets from all
1233 `~CollectionType.RUN` collections (no other collections are
1234 necessary, because all datasets are in a ``RUN`` collection). See
1235 :ref:`daf_butler_collection_expressions` for more information.
1236 dimensions : `~collections.abc.Iterable` of `Dimension` or `str`
1237 Dimensions to include in the query (in addition to those used
1238 to identify the queried dataset type(s)), either to constrain
1239 the resulting datasets to those for which a matching dimension
1240 exists, or to relate the dataset type's dimensions to dimensions
1241 referenced by the ``dataId`` or ``where`` arguments.
1242 dataId : `dict` or `DataCoordinate`, optional
1243 A data ID whose key-value pairs are used as equality constraints
1244 in the query.
1245 where : `str`, optional
1246 A string expression similar to a SQL WHERE clause. May involve
1247 any column of a dimension table or (as a shortcut for the primary
1248 key column of a dimension table) dimension name. See
1249 :ref:`daf_butler_dimension_expressions` for more information.
1250 findFirst : `bool`, optional
1251 If `True` (`False` is default), for each result data ID, only
1252 yield one `DatasetRef` of each `DatasetType`, from the first
1253 collection in which a dataset of that dataset type appears
1254 (according to the order of ``collections`` passed in). If `True`,
1255 ``collections`` must not contain regular expressions and may not
1256 be `...`.
1257 components : `bool`, optional
1258 If `True`, apply all dataset expression patterns to component
1259 dataset type names as well. If `False`, never apply patterns to
1260 components. If `None` (default), apply patterns to components only
1261 if their parent datasets were not matched by the expression.
1262 Fully-specified component datasets (`str` or `DatasetType`
1263 instances) are always included.
1264 **kwargs
1265 Additional keyword arguments are forwarded to
1266 `DataCoordinate.standardize` when processing the ``dataId``
1267 argument (and may be used to provide a constraining data ID even
1268 when the ``dataId`` argument is `None`).
1270 Returns
1271 -------
1272 refs : `queries.DatasetQueryResults`
1273 Dataset references matching the given query criteria.
1275 Raises
1276 ------
1277 TypeError
1278 Raised when the arguments are incompatible, such as when a
1279 collection wildcard is passed when ``findFirst`` is `True`.
1281 Notes
1282 -----
1283 When multiple dataset types are queried in a single call, the
1284 results of this operation are equivalent to querying for each dataset
1285 type separately in turn, and no information about the relationships
1286 between datasets of different types is included. In contexts where
1287 that kind of information is important, the recommended pattern is to
1288 use `queryDataIds` to first obtain data IDs (possibly with the
1289 desired dataset types and collections passed as constraints to the
1290 query), and then use multiple (generally much simpler) calls to
1291 `queryDatasets` with the returned data IDs passed as constraints.
1292 """
1293 # Standardize the collections expression.
1294 if findFirst:
1295 collections = CollectionSearch.fromExpression(collections)
1296 else:
1297 collections = CollectionQuery.fromExpression(collections)
1298 # Standardize and expand the data ID provided as a constraint.
1299 standardizedDataId = self.expandDataId(dataId, **kwargs)
1301 # We can only query directly if given a non-component DatasetType
1302 # instance. If we were given an expression or str or a component
1303 # DatasetType instance, we'll populate this dict, recurse, and return.
1304 # If we already have a non-component DatasetType, it will remain None
1305 # and we'll run the query directly.
1306 composition: Optional[
1307 Dict[
1308 DatasetType, # parent dataset type
1309 List[Optional[str]] # component name, or None for parent
1310 ]
1311 ] = None
1312 if not isinstance(datasetType, DatasetType):
1313 # We were given a dataset type expression (which may be as simple
1314 # as a str). Loop over all matching datasets, delegating handling
1315 # of the `components` argument to queryDatasetTypes, as we populate
1316 # the composition dict.
1317 composition = defaultdict(list)
1318 for trueDatasetType in self.queryDatasetTypes(datasetType, components=components):
1319 parentName, componentName = trueDatasetType.nameAndComponent()
1320 if componentName is not None:
1321 parentDatasetType = self.getDatasetType(parentName)
1322 composition.setdefault(parentDatasetType, []).append(componentName)
1323 else:
1324 composition.setdefault(trueDatasetType, []).append(None)
1325 elif datasetType.isComponent():
1326 # We were given a true DatasetType instance, but it's a component.
1327 # the composition dict will have exactly one item.
1328 parentName, componentName = datasetType.nameAndComponent()
1329 parentDatasetType = self.getDatasetType(parentName)
1330 composition = {parentDatasetType: [componentName]}
1331 if composition is not None:
1332 # We need to recurse. Do that once for each parent dataset type.
1333 chain = []
1334 for parentDatasetType, componentNames in composition.items():
1335 parentResults = self.queryDatasets(
1336 parentDatasetType,
1337 collections=collections,
1338 dimensions=dimensions,
1339 dataId=standardizedDataId,
1340 where=where,
1341 findFirst=findFirst
1342 )
1343 if isinstance(parentResults, queries.ParentDatasetQueryResults):
1344 chain.append(
1345 parentResults.withComponents(componentNames)
1346 )
1347 else:
1348 # Should only happen if we know there would be no results.
1349 assert isinstance(parentResults, queries.ChainedDatasetQueryResults) \
1350 and not parentResults._chain
1351 return queries.ChainedDatasetQueryResults(chain)
1352 # If we get here, there's no need to recurse (or we are already
1353 # recursing; there can only ever be one level of recursion).
1355 # The full set of dimensions in the query is the combination of those
1356 # needed for the DatasetType and those explicitly requested, if any.
1357 requestedDimensionNames = set(datasetType.dimensions.names)
1358 if dimensions is not None:
1359 requestedDimensionNames.update(self.dimensions.extract(dimensions).names)
1360 # Construct the summary structure needed to construct a QueryBuilder.
1361 summary = queries.QuerySummary(
1362 requested=DimensionGraph(self.dimensions, names=requestedDimensionNames),
1363 dataId=standardizedDataId,
1364 expression=where,
1365 )
1366 builder = self.makeQueryBuilder(summary)
1367 # Add the dataset subquery to the query, telling the QueryBuilder to
1368 # include the rank of the selected collection in the results only if we
1369 # need to findFirst. Note that if any of the collections are
1370 # actually wildcard expressions, and we've asked for deduplication,
1371 # this will raise TypeError for us.
1372 if not builder.joinDataset(datasetType, collections, isResult=True, findFirst=findFirst):
1373 return queries.ChainedDatasetQueryResults(())
1374 query = builder.finish()
1375 return queries.ParentDatasetQueryResults(self._db, query, components=[None])
1377 def queryDataIds(self, dimensions: Union[Iterable[Union[Dimension, str]], Dimension, str], *,
1378 dataId: Optional[DataId] = None,
1379 datasets: Any = None,
1380 collections: Any = None,
1381 where: Optional[str] = None,
1382 components: Optional[bool] = None,
1383 **kwargs: Any) -> queries.DataCoordinateQueryResults:
1384 """Query for data IDs matching user-provided criteria.
1386 Parameters
1387 ----------
1388 dimensions : `Dimension` or `str`, or iterable thereof
1389 The dimensions of the data IDs to yield, as either `Dimension`
1390 instances or `str`. Will be automatically expanded to a complete
1391 `DimensionGraph`.
1392 dataId : `dict` or `DataCoordinate`, optional
1393 A data ID whose key-value pairs are used as equality constraints
1394 in the query.
1395 datasets : `Any`, optional
1396 An expression that fully or partially identifies dataset types
1397 that should constrain the yielded data IDs. For example, including
1398 "raw" here would constrain the yielded ``instrument``,
1399 ``exposure``, ``detector``, and ``physical_filter`` values to only
1400 those for which at least one "raw" dataset exists in
1401 ``collections``. Allowed types include `DatasetType`, `str`,
1402 `re.Pattern`, and iterables thereof. Unlike other dataset type
1403 expressions, ``...`` is not permitted - it doesn't make sense to
1404 constrain data IDs on the existence of *all* datasets.
1405 See :ref:`daf_butler_dataset_type_expressions` for more
1406 information.
1407 collections: `Any`, optional
1408 An expression that fully or partially identifies the collections
1409 to search for datasets, such as a `str`, `re.Pattern`, or iterable
1410 thereof. `...` can be used to return all collections. Must be
1411 provided if ``datasets`` is, and is ignored if it is not. See
1412 :ref:`daf_butler_collection_expressions` for more information.
1413 where : `str`, optional
1414 A string expression similar to a SQL WHERE clause. May involve
1415 any column of a dimension table or (as a shortcut for the primary
1416 key column of a dimension table) dimension name. See
1417 :ref:`daf_butler_dimension_expressions` for more information.
1418 components : `bool`, optional
1419 If `True`, apply all dataset expression patterns to component
1420 dataset type names as well. If `False`, never apply patterns to
1421 components. If `None` (default), apply patterns to components only
1422 if their parent datasets were not matched by the expression.
1423 Fully-specified component datasets (`str` or `DatasetType`
1424 instances) are always included.
1425 **kwargs
1426 Additional keyword arguments are forwarded to
1427 `DataCoordinate.standardize` when processing the ``dataId``
1428 argument (and may be used to provide a constraining data ID even
1429 when the ``dataId`` argument is `None`).
1431 Returns
1432 -------
1433 dataIds : `DataCoordinateQueryResults`
1434 Data IDs matching the given query parameters. These are guaranteed
1435 to identify all dimensions (`DataCoordinate.hasFull` returns
1436 `True`), but will not contain `DimensionRecord` objects
1437 (`DataCoordinate.hasRecords` returns `False`). Call
1438 `DataCoordinateQueryResults.expanded` on the returned object to
1439 fetch those (and consider using
1440 `DataCoordinateQueryResults.materialize` on the returned object
1441 first if the expected number of rows is very large). See
1442 documentation for those methods for additional information.
1443 """
1444 dimensions = iterable(dimensions)
1445 standardizedDataId = self.expandDataId(dataId, **kwargs)
1446 standardizedDatasetTypes = set()
1447 requestedDimensions = self.dimensions.extract(dimensions)
1448 queryDimensionNames = set(requestedDimensions.names)
1449 if datasets is not None:
1450 if collections is None:
1451 raise TypeError("Cannot pass 'datasets' without 'collections'.")
1452 for datasetType in self.queryDatasetTypes(datasets, components=components):
1453 queryDimensionNames.update(datasetType.dimensions.names)
1454 # If any matched dataset type is a component, just operate on
1455 # its parent instead, because Registry doesn't know anything
1456 # about what components exist, and here (unlike queryDatasets)
1457 # we don't care about returning them.
1458 parentDatasetTypeName, componentName = datasetType.nameAndComponent()
1459 if componentName is not None:
1460 datasetType = self.getDatasetType(parentDatasetTypeName)
1461 standardizedDatasetTypes.add(datasetType)
1462 # Preprocess collections expression in case the original included
1463 # single-pass iterators (we'll want to use it multiple times
1464 # below).
1465 collections = CollectionQuery.fromExpression(collections)
1467 summary = queries.QuerySummary(
1468 requested=DimensionGraph(self.dimensions, names=queryDimensionNames),
1469 dataId=standardizedDataId,
1470 expression=where,
1471 )
1472 builder = self.makeQueryBuilder(summary)
1473 for datasetType in standardizedDatasetTypes:
1474 builder.joinDataset(datasetType, collections, isResult=False)
1475 query = builder.finish()
1476 return queries.DataCoordinateQueryResults(self._db, query)
1478 def queryDimensionRecords(self, element: Union[DimensionElement, str], *,
1479 dataId: Optional[DataId] = None,
1480 datasets: Any = None,
1481 collections: Any = None,
1482 where: Optional[str] = None,
1483 components: Optional[bool] = None,
1484 **kwargs: Any) -> Iterator[DimensionRecord]:
1485 """Query for dimension information matching user-provided criteria.
1487 Parameters
1488 ----------
1489 element : `DimensionElement` or `str`
1490 The dimension element to obtain r
1491 dataId : `dict` or `DataCoordinate`, optional
1492 A data ID whose key-value pairs are used as equality constraints
1493 in the query.
1494 datasets : `Any`, optional
1495 An expression that fully or partially identifies dataset types
1496 that should constrain the yielded records. See `queryDataIds` and
1497 :ref:`daf_butler_dataset_type_expressions` for more information.
1498 collections: `Any`, optional
1499 An expression that fully or partially identifies the collections
1500 to search for datasets. See `queryDataIds` and
1501 :ref:`daf_butler_collection_expressions` for more information.
1502 where : `str`, optional
1503 A string expression similar to a SQL WHERE clause. See
1504 `queryDataIds` and :ref:`daf_butler_dimension_expressions` for more
1505 information.
1506 components : `bool`, optional
1507 Whether to apply dataset expressions to components as well.
1508 See `queryDataIds` for more information.
1509 **kwargs
1510 Additional keyword arguments are forwarded to
1511 `DataCoordinate.standardize` when processing the ``dataId``
1512 argument (and may be used to provide a constraining data ID even
1513 when the ``dataId`` argument is `None`).
1515 Returns
1516 -------
1517 dataIds : `DataCoordinateQueryResults`
1518 Data IDs matching the given query parameters.
1519 """
1520 if not isinstance(element, DimensionElement):
1521 element = self.dimensions[element]
1522 dataIds = self.queryDataIds(element.graph, dataId=dataId, datasets=datasets, collections=collections,
1523 where=where, components=components, **kwargs)
1524 return iter(self._dimensions[element].fetch(dataIds))
1526 def queryDatasetAssociations(
1527 self,
1528 datasetType: Union[str, DatasetType],
1529 collections: Any = ...,
1530 *,
1531 collectionTypes: Iterable[CollectionType] = CollectionType.all(),
1532 flattenChains: bool = False,
1533 ) -> Iterator[DatasetAssociation]:
1534 """Iterate over dataset-collection combinations where the dataset is in
1535 the collection.
1537 This method is a temporary placeholder for better support for
1538 assocation results in `queryDatasets`. It will probably be
1539 removed in the future, and should be avoided in production code
1540 whenever possible.
1542 Parameters
1543 ----------
1544 datasetType : `DatasetType` or `str`
1545 A dataset type object or the name of one.
1546 collections: `Any`, optional
1547 An expression that fully or partially identifies the collections
1548 to search for datasets. See `queryCollections` and
1549 :ref:`daf_butler_collection_expressions` for more information.
1550 collectionTypes : `AbstractSet` [ `CollectionType` ], optional
1551 If provided, only yield associations from collections of these
1552 types.
1553 flattenChains : `bool`, optional
1554 If `True` (default) search in the children of
1555 `~CollectionType.CHAINED` collections. If `False`, ``CHAINED``
1556 collections are ignored.
1558 Yields
1559 ------
1560 association : `DatasetAssociation`
1561 Object representing the relationship beween a single dataset and
1562 a single collection.
1563 """
1564 collections = CollectionQuery.fromExpression(collections)
1565 tsRepr = self._db.getTimespanRepresentation()
1566 if isinstance(datasetType, str):
1567 storage = self._datasets[datasetType]
1568 else:
1569 storage = self._datasets[datasetType.name]
1570 for collectionRecord in collections.iter(self._collections, datasetType=datasetType,
1571 collectionTypes=frozenset(collectionTypes),
1572 flattenChains=flattenChains):
1573 query = storage.select(collectionRecord)
1574 if query is None:
1575 continue
1576 for row in self._db.query(query.combine()):
1577 dataId = DataCoordinate.fromRequiredValues(
1578 storage.datasetType.dimensions,
1579 tuple(row[name] for name in storage.datasetType.dimensions.required.names)
1580 )
1581 runRecord = self._collections[row[self._collections.getRunForeignKeyName()]]
1582 ref = DatasetRef(storage.datasetType, dataId, id=row["id"], run=runRecord.name,
1583 conform=False)
1584 if collectionRecord.type is CollectionType.CALIBRATION:
1585 timespan = tsRepr.extract(row)
1586 else:
1587 timespan = None
1588 yield DatasetAssociation(ref=ref, collection=collectionRecord.name, timespan=timespan)
1590 storageClasses: StorageClassFactory
1591 """All storage classes known to the registry (`StorageClassFactory`).
1592 """